Feat: full optimization on connector dashboard (#14979)

### What problem does this PR solve?

This PR improves the connector dashboard task management experience and
adds better visibility into connector execution logs.

### Overview:

#### Before
<img width="700" alt="image"
src="https://github.com/user-attachments/assets/e4a8ed6f-2e18-4f0f-8528-41a514550052"
/>

#### Now:
<img width="700" alt="Screenshot from 2026-05-18 16-31-30"
src="https://github.com/user-attachments/assets/d4ca193b-847a-49ae-9e4f-5fbca60ea627"
/>

### 1. Add a new logging page to the connector dashboard

A new logging page has been added so users can view connector task
execution logs directly from the connector dashboard.

### 2. Merge the Resume button into Confirm

The separate **Resume** button has been removed. The **Confirm** button
now represents different actions depending on the current task state:

- **Save**: Save form changes and reschedule tasks.
- **Stop**: Cancel currently scheduled or running tasks.
- **Resume**: Create new scheduled tasks after the previous tasks have
been stopped.
- **Start**: Start tasks when no task has been started yet.

### 3. Separate syncing and pruning tasks

Connector tasks are now separated into **syncing** and **pruning**.

Pruning is controlled by the **Sync deleted files** option:

- When **Sync deleted files** is disabled, only syncing tasks are shown.
- When **Sync deleted files** is enabled, both syncing and pruning tasks
are shown.

**Now: Sync deleted files disabled**

<img width="700" alt="Sync deleted files disabled"
src="https://github.com/user-attachments/assets/dbd9232e-614a-407f-a0b1-c109e5fa567d"
/>

**Now: Sync deleted files enabled**

<img width="700" alt="Sync deleted files enabled"
src="https://github.com/user-attachments/assets/1f527f48-ccb3-4ee8-97ca-086891489296"
/>

### 4. Update logs in backend

<img width="700" alt="image"
src="https://github.com/user-attachments/assets/10a95a3f-98c1-4e67-8afa-ddf6cda5b0b2"
/>

### 5. Remove connector resume API

- Removed: `POST /v1/connectors/<connector_id>/resume`
- Replaced by: `PATCH /v1/connectors/<connector_id>`


### Type of change

- [x] New Feature (non-breaking change which adds functionality)
This commit is contained in:
Magicbook1108
2026-05-19 10:07:11 +08:00
committed by GitHub
parent 41a9fc0030
commit b69a6a5d80
15 changed files with 706 additions and 557 deletions

View File

@@ -53,17 +53,34 @@ async def update_connector(connector_id):
return _connector_auth_error(connector_id, current_user.id)
req = await get_request_json()
if isinstance(req, dict) and isinstance(req.get("data"), dict):
req = req["data"]
e, conn = ConnectorService.get_by_id(connector_id)
if not e:
return get_data_error_result(message="Can't find this Connector!")
should_sleep = False
if req:
conn = {fld: req[fld] for fld in ["prune_freq", "refresh_freq", "config", "timeout_secs"] if fld in req}
conn["id"] = connector_id
ConnectorService.update_by_id(connector_id, conn)
update_fields = {fld: req[fld] for fld in ["prune_freq", "refresh_freq", "config", "timeout_secs"] if fld in req}
if update_fields:
update_fields["id"] = connector_id
ConnectorService.update_by_id(connector_id, update_fields)
should_sleep = True
await asyncio.sleep(1)
if req.get("reschedule"):
ConnectorService.cancel_tasks(connector_id)
ConnectorService.schedule_tasks(connector_id)
elif req.get("status") in [TaskStatus.CANCEL, "CANCEL"]:
ConnectorService.cancel_tasks(connector_id)
elif req.get("status") in [TaskStatus.SCHEDULE, "SCHEDULE"]:
ConnectorService.schedule_tasks(connector_id)
if should_sleep:
await asyncio.sleep(1)
e, conn = ConnectorService.get_by_id(connector_id)
if not e:
return get_data_error_result(message="Can't find this Connector!")
return get_json_result(data=conn.to_dict())
@@ -83,9 +100,9 @@ async def create_connector():
"input_type": InputType.POLL,
"config": req["config"],
"refresh_freq": int(req.get("refresh_freq", 5)),
"prune_freq": int(req.get("prune_freq", 720)),
"prune_freq": int(req.get("prune_freq", 5)),
"timeout_secs": int(req.get("timeout_secs", 60 * 29)),
"status": TaskStatus.SCHEDULE,
"status": TaskStatus.UNSTART,
}
ConnectorService.save(**conn)
@@ -127,21 +144,6 @@ def list_logs(connector_id):
return get_json_result(data={"total": total, "logs": arr})
@manager.route("/connectors/<connector_id>/resume", methods=["POST"]) # noqa: F821
@login_required
async def resume(connector_id):
"""Resume or cancel sync for an accessible connector."""
if not ConnectorService.accessible(connector_id, current_user.id):
return _connector_auth_error(connector_id, current_user.id)
req = await get_request_json()
if req.get("resume"):
ConnectorService.resume(connector_id, TaskStatus.SCHEDULE)
else:
ConnectorService.resume(connector_id, TaskStatus.CANCEL)
return get_json_result(data=True)
@manager.route("/connectors/<connector_id>/rebuild", methods=["POST"]) # noqa: F821
@login_required
async def rebuild(connector_id):
@@ -166,7 +168,7 @@ def rm_connector(connector_id):
if not ConnectorService.accessible(connector_id, current_user.id):
return _connector_auth_error(connector_id, current_user.id)
ConnectorService.resume(connector_id, TaskStatus.CANCEL)
ConnectorService.cancel_tasks(connector_id)
ConnectorService.delete_by_id(connector_id)
return get_json_result(data=True)

View File

@@ -1224,6 +1224,7 @@ class DateTimeTzField(CharField):
class SyncLogs(DataBaseModel):
id = CharField(max_length=32, primary_key=True)
connector_id = CharField(max_length=32, index=True)
task_type = CharField(max_length=32, null=False, default="sync", index=True)
status = CharField(max_length=128, null=False, help_text="Processing status", index=True)
from_beginning = CharField(max_length=1, null=True, help_text="", default="0", index=False)
new_docs_indexed = IntegerField(default=0, index=False)
@@ -1632,6 +1633,7 @@ def migrate_db():
alter_db_add_column(migrator, "llm_factories", "rank", IntegerField(default=0, index=False))
alter_db_add_column(migrator, "api_4_conversation", "name", CharField(max_length=255, null=True, help_text="conversation name", index=False))
alter_db_add_column(migrator, "api_4_conversation", "exp_user_id", CharField(max_length=255, null=True, help_text="exp_user_id", index=True))
alter_db_add_column(migrator, "sync_logs", "task_type", CharField(max_length=32, null=False, default="sync", index=True))
# Migrate system_settings.value from CharField to TextField for longer sandbox configs
alter_db_column_type(migrator, "system_settings", "value", TextField(null=False, help_text="Configuration value (JSON, string, etc.)"))
alter_db_add_column(migrator, "document", "content_hash", CharField(max_length=32, null=True, help_text="xxhash128 of document content for change detection", default="", index=True))

View File

@@ -28,7 +28,7 @@ from api.db.services.document_service import DocumentService
from api.db.services.document_service import DocMetadataService
from api.utils.common import hash128
from common.misc_utils import get_uuid
from common.constants import TaskStatus
from common.constants import ConnectorTaskType, TaskStatus
from common.settings import TIMEZONE
from common.time_utils import current_timestamp, timestamp_to_date
@@ -38,6 +38,33 @@ LOGGER = logging.getLogger(__name__)
class ConnectorService(CommonService):
model = Connector
@classmethod
def cancel_tasks(cls, connector_id):
e, conn = cls.get_by_id(connector_id)
if not e:
return
logging.info(
"[Connector] stop connector=%s(%s)",
conn.name,
connector_id,
)
for c2k in Connector2KbService.query(connector_id=connector_id):
SyncLogsService.filter_update(
[
SyncLogs.connector_id == connector_id,
SyncLogs.kb_id == c2k.kb_id,
SyncLogs.status.in_([TaskStatus.SCHEDULE, TaskStatus.RUNNING]),
],
{"status": TaskStatus.CANCEL},
)
ConnectorService.update_by_id(connector_id, {"status": TaskStatus.CANCEL})
logging.info(
"[Connector] connector=%s status updated to %s",
connector_id,
TaskStatus.CANCEL,
)
@classmethod
@DB.connection_context()
def accessible(cls, connector_id: str, user_id: str) -> bool:
@@ -64,25 +91,39 @@ class ConnectorService(CommonService):
return has_access
@classmethod
def resume(cls, connector_id, status):
def schedule_tasks(cls, connector_id):
e, conn = cls.get_by_id(connector_id)
if not e:
return
logging.info("[Connector] schedule connector=%s(%s)", conn.name, connector_id)
prune_enabled = bool((conn.config or {}).get("sync_deleted_files"))
for c2k in Connector2KbService.query(connector_id=connector_id):
task = SyncLogsService.get_latest_task(connector_id, c2k.kb_id)
if not task:
if status == TaskStatus.SCHEDULE:
SyncLogsService.schedule(connector_id, c2k.kb_id)
ConnectorService.update_by_id(connector_id, {"status": status})
return
sync_task = SyncLogsService.get_latest_task(
connector_id,
c2k.kb_id,
ConnectorTaskType.SYNC,
)
poll_range_start = None
total_docs_indexed = 0
if sync_task and sync_task.status == TaskStatus.DONE:
poll_range_start = sync_task.poll_range_end
total_docs_indexed = sync_task.total_docs_indexed
if task.status == TaskStatus.DONE:
if status == TaskStatus.SCHEDULE:
SyncLogsService.schedule(connector_id, c2k.kb_id, task.poll_range_end, total_docs_indexed=task.total_docs_indexed)
ConnectorService.update_by_id(connector_id, {"status": status})
return
SyncLogsService.schedule(
connector_id,
c2k.kb_id,
poll_range_start,
total_docs_indexed=total_docs_indexed,
task_type=ConnectorTaskType.SYNC,
)
task = task.to_dict()
task["status"] = status
SyncLogsService.update_by_id(task["id"], task)
ConnectorService.update_by_id(connector_id, {"status": status})
if prune_enabled:
SyncLogsService.schedule(
connector_id,
c2k.kb_id,
task_type=ConnectorTaskType.PRUNE,
)
@classmethod
def list(cls, tenant_id):
@@ -105,7 +146,9 @@ class ConnectorService(CommonService):
SyncLogsService.filter_delete([SyncLogs.connector_id==connector_id, SyncLogs.kb_id==kb_id])
docs = DocumentService.query(source_type=f"{conn.source}/{conn.id}", kb_id=kb_id)
err = FileService.delete_docs([d.id for d in docs], tenant_id)
SyncLogsService.schedule(connector_id, kb_id, reindex=True)
SyncLogsService.schedule(connector_id, kb_id, reindex=True, task_type=ConnectorTaskType.SYNC)
if (conn.config or {}).get("sync_deleted_files"):
SyncLogsService.schedule(connector_id, kb_id, task_type=ConnectorTaskType.PRUNE)
return err
@classmethod
@@ -170,30 +213,25 @@ class ConnectorService(CommonService):
class SyncLogsService(CommonService):
model = SyncLogs
@classmethod
def list_sync_tasks(cls, connector_id=None, page_number=None, items_per_page=15) -> Tuple[List[dict], int]:
fields = [
cls.model.id,
cls.model.connector_id,
cls.model.task_type,
cls.model.kb_id,
cls.model.update_date,
cls.model.poll_range_start,
cls.model.poll_range_end,
cls.model.new_docs_indexed,
cls.model.total_docs_indexed,
cls.model.docs_removed_from_index,
cls.model.error_msg,
cls.model.full_exception_trace,
cls.model.error_count,
Connector.name,
Connector.source,
Connector.tenant_id,
Connector.timeout_secs,
cls.model.time_started.alias("time_started"),
Connector.refresh_freq.alias("refresh_freq"),
Connector.prune_freq.alias("prune_freq"),
Knowledgebase.name.alias("kb_name"),
Knowledgebase.avatar.alias("kb_avatar"),
Connector2Kb.auto_parse,
cls.model.from_beginning.alias("reindex"),
cls.model.status,
cls.model.update_time
]
if not connector_id:
fields.append(Connector.config)
@@ -225,6 +263,80 @@ class SyncLogsService(CommonService):
return list(query.dicts()), total
@classmethod
def list_due_sync_tasks(cls) -> List[dict]:
return cls._list_due_tasks_for_freq(
ConnectorTaskType.SYNC,
"refresh_freq",
)
@classmethod
def list_due_prune_tasks(cls) -> List[dict]:
tasks = cls._list_due_tasks_for_freq(
ConnectorTaskType.PRUNE,
"prune_freq",
)
return [
task for task in tasks
# Prune is opt-in at the connector config level; keep the scheduler
# blind to prune_freq until the flag is enabled.
if bool((task.get("config") or {}).get("sync_deleted_files"))
and int(task.get("prune_freq") or 0) > 0
]
@classmethod
def _list_due_tasks_for_freq(cls, task_type: str, freq_field: str) -> List[dict]:
fields = [
cls.model.id,
cls.model.connector_id,
cls.model.task_type,
cls.model.kb_id,
cls.model.update_date,
cls.model.poll_range_start,
cls.model.poll_range_end,
cls.model.new_docs_indexed,
cls.model.total_docs_indexed,
cls.model.error_msg,
cls.model.full_exception_trace,
cls.model.error_count,
Connector.name,
Connector.source,
Connector.tenant_id,
Connector.timeout_secs,
Connector.config,
Connector.refresh_freq,
Connector.prune_freq,
Knowledgebase.name.alias("kb_name"),
Knowledgebase.avatar.alias("kb_avatar"),
Connector2Kb.auto_parse,
cls.model.from_beginning.alias("reindex"),
cls.model.status,
cls.model.update_time,
]
query = cls.model.select(*fields)\
.join(Connector, on=(cls.model.connector_id==Connector.id))\
.join(Connector2Kb, on=(cls.model.kb_id==Connector2Kb.kb_id))\
.join(Knowledgebase, on=(cls.model.kb_id==Knowledgebase.id))
query = query.where(
Connector.input_type == InputType.POLL,
Connector.status == TaskStatus.SCHEDULE,
cls.model.status == TaskStatus.SCHEDULE,
cls.model.task_type == task_type,
)
database_type = os.getenv("DB_TYPE", "mysql")
if "postgres" in database_type.lower():
expr = SQL(
f"NOW() AT TIME ZONE '{TIMEZONE}' - make_interval(mins => t2.{freq_field})"
)
else:
expr = SQL(f"NOW() - INTERVAL `t2`.`{freq_field}` MINUTE")
query = query.where(cls.model.update_date < expr)
return list(query.distinct().order_by(cls.model.update_time.desc()).dicts())
@classmethod
def start(cls, id, connector_id):
cls.update_by_id(id, {"status": TaskStatus.RUNNING, "time_started": datetime.now().strftime('%Y-%m-%d %H:%M:%S') })
@@ -236,7 +348,15 @@ class SyncLogsService(CommonService):
ConnectorService.update_by_id(connector_id, {"status": TaskStatus.DONE})
@classmethod
def schedule(cls, connector_id, kb_id, poll_range_start=None, reindex=False, total_docs_indexed=0):
def schedule(
cls,
connector_id,
kb_id,
poll_range_start=None,
reindex=False,
total_docs_indexed=0,
task_type=ConnectorTaskType.SYNC,
):
try:
if cls.model.select().where(cls.model.kb_id == kb_id, cls.model.connector_id == connector_id).count() > 100:
rm_ids = [m.id for m in cls.model.select(cls.model.id).where(cls.model.kb_id == kb_id, cls.model.connector_id == connector_id).order_by(cls.model.update_time.asc()).limit(70)]
@@ -246,21 +366,33 @@ class SyncLogsService(CommonService):
logging.exception(e)
try:
e = cls.query(kb_id=kb_id, connector_id=connector_id, status=TaskStatus.SCHEDULE)
e = cls.query(
kb_id=kb_id,
connector_id=connector_id,
status=TaskStatus.SCHEDULE,
task_type=task_type,
)
if e:
logging.warning(f"{kb_id}--{connector_id} has already had a scheduling sync task which is abnormal.")
logging.warning(
"%s--%s already has a scheduled %s task.",
kb_id,
connector_id,
task_type,
)
return None
reindex = "1" if reindex else "0"
ConnectorService.update_by_id(connector_id, {"status": TaskStatus.SCHEDULE})
return cls.save(**{
"id": get_uuid(),
"kb_id": kb_id, "status": TaskStatus.SCHEDULE, "connector_id": connector_id,
"task_type": task_type,
"poll_range_start": poll_range_start, "from_beginning": reindex,
"total_docs_indexed": total_docs_indexed
"total_docs_indexed": total_docs_indexed,
"time_started": datetime.now().strftime('%Y-%m-%d %H:%M:%S')
})
except Exception as e:
logging.exception(e)
task = cls.get_latest_task(connector_id, kb_id)
task = cls.get_latest_task(connector_id, kb_id, task_type)
if task:
cls.model.update(status=TaskStatus.SCHEDULE,
poll_range_start=poll_range_start,
@@ -337,11 +469,14 @@ class SyncLogsService(CommonService):
return errs, doc_ids
@classmethod
def get_latest_task(cls, connector_id, kb_id):
return cls.model.select().where(
def get_latest_task(cls, connector_id, kb_id, task_type=None):
query = cls.model.select().where(
cls.model.connector_id==connector_id,
cls.model.kb_id == kb_id
).order_by(cls.model.update_time.desc()).first()
)
if task_type is not None:
query = query.where(cls.model.task_type == task_type)
return query.order_by(cls.model.update_time.desc()).first()
class Connector2KbService(CommonService):
@@ -364,7 +499,10 @@ class Connector2KbService(CommonService):
"kb_id": kb_id,
"auto_parse": conn.get("auto_parse", "1")
})
SyncLogsService.schedule(conn_id, kb_id, reindex=True)
SyncLogsService.schedule(conn_id, kb_id, reindex=True, task_type=ConnectorTaskType.SYNC)
e, full_conn = ConnectorService.get_by_id(conn_id)
if e and (full_conn.config or {}).get("sync_deleted_files"):
SyncLogsService.schedule(conn_id, kb_id, task_type=ConnectorTaskType.PRUNE)
errs = []
for conn_id in old_conn_ids:

View File

@@ -93,6 +93,11 @@ class TaskStatus(StrEnum):
VALID_TASK_STATUS = {TaskStatus.UNSTART, TaskStatus.RUNNING, TaskStatus.CANCEL, TaskStatus.DONE, TaskStatus.FAIL, TaskStatus.SCHEDULE}
class ConnectorTaskType(StrEnum):
SYNC = "sync"
PRUNE = "prune"
class ParserType(StrEnum):
PRESENTATION = "presentation"
LAWS = "laws"

View File

@@ -41,7 +41,7 @@ from api.db.services.connector_service import ConnectorService, SyncLogsService
from api.db.services.document_service import DocumentService
from api.db.services.knowledgebase_service import KnowledgebaseService
from common import settings
from common.constants import FileSource, TaskStatus
from common.constants import ConnectorTaskType, FileSource, TaskStatus
from common.config_utils import show_configs
from common.data_source.config import INDEX_BATCH_SIZE
from common.data_source import (
@@ -76,8 +76,6 @@ from common.log_utils import init_root_logger
from common.signal_utils import start_tracemalloc_and_snapshot, stop_tracemalloc
from common.versions import get_ragflow_version
from box_sdk_gen import BoxOAuth, OAuthConfig, AccessToken
from collections import namedtuple
MAX_CONCURRENT_TASKS = int(os.environ.get("MAX_CONCURRENT_TASKS", "5"))
task_limiter = asyncio.Semaphore(MAX_CONCURRENT_TASKS)
@@ -157,30 +155,37 @@ class SyncBase:
})
return
SyncLogsService.schedule(task["connector_id"], task["kb_id"], task["poll_range_start"])
task_type = task.get("task_type", ConnectorTaskType.SYNC)
if task_type == ConnectorTaskType.SYNC:
SyncLogsService.schedule(
task["connector_id"],
task["kb_id"],
task.get("poll_range_start"),
task_type=ConnectorTaskType.SYNC,
)
elif task_type == ConnectorTaskType.PRUNE and self.conf.get("sync_deleted_files"):
SyncLogsService.schedule(
task["connector_id"],
task["kb_id"],
task_type=ConnectorTaskType.PRUNE,
)
async def _run_task_logic(self, task: dict):
task_type = task.get("task_type", ConnectorTaskType.SYNC)
if task_type == ConnectorTaskType.PRUNE:
await self._run_prune_task_logic(task)
return
await self._run_sync_task_logic(task)
async def _run_sync_task_logic(self, task: dict):
"""
Executes the core synchronization pipeline for a data source task.
This method retrieves documents from the external source via the `_generate` method,
parses and upserts them into the Knowledge Base (KB), and handles stale document
reconciliation (sync deletion) if a remote snapshot (`file_list`) is provided.
"""
generate_output = await self._generate(task)
# `_generate()` currently supports two outputs:
# 1. `document_batch_generator`
# 2. `(document_batch_generator, file_list)`
if isinstance(generate_output, tuple):
document_batch_generator, file_list = generate_output
else:
document_batch_generator = generate_output
file_list = None
document_batch_generator = await self._generate(task)
failed_docs = 0
added_docs = 0
updated_docs = 0
removed_docs = 0
next_update = datetime(1970, 1, 1, tzinfo=timezone.utc)
source_type = f"{self.SOURCE_NAME}/{task['connector_id']}"
existing_doc_ids = {
@@ -252,34 +257,12 @@ class SyncBase:
prefix = self._get_source_prefix()
prefix = f"{prefix} " if prefix else ""
next_update_info = self._format_window_boundary(next_update)
expects_deleted_file_snapshot = (
task.get("reindex") != "1"
and task.get("poll_range_start")
and self.conf.get("sync_deleted_files")
)
cleanup_errors = []
if expects_deleted_file_snapshot and file_list is None:
logging.warning(
"%s deleted-file snapshot retrieval failed "
"(connector_id=%s, kb_id=%s)",
self.SOURCE_NAME,
task["connector_id"],
task["kb_id"],
)
elif file_list is not None:
removed_docs, cleanup_errors = ConnectorService.cleanup_stale_documents_for_task(
task["id"],
task["connector_id"],
task["kb_id"],
task["tenant_id"],
file_list,
)
total_changed_docs = added_docs + updated_docs + removed_docs
total_changed_docs = added_docs + updated_docs
summary = (
f"{prefix}sync summary till {next_update_info}: "
f"total={total_changed_docs}, added={added_docs}, "
f"updated={updated_docs}, deleted={removed_docs}"
f"updated={updated_docs}"
)
if failed_docs > 0:
summary = f"{summary}, skipped={failed_docs}"
@@ -288,19 +271,80 @@ class SyncBase:
if (
isinstance(self, _RDBMSBase)
and failed_docs == 0
and (not expects_deleted_file_snapshot or file_list is not None)
and not cleanup_errors
):
self.connector.persist_sync_state()
SyncLogsService.done(task["id"], task["connector_id"])
task["poll_range_start"] = next_update
async def _run_prune_task_logic(self, task: dict):
if not self.conf.get("sync_deleted_files"):
SyncLogsService.done(task["id"], task["connector_id"])
return
await self._initialize_for_prune(task)
file_list = self._collect_prune_snapshot(task)
if file_list is None:
logging.warning(
"%s prune snapshot retrieval failed (connector_id=%s, kb_id=%s)",
self.SOURCE_NAME,
task["connector_id"],
task["kb_id"],
)
SyncLogsService.done(task["id"], task["connector_id"])
return
removed_docs, cleanup_errors = ConnectorService.cleanup_stale_documents_for_task(
task["id"],
task["connector_id"],
task["kb_id"],
task["tenant_id"],
file_list,
)
logging.info(
"%s prune summary: deleted=%s, errors=%s",
self.SOURCE_NAME,
removed_docs,
len(cleanup_errors),
)
SyncLogsService.done(task["id"], task["connector_id"])
async def _generate(self, task: dict):
raise NotImplementedError
def _get_source_prefix(self):
return ""
async def _initialize_for_prune(self, task: dict):
await self._generate(task)
def _get_prune_snapshot_kwargs(self, task: dict) -> dict[str, Any]:
return {}
def _collect_prune_snapshot(self, task: dict):
if not getattr(self, "connector", None):
return None
if not hasattr(self.connector, "retrieve_all_slim_docs_perm_sync"):
return None
file_list = []
snapshot_kwargs = self._get_prune_snapshot_kwargs(task)
try:
for slim_batch in self.connector.retrieve_all_slim_docs_perm_sync(**snapshot_kwargs):
file_list.extend(slim_batch)
except TypeError:
for slim_batch in self.connector.retrieve_all_slim_docs_perm_sync():
file_list.extend(slim_batch)
except Exception:
logging.exception(
"%s prune snapshot failed (connector_id=%s, kb_id=%s)",
self.SOURCE_NAME,
task["connector_id"],
task["kb_id"],
)
return None
return file_list
class _BlobLikeBase(SyncBase):
DEFAULT_BUCKET_TYPE: str = "s3"
@@ -391,7 +435,6 @@ class _BlobLikeBase(SyncBase):
self.connector.set_allow_images(self.conf.get("allow_images", False))
self.connector.load_credentials(self.conf["credentials"])
file_list = None
# Fingerprint-bypass path: skip GetObject for unchanged ETags. Disabled
# on full reindex (we want to re-fetch everything in that case).
use_fingerprint_path = task["reindex"] != "1"
@@ -400,15 +443,6 @@ class _BlobLikeBase(SyncBase):
else:
document_batch_generator = self.connector.load_from_state()
if (
task["reindex"] != "1"
and task["poll_range_start"]
and self.conf.get("sync_deleted_files")
):
file_list = []
for slim_batch in self.connector.retrieve_all_slim_docs_perm_sync():
file_list.extend(slim_batch)
_begin_info = (
"fingerprint-bypass"
if use_fingerprint_path
@@ -423,7 +457,7 @@ class _BlobLikeBase(SyncBase):
_begin_info,
)
)
return document_batch_generator, file_list
return document_batch_generator
class S3(_BlobLikeBase):
@@ -461,28 +495,11 @@ class RSS(SyncBase):
return self.connector.load_from_state()
end_time = datetime.now(timezone.utc).timestamp()
file_list = None
if self.conf.get("sync_deleted_files"):
logging.info(
"[RSS] Syncing deleted files via slim snapshot (connector_id=%s)",
task["connector_id"],
)
snapshot_start = time.perf_counter()
file_list = []
for slim_batch in self.connector.retrieve_all_slim_docs_perm_sync():
file_list.extend(slim_batch)
logging.info(
"[RSS] Slim snapshot fetched %d docs in %.2f seconds",
len(file_list),
time.perf_counter() - snapshot_start,
)
document_generator = self.connector.poll_source(
task["poll_range_start"].timestamp(),
end_time,
)
if file_list is not None:
return document_generator, file_list
return document_generator
@@ -525,16 +542,11 @@ class Confluence(SyncBase):
credential_json=self.conf["credentials"])
self.connector.set_credentials_provider(credentials_provider)
file_list = None
# Determine the time range for synchronization based on reindex or poll_range_start
if task["reindex"] == "1" or not task["poll_range_start"]:
start_time = 0.0
else:
start_time = task["poll_range_start"].timestamp()
if self.conf.get("sync_deleted_files"):
file_list = []
for slim_batch in self.connector.retrieve_all_slim_docs_perm_sync():
file_list.extend(slim_batch)
end_time = datetime.now(timezone.utc).timestamp()
@@ -580,7 +592,7 @@ class Confluence(SyncBase):
yield batch
self.log_connection("Confluence", self.conf["wiki_base"], task)
return wrapper(), file_list
return wrapper()
class Notion(SyncBase):
@@ -589,7 +601,6 @@ class Notion(SyncBase):
async def _generate(self, task: dict):
self.connector = NotionConnector(root_page_id=self.conf["root_page_id"])
self.connector.load_credentials(self.conf["credentials"])
file_list = None
document_generator = (
self.connector.load_from_state()
if task["reindex"] == "1" or not task["poll_range_start"]
@@ -597,19 +608,10 @@ class Notion(SyncBase):
datetime.now(timezone.utc).timestamp())
)
if (
task["reindex"] != "1"
and task["poll_range_start"]
and self.conf.get("sync_deleted_files")
):
file_list = []
for slim_batch in self.connector.retrieve_all_slim_docs_perm_sync():
file_list.extend(slim_batch)
_begin_info = "totally" if task["reindex"] == "1" or not task["poll_range_start"] else "from {}".format(
task["poll_range_start"])
self.log_connection("Notion", f"root({self.conf['root_page_id']})", task)
return document_generator, file_list
return document_generator
class Discord(SyncBase):
@@ -627,26 +629,17 @@ class Discord(SyncBase):
batch_size=self.conf.get("batch_size", 1024),
)
self.connector.load_credentials(self.conf["credentials"])
file_list = None
document_generator = (
self.connector.load_from_state()
if task["reindex"] == "1" or not task["poll_range_start"]
else self.connector.poll_source(task["poll_range_start"].timestamp(),
datetime.now(timezone.utc).timestamp())
)
if (
task["reindex"] != "1"
and task["poll_range_start"]
and self.conf.get("sync_deleted_files")
):
file_list = []
for slim_batch in self.connector.retrieve_all_slim_docs_perm_sync():
file_list.extend(slim_batch)
_begin_info = "totally" if task["reindex"] == "1" or not task["poll_range_start"] else "from {}".format(
task["poll_range_start"])
self.log_connection("Discord", f"servers({server_ids}), channel({channel_names})", task)
return document_generator, file_list
return document_generator
class Gmail(SyncBase):
@@ -685,8 +678,6 @@ class Gmail(SyncBase):
task["connector_id"],
)
file_list = None
# Decide between full reindex and incremental polling by time range.
if task["reindex"] == "1" or not task.get("poll_range_start"):
start_time = None
@@ -706,17 +697,13 @@ class Gmail(SyncBase):
end_time = datetime.now(timezone.utc).timestamp()
_begin_info = f"from {poll_start}"
document_generator = self.connector.poll_source(start_time, end_time)
if self.conf.get("sync_deleted_files"):
file_list = []
for slim_batch in self.connector.retrieve_all_slim_docs_perm_sync():
file_list.extend(slim_batch)
try:
admin_email = self.connector.primary_admin_email
except RuntimeError:
admin_email = "unknown"
self.log_connection("Gmail", f"as {admin_email}", task)
return document_generator, file_list
return document_generator
class Dropbox(SyncBase):
@@ -726,22 +713,16 @@ class Dropbox(SyncBase):
self.connector = DropboxConnector(batch_size=self.conf.get("batch_size", INDEX_BATCH_SIZE))
self.connector.load_credentials(self.conf["credentials"])
poll_start = task["poll_range_start"]
file_list = None
if task["reindex"] == "1" or not poll_start:
document_generator = self.connector.load_from_state()
_begin_info = "totally"
else:
end_time = datetime.now(timezone.utc).timestamp()
if self.conf.get("sync_deleted_files"):
file_list = []
for slim_batch in self.connector.retrieve_all_slim_docs_perm_sync():
file_list.extend(slim_batch)
document_generator = self.connector.poll_source(poll_start.timestamp(), end_time)
_begin_info = f"from {poll_start}"
self.log_connection("Dropbox", "workspace", task)
return document_generator, file_list
return document_generator
class GoogleDrive(SyncBase):
@@ -775,8 +756,6 @@ class GoogleDrive(SyncBase):
if new_credentials:
self._persist_rotated_credentials(task["connector_id"], new_credentials)
file_list = None
# Capture end_time BEFORE the snapshot to prevent the ingestion race condition
end_time = datetime.now(timezone.utc).timestamp()
@@ -786,18 +765,6 @@ class GoogleDrive(SyncBase):
else:
start_time = task["poll_range_start"].timestamp()
_begin_info = f"from {task['poll_range_start']}"
if self.conf.get("sync_deleted_files"):
file_list = []
SlimDoc = namedtuple('SlimDoc', ['id'])
# Add observability timing so operators can track the O(N) cost
snapshot_start = time.perf_counter()
for slim_batch in self.connector.retrieve_all_slim_docs_perm_sync():
file_list.extend(SlimDoc(doc.id) for doc in slim_batch)
logging.info("Slim snapshot fetched %d files in %.2f seconds", len(file_list), time.perf_counter() - snapshot_start)
raw_batch_size = self.conf.get("sync_batch_size") or self.conf.get("batch_size") or INDEX_BATCH_SIZE
try:
@@ -843,7 +810,7 @@ class GoogleDrive(SyncBase):
admin_email = "unknown"
self.log_connection("Google Drive", f"as {admin_email}", task)
return document_batches(), file_list
return document_batches()
def _persist_rotated_credentials(self, connector_id: str, credentials: dict[str, Any]) -> None:
"""Saves refreshed OAuth credentials back to the database configuration."""
@@ -886,17 +853,12 @@ class Jira(SyncBase):
self.connector.load_credentials(credentials)
self.connector.validate_connector_settings()
file_list = None
if task["reindex"] == "1" or not task["poll_range_start"]:
start_time = 0.0
_begin_info = "totally"
else:
start_time = task["poll_range_start"].timestamp()
if self.conf.get("sync_deleted_files"):
file_list = []
for slim_batch in self.connector.retrieve_all_slim_docs_perm_sync():
file_list.extend(slim_batch)
_begin_info = f"from {task['poll_range_start']}"
end_time = datetime.now(timezone.utc).timestamp()
@@ -955,7 +917,7 @@ class Jira(SyncBase):
f"overlap_buffer_s={getattr(self.connector, 'time_buffer_seconds', connector_kwargs.get('time_buffer_seconds'))}"
),
)
return document_batches(), file_list
return document_batches()
@staticmethod
def _normalize_list(values: Any) -> list[str] | None:
@@ -1007,25 +969,11 @@ class WebDAV(SyncBase):
self.connector.set_allow_images(self.conf.get("allow_images", False))
self.connector.load_credentials(self.conf["credentials"])
file_list = None
if task["reindex"] == "1" or not task["poll_range_start"]:
document_batch_generator = self.connector.load_from_state()
_begin_info = "totally"
else:
end_ts = datetime.now(timezone.utc).timestamp()
if self.conf.get("sync_deleted_files"):
file_list = []
try:
for slim_batch in self.connector.retrieve_all_slim_docs_perm_sync():
file_list.extend(slim_batch)
except Exception:
logging.exception(
"WebDAV slim snapshot failed; continuing without stale-document cleanup "
"(connector_id=%s, kb_id=%s)",
task["connector_id"],
task["kb_id"],
)
file_list = None
document_batch_generator = self.connector.poll_source(
task["poll_range_start"].timestamp(),
end_ts,
@@ -1038,7 +986,7 @@ class WebDAV(SyncBase):
for document_batch in document_batch_generator:
yield document_batch
return wrapper(), file_list
return wrapper()
class Moodle(SyncBase):
@@ -1054,7 +1002,6 @@ class Moodle(SyncBase):
# Determine the time range for synchronization based on reindex or poll_range_start
poll_start = task.get("poll_range_start")
file_list = None
if task["reindex"] == "1" or poll_start is None:
document_generator = self.connector.load_from_state()
@@ -1066,20 +1013,6 @@ class Moodle(SyncBase):
# could be polled as new and at the same time be missing from
# the slim list, which would mark it as stale and delete it.
end_ts = datetime.now(timezone.utc).timestamp()
if self.conf.get("sync_deleted_files"):
file_list = []
try:
for slim_batch in self.connector.retrieve_all_slim_docs_perm_sync():
file_list.extend(slim_batch)
except Exception:
logging.exception(
"Moodle slim snapshot failed; skipping stale-document cleanup "
"(connector_id=%s, kb_id=%s)",
task.get("connector_id"),
task.get("kb_id"),
)
file_list = None
document_generator = self.connector.poll_source(
poll_start.timestamp(),
end_ts,
@@ -1087,7 +1020,7 @@ class Moodle(SyncBase):
_begin_info = f"from {poll_start}"
self.log_connection("Moodle", self.conf["moodle_url"], task)
return document_generator, file_list
return document_generator
class BOX(SyncBase):
@@ -1115,23 +1048,18 @@ class BOX(SyncBase):
self.connector.load_credentials(auth)
poll_start = task["poll_range_start"]
file_list = None
if task["reindex"] == "1" or poll_start is None:
document_generator = self.connector.load_from_state()
_begin_info = "totally"
else:
if self.conf.get("sync_deleted_files"):
file_list = []
for slim_batch in self.connector.retrieve_all_slim_docs_perm_sync():
file_list.extend(slim_batch)
document_generator = self.connector.poll_source(
poll_start.timestamp(),
datetime.now(timezone.utc).timestamp(),
)
_begin_info = f"from {poll_start}"
self.log_connection("Box", f"folder_id({self.conf['folder_id']})", task)
return document_generator, file_list
return document_generator
class Airtable(SyncBase):
@@ -1156,16 +1084,11 @@ class Airtable(SyncBase):
)
poll_start = task.get("poll_range_start")
file_list = None
if task.get("reindex") == "1" or poll_start is None:
document_generator = self.connector.load_from_state()
_begin_info = "totally"
else:
if self.conf.get("sync_deleted_files"):
file_list = []
for slim_batch in self.connector.retrieve_all_slim_docs_perm_sync():
file_list.extend(slim_batch)
document_generator = self.connector.poll_source(
poll_start.timestamp(),
datetime.now(timezone.utc).timestamp(),
@@ -1178,7 +1101,7 @@ class Airtable(SyncBase):
task,
)
return document_generator, file_list
return document_generator
class Asana(SyncBase):
SOURCE_NAME: str = FileSource.ASANA
@@ -1198,17 +1121,12 @@ class Asana(SyncBase):
)
poll_start = task.get("poll_range_start")
file_list = None
if task.get("reindex") == "1" or not poll_start:
document_generator = self.connector.load_from_state()
_begin_info = "totally"
else:
end_time = datetime.now(timezone.utc).timestamp()
if self.conf.get("sync_deleted_files"):
file_list = []
for slim_batch in self.connector.retrieve_all_slim_docs_perm_sync():
file_list.extend(slim_batch)
document_generator = self.connector.poll_source(
poll_start.timestamp(),
end_time,
@@ -1221,7 +1139,7 @@ class Asana(SyncBase):
task,
)
return document_generator, file_list
return document_generator
class Github(SyncBase):
SOURCE_NAME: str = FileSource.GITHUB
@@ -1247,15 +1165,10 @@ class Github(SyncBase):
{"github_access_token": credentials["github_access_token"]}
)
file_list = None
if task.get("reindex") == "1" or not task.get("poll_range_start"):
start_time = datetime.fromtimestamp(0, tz=timezone.utc)
else:
start_time = task.get("poll_range_start")
if self.conf.get("sync_deleted_files"):
file_list = []
for slim_batch in self.connector.retrieve_all_slim_docs_perm_sync():
file_list.extend(slim_batch)
end_time = datetime.now(timezone.utc)
@@ -1292,7 +1205,7 @@ class Github(SyncBase):
task,
)
return wrapper(), file_list
return wrapper()
class IMAP(SyncBase):
SOURCE_NAME: str = FileSource.IMAP
@@ -1348,27 +1261,10 @@ class IMAP(SyncBase):
task["connector_id"],
)
file_list = None
if (
task["reindex"] != "1"
and task["poll_range_start"]
and self.conf.get("sync_deleted_files")
):
file_list = []
try:
for slim_batch in self.connector.retrieve_all_slim_docs_perm_sync(
start=initial_sync_start,
end=end_time,
):
file_list.extend(slim_batch)
except Exception:
logging.exception(
"IMAP slim snapshot failed; continuing without stale-document cleanup "
"(connector_id=%s, kb_id=%s)",
task["connector_id"],
task["kb_id"],
)
file_list = None
self._prune_snapshot_kwargs = {
"start": initial_sync_start,
"end": end_time,
}
raw_batch_size = self.conf.get("sync_batch_size") or self.conf.get("batch_size") or INDEX_BATCH_SIZE
try:
@@ -1414,7 +1310,10 @@ class IMAP(SyncBase):
f"host({self.conf['imap_host']}) port({self.conf['imap_port']}) user({self.conf['credentials']['imap_username']}) folder({self.conf['imap_mailbox']})",
task,
)
return wrapper(), file_list
return wrapper()
def _get_prune_snapshot_kwargs(self, task: dict) -> dict[str, Any]:
return getattr(self, "_prune_snapshot_kwargs", {})
class Zendesk(SyncBase):
@@ -1424,26 +1323,11 @@ class Zendesk(SyncBase):
self.connector.load_credentials(self.conf["credentials"])
end_time = datetime.now(timezone.utc).timestamp()
file_list = None
if task["reindex"] == "1" or not task.get("poll_range_start"):
start_time = 0
_begin_info = "totally"
else:
start_time = task["poll_range_start"].timestamp()
if self.conf.get("sync_deleted_files"):
logging.info(
"[Zendesk] Syncing deleted files via slim snapshot (connector_id=%s)",
task.get("connector_id"),
)
snapshot_start = time.perf_counter()
file_list = []
for slim_batch in self.connector.retrieve_all_slim_docs_perm_sync():
file_list.extend(slim_batch)
logging.info(
"[Zendesk] Slim snapshot fetched %d docs in %.2f seconds",
len(file_list),
time.perf_counter() - snapshot_start,
)
_begin_info = f"from {task['poll_range_start']}"
raw_batch_size = (
@@ -1504,9 +1388,6 @@ class Zendesk(SyncBase):
yield batch
self.log_connection("Zendesk", f"subdomain({self.conf['credentials'].get('zendesk_subdomain')})", task)
if file_list is not None:
return wrapper(), file_list
return wrapper()
@@ -1533,7 +1414,6 @@ class Gitlab(SyncBase):
}
)
file_list = None
if task["reindex"] == "1" or not task["poll_range_start"]:
document_generator = self.connector.load_from_state()
_begin_info = "totally"
@@ -1547,13 +1427,9 @@ class Gitlab(SyncBase):
poll_start.timestamp(),
datetime.now(timezone.utc).timestamp()
)
if self.conf.get("sync_deleted_files"):
file_list = []
for slim_batch in self.connector.retrieve_all_slim_docs_perm_sync():
file_list.extend(slim_batch)
_begin_info = "from {}".format(poll_start)
self.log_connection("Gitlab", f"({self.conf['project_name']})", task)
return document_generator, file_list
return document_generator
class Bitbucket(SyncBase):
@@ -1572,17 +1448,12 @@ class Bitbucket(SyncBase):
"bitbucket_api_token": self.conf["credentials"].get("bitbucket_api_token"),
}
)
file_list = None
if task["reindex"] == "1" or not task["poll_range_start"]:
start_time = datetime.fromtimestamp(0, tz=timezone.utc)
_begin_info = "totally"
else:
start_time = task.get("poll_range_start")
if self.conf.get("sync_deleted_files"):
file_list = []
for slim_batch in self.connector.retrieve_all_slim_docs_perm_sync():
file_list.extend(slim_batch)
_begin_info = f"from {start_time}"
end_time = datetime.now(timezone.utc)
@@ -1614,8 +1485,6 @@ class Bitbucket(SyncBase):
yield batch
self.log_connection("Bitbucket", f"workspace({self.conf.get('workspace')})", task)
if file_list is not None:
return wrapper(), file_list
return wrapper()
@@ -1642,26 +1511,12 @@ class SeaFile(SyncBase):
)
self.connector.load_credentials(conf["credentials"])
file_list = None
poll_start = task.get("poll_range_start")
if task["reindex"] == "1" or poll_start is None:
document_generator = self.connector.load_from_state()
_begin_info = "totally"
else:
end_ts = datetime.now(timezone.utc).timestamp()
if self.conf.get("sync_deleted_files"):
file_list = []
try:
for slim_batch in self.connector.retrieve_all_slim_docs_perm_sync():
file_list.extend(slim_batch)
except Exception:
logging.exception(
"SeaFile slim snapshot failed; continuing without stale-document cleanup "
"(connector_id=%s, kb_id=%s)",
task["connector_id"],
task["kb_id"],
)
file_list = None
document_generator = self.connector.poll_source(
poll_start.timestamp(),
end_ts,
@@ -1676,7 +1531,7 @@ class SeaFile(SyncBase):
extra += f" path={conf.get('sync_path')}"
self.log_connection("SeaFile", f"{conf['seafile_url']} (scope={scope}{extra})", task)
return document_generator, file_list
return document_generator
class DingTalkAITable(SyncBase):
@@ -1709,33 +1564,12 @@ class DingTalkAITable(SyncBase):
)
poll_start = task.get("poll_range_start")
file_list = None
if task.get("reindex") == "1" or poll_start is None:
document_generator = self.connector.load_from_state()
_begin_info = "totally"
else:
end_ts = datetime.now(timezone.utc).timestamp()
if self.conf.get("sync_deleted_files"):
file_list = []
logging.info(
"DingTalk AI Table: fetching slim snapshot for stale-document reconciliation "
"(connector_id=%s, kb_id=%s, table_id=%s)",
task["connector_id"],
task["kb_id"],
self.conf.get("table_id"),
)
try:
for slim_batch in self.connector.retrieve_all_slim_docs_perm_sync():
file_list.extend(slim_batch)
except Exception:
logging.exception(
"DingTalk AI Table slim snapshot failed; continuing without stale-document cleanup "
"(connector_id=%s, kb_id=%s)",
task["connector_id"],
task["kb_id"],
)
file_list = None
document_generator = self.connector.poll_source(
poll_start.timestamp(),
end_ts,
@@ -1748,7 +1582,7 @@ class DingTalkAITable(SyncBase):
task,
)
return document_generator, file_list
return document_generator
class _RDBMSBase(SyncBase):
@@ -1778,16 +1612,6 @@ class _RDBMSBase(SyncBase):
self.connector.validate_connector_settings()
self.connector.prepare_sync_state(task["connector_id"], self.conf)
file_list = None
if (
task["reindex"] != "1"
and task["poll_range_start"]
and self.conf.get("sync_deleted_files")
):
file_list = []
for slim_batch in self.connector.retrieve_all_slim_docs_perm_sync():
file_list.extend(slim_batch)
if task["reindex"] == "1" or not task["poll_range_start"]:
document_generator = self.connector.load_from_state()
_begin_info = "totally"
@@ -1804,7 +1628,7 @@ class _RDBMSBase(SyncBase):
_begin_info = f"from {poll_start}"
self.log_connection(self.LOG_NAME, f"{self.conf.get('host')}:{self.conf.get('database')}", task)
return document_generator, file_list
return document_generator
class MySQL(_RDBMSBase):
@@ -1886,14 +1710,17 @@ async def dispatch_tasks():
"""Polls the database for pending synchronization tasks and dispatches them concurrently."""
while True:
try:
list(SyncLogsService.list_sync_tasks()[0])
SyncLogsService.list_due_sync_tasks()
SyncLogsService.list_due_prune_tasks()
break
except Exception as e:
logging.warning(f"DB is not ready yet: {e}")
await asyncio.sleep(3)
due_sync_tasks = SyncLogsService.list_due_sync_tasks()
due_prune_tasks = SyncLogsService.list_due_prune_tasks()
tasks = []
for task in SyncLogsService.list_sync_tasks()[0]:
for task in [*due_sync_tasks, *due_prune_tasks]:
if task["poll_range_start"]:
task["poll_range_start"] = task["poll_range_start"].astimezone(timezone.utc)
if task["poll_range_end"]:

View File

@@ -205,7 +205,7 @@ def _load_connector_app(monkeypatch):
return True
@staticmethod
def resume(*_args, **_kwargs):
def cancel_tasks(*_args, **_kwargs):
return True
@staticmethod
@@ -252,7 +252,11 @@ def _load_connector_app(monkeypatch):
PERMISSION_ERROR=403,
AUTHENTICATION_ERROR=109,
)
constants_mod.TaskStatus = SimpleNamespace(SCHEDULE="schedule", CANCEL="cancel")
constants_mod.TaskStatus = SimpleNamespace(
UNSTART="unstart",
SCHEDULE="schedule",
CANCEL="cancel",
)
monkeypatch.setitem(sys.modules, "common.constants", constants_mod)
config_mod = ModuleType("common.data_source.config")
@@ -349,7 +353,7 @@ def test_connector_basic_routes_and_task_controls(monkeypatch):
records = {"conn-1": _FakeConnectorRecord({"id": "conn-1", "source": "drive"})}
update_calls = []
save_calls = []
resume_calls = []
cancel_calls = []
delete_calls = []
monkeypatch.setattr(module.ConnectorService, "update_by_id", lambda cid, payload: update_calls.append((cid, payload)))
@@ -362,7 +366,7 @@ def test_connector_basic_routes_and_task_controls(monkeypatch):
monkeypatch.setattr(module.ConnectorService, "get_by_id", lambda cid: (True, records[cid]))
monkeypatch.setattr(module.ConnectorService, "list", lambda tenant_id: [{"id": "listed", "tenant": tenant_id}])
monkeypatch.setattr(module.SyncLogsService, "list_sync_tasks", lambda cid, page, page_size: ([{"id": "log-1"}], 9))
monkeypatch.setattr(module.ConnectorService, "resume", lambda cid, status: resume_calls.append((cid, status)))
monkeypatch.setattr(module.ConnectorService, "cancel_tasks", lambda cid: cancel_calls.append(cid))
monkeypatch.setattr(module.ConnectorService, "delete_by_id", lambda cid: delete_calls.append(cid))
monkeypatch.setattr(module, "get_uuid", lambda: "generated-id")
@@ -384,6 +388,7 @@ def test_connector_basic_routes_and_task_controls(monkeypatch):
assert save_calls[-1]["id"] == "generated-id"
assert save_calls[-1]["tenant_id"] == "tenant-1"
assert save_calls[-1]["input_type"] == module.InputType.POLL
assert save_calls[-1]["status"] == module.TaskStatus.UNSTART
assert res["data"]["id"] == "generated-id"
list_res = module.list_connector()
@@ -401,14 +406,6 @@ def test_connector_basic_routes_and_task_controls(monkeypatch):
logs_res = module.list_logs("conn-log")
assert logs_res["data"] == {"total": 9, "logs": [{"id": "log-1"}]}
monkeypatch.setattr(module, "get_request_json", lambda: _AwaitableValue({"resume": True}))
assert _run(module.resume("conn-r1"))["data"] is True
monkeypatch.setattr(module, "get_request_json", lambda: _AwaitableValue({"resume": False}))
assert _run(module.resume("conn-r2"))["data"] is True
assert ("conn-r1", module.TaskStatus.SCHEDULE) in resume_calls
assert ("conn-r2", module.TaskStatus.CANCEL) in resume_calls
monkeypatch.setattr(module, "get_request_json", lambda: _AwaitableValue({"kb_id": "kb-1"}))
monkeypatch.setattr(module.ConnectorService, "rebuild", lambda *_args: "rebuild-failed")
failed_rebuild = _run(module.rebuild("conn-rb"))
@@ -421,7 +418,7 @@ def test_connector_basic_routes_and_task_controls(monkeypatch):
rm_res = module.rm_connector("conn-rm")
assert rm_res["data"] is True
assert ("conn-rm", module.TaskStatus.CANCEL) in resume_calls
assert cancel_calls == ["conn-rm"]
assert delete_calls == ["conn-rm"]
@@ -434,14 +431,14 @@ def test_connector_by_id_routes_reject_cross_tenant_access(monkeypatch):
monkeypatch.setattr(module.ConnectorService, "accessible", lambda cid, uid: False)
monkeypatch.setattr(module.ConnectorService, "get_by_id", lambda *_args: touched.append("get_by_id"))
monkeypatch.setattr(module.SyncLogsService, "list_sync_tasks", lambda *_args: touched.append("list_sync_tasks"))
monkeypatch.setattr(module.ConnectorService, "resume", lambda *_args: touched.append("resume"))
monkeypatch.setattr(module.ConnectorService, "cancel_tasks", lambda *_args: touched.append("cancel_tasks"))
monkeypatch.setattr(module.ConnectorService, "delete_by_id", lambda *_args: touched.append("delete_by_id"))
monkeypatch.setattr(module.ConnectorService, "update_by_id", lambda *_args: touched.append("update_by_id"))
monkeypatch.setattr(module.ConnectorService, "rebuild", lambda *_args: touched.append("rebuild"))
def _get_request_json():
touched.append("get_request_json")
return _AwaitableValue({"resume": True, "config": {"x": 1}})
return _AwaitableValue({"config": {"x": 1}})
monkeypatch.setattr(module, "get_request_json", _get_request_json)
@@ -449,7 +446,6 @@ def test_connector_by_id_routes_reject_cross_tenant_access(monkeypatch):
_run(module.update_connector("conn-victim")),
module.get_connector("conn-victim"),
module.list_logs("conn-victim"),
_run(module.resume("conn-victim")),
_run(module.rebuild("conn-victim")),
module.rm_connector("conn-victim"),
_run(module.test_connector("conn-victim")),

View File

@@ -205,7 +205,7 @@ def _load_connector_app(monkeypatch):
return True
@staticmethod
def resume(*_args, **_kwargs):
def cancel_tasks(*_args, **_kwargs):
return True
@staticmethod
@@ -349,7 +349,7 @@ def test_connector_basic_routes_and_task_controls(monkeypatch):
records = {"conn-1": _FakeConnectorRecord({"id": "conn-1", "source": "drive"})}
update_calls = []
save_calls = []
resume_calls = []
cancel_calls = []
delete_calls = []
monkeypatch.setattr(module.ConnectorService, "update_by_id", lambda cid, payload: update_calls.append((cid, payload)))
@@ -362,7 +362,7 @@ def test_connector_basic_routes_and_task_controls(monkeypatch):
monkeypatch.setattr(module.ConnectorService, "get_by_id", lambda cid: (True, records[cid]))
monkeypatch.setattr(module.ConnectorService, "list", lambda tenant_id: [{"id": "listed", "tenant": tenant_id}])
monkeypatch.setattr(module.SyncLogsService, "list_sync_tasks", lambda cid, page, page_size: ([{"id": "log-1"}], 9))
monkeypatch.setattr(module.ConnectorService, "resume", lambda cid, status: resume_calls.append((cid, status)))
monkeypatch.setattr(module.ConnectorService, "cancel_tasks", lambda cid: cancel_calls.append(cid))
monkeypatch.setattr(module.ConnectorService, "delete_by_id", lambda cid: delete_calls.append(cid))
monkeypatch.setattr(module, "get_uuid", lambda: "generated-id")
@@ -401,14 +401,6 @@ def test_connector_basic_routes_and_task_controls(monkeypatch):
logs_res = module.list_logs("conn-log")
assert logs_res["data"] == {"total": 9, "logs": [{"id": "log-1"}]}
monkeypatch.setattr(module, "get_request_json", lambda: _AwaitableValue({"resume": True}))
assert _run(module.resume("conn-r1"))["data"] is True
monkeypatch.setattr(module, "get_request_json", lambda: _AwaitableValue({"resume": False}))
assert _run(module.resume("conn-r2"))["data"] is True
assert ("conn-r1", module.TaskStatus.SCHEDULE) in resume_calls
assert ("conn-r2", module.TaskStatus.CANCEL) in resume_calls
monkeypatch.setattr(module, "get_request_json", lambda: _AwaitableValue({"kb_id": "kb-1"}))
monkeypatch.setattr(module.ConnectorService, "rebuild", lambda *_args: "rebuild-failed")
failed_rebuild = _run(module.rebuild("conn-rb"))
@@ -421,7 +413,7 @@ def test_connector_basic_routes_and_task_controls(monkeypatch):
rm_res = module.rm_connector("conn-rm")
assert rm_res["data"] is True
assert ("conn-rm", module.TaskStatus.CANCEL) in resume_calls
assert cancel_calls == ["conn-rm"]
assert delete_calls == ["conn-rm"]
@@ -434,14 +426,14 @@ def test_connector_by_id_routes_reject_cross_tenant_access(monkeypatch):
monkeypatch.setattr(module.ConnectorService, "accessible", lambda cid, uid: False)
monkeypatch.setattr(module.ConnectorService, "get_by_id", lambda *_args: touched.append("get_by_id"))
monkeypatch.setattr(module.SyncLogsService, "list_sync_tasks", lambda *_args: touched.append("list_sync_tasks"))
monkeypatch.setattr(module.ConnectorService, "resume", lambda *_args: touched.append("resume"))
monkeypatch.setattr(module.ConnectorService, "cancel_tasks", lambda *_args: touched.append("cancel_tasks"))
monkeypatch.setattr(module.ConnectorService, "delete_by_id", lambda *_args: touched.append("delete_by_id"))
monkeypatch.setattr(module.ConnectorService, "update_by_id", lambda *_args: touched.append("update_by_id"))
monkeypatch.setattr(module.ConnectorService, "rebuild", lambda *_args: touched.append("rebuild"))
def _get_request_json():
touched.append("get_request_json")
return _AwaitableValue({"resume": True, "config": {"x": 1}})
return _AwaitableValue({"config": {"x": 1}})
monkeypatch.setattr(module, "get_request_json", _get_request_json)
@@ -449,7 +441,6 @@ def test_connector_by_id_routes_reject_cross_tenant_access(monkeypatch):
_run(module.update_connector("conn-victim")),
module.get_connector("conn-victim"),
module.list_logs("conn-victim"),
_run(module.resume("conn-victim")),
_run(module.rebuild("conn-victim")),
module.rm_connector("conn-victim"),
_run(module.test_connector("conn-victim")),

View File

@@ -133,7 +133,53 @@ def _patch_common_dependencies(monkeypatch):
@pytest.mark.anyio
@pytest.mark.p2
async def test_run_task_logic_cleans_up_for_empty_snapshot(monkeypatch):
async def test_run_task_logic_skips_empty_sync_batches(monkeypatch):
_patch_common_dependencies(monkeypatch)
monkeypatch.setattr(
sync_data_source.SyncLogsService,
"increase_docs",
lambda *_args, **_kwargs: pytest.fail("increase_docs should not be called for empty batches"),
)
monkeypatch.setattr(
sync_data_source.KnowledgebaseService,
"get_by_id",
lambda *_args, **_kwargs: pytest.fail("get_by_id should not be called for empty batches"),
)
monkeypatch.setattr(
sync_data_source.SyncLogsService,
"duplicate_and_parse",
lambda *_args, **_kwargs: pytest.fail("duplicate_and_parse should not be called for empty batches"),
)
await _FakeSync(iter(([],)))._run_task_logic(_make_task())
@pytest.mark.anyio
@pytest.mark.p2
async def test_run_task_logic_skips_multiple_empty_sync_batches(monkeypatch):
_patch_common_dependencies(monkeypatch)
monkeypatch.setattr(
sync_data_source.SyncLogsService,
"increase_docs",
lambda *_args, **_kwargs: pytest.fail("increase_docs should not be called for empty batches"),
)
monkeypatch.setattr(
sync_data_source.KnowledgebaseService,
"get_by_id",
lambda *_args, **_kwargs: pytest.fail("get_by_id should not be called for empty batches"),
)
monkeypatch.setattr(
sync_data_source.SyncLogsService,
"duplicate_and_parse",
lambda *_args, **_kwargs: pytest.fail("duplicate_and_parse should not be called for empty batches"),
)
await _FakeSync(iter(([], [],)))._run_task_logic(_make_task())
@pytest.mark.anyio
@pytest.mark.p2
async def test_run_prune_task_logic_cleans_up_for_empty_snapshot(monkeypatch):
cleanup_calls = []
_patch_common_dependencies(monkeypatch)
@@ -148,7 +194,14 @@ async def test_run_task_logic_cleans_up_for_empty_snapshot(monkeypatch):
_fake_cleanup,
)
await _FakeSync((iter(()), []))._run_task_logic(_make_task())
task = {**_make_task(), "task_type": sync_data_source.ConnectorTaskType.PRUNE}
sync = _FakeSync(iter(()))
sync.conf["sync_deleted_files"] = True
sync.connector = types.SimpleNamespace(
retrieve_all_slim_docs_perm_sync=lambda: iter(([],))
)
await sync._run_task_logic(task)
assert cleanup_calls == [
(
@@ -166,7 +219,7 @@ async def test_run_task_logic_cleans_up_for_empty_snapshot(monkeypatch):
@pytest.mark.anyio
@pytest.mark.p2
async def test_run_task_logic_cleans_up_for_non_empty_snapshot(monkeypatch):
async def test_run_prune_task_logic_cleans_up_for_non_empty_snapshot(monkeypatch):
cleanup_calls = []
_patch_common_dependencies(monkeypatch)
@@ -182,7 +235,14 @@ async def test_run_task_logic_cleans_up_for_non_empty_snapshot(monkeypatch):
)
file_list = [types.SimpleNamespace(id="doc-1")]
await _FakeSync((iter(()), file_list))._run_task_logic(_make_task())
task = {**_make_task(), "task_type": sync_data_source.ConnectorTaskType.PRUNE}
sync = _FakeSync(iter(()))
sync.conf["sync_deleted_files"] = True
sync.connector = types.SimpleNamespace(
retrieve_all_slim_docs_perm_sync=lambda: iter((file_list,))
)
await sync._run_task_logic(task)
assert cleanup_calls == [
(
@@ -285,12 +345,13 @@ async def test_rdbms_generate_keeps_deleted_file_snapshot_without_timestamp_colu
}
)
document_generator, file_list = await sync._generate(task)
document_generator = await sync._generate(task)
connector = _FakeRDBMSConnector.instance
assert connector is not None
assert connector.load_from_state_called is True
assert connector.load_from_cursor_range_called is False
file_list = sync._collect_prune_snapshot(task)
assert connector.retrieve_all_slim_docs_perm_sync_called is True
assert file_list is not None
assert [doc.id for doc in file_list] == ["row-1"]
@@ -447,14 +508,15 @@ async def test_dropbox_generate_returns_snapshot_when_sync_deleted_enabled(monke
}
)
document_generator, file_list = await sync._generate(task)
document_generator = await sync._generate(task)
connector = _FakeDropboxConnector.instance
assert list(document_generator) == [["poll-sync"]]
file_list = sync._collect_prune_snapshot(task)
assert [doc.id for doc in file_list] == ["dropbox:id-1", "dropbox:id-2"]
assert connector.credentials == {"dropbox_access_token": "token-1"}
assert connector.retrieve_all_slim_docs_perm_sync_called is True
assert connector.snapshot_called_before_poll is True
assert connector.snapshot_called_before_poll is False
assert connector.poll_source_call[0] == poll_start.timestamp()
assert connector.poll_source_call[1] >= poll_start.timestamp()
@@ -477,11 +539,12 @@ async def test_dropbox_generate_skips_snapshot_for_full_reindex(monkeypatch):
}
)
document_generator, file_list = await sync._generate(task)
document_generator = await sync._generate(task)
connector = _FakeDropboxConnector.instance
assert list(document_generator) == [["full-sync"]]
assert file_list is None
assert connector.load_from_state_called is True
assert connector.retrieve_all_slim_docs_perm_sync_called is False
file_list = sync._collect_prune_snapshot(task)
assert [doc.id for doc in file_list] == ["dropbox:id-1", "dropbox:id-2"]
assert connector.retrieve_all_slim_docs_perm_sync_called is True
assert connector.poll_source_called is False

View File

@@ -111,10 +111,12 @@ interface DynamicFormProps<T extends FieldValues> {
// Form ref interface
export interface DynamicFormRef {
submit: () => void;
isDirty: () => boolean;
getValues: (name?: string) => any;
reset: (values?: any) => void;
trigger: UseFormTrigger<any>;
watch: (field: string, callback: (value: any) => void) => () => void;
watchDirty: (callback: (isDirty: boolean, values: any) => void) => () => void;
updateFieldType: (fieldName: string, newType: FormFieldType) => void;
onFieldUpdate: (
fieldName: string,
@@ -809,6 +811,7 @@ const DynamicForm = {
onSubmit(filteredValues);
})();
},
isDirty: () => form.formState.isDirty,
getValues: form.getValues,
reset: (values?: T) => {
if (values) {
@@ -828,6 +831,12 @@ const DynamicForm = {
});
return unsubscribe;
},
watchDirty: (callback: (isDirty: boolean, values: any) => void) => {
const { unsubscribe } = form.watch((values: any) => {
callback(form.formState.isDirty, values);
});
return unsubscribe;
},
onFieldUpdate: (
fieldName: string,

View File

@@ -9,9 +9,9 @@ import { Button } from '@/components/ui/button';
import { Card, CardContent, CardHeader, CardTitle } from '@/components/ui/card';
import { Input } from '@/components/ui/input';
import { Separator } from '@/components/ui/separator';
import { RunningStatus } from '@/constants/knowledge';
import { RunningStatus, RunningStatusOld } from '@/constants/knowledge';
import { t } from 'i18next';
import { CirclePause, Repeat } from 'lucide-react';
import { isEqual } from 'lodash';
import { useCallback, useEffect, useMemo, useRef, useState } from 'react';
import { FieldValues } from 'react-hook-form';
import {
@@ -25,9 +25,9 @@ import {
} from '../constant';
import {
useAddDataSource,
useDataSourceResume,
useFetchDataSourceDetail,
useTestDataSource,
useUpdateDataSourceStatus,
} from '../hooks';
import { DataSourceLogsTable } from './log-table';
@@ -35,7 +35,8 @@ const SourceDetailPage = () => {
const formRef = useRef<DynamicFormRef>(null);
const { data: detail } = useFetchDataSourceDetail();
const { handleResume } = useDataSourceResume();
const { updateStatus, loading: statusUpdateLoading } =
useUpdateDataSourceStatus();
const { dataSourceInfo } = useDataSourceInfo();
const detailInfo = useMemo(() => {
if (detail) {
@@ -44,83 +45,52 @@ const SourceDetailPage = () => {
}, [detail, dataSourceInfo]);
const [fields, setFields] = useState<FormFieldConfig[]>([]);
const [isDirty, setIsDirty] = useState(false);
const [defaultValues, setDefaultValues] = useState<FieldValues>(
DataSourceFormDefaultValues[
detail?.source as keyof typeof DataSourceFormDefaultValues
] as FieldValues,
);
const runSchedule = useCallback(() => {
handleResume({
resume:
detail?.status === RunningStatus.RUNNING ||
detail?.status === RunningStatus.SCHEDULE
? false
: true,
});
}, [detail, handleResume]);
const customFields = useMemo(() => {
return [
{
label: 'Prune Freq',
name: 'prune_freq',
type: FormFieldType.Number,
required: false,
shouldRender: (values: any) => !!values?.config?.sync_deleted_files,
render: (fieldProps: FormFieldConfig) => {
return (
<Input
{...fieldProps}
type={FormFieldType.Number}
suffix={
<span className="px-2 text-text-secondary italic">
{t('setting.minutes')}
</span>
}
/>
);
},
},
{
label: 'Refresh Freq',
name: 'refresh_freq',
type: FormFieldType.Number,
required: false,
render: (fieldProps: FormFieldConfig) => (
<div className="flex items-center gap-1 w-full relative">
<div className="flex-1">
<Input
{...fieldProps}
type={FormFieldType.Number}
suffix={
<span className="px-2 text-text-secondary italic">
{t('setting.minutes')}
</span>
}
/>
</div>
<button
type="button"
className="text-text-secondary bg-bg-input rounded-sm text-xs h-full p-2 border border-border-button hover:bg-border-button hover:text-text-primary"
onClick={() => {
runSchedule();
}}
>
{detail?.status === RunningStatus.RUNNING ||
detail?.status === RunningStatus.SCHEDULE ? (
<CirclePause size={12} />
) : (
<Repeat size={12} />
)}
</button>
</div>
<Input
{...fieldProps}
type={FormFieldType.Number}
suffix={
<span className="px-2 text-text-secondary italic">
{t('setting.minutes')}
</span>
}
/>
),
},
{
label: 'Prune Freq',
name: 'prune_freq',
type: FormFieldType.Number,
required: false,
hidden: true,
render: (fieldProps: FormFieldConfig) => {
return (
<div className="flex items-center gap-1 w-full relative">
<div className="flex-1">
<Input
{...fieldProps}
type={FormFieldType.Number}
suffix={
<span className="px-2 text-text-secondary italic">
hours
</span>
}
/>
</div>
</div>
);
},
},
{
label: 'Timeout Secs',
name: 'timeout_secs',
@@ -143,7 +113,7 @@ const SourceDetailPage = () => {
),
},
];
}, [detail, runSchedule]);
}, []);
const { addLoading, handleAddOk } = useAddDataSource({ isEdit: true });
const { loading: testLoading, handleTest } = useTestDataSource();
@@ -152,6 +122,54 @@ const SourceDetailPage = () => {
formRef?.current?.submit();
}, []);
const isUnstarted = useMemo(
() =>
detail?.status === RunningStatus.UNSTART ||
detail?.status === RunningStatusOld.UNSTART,
[detail?.status],
);
const isConnectorActive = useMemo(
() =>
detail?.status === RunningStatus.RUNNING ||
detail?.status === RunningStatus.SCHEDULE ||
detail?.status === RunningStatusOld.RUNNING ||
detail?.status === RunningStatusOld.SCHEDULE,
[detail?.status],
);
const actionMode = useMemo(() => {
if (isDirty) {
return 'save' as const;
}
if (isUnstarted) {
return 'save' as const;
}
if (isConnectorActive) {
return 'stop' as const;
}
return 'resume' as const;
}, [isConnectorActive, isDirty, isUnstarted]);
const handlePrimaryAction = useCallback(() => {
if (actionMode === 'save') {
onSubmit();
return;
}
updateStatus(
actionMode === 'resume' ? RunningStatus.SCHEDULE : RunningStatus.CANCEL,
);
}, [actionMode, onSubmit, updateStatus]);
const primaryActionLabel = useMemo(() => {
if (actionMode === 'stop') return 'Stop';
if (actionMode === 'resume') return 'Resume';
return 'Save';
}, [actionMode]);
useEffect(() => {
const baseFields = DataSourceFormBaseFields.map((field) => {
if (field.name === 'name') {
@@ -191,9 +209,20 @@ const SourceDetailPage = () => {
),
};
setDefaultValues(defaultValueTemp);
setIsDirty(false);
}
}, [detail, customFields, onSubmit]);
useEffect(() => {
const instance = formRef.current;
if (!instance) return;
setIsDirty(!isEqual(instance.getValues(), defaultValues));
return instance.watchDirty((_nextIsDirty, values) => {
setIsDirty(!isEqual(values, defaultValues));
});
}, [defaultValues, fields]);
return (
<div className="px-10 py-5">
<BackButton />
@@ -229,22 +258,21 @@ const SourceDetailPage = () => {
)}
<Button
type="button"
onClick={onSubmit}
disabled={addLoading}
loading={addLoading}
onClick={handlePrimaryAction}
disabled={addLoading || statusUpdateLoading}
loading={
(addLoading && actionMode === 'save') ||
(statusUpdateLoading && actionMode !== 'save')
}
>
{t('common.confirm')}
{/* {addLoading && <Loader2 className="mr-2 h-4 w-4 animate-spin" />}
{addLoading
? t('modal.loadingText', { defaultValue: 'Submitting...' })
: t('modal.okText', { defaultValue: 'Submit' })} */}
{primaryActionLabel}
</Button>
</div>
<section className="flex flex-col gap-2">
<div className="text-2xl text-text-primary mb-2">
{t('setting.log')}
</div>
<DataSourceLogsTable refresh_freq={detail?.refresh_freq || false} />
<DataSourceLogsTable autoRefresh={isConnectorActive} />
</section>
</CardContent>
</Card>

View File

@@ -1,6 +1,5 @@
import FileStatusBadge from '@/components/file-status-badge';
import { RAGFlowAvatar } from '@/components/ragflow-avatar';
import { Button } from '@/components/ui/button';
import { RAGFlowPagination } from '@/components/ui/ragflow-pagination';
import {
Table,
@@ -14,11 +13,6 @@ import { RunningStatusMap } from '@/constants/knowledge';
import { RunningStatus } from '@/pages/dataset/dataset/constant';
import { Routes } from '@/routes';
import { formatDate } from '@/utils/date';
import {
HoverCard,
HoverCardContent,
HoverCardTrigger,
} from '@radix-ui/react-hover-card';
import {
ColumnDef,
flexRender,
@@ -30,15 +24,86 @@ import {
} from '@tanstack/react-table';
import { t } from 'i18next';
import { pick } from 'lodash';
import { Eye } from 'lucide-react';
import { useCallback, useMemo } from 'react';
import { useCallback, useEffect, useMemo, useState } from 'react';
import { useNavigate } from 'react-router';
import { useLogListDataSource } from '../hooks';
import { IDataSourceLog } from '../interface';
const formatDuration = (seconds: number) => {
const safeSeconds = Math.max(0, seconds);
const hours = Math.floor(safeSeconds / 3600);
const minutes = Math.floor((safeSeconds % 3600) / 60);
const remainingSeconds = safeSeconds % 60;
if (hours > 0) {
return `${hours}h ${minutes}m ${remainingSeconds}s`;
}
if (minutes > 0) {
return `${minutes}m ${remainingSeconds}s`;
}
return `${remainingSeconds}s`;
};
const getTaskCountdownSeconds = (row: IDataSourceLog, now: number) => {
const freqMinutes =
row.task_type === 'prune'
? Number(row.prune_freq || 0)
: Number(row.refresh_freq || 0);
const scheduledAt = row.time_started
? new Date(row.time_started).getTime()
: 0;
if (!freqMinutes || !scheduledAt) {
return null;
}
const nextRunAt = scheduledAt + freqMinutes * 60 * 1000;
return Math.ceil((nextRunAt - now) / 1000);
};
const TaskCountdown = ({ row, now }: { row: IDataSourceLog; now: number }) => {
const remainingSeconds = getTaskCountdownSeconds(row, now);
if (remainingSeconds === null) {
return '';
}
return <span>Task starts in {formatDuration(remainingSeconds)}</span>;
};
const getSummary = (row: IDataSourceLog, now: number) => {
if (row.status === RunningStatus.SCHEDULE || row.status === '5') {
return <TaskCountdown row={row} now={now} />;
}
if (row.status === RunningStatus.RUNNING || row.status === '1') {
return row.task_type === 'prune' ? 'Prune in progress' : 'Sync in progress';
}
if (row.status === RunningStatus.FAIL || row.status === '4') {
return row.error_msg || 'Task failed';
}
if (row.status === RunningStatus.CANCEL || row.status === '2') {
return '';
}
if (row.task_type === 'prune') {
return `deleted=${row.docs_removed_from_index || 0}, error=${row.error_count || 0}`;
}
return `total=${row.total_docs_indexed || 0}, added=${row.new_docs_indexed || 0}, updated=${Math.max(
0,
(row.total_docs_indexed || 0) - (row.new_docs_indexed || 0),
)}, error=${row.error_count || 0}`;
};
const columns = ({
handleToDataSetDetail,
now,
}: {
handleToDataSetDetail: (id: string) => void;
now: number;
}) => {
return [
{
@@ -71,7 +136,6 @@ const columns = ({
<div
className="flex items-center gap-2 text-text-primary cursor-pointer"
onClick={() => {
console.log('handleToDataSetDetail', row.original.kb_id);
handleToDataSetDetail(row.original.kb_id);
}}
>
@@ -86,39 +150,16 @@ const columns = ({
},
},
{
accessorKey: 'new_docs_indexed',
header: t('setting.newDocs'),
accessorKey: 'task_type',
header: 'Task Type',
cell: ({ row }) => row.original.task_type || 'sync',
},
{
id: 'operations',
header: t('setting.errorMsg'),
id: 'summary',
header: 'Summary',
cell: ({ row }) => (
<div className="flex gap-1 items-center">
{row.original.error_msg}
{row.original.error_msg && (
<div className="flex justify-start space-x-2 opacity-0 group-hover:opacity-100 transition-opacity">
<HoverCard>
<HoverCardTrigger>
<Button
variant="ghost"
size="sm"
className="p-1"
// onClick={() => {
// showLog(row, LogTabs.FILE_LOGS);
// }}
>
<Eye />
</Button>
</HoverCardTrigger>
<HoverCardContent className="w-[40vw] max-h-[40vh] overflow-auto bg-bg-base z-[999] px-3 py-2 rounded-md border border-border-default">
<div className="space-y-2">
{row.original.full_exception_trace}
</div>
</HoverCardContent>
</HoverCard>
</div>
)}
<div className="max-w-[32rem] whitespace-normal break-words text-text-primary">
{getSummary(row.original as IDataSourceLog, now)}
</div>
),
},
@@ -131,14 +172,22 @@ const columns = ({
// total: 0,
// };
export const DataSourceLogsTable = ({
refresh_freq,
autoRefresh,
}: {
refresh_freq: number | false;
autoRefresh: boolean;
}) => {
// const [pagination, setPagination] = useState(paginationInit);
const { data, pagination, setPagination } =
useLogListDataSource(refresh_freq);
const { data, pagination, setPagination } = useLogListDataSource(autoRefresh);
const navigate = useNavigate();
const [now, setNow] = useState(() => Date.now());
useEffect(() => {
const timer = window.setInterval(() => {
setNow(Date.now());
}, 1000);
return () => window.clearInterval(timer);
}, []);
const currentPagination = useMemo(
() => ({
pageIndex: (pagination.current || 1) - 1,
@@ -149,15 +198,14 @@ export const DataSourceLogsTable = ({
const handleToDataSetDetail = useCallback(
(id: string) => {
console.log('handleToDataSetDetail', id);
navigate(`${Routes.DatasetBase}${Routes.DatasetBase}/${id}`);
navigate(`${Routes.Dataset}/${id}`);
},
[navigate],
);
const table = useReactTable<any>({
data: data || [],
columns: columns({ handleToDataSetDetail }),
columns: columns({ handleToDataSetDetail, now }),
manualPagination: true,
getCoreRowModel: getCoreRowModel(),
getPaginationRowModel: getPaginationRowModel(),

View File

@@ -1,9 +1,9 @@
import message from '@/components/ui/message';
import { RunningStatus } from '@/constants/knowledge';
import { useSetModalState } from '@/hooks/common-hooks';
import { useGetPaginationWithRouter } from '@/hooks/logic-hooks';
import dataSourceService, {
dataSourceRebuild,
dataSourceResume,
dataSourceUpdate,
deleteDataSource,
featchDataSourceDetail,
@@ -15,7 +15,12 @@ import { t } from 'i18next';
import { useCallback, useMemo, useState } from 'react';
import { useParams, useSearchParams } from 'react-router';
import { DataSourceKey, useDataSourceInfo } from './constant';
import { IDataSorceInfo, IDataSource, IDataSourceBase } from './interface';
import {
IDataSorceInfo,
IDataSource,
IDataSourceBase,
IDataSourceLog,
} from './interface';
export const useListDataSource = () => {
const { dataSourceInfo } = useDataSourceInfo();
@@ -28,10 +33,8 @@ export const useListDataSource = () => {
});
const categorizeDataBySource = (data: IDataSourceBase[]) => {
const categorizedData: Record<DataSourceKey, any[]> = {} as Record<
DataSourceKey,
any[]
>;
const categorizedData: Partial<Record<DataSourceKey, IDataSourceBase[]>> =
{};
data.forEach((item) => {
const source = item.source;
@@ -93,17 +96,29 @@ export const useAddDataSource = ({ isEdit = false }: { isEdit?: boolean }) => {
async (data: any) => {
setAddLoading(true);
const { data: res } = isEdit
? await dataSourceUpdate(data.id, data)
? await dataSourceUpdate(data.id, {
...data,
reschedule: true,
})
: await dataSourceService.dataSourceSet(data);
console.log('🚀 ~ handleAddOk ~ code:', res.code);
if (res.code === 0) {
if (isEdit && res.data?.id) {
queryClient.setQueryData(
['data-source-detail', res.data.id],
res.data,
);
queryClient.invalidateQueries({
queryKey: ['data-source-detail', res.data.id],
});
}
queryClient.invalidateQueries({ queryKey: ['data-source'] });
message.success(t(`message.operated`));
hideAddingModal();
}
setAddLoading(false);
},
[hideAddingModal, queryClient],
[hideAddingModal, isEdit, queryClient],
);
return {
@@ -117,24 +132,25 @@ export const useAddDataSource = ({ isEdit = false }: { isEdit?: boolean }) => {
};
};
export const useLogListDataSource = (refresh_freq: number | false) => {
export const useLogListDataSource = (autoRefresh: boolean) => {
const { pagination, setPagination } = useGetPaginationWithRouter();
const [currentQueryParameters] = useSearchParams();
const id = currentQueryParameters.get('id');
const { data, isFetching } = useQuery<{ logs: IDataSource[]; total: number }>(
{
queryKey: ['data-source-logs', id, pagination, refresh_freq],
refetchInterval: refresh_freq ? refresh_freq * 60 * 1000 : false,
queryFn: async () => {
const { data } = await getDataSourceLogs(id as string, {
page_size: pagination.pageSize,
page: pagination.current,
});
return data.data;
},
const { data, isFetching } = useQuery<{
logs: IDataSourceLog[];
total: number;
}>({
queryKey: ['data-source-logs', id, pagination, autoRefresh],
refetchInterval: autoRefresh ? 15 * 1000 : false,
queryFn: async () => {
const { data } = await getDataSourceLogs(id as string, {
page_size: pagination.pageSize,
page: pagination.current,
});
return data.data;
},
);
});
return {
data: data?.logs,
isFetching,
@@ -179,21 +195,49 @@ export const useFetchDataSourceDetail = () => {
return { data };
};
export const useDataSourceResume = () => {
export const useUpdateDataSourceStatus = () => {
const [currentQueryParameters] = useSearchParams();
const id = currentQueryParameters.get('id');
const queryClient = useQueryClient();
const handleResume = useCallback(
async (param: { resume: boolean }) => {
const { data } = await dataSourceResume(id as string, param);
if (data.code === 0) {
queryClient.invalidateQueries({ queryKey: ['data-source-detail', id] });
message.success(t(`message.operated`));
const [loading, setLoading] = useState(false);
const updateStatus = useCallback(
async (status: RunningStatus.SCHEDULE | RunningStatus.CANCEL) => {
if (!id) return;
setLoading(true);
try {
const { data } = await dataSourceUpdate(id, {
status,
});
if (data.code === 0) {
queryClient.setQueryData(
['data-source-detail', id],
(previous?: IDataSource) => ({
...(previous || {}),
...(data.data || {}),
status: data.data?.status ?? status,
}),
);
await Promise.all([
queryClient.invalidateQueries({
queryKey: ['data-source-detail', id],
}),
queryClient.invalidateQueries({ queryKey: ['data-source'] }),
queryClient.invalidateQueries({
queryKey: ['data-source-logs', id],
}),
]);
message.success(t(`message.operated`));
}
} finally {
setLoading(false);
}
},
[id, queryClient],
);
return { handleResume };
return { updateStatus, loading };
};
export const useDataSourceRebuild = () => {

View File

@@ -1,5 +1,5 @@
import { RunningStatus } from '@/constants/knowledge';
import { DataSourceKey } from './contant';
import { DataSourceKey } from './constant';
export interface IDataSorceInfo {
id: DataSourceKey;
@@ -28,20 +28,20 @@ export interface IDataSourceBase {
export interface IDataSourceLog {
connector_id: string;
docs_removed_from_index?: number;
error_count: number;
error_msg: string;
id: string;
kb_id: string;
kb_name: string;
name: string;
new_docs_indexed: number;
poll_range_end: null | string;
poll_range_start: null | string;
reindex: string;
source: DataSourceKey;
prune_freq?: number;
refresh_freq?: number;
status: RunningStatus;
tenant_id: string;
timeout_secs: number;
task_type?: string;
time_started?: string | null;
total_docs_indexed?: number;
update_date: string;
}
interface IDataSourceInfoItem {

View File

@@ -20,15 +20,12 @@ const dataSourceService = registerServer<keyof typeof methods>(
export const deleteDataSource = (id: string) =>
request.delete(api.dataSourceDel(id));
export const dataSourceResume = (id: string, data: { resume: boolean }) => {
return request.post(api.dataSourceResume(id), { data });
};
export const dataSourceRebuild = (id: string, data: { kb_id: string }) => {
return request.post(api.dataSourceRebuild(id), { data });
};
export const dataSourceUpdate = (id: string, data: { kb_id: string }) => {
export const dataSourceUpdate = (id: string, data: Record<string, any>) => {
return request.patch(api.dataSourceUpdate(id), { data });
};

View File

@@ -39,7 +39,6 @@ export default {
dataSourceSet: `${restAPIv1}/connectors`,
dataSourceList: `${restAPIv1}/connectors`,
dataSourceDel: (id: string) => `${restAPIv1}/connectors/${id}`,
dataSourceResume: (id: string) => `${restAPIv1}/connectors/${id}/resume`,
dataSourceRebuild: (id: string) => `${restAPIv1}/connectors/${id}/rebuild`,
dataSourceLogs: (id: string) => `${restAPIv1}/connectors/${id}/logs`,
dataSourceDetail: (id: string) => `${restAPIv1}/connectors/${id}`,