mirror of
https://github.com/infiniflow/ragflow.git
synced 2026-06-29 23:41:12 +08:00
feat(zendesk): support deleted-file sync (#14487)
### What problem does this PR solve? Refs #14362. This PR enables syncing deleted files for Zendesk data sources. Previously, Zendesk incremental sync never returned a slim remote snapshot to the shared stale-document cleanup path, so deleted remote Zendesk records could remain in RAGFlow. The existing Zendesk slim snapshot also included records that ingestion intentionally skips, such as draft articles, articles without bodies, skipped-label articles, empty-body articles, and tickets with `status == "deleted"`. This PR: - exposes the deleted-file sync option for Zendesk in the data source UI - returns Zendesk slim snapshots during incremental sync when `sync_deleted_files` is enabled - reuses Zendesk indexability rules so cleanup compares against the same records ingestion can materialize - adds start/end logs around Zendesk slim snapshot collection for operational visibility Per maintainer request, this PR contains no test-case changes. Manual verification recording will be provided separately. Validation: - `uv run ruff check common/data_source/zendesk_connector.py rag/svr/sync_data_source.py` - `uv run pytest test/unit_test/rag/test_sync_data_source.py -q` - `./node_modules/.bin/eslint src/pages/user-setting/data-source/constant/index.tsx` ### Type of change - [ ] Bug Fix (non-breaking change which fixes an issue) - [x] New Feature (non-breaking change which adds functionality) - [ ] Documentation Update - [ ] Refactoring - [ ] Performance Improvement - [ ] Other (please describe):
This commit is contained in:
@@ -246,6 +246,18 @@ def _article_to_document(
|
||||
)
|
||||
|
||||
|
||||
def _is_indexable_article(article: dict[str, Any]) -> bool:
|
||||
body = article.get("body")
|
||||
return (
|
||||
bool(body)
|
||||
and not article.get("draft")
|
||||
and not any(
|
||||
label in ZENDESK_CONNECTOR_SKIP_ARTICLE_LABELS
|
||||
for label in article.get("label_names") or []
|
||||
)
|
||||
)
|
||||
|
||||
|
||||
def _get_comment_text(
|
||||
comment: dict[str, Any],
|
||||
author_map: dict[str, BasicExpertInfo],
|
||||
@@ -333,6 +345,10 @@ def _ticket_to_document(
|
||||
)
|
||||
|
||||
|
||||
def _is_indexable_ticket(ticket: dict[str, Any]) -> bool:
|
||||
return ticket.get("status") != "deleted"
|
||||
|
||||
|
||||
class ZendeskConnectorCheckpoint(ConnectorCheckpoint):
|
||||
# We use cursor-based paginated retrieval for articles
|
||||
after_cursor_articles: str | None
|
||||
@@ -419,14 +435,7 @@ class ZendeskConnector(
|
||||
has_more = response.has_more
|
||||
after_cursor = response.meta.get("after_cursor")
|
||||
for article in articles:
|
||||
if (
|
||||
article.get("body") is None
|
||||
or article.get("draft")
|
||||
or any(
|
||||
label in ZENDESK_CONNECTOR_SKIP_ARTICLE_LABELS
|
||||
for label in article.get("label_names", [])
|
||||
)
|
||||
):
|
||||
if not _is_indexable_article(article):
|
||||
continue
|
||||
|
||||
try:
|
||||
@@ -498,7 +507,7 @@ class ZendeskConnector(
|
||||
has_more = ticket_response.has_more
|
||||
next_start_time = ticket_response.meta["end_time"]
|
||||
for ticket in tickets:
|
||||
if ticket.get("status") == "deleted":
|
||||
if not _is_indexable_ticket(ticket):
|
||||
continue
|
||||
|
||||
try:
|
||||
@@ -559,6 +568,8 @@ class ZendeskConnector(
|
||||
if self.content_type == "articles":
|
||||
articles = _get_articles(self.client)
|
||||
for article in articles:
|
||||
if not _is_indexable_article(article):
|
||||
continue
|
||||
slim_doc_batch.append(
|
||||
SlimDocument(
|
||||
id=f"article:{article['id']}",
|
||||
@@ -570,6 +581,8 @@ class ZendeskConnector(
|
||||
elif self.content_type == "tickets":
|
||||
tickets = _get_tickets(self.client)
|
||||
for ticket in tickets:
|
||||
if not _is_indexable_ticket(ticket):
|
||||
continue
|
||||
slim_doc_batch.append(
|
||||
SlimDocument(
|
||||
id=f"zendesk_ticket_{ticket['id']}",
|
||||
|
||||
@@ -1232,11 +1232,26 @@ class Zendesk(SyncBase):
|
||||
self.connector.load_credentials(self.conf["credentials"])
|
||||
|
||||
end_time = datetime.now(timezone.utc).timestamp()
|
||||
file_list = None
|
||||
if task["reindex"] == "1" or not task.get("poll_range_start"):
|
||||
start_time = 0
|
||||
_begin_info = "totally"
|
||||
else:
|
||||
start_time = task["poll_range_start"].timestamp()
|
||||
if self.conf.get("sync_deleted_files"):
|
||||
logging.info(
|
||||
"[Zendesk] Syncing deleted files via slim snapshot (connector_id=%s)",
|
||||
task.get("connector_id"),
|
||||
)
|
||||
snapshot_start = time.perf_counter()
|
||||
file_list = []
|
||||
for slim_batch in self.connector.retrieve_all_slim_docs_perm_sync():
|
||||
file_list.extend(slim_batch)
|
||||
logging.info(
|
||||
"[Zendesk] Slim snapshot fetched %d docs in %.2f seconds",
|
||||
len(file_list),
|
||||
time.perf_counter() - snapshot_start,
|
||||
)
|
||||
_begin_info = f"from {task['poll_range_start']}"
|
||||
|
||||
raw_batch_size = (
|
||||
@@ -1298,6 +1313,8 @@ class Zendesk(SyncBase):
|
||||
|
||||
self.log_connection("Zendesk", f"subdomain({self.conf['credentials'].get('zendesk_subdomain')})", task)
|
||||
|
||||
if file_list is not None:
|
||||
return wrapper(), file_list
|
||||
return wrapper()
|
||||
|
||||
|
||||
|
||||
@@ -105,6 +105,9 @@ export const DataSourceFeatureVisibilityMap: Partial<
|
||||
[DataSourceKey.AIRTABLE]: {
|
||||
syncDeletedFiles: true,
|
||||
},
|
||||
[DataSourceKey.ZENDESK]: {
|
||||
syncDeletedFiles: true,
|
||||
},
|
||||
[DataSourceKey.SEAFILE]: {
|
||||
syncDeletedFiles: true,
|
||||
},
|
||||
|
||||
Reference in New Issue
Block a user