From 17eda04b8d717c14fc0fe364363291ea56f40cc0 Mon Sep 17 00:00:00 2001 From: bitloi <89318445+bitloi@users.noreply.github.com> Date: Thu, 30 Apr 2026 03:44:05 -0300 Subject: [PATCH] feat(zendesk): support deleted-file sync (#14487) ### What problem does this PR solve? Refs #14362. This PR enables syncing deleted files for Zendesk data sources. Previously, Zendesk incremental sync never returned a slim remote snapshot to the shared stale-document cleanup path, so deleted remote Zendesk records could remain in RAGFlow. The existing Zendesk slim snapshot also included records that ingestion intentionally skips, such as draft articles, articles without bodies, skipped-label articles, empty-body articles, and tickets with `status == "deleted"`. This PR: - exposes the deleted-file sync option for Zendesk in the data source UI - returns Zendesk slim snapshots during incremental sync when `sync_deleted_files` is enabled - reuses Zendesk indexability rules so cleanup compares against the same records ingestion can materialize - adds start/end logs around Zendesk slim snapshot collection for operational visibility Per maintainer request, this PR contains no test-case changes. Manual verification recording will be provided separately. Validation: - `uv run ruff check common/data_source/zendesk_connector.py rag/svr/sync_data_source.py` - `uv run pytest test/unit_test/rag/test_sync_data_source.py -q` - `./node_modules/.bin/eslint src/pages/user-setting/data-source/constant/index.tsx` ### Type of change - [ ] Bug Fix (non-breaking change which fixes an issue) - [x] New Feature (non-breaking change which adds functionality) - [ ] Documentation Update - [ ] Refactoring - [ ] Performance Improvement - [ ] Other (please describe): --- common/data_source/zendesk_connector.py | 31 +++++++++++++------ rag/svr/sync_data_source.py | 17 ++++++++++ .../data-source/constant/index.tsx | 3 ++ 3 files changed, 42 insertions(+), 9 deletions(-) diff --git a/common/data_source/zendesk_connector.py b/common/data_source/zendesk_connector.py index 8ea48d553b..c357b500fb 100644 --- a/common/data_source/zendesk_connector.py +++ b/common/data_source/zendesk_connector.py @@ -246,6 +246,18 @@ def _article_to_document( ) +def _is_indexable_article(article: dict[str, Any]) -> bool: + body = article.get("body") + return ( + bool(body) + and not article.get("draft") + and not any( + label in ZENDESK_CONNECTOR_SKIP_ARTICLE_LABELS + for label in article.get("label_names") or [] + ) + ) + + def _get_comment_text( comment: dict[str, Any], author_map: dict[str, BasicExpertInfo], @@ -333,6 +345,10 @@ def _ticket_to_document( ) +def _is_indexable_ticket(ticket: dict[str, Any]) -> bool: + return ticket.get("status") != "deleted" + + class ZendeskConnectorCheckpoint(ConnectorCheckpoint): # We use cursor-based paginated retrieval for articles after_cursor_articles: str | None @@ -419,14 +435,7 @@ class ZendeskConnector( has_more = response.has_more after_cursor = response.meta.get("after_cursor") for article in articles: - if ( - article.get("body") is None - or article.get("draft") - or any( - label in ZENDESK_CONNECTOR_SKIP_ARTICLE_LABELS - for label in article.get("label_names", []) - ) - ): + if not _is_indexable_article(article): continue try: @@ -498,7 +507,7 @@ class ZendeskConnector( has_more = ticket_response.has_more next_start_time = ticket_response.meta["end_time"] for ticket in tickets: - if ticket.get("status") == "deleted": + if not _is_indexable_ticket(ticket): continue try: @@ -559,6 +568,8 @@ class ZendeskConnector( if self.content_type == "articles": articles = _get_articles(self.client) for article in articles: + if not _is_indexable_article(article): + continue slim_doc_batch.append( SlimDocument( id=f"article:{article['id']}", @@ -570,6 +581,8 @@ class ZendeskConnector( elif self.content_type == "tickets": tickets = _get_tickets(self.client) for ticket in tickets: + if not _is_indexable_ticket(ticket): + continue slim_doc_batch.append( SlimDocument( id=f"zendesk_ticket_{ticket['id']}", diff --git a/rag/svr/sync_data_source.py b/rag/svr/sync_data_source.py index 5ada9f52a9..c00c209e0f 100644 --- a/rag/svr/sync_data_source.py +++ b/rag/svr/sync_data_source.py @@ -1232,11 +1232,26 @@ class Zendesk(SyncBase): self.connector.load_credentials(self.conf["credentials"]) end_time = datetime.now(timezone.utc).timestamp() + file_list = None if task["reindex"] == "1" or not task.get("poll_range_start"): start_time = 0 _begin_info = "totally" else: start_time = task["poll_range_start"].timestamp() + if self.conf.get("sync_deleted_files"): + logging.info( + "[Zendesk] Syncing deleted files via slim snapshot (connector_id=%s)", + task.get("connector_id"), + ) + snapshot_start = time.perf_counter() + file_list = [] + for slim_batch in self.connector.retrieve_all_slim_docs_perm_sync(): + file_list.extend(slim_batch) + logging.info( + "[Zendesk] Slim snapshot fetched %d docs in %.2f seconds", + len(file_list), + time.perf_counter() - snapshot_start, + ) _begin_info = f"from {task['poll_range_start']}" raw_batch_size = ( @@ -1298,6 +1313,8 @@ class Zendesk(SyncBase): self.log_connection("Zendesk", f"subdomain({self.conf['credentials'].get('zendesk_subdomain')})", task) + if file_list is not None: + return wrapper(), file_list return wrapper() diff --git a/web/src/pages/user-setting/data-source/constant/index.tsx b/web/src/pages/user-setting/data-source/constant/index.tsx index 718bdb4e93..9d8777be0d 100644 --- a/web/src/pages/user-setting/data-source/constant/index.tsx +++ b/web/src/pages/user-setting/data-source/constant/index.tsx @@ -105,6 +105,9 @@ export const DataSourceFeatureVisibilityMap: Partial< [DataSourceKey.AIRTABLE]: { syncDeletedFiles: true, }, + [DataSourceKey.ZENDESK]: { + syncDeletedFiles: true, + }, [DataSourceKey.SEAFILE]: { syncDeletedFiles: true, },