diff --git a/common/data_source/zendesk_connector.py b/common/data_source/zendesk_connector.py index 8ea48d553b..c357b500fb 100644 --- a/common/data_source/zendesk_connector.py +++ b/common/data_source/zendesk_connector.py @@ -246,6 +246,18 @@ def _article_to_document( ) +def _is_indexable_article(article: dict[str, Any]) -> bool: + body = article.get("body") + return ( + bool(body) + and not article.get("draft") + and not any( + label in ZENDESK_CONNECTOR_SKIP_ARTICLE_LABELS + for label in article.get("label_names") or [] + ) + ) + + def _get_comment_text( comment: dict[str, Any], author_map: dict[str, BasicExpertInfo], @@ -333,6 +345,10 @@ def _ticket_to_document( ) +def _is_indexable_ticket(ticket: dict[str, Any]) -> bool: + return ticket.get("status") != "deleted" + + class ZendeskConnectorCheckpoint(ConnectorCheckpoint): # We use cursor-based paginated retrieval for articles after_cursor_articles: str | None @@ -419,14 +435,7 @@ class ZendeskConnector( has_more = response.has_more after_cursor = response.meta.get("after_cursor") for article in articles: - if ( - article.get("body") is None - or article.get("draft") - or any( - label in ZENDESK_CONNECTOR_SKIP_ARTICLE_LABELS - for label in article.get("label_names", []) - ) - ): + if not _is_indexable_article(article): continue try: @@ -498,7 +507,7 @@ class ZendeskConnector( has_more = ticket_response.has_more next_start_time = ticket_response.meta["end_time"] for ticket in tickets: - if ticket.get("status") == "deleted": + if not _is_indexable_ticket(ticket): continue try: @@ -559,6 +568,8 @@ class ZendeskConnector( if self.content_type == "articles": articles = _get_articles(self.client) for article in articles: + if not _is_indexable_article(article): + continue slim_doc_batch.append( SlimDocument( id=f"article:{article['id']}", @@ -570,6 +581,8 @@ class ZendeskConnector( elif self.content_type == "tickets": tickets = _get_tickets(self.client) for ticket in tickets: + if not _is_indexable_ticket(ticket): + continue slim_doc_batch.append( SlimDocument( id=f"zendesk_ticket_{ticket['id']}", diff --git a/rag/svr/sync_data_source.py b/rag/svr/sync_data_source.py index 5ada9f52a9..c00c209e0f 100644 --- a/rag/svr/sync_data_source.py +++ b/rag/svr/sync_data_source.py @@ -1232,11 +1232,26 @@ class Zendesk(SyncBase): self.connector.load_credentials(self.conf["credentials"]) end_time = datetime.now(timezone.utc).timestamp() + file_list = None if task["reindex"] == "1" or not task.get("poll_range_start"): start_time = 0 _begin_info = "totally" else: start_time = task["poll_range_start"].timestamp() + if self.conf.get("sync_deleted_files"): + logging.info( + "[Zendesk] Syncing deleted files via slim snapshot (connector_id=%s)", + task.get("connector_id"), + ) + snapshot_start = time.perf_counter() + file_list = [] + for slim_batch in self.connector.retrieve_all_slim_docs_perm_sync(): + file_list.extend(slim_batch) + logging.info( + "[Zendesk] Slim snapshot fetched %d docs in %.2f seconds", + len(file_list), + time.perf_counter() - snapshot_start, + ) _begin_info = f"from {task['poll_range_start']}" raw_batch_size = ( @@ -1298,6 +1313,8 @@ class Zendesk(SyncBase): self.log_connection("Zendesk", f"subdomain({self.conf['credentials'].get('zendesk_subdomain')})", task) + if file_list is not None: + return wrapper(), file_list return wrapper() diff --git a/web/src/pages/user-setting/data-source/constant/index.tsx b/web/src/pages/user-setting/data-source/constant/index.tsx index 718bdb4e93..9d8777be0d 100644 --- a/web/src/pages/user-setting/data-source/constant/index.tsx +++ b/web/src/pages/user-setting/data-source/constant/index.tsx @@ -105,6 +105,9 @@ export const DataSourceFeatureVisibilityMap: Partial< [DataSourceKey.AIRTABLE]: { syncDeletedFiles: true, }, + [DataSourceKey.ZENDESK]: { + syncDeletedFiles: true, + }, [DataSourceKey.SEAFILE]: { syncDeletedFiles: true, },