feat(zendesk): support deleted-file sync (#14487)

### What problem does this PR solve?

Refs #14362.

This PR enables syncing deleted files for Zendesk data sources.

Previously, Zendesk incremental sync never returned a slim remote
snapshot to the shared stale-document cleanup path, so deleted remote
Zendesk records could remain in RAGFlow. The existing Zendesk slim
snapshot also included records that ingestion intentionally skips, such
as draft articles, articles without bodies, skipped-label articles,
empty-body articles, and tickets with `status == "deleted"`.

This PR:
- exposes the deleted-file sync option for Zendesk in the data source UI
- returns Zendesk slim snapshots during incremental sync when
`sync_deleted_files` is enabled
- reuses Zendesk indexability rules so cleanup compares against the same
records ingestion can materialize
- adds start/end logs around Zendesk slim snapshot collection for
operational visibility

Per maintainer request, this PR contains no test-case changes. Manual
verification recording will be provided separately.

Validation:
- `uv run ruff check common/data_source/zendesk_connector.py
rag/svr/sync_data_source.py`
- `uv run pytest test/unit_test/rag/test_sync_data_source.py -q`
- `./node_modules/.bin/eslint
src/pages/user-setting/data-source/constant/index.tsx`

### Type of change

- [ ] Bug Fix (non-breaking change which fixes an issue)
- [x] New Feature (non-breaking change which adds functionality)
- [ ] Documentation Update
- [ ] Refactoring
- [ ] Performance Improvement
- [ ] Other (please describe):
This commit is contained in:
bitloi
2026-04-30 03:44:05 -03:00
committed by GitHub
parent 8f75e52bbf
commit 17eda04b8d
3 changed files with 42 additions and 9 deletions

View File

@@ -246,6 +246,18 @@ def _article_to_document(
)
def _is_indexable_article(article: dict[str, Any]) -> bool:
body = article.get("body")
return (
bool(body)
and not article.get("draft")
and not any(
label in ZENDESK_CONNECTOR_SKIP_ARTICLE_LABELS
for label in article.get("label_names") or []
)
)
def _get_comment_text(
comment: dict[str, Any],
author_map: dict[str, BasicExpertInfo],
@@ -333,6 +345,10 @@ def _ticket_to_document(
)
def _is_indexable_ticket(ticket: dict[str, Any]) -> bool:
return ticket.get("status") != "deleted"
class ZendeskConnectorCheckpoint(ConnectorCheckpoint):
# We use cursor-based paginated retrieval for articles
after_cursor_articles: str | None
@@ -419,14 +435,7 @@ class ZendeskConnector(
has_more = response.has_more
after_cursor = response.meta.get("after_cursor")
for article in articles:
if (
article.get("body") is None
or article.get("draft")
or any(
label in ZENDESK_CONNECTOR_SKIP_ARTICLE_LABELS
for label in article.get("label_names", [])
)
):
if not _is_indexable_article(article):
continue
try:
@@ -498,7 +507,7 @@ class ZendeskConnector(
has_more = ticket_response.has_more
next_start_time = ticket_response.meta["end_time"]
for ticket in tickets:
if ticket.get("status") == "deleted":
if not _is_indexable_ticket(ticket):
continue
try:
@@ -559,6 +568,8 @@ class ZendeskConnector(
if self.content_type == "articles":
articles = _get_articles(self.client)
for article in articles:
if not _is_indexable_article(article):
continue
slim_doc_batch.append(
SlimDocument(
id=f"article:{article['id']}",
@@ -570,6 +581,8 @@ class ZendeskConnector(
elif self.content_type == "tickets":
tickets = _get_tickets(self.client)
for ticket in tickets:
if not _is_indexable_ticket(ticket):
continue
slim_doc_batch.append(
SlimDocument(
id=f"zendesk_ticket_{ticket['id']}",

View File

@@ -1232,11 +1232,26 @@ class Zendesk(SyncBase):
self.connector.load_credentials(self.conf["credentials"])
end_time = datetime.now(timezone.utc).timestamp()
file_list = None
if task["reindex"] == "1" or not task.get("poll_range_start"):
start_time = 0
_begin_info = "totally"
else:
start_time = task["poll_range_start"].timestamp()
if self.conf.get("sync_deleted_files"):
logging.info(
"[Zendesk] Syncing deleted files via slim snapshot (connector_id=%s)",
task.get("connector_id"),
)
snapshot_start = time.perf_counter()
file_list = []
for slim_batch in self.connector.retrieve_all_slim_docs_perm_sync():
file_list.extend(slim_batch)
logging.info(
"[Zendesk] Slim snapshot fetched %d docs in %.2f seconds",
len(file_list),
time.perf_counter() - snapshot_start,
)
_begin_info = f"from {task['poll_range_start']}"
raw_batch_size = (
@@ -1298,6 +1313,8 @@ class Zendesk(SyncBase):
self.log_connection("Zendesk", f"subdomain({self.conf['credentials'].get('zendesk_subdomain')})", task)
if file_list is not None:
return wrapper(), file_list
return wrapper()

View File

@@ -105,6 +105,9 @@ export const DataSourceFeatureVisibilityMap: Partial<
[DataSourceKey.AIRTABLE]: {
syncDeletedFiles: true,
},
[DataSourceKey.ZENDESK]: {
syncDeletedFiles: true,
},
[DataSourceKey.SEAFILE]: {
syncDeletedFiles: true,
},