diff --git a/common/data_source/google_drive/connector.py b/common/data_source/google_drive/connector.py index add3b775f8..479c60e0b6 100644 --- a/common/data_source/google_drive/connector.py +++ b/common/data_source/google_drive/connector.py @@ -159,6 +159,7 @@ class GoogleDriveConnector(SlimConnectorWithPermSync, CheckpointedConnectorWithP self._creds: OAuthCredentials | ServiceAccountCredentials | None = None self._creds_dict: dict[str, Any] | None = None + self._all_drive_ids_cache: set[str] | None = None # ids of folders and shared drives that have been traversed self._retrieved_folder_and_drive_ids: set[str] = set() @@ -211,6 +212,7 @@ class GoogleDriveConnector(SlimConnectorWithPermSync, CheckpointedConnectorWithP self.include_files_shared_with_me = True self._creds_dict = new_creds_dict + self._all_drive_ids_cache = None return new_creds_dict @@ -249,7 +251,11 @@ class GoogleDriveConnector(SlimConnectorWithPermSync, CheckpointedConnectorWithP return user_emails def get_all_drive_ids(self) -> set[str]: - return self._get_all_drives_for_user(self.primary_admin_email) + if self._all_drive_ids_cache is None: + self._all_drive_ids_cache = self._get_all_drives_for_user( + self.primary_admin_email + ) + return set(self._all_drive_ids_cache) def _get_all_drives_for_user(self, user_email: str) -> set[str]: drive_service = get_drive_service(self.creds, user_email) @@ -265,7 +271,14 @@ class GoogleDriveConnector(SlimConnectorWithPermSync, CheckpointedConnectorWithP all_drive_ids.add(drive["id"]) if not all_drive_ids: - self.logger.warning("No drives found even though indexing shared drives was requested.") + if self._requested_shared_drive_ids: + self.logger.warning( + "No shared drives found for user %s while resolving requested shared drives.", + user_email, + ) + elif self.include_shared_drives: + log_fn = self.logger.warning if is_service_account else self.logger.info + log_fn("No shared drives found for user %s.", user_email) return all_drive_ids diff --git a/common/data_source/google_util/resource.py b/common/data_source/google_util/resource.py index eb060e4688..ba4199cb07 100644 --- a/common/data_source/google_util/resource.py +++ b/common/data_source/google_util/resource.py @@ -85,9 +85,19 @@ def _get_google_service( if isinstance(creds, ServiceAccountCredentials): # NOTE: https://developers.google.com/identity/protocols/oauth2/service-account#error-codes creds = creds.with_subject(user_email) - service = build(service_name, service_version, credentials=creds) + service = build( + service_name, + service_version, + credentials=creds, + cache_discovery=False, + ) elif isinstance(creds, OAuthCredentials): - service = build(service_name, service_version, credentials=creds) + service = build( + service_name, + service_version, + credentials=creds, + cache_discovery=False, + ) return service diff --git a/rag/svr/sync_data_source.py b/rag/svr/sync_data_source.py index ac0d4d0cb7..6f160a3564 100644 --- a/rag/svr/sync_data_source.py +++ b/rag/svr/sync_data_source.py @@ -577,6 +577,8 @@ class Gmail(SyncBase): task["connector_id"], ) + file_list = None + # Decide between full reindex and incremental polling by time range. if task["reindex"] == "1" or not task.get("poll_range_start"): start_time = None @@ -596,13 +598,17 @@ class Gmail(SyncBase): end_time = datetime.now(timezone.utc).timestamp() _begin_info = f"from {poll_start}" document_generator = self.connector.poll_source(start_time, end_time) + if self.conf.get("sync_deleted_files"): + file_list = [] + for slim_batch in self.connector.retrieve_all_slim_docs_perm_sync(): + file_list.extend(slim_batch) try: admin_email = self.connector.primary_admin_email except RuntimeError: admin_email = "unknown" self.log_connection("Gmail", f"as {admin_email}", task) - return document_generator + return document_generator, file_list class Dropbox(SyncBase): @@ -671,7 +677,6 @@ class GoogleDrive(SyncBase): if self.conf.get("sync_deleted_files"): file_list = [] - logging.info("Syncing deleted files (connector_id=%s)", task["connector_id"]) SlimDoc = namedtuple('SlimDoc', ['id']) # Add observability timing so operators can track the O(N) cost diff --git a/web/src/pages/user-setting/data-source/constant/index.tsx b/web/src/pages/user-setting/data-source/constant/index.tsx index 774b5c3f91..5816193e54 100644 --- a/web/src/pages/user-setting/data-source/constant/index.tsx +++ b/web/src/pages/user-setting/data-source/constant/index.tsx @@ -61,6 +61,9 @@ export const DataSourceFeatureVisibilityMap = { [DataSourceKey.GOOGLE_DRIVE]: { syncDeletedFiles: true, }, + [DataSourceKey.GMAIL]: { + syncDeletedFiles: true, + }, [DataSourceKey.CONFLUENCE]: { syncDeletedFiles: true, },