"""SharePoint connector Ingests files from SharePoint document libraries via the Microsoft Graph API (Office365-REST-Python-Client). Authentication uses MSAL client-credentials (app-only) flow, so it requires an Azure AD app with the ``Sites.Read.All`` and ``Files.Read.All`` application permissions (admin-consented). The connector implements the checkpointed-connector interface used by the sync worker: ``load_from_checkpoint`` walks every document library under the configured site, downloads each file, and yields blob-based ``Document`` objects. Incremental syncs are bounded by the file ``lastModifiedDateTime``. """ import logging from datetime import datetime, timezone from typing import Any, Generator import msal from office365.graph_client import GraphClient from common.data_source.config import INDEX_BATCH_SIZE from common.data_source.exceptions import ( ConnectorMissingCredentialError, ConnectorValidationError, ) from common.data_source.interfaces import ( CheckpointedConnectorWithPermSync, SecondsSinceUnixEpoch, SlimConnectorWithPermSync, ) from common.data_source.models import ( ConnectorCheckpoint, ConnectorFailure, Document, DocumentFailure, SlimDocument, ) GRAPH_SCOPES = ["https://graph.microsoft.com/.default"] class SharePointConnector(CheckpointedConnectorWithPermSync, SlimConnectorWithPermSync): """SharePoint connector for accessing SharePoint sites and documents.""" def __init__(self, batch_size: int = INDEX_BATCH_SIZE) -> None: self.batch_size = batch_size self.graph_client: GraphClient | None = None self._site_url: str | None = None # -- credentials --------------------------------------------------------- def load_credentials(self, credentials: dict[str, Any]) -> dict[str, Any] | None: """Configure a Microsoft Graph client from app-only credentials. The token is acquired lazily through a callback (the way ``GraphClient`` expects it), so this method performs no network call; the first real request triggers ``acquire_token_for_client``. """ tenant_id = credentials.get("tenant_id") client_id = credentials.get("client_id") client_secret = credentials.get("client_secret") site_url = credentials.get("site_url") if not all([tenant_id, client_id, client_secret, site_url]): raise ConnectorMissingCredentialError("SharePoint credentials are incomplete") self._site_url = site_url authority = f"https://login.microsoftonline.com/{tenant_id}" def _acquire_token() -> dict[str, Any]: app = msal.ConfidentialClientApplication( client_id=client_id, client_credential=client_secret, authority=authority, ) token = app.acquire_token_for_client(scopes=GRAPH_SCOPES) if "access_token" not in token: detail = token.get("error_description") or token.get("error") or token raise ConnectorMissingCredentialError( f"Failed to acquire SharePoint access token: {detail}" ) return token self.graph_client = GraphClient(_acquire_token) return None def validate_connector_settings(self) -> None: """Validate credentials by resolving the configured site.""" if self.graph_client is None or not self._site_url: raise ConnectorMissingCredentialError("SharePoint") try: site = self.graph_client.sites.get_by_url(self._site_url).execute_query() if not site: raise ConnectorValidationError("Failed to access SharePoint site") except ConnectorValidationError: raise except Exception as e: message = str(e) if "401" in message or "403" in message: raise ConnectorValidationError( "Invalid credentials or insufficient permissions for SharePoint" ) raise ConnectorValidationError(f"SharePoint validation error: {e}") # -- traversal helpers --------------------------------------------------- def _iter_drives(self): site = self.graph_client.sites.get_by_url(self._site_url).execute_query() return site.drives.get().execute_query() @staticmethod def _is_folder(drive_item: Any) -> bool: return "folder" in getattr(drive_item, "properties", {}) def _walk_files(self, root_item: Any) -> Generator[Any, None, None]: """Depth-first walk of a drive yielding file (non-folder) driveItems.""" stack = [root_item] while stack: folder = stack.pop() children = folder.children.get().execute_query() for child in children: if self._is_folder(child): stack.append(child) else: yield child @staticmethod def _modified_dt(drive_item: Any) -> datetime | None: value = getattr(drive_item, "last_modified_datetime", None) if value is None: value = getattr(drive_item, "properties", {}).get("lastModifiedDateTime") if value is None: return None if isinstance(value, str): try: value = datetime.fromisoformat(value.replace("Z", "+00:00")) except ValueError: return None if value.tzinfo is None: value = value.replace(tzinfo=timezone.utc) return value @staticmethod def _composite_doc_id(drive_id: Any, drive_item: Any) -> str: # Graph driveItem IDs are only unique within a single drive. A site can # expose multiple document libraries (drives), so we namespace the item # ID by drive ID to keep document identifiers globally unique. return f"{drive_id}:{drive_item.id}" def _drive_item_to_document(self, drive_item: Any, drive_id: Any, drive_name: str) -> Document: name = drive_item.name or str(drive_item.id) content_result = drive_item.get_content().execute_query() blob = content_result.value or b"" if isinstance(blob, str): blob = blob.encode("utf-8") extension = "" if "." in name: extension = "." + name.rsplit(".", 1)[1] size_bytes = getattr(drive_item, "properties", {}).get("size") if not size_bytes: size_bytes = len(blob) modified = self._modified_dt(drive_item) or datetime.now(timezone.utc) metadata = {"drive": drive_name, "drive_id": str(drive_id), "drive_item_id": str(drive_item.id)} web_url = getattr(drive_item, "web_url", None) if web_url: metadata["web_url"] = web_url return Document( id=self._composite_doc_id(drive_id, drive_item), source="sharepoint", semantic_identifier=name, extension=extension, blob=blob, size_bytes=int(size_bytes), doc_updated_at=modified, metadata=metadata, ) def _generate_documents( self, start: SecondsSinceUnixEpoch, end: SecondsSinceUnixEpoch, ) -> Generator[Document | ConnectorFailure, None, None]: if self.graph_client is None or not self._site_url: raise ConnectorMissingCredentialError("SharePoint") for drive in self._iter_drives(): drive_name = getattr(drive, "name", None) or getattr(drive, "properties", {}).get("name", "") drive_id = getattr(drive, "id", None) or getattr(drive, "properties", {}).get("id", "") for drive_item in self._walk_files(drive.root): try: modified = self._modified_dt(drive_item) if modified is not None: ts = modified.timestamp() # start is an exclusive lower bound; full reindex passes start=0. if not (start < ts <= end): continue yield self._drive_item_to_document(drive_item, drive_id, drive_name) except Exception as e: logging.exception("SharePoint failed to process drive item") yield ConnectorFailure( failed_document=DocumentFailure( document_id=self._composite_doc_id(drive_id, drive_item) if getattr(drive_item, "id", None) is not None else "unknown", document_link=getattr(drive_item, "web_url", "") or "", ), failure_message=str(e), exception=e, ) # -- checkpointed connector interface ------------------------------------ def load_from_checkpoint( self, start: SecondsSinceUnixEpoch, end: SecondsSinceUnixEpoch, checkpoint: ConnectorCheckpoint, ) -> Generator[Document | ConnectorFailure, None, ConnectorCheckpoint]: """Yield every file under the site as a Document, then finish. The whole library is enumerated in a single pass, so the returned checkpoint always has ``has_more=False``. """ yield from self._generate_documents(start, end) return ConnectorCheckpoint(has_more=False) def load_from_checkpoint_with_perm_sync( self, start: SecondsSinceUnixEpoch, end: SecondsSinceUnixEpoch, checkpoint: ConnectorCheckpoint, ) -> Generator[Document | ConnectorFailure, None, ConnectorCheckpoint]: """Permission-aware variant. SharePoint ACL -> ExternalAccess mapping is not yet wired through the sync pipeline (the pipeline does not persist ExternalAccess), so this currently yields the same documents as ``load_from_checkpoint``. """ return self.load_from_checkpoint(start, end, checkpoint) def build_dummy_checkpoint(self) -> ConnectorCheckpoint: return ConnectorCheckpoint(has_more=True) def validate_checkpoint_json(self, checkpoint_json: str) -> ConnectorCheckpoint: return ConnectorCheckpoint(has_more=True) def retrieve_all_slim_docs_perm_sync( self, callback: Any = None, ) -> Generator[list[SlimDocument], None, None]: """Yield batches of slim documents (ids only) for prune/permission sync.""" if self.graph_client is None or not self._site_url: raise ConnectorMissingCredentialError("SharePoint") batch: list[SlimDocument] = [] for drive in self._iter_drives(): drive_id = getattr(drive, "id", None) or getattr(drive, "properties", {}).get("id", "") for drive_item in self._walk_files(drive.root): batch.append(SlimDocument(id=self._composite_doc_id(drive_id, drive_item))) if len(batch) >= self.batch_size: yield batch batch = [] if batch: yield batch