feat: implement SharePoint data source connector (#15190)

### What problem does this PR solve? Closes #15189. RAGFlow shipped a SharePoint connector stub (`common/data_source/sharepoint_connector.py`) whose document-loading methods all returned `[]`, `SharePoint._generate()` was a `pass`, and SharePoint was commented out of the data-source settings UI. As a result there was no way to index files stored in SharePoint document libraries. This PR implements the connector end to end on top of Microsoft Graph (Office365-REST-Python-Client). **Backend** - `common/data_source/sharepoint_connector.py` - `load_credentials()` now builds the Graph client using an MSAL client-credentials **token callback** — the form `GraphClient` actually expects. (The previous stub passed a raw access-token string to `GraphClient(...)`, which is not how that client is driven.) Token acquisition is lazy, so credential loading does no network call. - `validate_connector_settings()` resolves the configured site via Graph. - `load_from_checkpoint()` is now a generator that enumerates every document library under the site, walks folders depth-first, downloads each file, and yields blob-based `Document` objects (`extension` / `blob` / `size_bytes` / `doc_updated_at`). Incremental syncs are bounded by file `lastModifiedDateTime`. Per-file errors are surfaced as `ConnectorFailure` rather than aborting the run. - `retrieve_all_slim_docs_perm_sync()` yields id-only `SlimDocument` batches (no downloads) and the checkpoint helpers return proper checkpoints. - ACL → `ExternalAccess` mapping is intentionally left best-effort (`load_from_checkpoint_with_perm_sync` delegates to the standard load) because the sync pipeline does not currently persist `ExternalAccess`; this can be extended once that plumbing exists. - `rag/svr/sync_data_source.py` - Implemented `SharePoint._generate()` using the existing `CheckpointOutputWrapper` pattern (same shape as Confluence/Jira/Google Drive), supporting full reindex and incremental polling from `poll_range_start`. - `SharePointConnector` is already exported from `common/data_source/__init__.py`. **Frontend (`web/`)** - Enabled the `SHAREPOINT` data-source enum and added its form fields `site_url`, `tenant_id`, `client_id`, `client_secret`), default values, display metadata, and a SharePoint icon. - Added `sharepointDescription` / `sharepointSiteUrlTip` to `en.ts` and `zh.ts`. **Tests** - `test/unit_test/data_source/test_sharepoint_connector_unit.py`: mock-based unit tests covering credential loading (incomplete creds raise, happy path sets the Graph client, fetch-without-creds raises), drive traversal + file download, incremental `lastModifiedDateTime` filtering, and slim-doc listing. All 6 pass; `ruff check` is clean. ### Type of change - [x] New Feature (non-breaking change which adds functionality)
2026-06-29 15:31:05 +08:00 · 2026-05-27 23:26:08 -06:00
parent 0aff6a3f32
commit c4c4e228e3
7 changed files with 627 additions and 71 deletions
--- a/common/data_source/sharepoint_connector.py
+++ b/common/data_source/sharepoint_connector.py
@@ -1,119 +1,270 @@
-"""SharePoint connector"""
+"""SharePoint connector
+
+Ingests files from SharePoint document libraries via the Microsoft Graph API
+(Office365-REST-Python-Client). Authentication uses MSAL client-credentials
+(app-only) flow, so it requires an Azure AD app with the ``Sites.Read.All`` and
+``Files.Read.All`` application permissions (admin-consented).
+
+The connector implements the checkpointed-connector interface used by the sync
+worker: ``load_from_checkpoint`` walks every document library under the
+configured site, downloads each file, and yields blob-based ``Document``
+objects. Incremental syncs are bounded by the file ``lastModifiedDateTime``.
+"""
+
+import logging
+from datetime import datetime, timezone
+from typing import Any, Generator

-from typing import Any
 import msal
 from office365.graph_client import GraphClient
-from office365.runtime.client_request import ClientRequestException
-from office365.sharepoint.client_context import ClientContext

 from common.data_source.config import INDEX_BATCH_SIZE
-from common.data_source.exceptions import ConnectorValidationError, ConnectorMissingCredentialError
+from common.data_source.exceptions import (
+    ConnectorMissingCredentialError,
+    ConnectorValidationError,
+)
 from common.data_source.interfaces import (
    CheckpointedConnectorWithPermSync,
    SecondsSinceUnixEpoch,
-    SlimConnectorWithPermSync
+    SlimConnectorWithPermSync,
 )
 from common.data_source.models import (
-    ConnectorCheckpoint
+    ConnectorCheckpoint,
+    ConnectorFailure,
+    Document,
+    DocumentFailure,
+    SlimDocument,
 )

+GRAPH_SCOPES = ["https://graph.microsoft.com/.default"]
+

 class SharePointConnector(CheckpointedConnectorWithPermSync, SlimConnectorWithPermSync):
-    """SharePoint connector for accessing SharePoint sites and documents"""
+    """SharePoint connector for accessing SharePoint sites and documents."""

    def __init__(self, batch_size: int = INDEX_BATCH_SIZE) -> None:
        self.batch_size = batch_size
-        self.sharepoint_client = None
-        self.graph_client = None
+        self.graph_client: GraphClient | None = None
+        self._site_url: str | None = None
+
+    # -- credentials ---------------------------------------------------------

    def load_credentials(self, credentials: dict[str, Any]) -> dict[str, Any] | None:
-        """Load SharePoint credentials"""
-        try:
-            tenant_id = credentials.get("tenant_id")
-            client_id = credentials.get("client_id")
-            client_secret = credentials.get("client_secret")
-            site_url = credentials.get("site_url")
-            
-            if not all([tenant_id, client_id, client_secret, site_url]):
-                raise ConnectorMissingCredentialError("SharePoint credentials are incomplete")
-            
-            # Create MSAL confidential client
+        """Configure a Microsoft Graph client from app-only credentials.
+
+        The token is acquired lazily through a callback (the way
+        ``GraphClient`` expects it), so this method performs no network call;
+        the first real request triggers ``acquire_token_for_client``.
+        """
+        tenant_id = credentials.get("tenant_id")
+        client_id = credentials.get("client_id")
+        client_secret = credentials.get("client_secret")
+        site_url = credentials.get("site_url")
+
+        if not all([tenant_id, client_id, client_secret, site_url]):
+            raise ConnectorMissingCredentialError("SharePoint credentials are incomplete")
+
+        self._site_url = site_url
+        authority = f"https://login.microsoftonline.com/{tenant_id}"
+
+        def _acquire_token() -> dict[str, Any]:
            app = msal.ConfidentialClientApplication(
                client_id=client_id,
                client_credential=client_secret,
-                authority=f"https://login.microsoftonline.com/{tenant_id}"
+                authority=authority,
            )
-            
-            # Get access token
-            result = app.acquire_token_for_client(scopes=["https://graph.microsoft.com/.default"])
-            
-            if "access_token" not in result:
-                raise ConnectorMissingCredentialError("Failed to acquire SharePoint access token")
-            
-            # Create Graph client
-            self.graph_client = GraphClient(result["access_token"])
-            
-            # Create SharePoint client context
-            self.sharepoint_client = ClientContext(site_url).with_access_token(result["access_token"])
-            
-            return None
-        except Exception as e:
-            raise ConnectorMissingCredentialError(f"SharePoint: {e}")
+            token = app.acquire_token_for_client(scopes=GRAPH_SCOPES)
+            if "access_token" not in token:
+                detail = token.get("error_description") or token.get("error") or token
+                raise ConnectorMissingCredentialError(
+                    f"Failed to acquire SharePoint access token: {detail}"
+                )
+            return token
+
+        self.graph_client = GraphClient(_acquire_token)
+        return None

    def validate_connector_settings(self) -> None:
-        """Validate SharePoint connector settings"""
-        if not self.sharepoint_client or not self.graph_client:
+        """Validate credentials by resolving the configured site."""
+        if self.graph_client is None or not self._site_url:
            raise ConnectorMissingCredentialError("SharePoint")
-        
+
        try:
-            # Test connection by getting site info
-            site = self.sharepoint_client.site.get().execute_query()
+            site = self.graph_client.sites.get_by_url(self._site_url).execute_query()
            if not site:
                raise ConnectorValidationError("Failed to access SharePoint site")
-        except ClientRequestException as e:
-            if "401" in str(e) or "403" in str(e):
-                raise ConnectorValidationError("Invalid credentials or insufficient permissions")
-            else:
-                raise ConnectorValidationError(f"SharePoint validation error: {e}")
+        except ConnectorValidationError:
+            raise
+        except Exception as e:
+            message = str(e)
+            if "401" in message or "403" in message:
+                raise ConnectorValidationError(
+                    "Invalid credentials or insufficient permissions for SharePoint"
+                )
+            raise ConnectorValidationError(f"SharePoint validation error: {e}")

-    def poll_source(self, start: SecondsSinceUnixEpoch, end: SecondsSinceUnixEpoch) -> Any:
-        """Poll SharePoint for recent documents"""
-        # Simplified implementation - in production this would handle actual polling
-        return []
+    # -- traversal helpers ---------------------------------------------------
+
+    def _iter_drives(self):
+        site = self.graph_client.sites.get_by_url(self._site_url).execute_query()
+        return site.drives.get().execute_query()
+
+    @staticmethod
+    def _is_folder(drive_item: Any) -> bool:
+        return "folder" in getattr(drive_item, "properties", {})
+
+    def _walk_files(self, root_item: Any) -> Generator[Any, None, None]:
+        """Depth-first walk of a drive yielding file (non-folder) driveItems."""
+        stack = [root_item]
+        while stack:
+            folder = stack.pop()
+            children = folder.children.get().execute_query()
+            for child in children:
+                if self._is_folder(child):
+                    stack.append(child)
+                else:
+                    yield child
+
+    @staticmethod
+    def _modified_dt(drive_item: Any) -> datetime | None:
+        value = getattr(drive_item, "last_modified_datetime", None)
+        if value is None:
+            value = getattr(drive_item, "properties", {}).get("lastModifiedDateTime")
+        if value is None:
+            return None
+        if isinstance(value, str):
+            try:
+                value = datetime.fromisoformat(value.replace("Z", "+00:00"))
+            except ValueError:
+                return None
+        if value.tzinfo is None:
+            value = value.replace(tzinfo=timezone.utc)
+        return value
+
+    @staticmethod
+    def _composite_doc_id(drive_id: Any, drive_item: Any) -> str:
+        # Graph driveItem IDs are only unique within a single drive. A site can
+        # expose multiple document libraries (drives), so we namespace the item
+        # ID by drive ID to keep document identifiers globally unique.
+        return f"{drive_id}:{drive_item.id}"
+
+    def _drive_item_to_document(self, drive_item: Any, drive_id: Any, drive_name: str) -> Document:
+        name = drive_item.name or str(drive_item.id)
+        content_result = drive_item.get_content().execute_query()
+        blob = content_result.value or b""
+        if isinstance(blob, str):
+            blob = blob.encode("utf-8")
+
+        extension = ""
+        if "." in name:
+            extension = "." + name.rsplit(".", 1)[1]
+
+        size_bytes = getattr(drive_item, "properties", {}).get("size")
+        if not size_bytes:
+            size_bytes = len(blob)
+
+        modified = self._modified_dt(drive_item) or datetime.now(timezone.utc)
+
+        metadata = {"drive": drive_name, "drive_id": str(drive_id), "drive_item_id": str(drive_item.id)}
+        web_url = getattr(drive_item, "web_url", None)
+        if web_url:
+            metadata["web_url"] = web_url
+
+        return Document(
+            id=self._composite_doc_id(drive_id, drive_item),
+            source="sharepoint",
+            semantic_identifier=name,
+            extension=extension,
+            blob=blob,
+            size_bytes=int(size_bytes),
+            doc_updated_at=modified,
+            metadata=metadata,
+        )
+
+    def _generate_documents(
+        self,
+        start: SecondsSinceUnixEpoch,
+        end: SecondsSinceUnixEpoch,
+    ) -> Generator[Document | ConnectorFailure, None, None]:
+        if self.graph_client is None or not self._site_url:
+            raise ConnectorMissingCredentialError("SharePoint")
+
+        for drive in self._iter_drives():
+            drive_name = getattr(drive, "name", None) or getattr(drive, "properties", {}).get("name", "")
+            drive_id = getattr(drive, "id", None) or getattr(drive, "properties", {}).get("id", "")
+            for drive_item in self._walk_files(drive.root):
+                try:
+                    modified = self._modified_dt(drive_item)
+                    if modified is not None:
+                        ts = modified.timestamp()
+                        # start is an exclusive lower bound; full reindex passes start=0.
+                        if not (start < ts <= end):
+                            continue
+                    yield self._drive_item_to_document(drive_item, drive_id, drive_name)
+                except Exception as e:
+                    logging.exception("SharePoint failed to process drive item")
+                    yield ConnectorFailure(
+                        failed_document=DocumentFailure(
+                            document_id=self._composite_doc_id(drive_id, drive_item)
+                            if getattr(drive_item, "id", None) is not None
+                            else "unknown",
+                            document_link=getattr(drive_item, "web_url", "") or "",
+                        ),
+                        failure_message=str(e),
+                        exception=e,
+                    )
+
+    # -- checkpointed connector interface ------------------------------------

    def load_from_checkpoint(
        self,
        start: SecondsSinceUnixEpoch,
        end: SecondsSinceUnixEpoch,
        checkpoint: ConnectorCheckpoint,
-    ) -> Any:
-        """Load documents from checkpoint"""
-        # Simplified implementation
-        return []
+    ) -> Generator[Document | ConnectorFailure, None, ConnectorCheckpoint]:
+        """Yield every file under the site as a Document, then finish.
+
+        The whole library is enumerated in a single pass, so the returned
+        checkpoint always has ``has_more=False``.
+        """
+        yield from self._generate_documents(start, end)
+        return ConnectorCheckpoint(has_more=False)

    def load_from_checkpoint_with_perm_sync(
        self,
        start: SecondsSinceUnixEpoch,
        end: SecondsSinceUnixEpoch,
        checkpoint: ConnectorCheckpoint,
-    ) -> Any:
-        """Load documents from checkpoint with permission sync"""
-        # Simplified implementation
-        return []
+    ) -> Generator[Document | ConnectorFailure, None, ConnectorCheckpoint]:
+        """Permission-aware variant.
+
+        SharePoint ACL -> ExternalAccess mapping is not yet wired through the
+        sync pipeline (the pipeline does not persist ExternalAccess), so this
+        currently yields the same documents as ``load_from_checkpoint``.
+        """
+        return self.load_from_checkpoint(start, end, checkpoint)

    def build_dummy_checkpoint(self) -> ConnectorCheckpoint:
-        """Build dummy checkpoint"""
-        return ConnectorCheckpoint()
+        return ConnectorCheckpoint(has_more=True)

    def validate_checkpoint_json(self, checkpoint_json: str) -> ConnectorCheckpoint:
-        """Validate checkpoint JSON"""
-        # Simplified implementation
-        return ConnectorCheckpoint()
+        return ConnectorCheckpoint(has_more=True)

    def retrieve_all_slim_docs_perm_sync(
        self,
        callback: Any = None,
-    ) -> Any:
-        """Retrieve all simplified documents with permission sync"""
-        # Simplified implementation
-        return []
+    ) -> Generator[list[SlimDocument], None, None]:
+        """Yield batches of slim documents (ids only) for prune/permission sync."""
+        if self.graph_client is None or not self._site_url:
+            raise ConnectorMissingCredentialError("SharePoint")
+
+        batch: list[SlimDocument] = []
+        for drive in self._iter_drives():
+            drive_id = getattr(drive, "id", None) or getattr(drive, "properties", {}).get("id", "")
+            for drive_item in self._walk_files(drive.root):
+                batch.append(SlimDocument(id=self._composite_doc_id(drive_id, drive_item)))
+                if len(batch) >= self.batch_size:
+                    yield batch
+                    batch = []
+        if batch:
+            yield batch
--- a/rag/svr/sync_data_source.py
+++ b/rag/svr/sync_data_source.py
@@ -61,6 +61,7 @@ from common.data_source import (
    RDBMSConnector,
    DingTalkAITableConnector,
    RestAPIConnector,
+    SharePointConnector,
 )
 from common.data_source.models import ConnectorFailure, SeafileSyncScope
 from common.data_source.webdav_connector import WebDAVConnector
@@ -932,7 +933,66 @@ class SharePoint(SyncBase):
    SOURCE_NAME: str = FileSource.SHAREPOINT

    async def _generate(self, task: dict):
-        pass
+        self.connector = SharePointConnector(
+            batch_size=self.conf.get("batch_size", INDEX_BATCH_SIZE),
+        )
+
+        credentials = self.conf.get("credentials") or {}
+        self.connector.load_credentials(credentials)
+        self.connector.validate_connector_settings()
+
+        if task["reindex"] == "1" or not task["poll_range_start"]:
+            start_time = 0.0
+            _begin_info = "totally"
+        else:
+            start_time = task["poll_range_start"].timestamp()
+            _begin_info = f"from {task['poll_range_start']}"
+
+        end_time = datetime.now(timezone.utc).timestamp()
+
+        raw_batch_size = self.conf.get("sync_batch_size") or self.conf.get("batch_size") or INDEX_BATCH_SIZE
+        try:
+            batch_size = int(raw_batch_size)
+        except (TypeError, ValueError):
+            batch_size = INDEX_BATCH_SIZE
+        if batch_size <= 0:
+            batch_size = INDEX_BATCH_SIZE
+
+        def document_batches():
+            checkpoint = self.connector.build_dummy_checkpoint()
+            pending_docs = []
+            iterations = 0
+            iteration_limit = 100_000
+
+            while checkpoint.has_more:
+                wrapper = CheckpointOutputWrapper()
+                doc_generator = wrapper(
+                    self.connector.load_from_checkpoint(start_time, end_time, checkpoint)
+                )
+                for document, failure, next_checkpoint in doc_generator:
+                    if failure is not None:
+                        logging.warning(
+                            "SharePoint connector failure: %s",
+                            getattr(failure, "failure_message", failure),
+                        )
+                        continue
+                    if document is not None:
+                        pending_docs.append(document)
+                        if len(pending_docs) >= batch_size:
+                            yield pending_docs
+                            pending_docs = []
+                    if next_checkpoint is not None:
+                        checkpoint = next_checkpoint
+
+                iterations += 1
+                if iterations > iteration_limit:
+                    raise RuntimeError("Too many iterations while loading SharePoint documents.")
+
+            if pending_docs:
+                yield pending_docs
+
+        self.log_connection("SharePoint", self.conf.get("credentials", {}).get("site_url", ""), task)
+        return document_batches()


 class Slack(SyncBase):
--- a/test/unit_test/data_source/test_sharepoint_connector_unit.py
+++ b/test/unit_test/data_source/test_sharepoint_connector_unit.py
@@ -0,0 +1,284 @@
+#
+#  Copyright 2026 The InfiniFlow Authors. All Rights Reserved.
+#
+#  Licensed under the Apache License, Version 2.0 (the "License");
+#  you may not use this file except in compliance with the License.
+#  You may obtain a copy of the License at
+#
+#      http://www.apache.org/licenses/LICENSE-2.0
+#
+#  Unless required by applicable law or agreed to in writing, software
+#  distributed under the License is distributed on an "AS IS" BASIS,
+#  WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+#  See the License for the specific language governing permissions and
+#  limitations under the License.
+#
+import importlib.util
+import sys
+from datetime import datetime, timezone
+from pathlib import Path
+from types import ModuleType
+
+import pytest
+
+
+def _load_sharepoint_connector_module():
+    """Load sharepoint_connector.py in isolation (avoid the package __init__)."""
+    repo_root = Path(__file__).resolve().parents[3]
+    package_name = "common.data_source"
+    saved_modules = {
+        name: module
+        for name, module in sys.modules.items()
+        if name == package_name or name.startswith(f"{package_name}.")
+    }
+    package_stub = ModuleType(package_name)
+    package_stub.__path__ = [str(repo_root / "common" / "data_source")]
+    sys.modules[package_name] = package_stub
+
+    try:
+        spec = importlib.util.spec_from_file_location(
+            "_sharepoint_connector_under_test",
+            repo_root / "common" / "data_source" / "sharepoint_connector.py",
+        )
+        module = importlib.util.module_from_spec(spec)
+        spec.loader.exec_module(module)
+        return module
+    finally:
+        for name in list(sys.modules):
+            if name == package_name or name.startswith(f"{package_name}."):
+                if name in saved_modules:
+                    sys.modules[name] = saved_modules[name]
+                else:
+                    sys.modules.pop(name, None)
+
+
+sharepoint_connector = _load_sharepoint_connector_module()
+SharePointConnector = sharepoint_connector.SharePointConnector
+
+
+# --- fakes for the office365 fluent API ------------------------------------
+
+
+class _Query:
+    """Mimics the `.get()` / `.get_by_url()` -> `.execute_query()` chain."""
+
+    def __init__(self, value):
+        self._value = value
+
+    def execute_query(self):
+        return self._value
+
+
+class _Content:
+    def __init__(self, value: bytes):
+        self.value = value
+
+    def execute_query(self):
+        return self
+
+
+class _FakeDriveItem:
+    def __init__(self, item_id, name=None, content=None, modified=None, children=None, size=None):
+        self.id = item_id
+        self.name = name
+        self.web_url = f"https://contoso.sharepoint.com/{item_id}"
+        self.last_modified_datetime = modified
+        self._content = content
+        self._children = children or []
+        self.properties = {}
+        if children is not None:
+            self.properties["folder"] = {"childCount": len(children)}
+        else:
+            self.properties["file"] = {"mimeType": "text/plain"}
+        if size is not None:
+            self.properties["size"] = size
+
+    @property
+    def children(self):
+        return _FakeDrivesAccessor(self._children)
+
+    def get_content(self):
+        return _Content(self._content)
+
+
+class _FakeDrive:
+    def __init__(self, name, root, drive_id=None):
+        self.name = name
+        self.root = root
+        self.id = drive_id or f"drive-{name}"
+        self.properties = {"name": name, "id": self.id}
+
+
+class _FakeDrivesAccessor:
+    def __init__(self, drives):
+        self._drives = drives
+
+    def get(self):
+        return _Query(self._drives)
+
+
+class _FakeSite:
+    def __init__(self, drives):
+        self.drives = _FakeDrivesAccessor(drives)
+
+    def __bool__(self):
+        return True
+
+
+class _FakeSitesAccessor:
+    def __init__(self, site):
+        self._site = site
+
+    def get_by_url(self, url):
+        return _Query(self._site)
+
+
+class _FakeGraphClient:
+    def __init__(self, site):
+        self.sites = _FakeSitesAccessor(site)
+
+
+def _build_connector_with_tree():
+    jan = datetime(2026, 1, 1, 12, tzinfo=timezone.utc)
+    feb = datetime(2026, 2, 1, 12, tzinfo=timezone.utc)
+
+    readme = _FakeDriveItem("f1", "readme.txt", b"hello sharepoint", jan, size=16)
+    nested = _FakeDriveItem("f2", "report.md", b"# Report", feb, size=8)
+    subfolder = _FakeDriveItem("d2", "sub", children=[nested])
+    root = _FakeDriveItem("d1", "root", children=[readme, subfolder])
+    drive = _FakeDrive("Documents", root, drive_id="drv-A")
+    site = _FakeSite([drive])
+
+    connector = SharePointConnector(batch_size=10)
+    connector.graph_client = _FakeGraphClient(site)
+    connector._site_url = "https://contoso.sharepoint.com/sites/MySite"
+    return connector, jan, feb
+
+
+# --- credential loading -----------------------------------------------------
+
+
+def test_load_credentials_incomplete_raises():
+    connector = SharePointConnector()
+    with pytest.raises(sharepoint_connector.ConnectorMissingCredentialError):
+        connector.load_credentials({"tenant_id": "t", "client_id": "c"})
+
+
+def test_load_credentials_sets_graph_client(monkeypatch):
+    captured = {}
+
+    class _FakeApp:
+        def __init__(self, **kwargs):
+            captured.update(kwargs)
+
+        def acquire_token_for_client(self, scopes):
+            return {"access_token": "tok"}
+
+    monkeypatch.setattr(sharepoint_connector.msal, "ConfidentialClientApplication", _FakeApp)
+    monkeypatch.setattr(sharepoint_connector, "GraphClient", lambda token_callback: ("client", token_callback))
+
+    connector = SharePointConnector()
+    result = connector.load_credentials(
+        {
+            "tenant_id": "tenant",
+            "client_id": "client",
+            "client_secret": "secret",
+            "site_url": "https://contoso.sharepoint.com/sites/MySite",
+        }
+    )
+
+    assert result is None
+    assert connector._site_url == "https://contoso.sharepoint.com/sites/MySite"
+    assert connector.graph_client is not None
+
+
+def test_fetch_without_credentials_raises():
+    connector = SharePointConnector()
+    with pytest.raises(sharepoint_connector.ConnectorMissingCredentialError):
+        list(connector.load_from_checkpoint(0.0, 9e12, connector.build_dummy_checkpoint()))
+
+
+# --- document generation ----------------------------------------------------
+
+
+def _collect(generator):
+    """Drain a checkpoint generator, returning (documents, final_checkpoint)."""
+    docs = []
+    try:
+        while True:
+            docs.append(next(generator))
+    except StopIteration as stop:
+        return docs, stop.value
+
+
+def test_load_from_checkpoint_walks_libraries_and_downloads():
+    connector, _jan, _feb = _build_connector_with_tree()
+
+    docs, checkpoint = _collect(
+        connector.load_from_checkpoint(0.0, 9e12, connector.build_dummy_checkpoint())
+    )
+
+    assert checkpoint.has_more is False
+    assert {doc.id for doc in docs} == {"drv-A:f1", "drv-A:f2"}
+
+    by_id = {doc.id: doc for doc in docs}
+    assert by_id["drv-A:f1"].blob == b"hello sharepoint"
+    assert by_id["drv-A:f1"].extension == ".txt"
+    assert by_id["drv-A:f1"].size_bytes == 16
+    assert by_id["drv-A:f1"].source == "sharepoint"
+    assert by_id["drv-A:f1"].metadata["drive"] == "Documents"
+    assert by_id["drv-A:f1"].metadata["drive_id"] == "drv-A"
+    assert by_id["drv-A:f1"].metadata["drive_item_id"] == "f1"
+    assert by_id["drv-A:f2"].semantic_identifier == "report.md"
+    assert by_id["drv-A:f2"].extension == ".md"
+
+
+def test_load_from_checkpoint_filters_by_modified_window():
+    connector, _jan, feb = _build_connector_with_tree()
+
+    # Only include files modified strictly after mid-January -> just report.md (Feb).
+    start = datetime(2026, 1, 15, tzinfo=timezone.utc).timestamp()
+    end = datetime(2026, 3, 1, tzinfo=timezone.utc).timestamp()
+
+    docs, _ = _collect(
+        connector.load_from_checkpoint(start, end, connector.build_dummy_checkpoint())
+    )
+
+    assert [doc.id for doc in docs] == ["drv-A:f2"]
+
+
+def test_retrieve_all_slim_docs_lists_ids_without_download():
+    connector, _jan, _feb = _build_connector_with_tree()
+
+    batches = list(connector.retrieve_all_slim_docs_perm_sync())
+    ids = [doc.id for batch in batches for doc in batch]
+
+    assert sorted(ids) == ["drv-A:f1", "drv-A:f2"]
+
+
+def test_document_ids_are_unique_across_drives_with_colliding_item_ids():
+    # Graph driveItem IDs are unique only within a single drive; two libraries
+    # under the same site can legitimately yield items with identical IDs.
+    jan = datetime(2026, 1, 1, 12, tzinfo=timezone.utc)
+
+    file_a = _FakeDriveItem("same-id", "a.txt", b"A", jan, size=1)
+    root_a = _FakeDriveItem("rootA", "root", children=[file_a])
+    drive_a = _FakeDrive("LibraryA", root_a, drive_id="drv-A")
+
+    file_b = _FakeDriveItem("same-id", "b.txt", b"B", jan, size=1)
+    root_b = _FakeDriveItem("rootB", "root", children=[file_b])
+    drive_b = _FakeDrive("LibraryB", root_b, drive_id="drv-B")
+
+    site = _FakeSite([drive_a, drive_b])
+    connector = SharePointConnector(batch_size=10)
+    connector.graph_client = _FakeGraphClient(site)
+    connector._site_url = "https://contoso.sharepoint.com/sites/MySite"
+
+    docs, _ = _collect(
+        connector.load_from_checkpoint(0.0, 9e12, connector.build_dummy_checkpoint())
+    )
+    ids = {doc.id for doc in docs}
+    assert ids == {"drv-A:same-id", "drv-B:same-id"}
+
+    slim_ids = [doc.id for batch in connector.retrieve_all_slim_docs_perm_sync() for doc in batch]
+    assert sorted(slim_ids) == ["drv-A:same-id", "drv-B:same-id"]
--- a/web/src/assets/svg/data-source/sharepoint.svg
+++ b/web/src/assets/svg/data-source/sharepoint.svg
@@ -0,0 +1,9 @@
+<svg xmlns="http://www.w3.org/2000/svg" viewBox="0 0 48 48" width="48" height="48">
+  <circle cx="20" cy="14" r="11" fill="#036C70"/>
+  <circle cx="31" cy="25" r="10" fill="#1A9BA1"/>
+  <circle cx="22.5" cy="36.5" r="8.5" fill="#37C6D0"/>
+  <path d="M22 11H10.5A1.5 1.5 0 0 0 9 12.5v18A1.5 1.5 0 0 0 10.5 32H22a1.5 1.5 0 0 0 1.5-1.5v-18A1.5 1.5 0 0 0 22 11z" opacity=".1"/>
+  <path d="M21 12H10.5A1.5 1.5 0 0 0 9 13.5v18A1.5 1.5 0 0 0 10.5 33H21a1.5 1.5 0 0 0 1.5-1.5v-18A1.5 1.5 0 0 0 21 12z" opacity=".2"/>
+  <rect x="2" y="14" width="22" height="22" rx="1.5" fill="#03787C"/>
+  <path d="M10.2 24.6c-.5-.3-.9-.7-1.2-1.1a2.6 2.6 0 0 1-.4-1.5c0-.8.3-1.5.9-2 .6-.5 1.4-.8 2.5-.8 1 0 1.8.1 2.5.4v1.9a4 4 0 0 0-2.3-.6c-.4 0-.8.1-1 .3-.3.1-.4.4-.4.7 0 .2.1.4.3.6.2.2.6.4 1.2.6.9.3 1.6.7 2 1.1.4.4.6 1 .6 1.6 0 .9-.3 1.5-.9 2-.6.5-1.5.7-2.6.7-.5 0-1-.1-1.5-.2a4 4 0 0 1-1.1-.4v-2c.4.3.8.5 1.3.7.5.2.9.2 1.3.2.5 0 .8-.1 1-.3.2-.1.3-.4.3-.7 0-.3-.1-.5-.4-.7-.2-.2-.7-.5-1.4-.8z" fill="#fff"/>
+</svg>
--- a/web/src/locales/en.ts
+++ b/web/src/locales/en.ts
@@ -1238,6 +1238,10 @@ Example: Virtual Hosted Style`,
        'Upload the OAuth JSON generated from Google Console. If it only contains client credentials, run the browser-based verification once to mint long-lived refresh tokens.',
      dropboxDescription:
        'Connect your Dropbox to sync files and folders from a chosen account.',
+      sharepointDescription:
+        'Connect a SharePoint site via Microsoft Graph to sync its document libraries.',
+      sharepointSiteUrlTip:
+        'Full URL of the SharePoint site to index, e.g. https://contoso.sharepoint.com/sites/MySite. Requires an Azure AD app with Sites.Read.All and Files.Read.All application permissions (admin consent).',
      bitbucketDescription: 'Connect Bitbucket to sync PR content.',
      bitbucketTopWorkspaceTip:
        'The Bitbucket workspace to index (e.g., "atlassian" from https://bitbucket.org/atlassian/workspace ).',
--- a/web/src/locales/zh.ts
+++ b/web/src/locales/zh.ts
@@ -1099,6 +1099,9 @@ NER：使用 spaCy NER 和基于规则的关键词提取来抽取实体和关系
      gmailTokenTip:
        '请上传由 Google Console 生成的 OAuth JSON。如果仅包含 client credentials，请通过浏览器授权一次以获取长期有效的刷新 Token。',
      dropboxDescription: '连接 Dropbox，同步指定账号下的文件与文件夹。',
+      sharepointDescription: '通过 Microsoft Graph 连接 SharePoint 站点，同步其文档库。',
+      sharepointSiteUrlTip:
+        '要索引的 SharePoint 站点完整 URL，例如 https://contoso.sharepoint.com/sites/MySite。需要具备 Sites.Read.All 与 Files.Read.All 应用权限（管理员同意）的 Azure AD 应用。',
      boxDescription: '连接你的 Box 云盘以同步文件和文件夹。',
      bitbucketDescription: '连接 Bitbucket，同步 PR 内容。',
      bitbucketTopWorkspaceTip:
--- a/web/src/pages/user-setting/data-source/constant/index.tsx
+++ b/web/src/pages/user-setting/data-source/constant/index.tsx
@@ -43,8 +43,8 @@ export enum DataSourceKey {
  POSTGRESQL = 'postgresql',
  REST_API = 'rest_api',
  RSS = 'rss',
+  SHAREPOINT = 'sharepoint',

-  //   SHAREPOINT = 'sharepoint',
  //   SLACK = 'slack',
  //   TEAMS = 'teams',
 }
@@ -213,6 +213,11 @@ export const generateDataSourceInfo = (t: TFunction) => {
      description: t(`setting.${DataSourceKey.MOODLE}Description`),
      icon: <SvgIcon name={'data-source/moodle'} width={38} />,
    },
+    [DataSourceKey.SHAREPOINT]: {
+      name: 'SharePoint',
+      description: t(`setting.${DataSourceKey.SHAREPOINT}Description`),
+      icon: <SvgIcon name={'data-source/sharepoint'} width={38} />,
+    },
    [DataSourceKey.JIRA]: {
      name: 'Jira',
      description: t(`setting.${DataSourceKey.JIRA}Description`),
@@ -654,6 +659,34 @@ export const DataSourceFormFields = {
      required: true,
    },
  ],
+  [DataSourceKey.SHAREPOINT]: [
+    {
+      label: 'Site URL',
+      name: 'config.credentials.site_url',
+      type: FormFieldType.Text,
+      required: true,
+      placeholder: 'https://contoso.sharepoint.com/sites/MySite',
+      tooltip: t('setting.sharepointSiteUrlTip'),
+    },
+    {
+      label: 'Tenant ID',
+      name: 'config.credentials.tenant_id',
+      type: FormFieldType.Text,
+      required: true,
+    },
+    {
+      label: 'Client ID',
+      name: 'config.credentials.client_id',
+      type: FormFieldType.Text,
+      required: true,
+    },
+    {
+      label: 'Client Secret',
+      name: 'config.credentials.client_secret',
+      type: FormFieldType.Password,
+      required: true,
+    },
+  ],
  [DataSourceKey.JIRA]: jiraConstant(t),
  [DataSourceKey.WEBDAV]: [
    {
@@ -1509,6 +1542,18 @@ export const DataSourceFormDefaultValues = {
      },
    },
  },
+  [DataSourceKey.SHAREPOINT]: {
+    name: '',
+    source: DataSourceKey.SHAREPOINT,
+    config: {
+      credentials: {
+        site_url: '',
+        tenant_id: '',
+        client_id: '',
+        client_secret: '',
+      },
+    },
+  },
  [DataSourceKey.JIRA]: {
    name: '',
    source: DataSourceKey.JIRA,