diff --git a/common/data_source/sharepoint_connector.py b/common/data_source/sharepoint_connector.py
index e5684023c1..ab3384d702 100644
--- a/common/data_source/sharepoint_connector.py
+++ b/common/data_source/sharepoint_connector.py
@@ -1,119 +1,270 @@
-"""SharePoint connector"""
+"""SharePoint connector
+
+Ingests files from SharePoint document libraries via the Microsoft Graph API
+(Office365-REST-Python-Client). Authentication uses MSAL client-credentials
+(app-only) flow, so it requires an Azure AD app with the ``Sites.Read.All`` and
+``Files.Read.All`` application permissions (admin-consented).
+
+The connector implements the checkpointed-connector interface used by the sync
+worker: ``load_from_checkpoint`` walks every document library under the
+configured site, downloads each file, and yields blob-based ``Document``
+objects. Incremental syncs are bounded by the file ``lastModifiedDateTime``.
+"""
+
+import logging
+from datetime import datetime, timezone
+from typing import Any, Generator
-from typing import Any
import msal
from office365.graph_client import GraphClient
-from office365.runtime.client_request import ClientRequestException
-from office365.sharepoint.client_context import ClientContext
from common.data_source.config import INDEX_BATCH_SIZE
-from common.data_source.exceptions import ConnectorValidationError, ConnectorMissingCredentialError
+from common.data_source.exceptions import (
+ ConnectorMissingCredentialError,
+ ConnectorValidationError,
+)
from common.data_source.interfaces import (
CheckpointedConnectorWithPermSync,
SecondsSinceUnixEpoch,
- SlimConnectorWithPermSync
+ SlimConnectorWithPermSync,
)
from common.data_source.models import (
- ConnectorCheckpoint
+ ConnectorCheckpoint,
+ ConnectorFailure,
+ Document,
+ DocumentFailure,
+ SlimDocument,
)
+GRAPH_SCOPES = ["https://graph.microsoft.com/.default"]
+
class SharePointConnector(CheckpointedConnectorWithPermSync, SlimConnectorWithPermSync):
- """SharePoint connector for accessing SharePoint sites and documents"""
+ """SharePoint connector for accessing SharePoint sites and documents."""
def __init__(self, batch_size: int = INDEX_BATCH_SIZE) -> None:
self.batch_size = batch_size
- self.sharepoint_client = None
- self.graph_client = None
+ self.graph_client: GraphClient | None = None
+ self._site_url: str | None = None
+
+ # -- credentials ---------------------------------------------------------
def load_credentials(self, credentials: dict[str, Any]) -> dict[str, Any] | None:
- """Load SharePoint credentials"""
- try:
- tenant_id = credentials.get("tenant_id")
- client_id = credentials.get("client_id")
- client_secret = credentials.get("client_secret")
- site_url = credentials.get("site_url")
-
- if not all([tenant_id, client_id, client_secret, site_url]):
- raise ConnectorMissingCredentialError("SharePoint credentials are incomplete")
-
- # Create MSAL confidential client
+ """Configure a Microsoft Graph client from app-only credentials.
+
+ The token is acquired lazily through a callback (the way
+ ``GraphClient`` expects it), so this method performs no network call;
+ the first real request triggers ``acquire_token_for_client``.
+ """
+ tenant_id = credentials.get("tenant_id")
+ client_id = credentials.get("client_id")
+ client_secret = credentials.get("client_secret")
+ site_url = credentials.get("site_url")
+
+ if not all([tenant_id, client_id, client_secret, site_url]):
+ raise ConnectorMissingCredentialError("SharePoint credentials are incomplete")
+
+ self._site_url = site_url
+ authority = f"https://login.microsoftonline.com/{tenant_id}"
+
+ def _acquire_token() -> dict[str, Any]:
app = msal.ConfidentialClientApplication(
client_id=client_id,
client_credential=client_secret,
- authority=f"https://login.microsoftonline.com/{tenant_id}"
+ authority=authority,
)
-
- # Get access token
- result = app.acquire_token_for_client(scopes=["https://graph.microsoft.com/.default"])
-
- if "access_token" not in result:
- raise ConnectorMissingCredentialError("Failed to acquire SharePoint access token")
-
- # Create Graph client
- self.graph_client = GraphClient(result["access_token"])
-
- # Create SharePoint client context
- self.sharepoint_client = ClientContext(site_url).with_access_token(result["access_token"])
-
- return None
- except Exception as e:
- raise ConnectorMissingCredentialError(f"SharePoint: {e}")
+ token = app.acquire_token_for_client(scopes=GRAPH_SCOPES)
+ if "access_token" not in token:
+ detail = token.get("error_description") or token.get("error") or token
+ raise ConnectorMissingCredentialError(
+ f"Failed to acquire SharePoint access token: {detail}"
+ )
+ return token
+
+ self.graph_client = GraphClient(_acquire_token)
+ return None
def validate_connector_settings(self) -> None:
- """Validate SharePoint connector settings"""
- if not self.sharepoint_client or not self.graph_client:
+ """Validate credentials by resolving the configured site."""
+ if self.graph_client is None or not self._site_url:
raise ConnectorMissingCredentialError("SharePoint")
-
+
try:
- # Test connection by getting site info
- site = self.sharepoint_client.site.get().execute_query()
+ site = self.graph_client.sites.get_by_url(self._site_url).execute_query()
if not site:
raise ConnectorValidationError("Failed to access SharePoint site")
- except ClientRequestException as e:
- if "401" in str(e) or "403" in str(e):
- raise ConnectorValidationError("Invalid credentials or insufficient permissions")
- else:
- raise ConnectorValidationError(f"SharePoint validation error: {e}")
+ except ConnectorValidationError:
+ raise
+ except Exception as e:
+ message = str(e)
+ if "401" in message or "403" in message:
+ raise ConnectorValidationError(
+ "Invalid credentials or insufficient permissions for SharePoint"
+ )
+ raise ConnectorValidationError(f"SharePoint validation error: {e}")
- def poll_source(self, start: SecondsSinceUnixEpoch, end: SecondsSinceUnixEpoch) -> Any:
- """Poll SharePoint for recent documents"""
- # Simplified implementation - in production this would handle actual polling
- return []
+ # -- traversal helpers ---------------------------------------------------
+
+ def _iter_drives(self):
+ site = self.graph_client.sites.get_by_url(self._site_url).execute_query()
+ return site.drives.get().execute_query()
+
+ @staticmethod
+ def _is_folder(drive_item: Any) -> bool:
+ return "folder" in getattr(drive_item, "properties", {})
+
+ def _walk_files(self, root_item: Any) -> Generator[Any, None, None]:
+ """Depth-first walk of a drive yielding file (non-folder) driveItems."""
+ stack = [root_item]
+ while stack:
+ folder = stack.pop()
+ children = folder.children.get().execute_query()
+ for child in children:
+ if self._is_folder(child):
+ stack.append(child)
+ else:
+ yield child
+
+ @staticmethod
+ def _modified_dt(drive_item: Any) -> datetime | None:
+ value = getattr(drive_item, "last_modified_datetime", None)
+ if value is None:
+ value = getattr(drive_item, "properties", {}).get("lastModifiedDateTime")
+ if value is None:
+ return None
+ if isinstance(value, str):
+ try:
+ value = datetime.fromisoformat(value.replace("Z", "+00:00"))
+ except ValueError:
+ return None
+ if value.tzinfo is None:
+ value = value.replace(tzinfo=timezone.utc)
+ return value
+
+ @staticmethod
+ def _composite_doc_id(drive_id: Any, drive_item: Any) -> str:
+ # Graph driveItem IDs are only unique within a single drive. A site can
+ # expose multiple document libraries (drives), so we namespace the item
+ # ID by drive ID to keep document identifiers globally unique.
+ return f"{drive_id}:{drive_item.id}"
+
+ def _drive_item_to_document(self, drive_item: Any, drive_id: Any, drive_name: str) -> Document:
+ name = drive_item.name or str(drive_item.id)
+ content_result = drive_item.get_content().execute_query()
+ blob = content_result.value or b""
+ if isinstance(blob, str):
+ blob = blob.encode("utf-8")
+
+ extension = ""
+ if "." in name:
+ extension = "." + name.rsplit(".", 1)[1]
+
+ size_bytes = getattr(drive_item, "properties", {}).get("size")
+ if not size_bytes:
+ size_bytes = len(blob)
+
+ modified = self._modified_dt(drive_item) or datetime.now(timezone.utc)
+
+ metadata = {"drive": drive_name, "drive_id": str(drive_id), "drive_item_id": str(drive_item.id)}
+ web_url = getattr(drive_item, "web_url", None)
+ if web_url:
+ metadata["web_url"] = web_url
+
+ return Document(
+ id=self._composite_doc_id(drive_id, drive_item),
+ source="sharepoint",
+ semantic_identifier=name,
+ extension=extension,
+ blob=blob,
+ size_bytes=int(size_bytes),
+ doc_updated_at=modified,
+ metadata=metadata,
+ )
+
+ def _generate_documents(
+ self,
+ start: SecondsSinceUnixEpoch,
+ end: SecondsSinceUnixEpoch,
+ ) -> Generator[Document | ConnectorFailure, None, None]:
+ if self.graph_client is None or not self._site_url:
+ raise ConnectorMissingCredentialError("SharePoint")
+
+ for drive in self._iter_drives():
+ drive_name = getattr(drive, "name", None) or getattr(drive, "properties", {}).get("name", "")
+ drive_id = getattr(drive, "id", None) or getattr(drive, "properties", {}).get("id", "")
+ for drive_item in self._walk_files(drive.root):
+ try:
+ modified = self._modified_dt(drive_item)
+ if modified is not None:
+ ts = modified.timestamp()
+ # start is an exclusive lower bound; full reindex passes start=0.
+ if not (start < ts <= end):
+ continue
+ yield self._drive_item_to_document(drive_item, drive_id, drive_name)
+ except Exception as e:
+ logging.exception("SharePoint failed to process drive item")
+ yield ConnectorFailure(
+ failed_document=DocumentFailure(
+ document_id=self._composite_doc_id(drive_id, drive_item)
+ if getattr(drive_item, "id", None) is not None
+ else "unknown",
+ document_link=getattr(drive_item, "web_url", "") or "",
+ ),
+ failure_message=str(e),
+ exception=e,
+ )
+
+ # -- checkpointed connector interface ------------------------------------
def load_from_checkpoint(
self,
start: SecondsSinceUnixEpoch,
end: SecondsSinceUnixEpoch,
checkpoint: ConnectorCheckpoint,
- ) -> Any:
- """Load documents from checkpoint"""
- # Simplified implementation
- return []
+ ) -> Generator[Document | ConnectorFailure, None, ConnectorCheckpoint]:
+ """Yield every file under the site as a Document, then finish.
+
+ The whole library is enumerated in a single pass, so the returned
+ checkpoint always has ``has_more=False``.
+ """
+ yield from self._generate_documents(start, end)
+ return ConnectorCheckpoint(has_more=False)
def load_from_checkpoint_with_perm_sync(
self,
start: SecondsSinceUnixEpoch,
end: SecondsSinceUnixEpoch,
checkpoint: ConnectorCheckpoint,
- ) -> Any:
- """Load documents from checkpoint with permission sync"""
- # Simplified implementation
- return []
+ ) -> Generator[Document | ConnectorFailure, None, ConnectorCheckpoint]:
+ """Permission-aware variant.
+
+ SharePoint ACL -> ExternalAccess mapping is not yet wired through the
+ sync pipeline (the pipeline does not persist ExternalAccess), so this
+ currently yields the same documents as ``load_from_checkpoint``.
+ """
+ return self.load_from_checkpoint(start, end, checkpoint)
def build_dummy_checkpoint(self) -> ConnectorCheckpoint:
- """Build dummy checkpoint"""
- return ConnectorCheckpoint()
+ return ConnectorCheckpoint(has_more=True)
def validate_checkpoint_json(self, checkpoint_json: str) -> ConnectorCheckpoint:
- """Validate checkpoint JSON"""
- # Simplified implementation
- return ConnectorCheckpoint()
+ return ConnectorCheckpoint(has_more=True)
def retrieve_all_slim_docs_perm_sync(
self,
callback: Any = None,
- ) -> Any:
- """Retrieve all simplified documents with permission sync"""
- # Simplified implementation
- return []
+ ) -> Generator[list[SlimDocument], None, None]:
+ """Yield batches of slim documents (ids only) for prune/permission sync."""
+ if self.graph_client is None or not self._site_url:
+ raise ConnectorMissingCredentialError("SharePoint")
+
+ batch: list[SlimDocument] = []
+ for drive in self._iter_drives():
+ drive_id = getattr(drive, "id", None) or getattr(drive, "properties", {}).get("id", "")
+ for drive_item in self._walk_files(drive.root):
+ batch.append(SlimDocument(id=self._composite_doc_id(drive_id, drive_item)))
+ if len(batch) >= self.batch_size:
+ yield batch
+ batch = []
+ if batch:
+ yield batch
diff --git a/rag/svr/sync_data_source.py b/rag/svr/sync_data_source.py
index a5ba395820..3b01014327 100644
--- a/rag/svr/sync_data_source.py
+++ b/rag/svr/sync_data_source.py
@@ -61,6 +61,7 @@ from common.data_source import (
RDBMSConnector,
DingTalkAITableConnector,
RestAPIConnector,
+ SharePointConnector,
)
from common.data_source.models import ConnectorFailure, SeafileSyncScope
from common.data_source.webdav_connector import WebDAVConnector
@@ -932,7 +933,66 @@ class SharePoint(SyncBase):
SOURCE_NAME: str = FileSource.SHAREPOINT
async def _generate(self, task: dict):
- pass
+ self.connector = SharePointConnector(
+ batch_size=self.conf.get("batch_size", INDEX_BATCH_SIZE),
+ )
+
+ credentials = self.conf.get("credentials") or {}
+ self.connector.load_credentials(credentials)
+ self.connector.validate_connector_settings()
+
+ if task["reindex"] == "1" or not task["poll_range_start"]:
+ start_time = 0.0
+ _begin_info = "totally"
+ else:
+ start_time = task["poll_range_start"].timestamp()
+ _begin_info = f"from {task['poll_range_start']}"
+
+ end_time = datetime.now(timezone.utc).timestamp()
+
+ raw_batch_size = self.conf.get("sync_batch_size") or self.conf.get("batch_size") or INDEX_BATCH_SIZE
+ try:
+ batch_size = int(raw_batch_size)
+ except (TypeError, ValueError):
+ batch_size = INDEX_BATCH_SIZE
+ if batch_size <= 0:
+ batch_size = INDEX_BATCH_SIZE
+
+ def document_batches():
+ checkpoint = self.connector.build_dummy_checkpoint()
+ pending_docs = []
+ iterations = 0
+ iteration_limit = 100_000
+
+ while checkpoint.has_more:
+ wrapper = CheckpointOutputWrapper()
+ doc_generator = wrapper(
+ self.connector.load_from_checkpoint(start_time, end_time, checkpoint)
+ )
+ for document, failure, next_checkpoint in doc_generator:
+ if failure is not None:
+ logging.warning(
+ "SharePoint connector failure: %s",
+ getattr(failure, "failure_message", failure),
+ )
+ continue
+ if document is not None:
+ pending_docs.append(document)
+ if len(pending_docs) >= batch_size:
+ yield pending_docs
+ pending_docs = []
+ if next_checkpoint is not None:
+ checkpoint = next_checkpoint
+
+ iterations += 1
+ if iterations > iteration_limit:
+ raise RuntimeError("Too many iterations while loading SharePoint documents.")
+
+ if pending_docs:
+ yield pending_docs
+
+ self.log_connection("SharePoint", self.conf.get("credentials", {}).get("site_url", ""), task)
+ return document_batches()
class Slack(SyncBase):
diff --git a/test/unit_test/data_source/test_sharepoint_connector_unit.py b/test/unit_test/data_source/test_sharepoint_connector_unit.py
new file mode 100644
index 0000000000..ed12b6714b
--- /dev/null
+++ b/test/unit_test/data_source/test_sharepoint_connector_unit.py
@@ -0,0 +1,284 @@
+#
+# Copyright 2026 The InfiniFlow Authors. All Rights Reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+# http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+#
+import importlib.util
+import sys
+from datetime import datetime, timezone
+from pathlib import Path
+from types import ModuleType
+
+import pytest
+
+
+def _load_sharepoint_connector_module():
+ """Load sharepoint_connector.py in isolation (avoid the package __init__)."""
+ repo_root = Path(__file__).resolve().parents[3]
+ package_name = "common.data_source"
+ saved_modules = {
+ name: module
+ for name, module in sys.modules.items()
+ if name == package_name or name.startswith(f"{package_name}.")
+ }
+ package_stub = ModuleType(package_name)
+ package_stub.__path__ = [str(repo_root / "common" / "data_source")]
+ sys.modules[package_name] = package_stub
+
+ try:
+ spec = importlib.util.spec_from_file_location(
+ "_sharepoint_connector_under_test",
+ repo_root / "common" / "data_source" / "sharepoint_connector.py",
+ )
+ module = importlib.util.module_from_spec(spec)
+ spec.loader.exec_module(module)
+ return module
+ finally:
+ for name in list(sys.modules):
+ if name == package_name or name.startswith(f"{package_name}."):
+ if name in saved_modules:
+ sys.modules[name] = saved_modules[name]
+ else:
+ sys.modules.pop(name, None)
+
+
+sharepoint_connector = _load_sharepoint_connector_module()
+SharePointConnector = sharepoint_connector.SharePointConnector
+
+
+# --- fakes for the office365 fluent API ------------------------------------
+
+
+class _Query:
+ """Mimics the `.get()` / `.get_by_url()` -> `.execute_query()` chain."""
+
+ def __init__(self, value):
+ self._value = value
+
+ def execute_query(self):
+ return self._value
+
+
+class _Content:
+ def __init__(self, value: bytes):
+ self.value = value
+
+ def execute_query(self):
+ return self
+
+
+class _FakeDriveItem:
+ def __init__(self, item_id, name=None, content=None, modified=None, children=None, size=None):
+ self.id = item_id
+ self.name = name
+ self.web_url = f"https://contoso.sharepoint.com/{item_id}"
+ self.last_modified_datetime = modified
+ self._content = content
+ self._children = children or []
+ self.properties = {}
+ if children is not None:
+ self.properties["folder"] = {"childCount": len(children)}
+ else:
+ self.properties["file"] = {"mimeType": "text/plain"}
+ if size is not None:
+ self.properties["size"] = size
+
+ @property
+ def children(self):
+ return _FakeDrivesAccessor(self._children)
+
+ def get_content(self):
+ return _Content(self._content)
+
+
+class _FakeDrive:
+ def __init__(self, name, root, drive_id=None):
+ self.name = name
+ self.root = root
+ self.id = drive_id or f"drive-{name}"
+ self.properties = {"name": name, "id": self.id}
+
+
+class _FakeDrivesAccessor:
+ def __init__(self, drives):
+ self._drives = drives
+
+ def get(self):
+ return _Query(self._drives)
+
+
+class _FakeSite:
+ def __init__(self, drives):
+ self.drives = _FakeDrivesAccessor(drives)
+
+ def __bool__(self):
+ return True
+
+
+class _FakeSitesAccessor:
+ def __init__(self, site):
+ self._site = site
+
+ def get_by_url(self, url):
+ return _Query(self._site)
+
+
+class _FakeGraphClient:
+ def __init__(self, site):
+ self.sites = _FakeSitesAccessor(site)
+
+
+def _build_connector_with_tree():
+ jan = datetime(2026, 1, 1, 12, tzinfo=timezone.utc)
+ feb = datetime(2026, 2, 1, 12, tzinfo=timezone.utc)
+
+ readme = _FakeDriveItem("f1", "readme.txt", b"hello sharepoint", jan, size=16)
+ nested = _FakeDriveItem("f2", "report.md", b"# Report", feb, size=8)
+ subfolder = _FakeDriveItem("d2", "sub", children=[nested])
+ root = _FakeDriveItem("d1", "root", children=[readme, subfolder])
+ drive = _FakeDrive("Documents", root, drive_id="drv-A")
+ site = _FakeSite([drive])
+
+ connector = SharePointConnector(batch_size=10)
+ connector.graph_client = _FakeGraphClient(site)
+ connector._site_url = "https://contoso.sharepoint.com/sites/MySite"
+ return connector, jan, feb
+
+
+# --- credential loading -----------------------------------------------------
+
+
+def test_load_credentials_incomplete_raises():
+ connector = SharePointConnector()
+ with pytest.raises(sharepoint_connector.ConnectorMissingCredentialError):
+ connector.load_credentials({"tenant_id": "t", "client_id": "c"})
+
+
+def test_load_credentials_sets_graph_client(monkeypatch):
+ captured = {}
+
+ class _FakeApp:
+ def __init__(self, **kwargs):
+ captured.update(kwargs)
+
+ def acquire_token_for_client(self, scopes):
+ return {"access_token": "tok"}
+
+ monkeypatch.setattr(sharepoint_connector.msal, "ConfidentialClientApplication", _FakeApp)
+ monkeypatch.setattr(sharepoint_connector, "GraphClient", lambda token_callback: ("client", token_callback))
+
+ connector = SharePointConnector()
+ result = connector.load_credentials(
+ {
+ "tenant_id": "tenant",
+ "client_id": "client",
+ "client_secret": "secret",
+ "site_url": "https://contoso.sharepoint.com/sites/MySite",
+ }
+ )
+
+ assert result is None
+ assert connector._site_url == "https://contoso.sharepoint.com/sites/MySite"
+ assert connector.graph_client is not None
+
+
+def test_fetch_without_credentials_raises():
+ connector = SharePointConnector()
+ with pytest.raises(sharepoint_connector.ConnectorMissingCredentialError):
+ list(connector.load_from_checkpoint(0.0, 9e12, connector.build_dummy_checkpoint()))
+
+
+# --- document generation ----------------------------------------------------
+
+
+def _collect(generator):
+ """Drain a checkpoint generator, returning (documents, final_checkpoint)."""
+ docs = []
+ try:
+ while True:
+ docs.append(next(generator))
+ except StopIteration as stop:
+ return docs, stop.value
+
+
+def test_load_from_checkpoint_walks_libraries_and_downloads():
+ connector, _jan, _feb = _build_connector_with_tree()
+
+ docs, checkpoint = _collect(
+ connector.load_from_checkpoint(0.0, 9e12, connector.build_dummy_checkpoint())
+ )
+
+ assert checkpoint.has_more is False
+ assert {doc.id for doc in docs} == {"drv-A:f1", "drv-A:f2"}
+
+ by_id = {doc.id: doc for doc in docs}
+ assert by_id["drv-A:f1"].blob == b"hello sharepoint"
+ assert by_id["drv-A:f1"].extension == ".txt"
+ assert by_id["drv-A:f1"].size_bytes == 16
+ assert by_id["drv-A:f1"].source == "sharepoint"
+ assert by_id["drv-A:f1"].metadata["drive"] == "Documents"
+ assert by_id["drv-A:f1"].metadata["drive_id"] == "drv-A"
+ assert by_id["drv-A:f1"].metadata["drive_item_id"] == "f1"
+ assert by_id["drv-A:f2"].semantic_identifier == "report.md"
+ assert by_id["drv-A:f2"].extension == ".md"
+
+
+def test_load_from_checkpoint_filters_by_modified_window():
+ connector, _jan, feb = _build_connector_with_tree()
+
+ # Only include files modified strictly after mid-January -> just report.md (Feb).
+ start = datetime(2026, 1, 15, tzinfo=timezone.utc).timestamp()
+ end = datetime(2026, 3, 1, tzinfo=timezone.utc).timestamp()
+
+ docs, _ = _collect(
+ connector.load_from_checkpoint(start, end, connector.build_dummy_checkpoint())
+ )
+
+ assert [doc.id for doc in docs] == ["drv-A:f2"]
+
+
+def test_retrieve_all_slim_docs_lists_ids_without_download():
+ connector, _jan, _feb = _build_connector_with_tree()
+
+ batches = list(connector.retrieve_all_slim_docs_perm_sync())
+ ids = [doc.id for batch in batches for doc in batch]
+
+ assert sorted(ids) == ["drv-A:f1", "drv-A:f2"]
+
+
+def test_document_ids_are_unique_across_drives_with_colliding_item_ids():
+ # Graph driveItem IDs are unique only within a single drive; two libraries
+ # under the same site can legitimately yield items with identical IDs.
+ jan = datetime(2026, 1, 1, 12, tzinfo=timezone.utc)
+
+ file_a = _FakeDriveItem("same-id", "a.txt", b"A", jan, size=1)
+ root_a = _FakeDriveItem("rootA", "root", children=[file_a])
+ drive_a = _FakeDrive("LibraryA", root_a, drive_id="drv-A")
+
+ file_b = _FakeDriveItem("same-id", "b.txt", b"B", jan, size=1)
+ root_b = _FakeDriveItem("rootB", "root", children=[file_b])
+ drive_b = _FakeDrive("LibraryB", root_b, drive_id="drv-B")
+
+ site = _FakeSite([drive_a, drive_b])
+ connector = SharePointConnector(batch_size=10)
+ connector.graph_client = _FakeGraphClient(site)
+ connector._site_url = "https://contoso.sharepoint.com/sites/MySite"
+
+ docs, _ = _collect(
+ connector.load_from_checkpoint(0.0, 9e12, connector.build_dummy_checkpoint())
+ )
+ ids = {doc.id for doc in docs}
+ assert ids == {"drv-A:same-id", "drv-B:same-id"}
+
+ slim_ids = [doc.id for batch in connector.retrieve_all_slim_docs_perm_sync() for doc in batch]
+ assert sorted(slim_ids) == ["drv-A:same-id", "drv-B:same-id"]
diff --git a/web/src/assets/svg/data-source/sharepoint.svg b/web/src/assets/svg/data-source/sharepoint.svg
new file mode 100644
index 0000000000..fc8ed27b0b
--- /dev/null
+++ b/web/src/assets/svg/data-source/sharepoint.svg
@@ -0,0 +1,9 @@
+
diff --git a/web/src/locales/en.ts b/web/src/locales/en.ts
index 18f988691c..1f8f1fe117 100644
--- a/web/src/locales/en.ts
+++ b/web/src/locales/en.ts
@@ -1238,6 +1238,10 @@ Example: Virtual Hosted Style`,
'Upload the OAuth JSON generated from Google Console. If it only contains client credentials, run the browser-based verification once to mint long-lived refresh tokens.',
dropboxDescription:
'Connect your Dropbox to sync files and folders from a chosen account.',
+ sharepointDescription:
+ 'Connect a SharePoint site via Microsoft Graph to sync its document libraries.',
+ sharepointSiteUrlTip:
+ 'Full URL of the SharePoint site to index, e.g. https://contoso.sharepoint.com/sites/MySite. Requires an Azure AD app with Sites.Read.All and Files.Read.All application permissions (admin consent).',
bitbucketDescription: 'Connect Bitbucket to sync PR content.',
bitbucketTopWorkspaceTip:
'The Bitbucket workspace to index (e.g., "atlassian" from https://bitbucket.org/atlassian/workspace ).',
diff --git a/web/src/locales/zh.ts b/web/src/locales/zh.ts
index 17c368ee19..cd0c6709cb 100644
--- a/web/src/locales/zh.ts
+++ b/web/src/locales/zh.ts
@@ -1099,6 +1099,9 @@ NER:使用 spaCy NER 和基于规则的关键词提取来抽取实体和关系
gmailTokenTip:
'请上传由 Google Console 生成的 OAuth JSON。如果仅包含 client credentials,请通过浏览器授权一次以获取长期有效的刷新 Token。',
dropboxDescription: '连接 Dropbox,同步指定账号下的文件与文件夹。',
+ sharepointDescription: '通过 Microsoft Graph 连接 SharePoint 站点,同步其文档库。',
+ sharepointSiteUrlTip:
+ '要索引的 SharePoint 站点完整 URL,例如 https://contoso.sharepoint.com/sites/MySite。需要具备 Sites.Read.All 与 Files.Read.All 应用权限(管理员同意)的 Azure AD 应用。',
boxDescription: '连接你的 Box 云盘以同步文件和文件夹。',
bitbucketDescription: '连接 Bitbucket,同步 PR 内容。',
bitbucketTopWorkspaceTip:
diff --git a/web/src/pages/user-setting/data-source/constant/index.tsx b/web/src/pages/user-setting/data-source/constant/index.tsx
index f1d2bb4ace..026570eca3 100644
--- a/web/src/pages/user-setting/data-source/constant/index.tsx
+++ b/web/src/pages/user-setting/data-source/constant/index.tsx
@@ -43,8 +43,8 @@ export enum DataSourceKey {
POSTGRESQL = 'postgresql',
REST_API = 'rest_api',
RSS = 'rss',
+ SHAREPOINT = 'sharepoint',
- // SHAREPOINT = 'sharepoint',
// SLACK = 'slack',
// TEAMS = 'teams',
}
@@ -213,6 +213,11 @@ export const generateDataSourceInfo = (t: TFunction) => {
description: t(`setting.${DataSourceKey.MOODLE}Description`),
icon: ,
},
+ [DataSourceKey.SHAREPOINT]: {
+ name: 'SharePoint',
+ description: t(`setting.${DataSourceKey.SHAREPOINT}Description`),
+ icon: ,
+ },
[DataSourceKey.JIRA]: {
name: 'Jira',
description: t(`setting.${DataSourceKey.JIRA}Description`),
@@ -654,6 +659,34 @@ export const DataSourceFormFields = {
required: true,
},
],
+ [DataSourceKey.SHAREPOINT]: [
+ {
+ label: 'Site URL',
+ name: 'config.credentials.site_url',
+ type: FormFieldType.Text,
+ required: true,
+ placeholder: 'https://contoso.sharepoint.com/sites/MySite',
+ tooltip: t('setting.sharepointSiteUrlTip'),
+ },
+ {
+ label: 'Tenant ID',
+ name: 'config.credentials.tenant_id',
+ type: FormFieldType.Text,
+ required: true,
+ },
+ {
+ label: 'Client ID',
+ name: 'config.credentials.client_id',
+ type: FormFieldType.Text,
+ required: true,
+ },
+ {
+ label: 'Client Secret',
+ name: 'config.credentials.client_secret',
+ type: FormFieldType.Password,
+ required: true,
+ },
+ ],
[DataSourceKey.JIRA]: jiraConstant(t),
[DataSourceKey.WEBDAV]: [
{
@@ -1509,6 +1542,18 @@ export const DataSourceFormDefaultValues = {
},
},
},
+ [DataSourceKey.SHAREPOINT]: {
+ name: '',
+ source: DataSourceKey.SHAREPOINT,
+ config: {
+ credentials: {
+ site_url: '',
+ tenant_id: '',
+ client_id: '',
+ client_secret: '',
+ },
+ },
+ },
[DataSourceKey.JIRA]: {
name: '',
source: DataSourceKey.JIRA,