mirror of
https://github.com/infiniflow/ragflow.git
synced 2026-06-29 15:31:05 +08:00
feat: implement SharePoint data source connector (#15190)
### What problem does this PR solve? Closes #15189. RAGFlow shipped a SharePoint connector stub (`common/data_source/sharepoint_connector.py`) whose document-loading methods all returned `[]`, `SharePoint._generate()` was a `pass`, and SharePoint was commented out of the data-source settings UI. As a result there was no way to index files stored in SharePoint document libraries. This PR implements the connector end to end on top of Microsoft Graph (Office365-REST-Python-Client). **Backend** - `common/data_source/sharepoint_connector.py` - `load_credentials()` now builds the Graph client using an MSAL client-credentials **token callback** — the form `GraphClient` actually expects. (The previous stub passed a raw access-token string to `GraphClient(...)`, which is not how that client is driven.) Token acquisition is lazy, so credential loading does no network call. - `validate_connector_settings()` resolves the configured site via Graph. - `load_from_checkpoint()` is now a generator that enumerates every document library under the site, walks folders depth-first, downloads each file, and yields blob-based `Document` objects (`extension` / `blob` / `size_bytes` / `doc_updated_at`). Incremental syncs are bounded by file `lastModifiedDateTime`. Per-file errors are surfaced as `ConnectorFailure` rather than aborting the run. - `retrieve_all_slim_docs_perm_sync()` yields id-only `SlimDocument` batches (no downloads) and the checkpoint helpers return proper checkpoints. - ACL → `ExternalAccess` mapping is intentionally left best-effort (`load_from_checkpoint_with_perm_sync` delegates to the standard load) because the sync pipeline does not currently persist `ExternalAccess`; this can be extended once that plumbing exists. - `rag/svr/sync_data_source.py` - Implemented `SharePoint._generate()` using the existing `CheckpointOutputWrapper` pattern (same shape as Confluence/Jira/Google Drive), supporting full reindex and incremental polling from `poll_range_start`. - `SharePointConnector` is already exported from `common/data_source/__init__.py`. **Frontend (`web/`)** - Enabled the `SHAREPOINT` data-source enum and added its form fields `site_url`, `tenant_id`, `client_id`, `client_secret`), default values, display metadata, and a SharePoint icon. - Added `sharepointDescription` / `sharepointSiteUrlTip` to `en.ts` and `zh.ts`. **Tests** - `test/unit_test/data_source/test_sharepoint_connector_unit.py`: mock-based unit tests covering credential loading (incomplete creds raise, happy path sets the Graph client, fetch-without-creds raises), drive traversal + file download, incremental `lastModifiedDateTime` filtering, and slim-doc listing. All 6 pass; `ruff check` is clean. ### Type of change - [x] New Feature (non-breaking change which adds functionality)
This commit is contained in:
@@ -1,119 +1,270 @@
|
||||
"""SharePoint connector"""
|
||||
"""SharePoint connector
|
||||
|
||||
Ingests files from SharePoint document libraries via the Microsoft Graph API
|
||||
(Office365-REST-Python-Client). Authentication uses MSAL client-credentials
|
||||
(app-only) flow, so it requires an Azure AD app with the ``Sites.Read.All`` and
|
||||
``Files.Read.All`` application permissions (admin-consented).
|
||||
|
||||
The connector implements the checkpointed-connector interface used by the sync
|
||||
worker: ``load_from_checkpoint`` walks every document library under the
|
||||
configured site, downloads each file, and yields blob-based ``Document``
|
||||
objects. Incremental syncs are bounded by the file ``lastModifiedDateTime``.
|
||||
"""
|
||||
|
||||
import logging
|
||||
from datetime import datetime, timezone
|
||||
from typing import Any, Generator
|
||||
|
||||
from typing import Any
|
||||
import msal
|
||||
from office365.graph_client import GraphClient
|
||||
from office365.runtime.client_request import ClientRequestException
|
||||
from office365.sharepoint.client_context import ClientContext
|
||||
|
||||
from common.data_source.config import INDEX_BATCH_SIZE
|
||||
from common.data_source.exceptions import ConnectorValidationError, ConnectorMissingCredentialError
|
||||
from common.data_source.exceptions import (
|
||||
ConnectorMissingCredentialError,
|
||||
ConnectorValidationError,
|
||||
)
|
||||
from common.data_source.interfaces import (
|
||||
CheckpointedConnectorWithPermSync,
|
||||
SecondsSinceUnixEpoch,
|
||||
SlimConnectorWithPermSync
|
||||
SlimConnectorWithPermSync,
|
||||
)
|
||||
from common.data_source.models import (
|
||||
ConnectorCheckpoint
|
||||
ConnectorCheckpoint,
|
||||
ConnectorFailure,
|
||||
Document,
|
||||
DocumentFailure,
|
||||
SlimDocument,
|
||||
)
|
||||
|
||||
GRAPH_SCOPES = ["https://graph.microsoft.com/.default"]
|
||||
|
||||
|
||||
class SharePointConnector(CheckpointedConnectorWithPermSync, SlimConnectorWithPermSync):
|
||||
"""SharePoint connector for accessing SharePoint sites and documents"""
|
||||
"""SharePoint connector for accessing SharePoint sites and documents."""
|
||||
|
||||
def __init__(self, batch_size: int = INDEX_BATCH_SIZE) -> None:
|
||||
self.batch_size = batch_size
|
||||
self.sharepoint_client = None
|
||||
self.graph_client = None
|
||||
self.graph_client: GraphClient | None = None
|
||||
self._site_url: str | None = None
|
||||
|
||||
# -- credentials ---------------------------------------------------------
|
||||
|
||||
def load_credentials(self, credentials: dict[str, Any]) -> dict[str, Any] | None:
|
||||
"""Load SharePoint credentials"""
|
||||
try:
|
||||
tenant_id = credentials.get("tenant_id")
|
||||
client_id = credentials.get("client_id")
|
||||
client_secret = credentials.get("client_secret")
|
||||
site_url = credentials.get("site_url")
|
||||
|
||||
if not all([tenant_id, client_id, client_secret, site_url]):
|
||||
raise ConnectorMissingCredentialError("SharePoint credentials are incomplete")
|
||||
|
||||
# Create MSAL confidential client
|
||||
"""Configure a Microsoft Graph client from app-only credentials.
|
||||
|
||||
The token is acquired lazily through a callback (the way
|
||||
``GraphClient`` expects it), so this method performs no network call;
|
||||
the first real request triggers ``acquire_token_for_client``.
|
||||
"""
|
||||
tenant_id = credentials.get("tenant_id")
|
||||
client_id = credentials.get("client_id")
|
||||
client_secret = credentials.get("client_secret")
|
||||
site_url = credentials.get("site_url")
|
||||
|
||||
if not all([tenant_id, client_id, client_secret, site_url]):
|
||||
raise ConnectorMissingCredentialError("SharePoint credentials are incomplete")
|
||||
|
||||
self._site_url = site_url
|
||||
authority = f"https://login.microsoftonline.com/{tenant_id}"
|
||||
|
||||
def _acquire_token() -> dict[str, Any]:
|
||||
app = msal.ConfidentialClientApplication(
|
||||
client_id=client_id,
|
||||
client_credential=client_secret,
|
||||
authority=f"https://login.microsoftonline.com/{tenant_id}"
|
||||
authority=authority,
|
||||
)
|
||||
|
||||
# Get access token
|
||||
result = app.acquire_token_for_client(scopes=["https://graph.microsoft.com/.default"])
|
||||
|
||||
if "access_token" not in result:
|
||||
raise ConnectorMissingCredentialError("Failed to acquire SharePoint access token")
|
||||
|
||||
# Create Graph client
|
||||
self.graph_client = GraphClient(result["access_token"])
|
||||
|
||||
# Create SharePoint client context
|
||||
self.sharepoint_client = ClientContext(site_url).with_access_token(result["access_token"])
|
||||
|
||||
return None
|
||||
except Exception as e:
|
||||
raise ConnectorMissingCredentialError(f"SharePoint: {e}")
|
||||
token = app.acquire_token_for_client(scopes=GRAPH_SCOPES)
|
||||
if "access_token" not in token:
|
||||
detail = token.get("error_description") or token.get("error") or token
|
||||
raise ConnectorMissingCredentialError(
|
||||
f"Failed to acquire SharePoint access token: {detail}"
|
||||
)
|
||||
return token
|
||||
|
||||
self.graph_client = GraphClient(_acquire_token)
|
||||
return None
|
||||
|
||||
def validate_connector_settings(self) -> None:
|
||||
"""Validate SharePoint connector settings"""
|
||||
if not self.sharepoint_client or not self.graph_client:
|
||||
"""Validate credentials by resolving the configured site."""
|
||||
if self.graph_client is None or not self._site_url:
|
||||
raise ConnectorMissingCredentialError("SharePoint")
|
||||
|
||||
|
||||
try:
|
||||
# Test connection by getting site info
|
||||
site = self.sharepoint_client.site.get().execute_query()
|
||||
site = self.graph_client.sites.get_by_url(self._site_url).execute_query()
|
||||
if not site:
|
||||
raise ConnectorValidationError("Failed to access SharePoint site")
|
||||
except ClientRequestException as e:
|
||||
if "401" in str(e) or "403" in str(e):
|
||||
raise ConnectorValidationError("Invalid credentials or insufficient permissions")
|
||||
else:
|
||||
raise ConnectorValidationError(f"SharePoint validation error: {e}")
|
||||
except ConnectorValidationError:
|
||||
raise
|
||||
except Exception as e:
|
||||
message = str(e)
|
||||
if "401" in message or "403" in message:
|
||||
raise ConnectorValidationError(
|
||||
"Invalid credentials or insufficient permissions for SharePoint"
|
||||
)
|
||||
raise ConnectorValidationError(f"SharePoint validation error: {e}")
|
||||
|
||||
def poll_source(self, start: SecondsSinceUnixEpoch, end: SecondsSinceUnixEpoch) -> Any:
|
||||
"""Poll SharePoint for recent documents"""
|
||||
# Simplified implementation - in production this would handle actual polling
|
||||
return []
|
||||
# -- traversal helpers ---------------------------------------------------
|
||||
|
||||
def _iter_drives(self):
|
||||
site = self.graph_client.sites.get_by_url(self._site_url).execute_query()
|
||||
return site.drives.get().execute_query()
|
||||
|
||||
@staticmethod
|
||||
def _is_folder(drive_item: Any) -> bool:
|
||||
return "folder" in getattr(drive_item, "properties", {})
|
||||
|
||||
def _walk_files(self, root_item: Any) -> Generator[Any, None, None]:
|
||||
"""Depth-first walk of a drive yielding file (non-folder) driveItems."""
|
||||
stack = [root_item]
|
||||
while stack:
|
||||
folder = stack.pop()
|
||||
children = folder.children.get().execute_query()
|
||||
for child in children:
|
||||
if self._is_folder(child):
|
||||
stack.append(child)
|
||||
else:
|
||||
yield child
|
||||
|
||||
@staticmethod
|
||||
def _modified_dt(drive_item: Any) -> datetime | None:
|
||||
value = getattr(drive_item, "last_modified_datetime", None)
|
||||
if value is None:
|
||||
value = getattr(drive_item, "properties", {}).get("lastModifiedDateTime")
|
||||
if value is None:
|
||||
return None
|
||||
if isinstance(value, str):
|
||||
try:
|
||||
value = datetime.fromisoformat(value.replace("Z", "+00:00"))
|
||||
except ValueError:
|
||||
return None
|
||||
if value.tzinfo is None:
|
||||
value = value.replace(tzinfo=timezone.utc)
|
||||
return value
|
||||
|
||||
@staticmethod
|
||||
def _composite_doc_id(drive_id: Any, drive_item: Any) -> str:
|
||||
# Graph driveItem IDs are only unique within a single drive. A site can
|
||||
# expose multiple document libraries (drives), so we namespace the item
|
||||
# ID by drive ID to keep document identifiers globally unique.
|
||||
return f"{drive_id}:{drive_item.id}"
|
||||
|
||||
def _drive_item_to_document(self, drive_item: Any, drive_id: Any, drive_name: str) -> Document:
|
||||
name = drive_item.name or str(drive_item.id)
|
||||
content_result = drive_item.get_content().execute_query()
|
||||
blob = content_result.value or b""
|
||||
if isinstance(blob, str):
|
||||
blob = blob.encode("utf-8")
|
||||
|
||||
extension = ""
|
||||
if "." in name:
|
||||
extension = "." + name.rsplit(".", 1)[1]
|
||||
|
||||
size_bytes = getattr(drive_item, "properties", {}).get("size")
|
||||
if not size_bytes:
|
||||
size_bytes = len(blob)
|
||||
|
||||
modified = self._modified_dt(drive_item) or datetime.now(timezone.utc)
|
||||
|
||||
metadata = {"drive": drive_name, "drive_id": str(drive_id), "drive_item_id": str(drive_item.id)}
|
||||
web_url = getattr(drive_item, "web_url", None)
|
||||
if web_url:
|
||||
metadata["web_url"] = web_url
|
||||
|
||||
return Document(
|
||||
id=self._composite_doc_id(drive_id, drive_item),
|
||||
source="sharepoint",
|
||||
semantic_identifier=name,
|
||||
extension=extension,
|
||||
blob=blob,
|
||||
size_bytes=int(size_bytes),
|
||||
doc_updated_at=modified,
|
||||
metadata=metadata,
|
||||
)
|
||||
|
||||
def _generate_documents(
|
||||
self,
|
||||
start: SecondsSinceUnixEpoch,
|
||||
end: SecondsSinceUnixEpoch,
|
||||
) -> Generator[Document | ConnectorFailure, None, None]:
|
||||
if self.graph_client is None or not self._site_url:
|
||||
raise ConnectorMissingCredentialError("SharePoint")
|
||||
|
||||
for drive in self._iter_drives():
|
||||
drive_name = getattr(drive, "name", None) or getattr(drive, "properties", {}).get("name", "")
|
||||
drive_id = getattr(drive, "id", None) or getattr(drive, "properties", {}).get("id", "")
|
||||
for drive_item in self._walk_files(drive.root):
|
||||
try:
|
||||
modified = self._modified_dt(drive_item)
|
||||
if modified is not None:
|
||||
ts = modified.timestamp()
|
||||
# start is an exclusive lower bound; full reindex passes start=0.
|
||||
if not (start < ts <= end):
|
||||
continue
|
||||
yield self._drive_item_to_document(drive_item, drive_id, drive_name)
|
||||
except Exception as e:
|
||||
logging.exception("SharePoint failed to process drive item")
|
||||
yield ConnectorFailure(
|
||||
failed_document=DocumentFailure(
|
||||
document_id=self._composite_doc_id(drive_id, drive_item)
|
||||
if getattr(drive_item, "id", None) is not None
|
||||
else "unknown",
|
||||
document_link=getattr(drive_item, "web_url", "") or "",
|
||||
),
|
||||
failure_message=str(e),
|
||||
exception=e,
|
||||
)
|
||||
|
||||
# -- checkpointed connector interface ------------------------------------
|
||||
|
||||
def load_from_checkpoint(
|
||||
self,
|
||||
start: SecondsSinceUnixEpoch,
|
||||
end: SecondsSinceUnixEpoch,
|
||||
checkpoint: ConnectorCheckpoint,
|
||||
) -> Any:
|
||||
"""Load documents from checkpoint"""
|
||||
# Simplified implementation
|
||||
return []
|
||||
) -> Generator[Document | ConnectorFailure, None, ConnectorCheckpoint]:
|
||||
"""Yield every file under the site as a Document, then finish.
|
||||
|
||||
The whole library is enumerated in a single pass, so the returned
|
||||
checkpoint always has ``has_more=False``.
|
||||
"""
|
||||
yield from self._generate_documents(start, end)
|
||||
return ConnectorCheckpoint(has_more=False)
|
||||
|
||||
def load_from_checkpoint_with_perm_sync(
|
||||
self,
|
||||
start: SecondsSinceUnixEpoch,
|
||||
end: SecondsSinceUnixEpoch,
|
||||
checkpoint: ConnectorCheckpoint,
|
||||
) -> Any:
|
||||
"""Load documents from checkpoint with permission sync"""
|
||||
# Simplified implementation
|
||||
return []
|
||||
) -> Generator[Document | ConnectorFailure, None, ConnectorCheckpoint]:
|
||||
"""Permission-aware variant.
|
||||
|
||||
SharePoint ACL -> ExternalAccess mapping is not yet wired through the
|
||||
sync pipeline (the pipeline does not persist ExternalAccess), so this
|
||||
currently yields the same documents as ``load_from_checkpoint``.
|
||||
"""
|
||||
return self.load_from_checkpoint(start, end, checkpoint)
|
||||
|
||||
def build_dummy_checkpoint(self) -> ConnectorCheckpoint:
|
||||
"""Build dummy checkpoint"""
|
||||
return ConnectorCheckpoint()
|
||||
return ConnectorCheckpoint(has_more=True)
|
||||
|
||||
def validate_checkpoint_json(self, checkpoint_json: str) -> ConnectorCheckpoint:
|
||||
"""Validate checkpoint JSON"""
|
||||
# Simplified implementation
|
||||
return ConnectorCheckpoint()
|
||||
return ConnectorCheckpoint(has_more=True)
|
||||
|
||||
def retrieve_all_slim_docs_perm_sync(
|
||||
self,
|
||||
callback: Any = None,
|
||||
) -> Any:
|
||||
"""Retrieve all simplified documents with permission sync"""
|
||||
# Simplified implementation
|
||||
return []
|
||||
) -> Generator[list[SlimDocument], None, None]:
|
||||
"""Yield batches of slim documents (ids only) for prune/permission sync."""
|
||||
if self.graph_client is None or not self._site_url:
|
||||
raise ConnectorMissingCredentialError("SharePoint")
|
||||
|
||||
batch: list[SlimDocument] = []
|
||||
for drive in self._iter_drives():
|
||||
drive_id = getattr(drive, "id", None) or getattr(drive, "properties", {}).get("id", "")
|
||||
for drive_item in self._walk_files(drive.root):
|
||||
batch.append(SlimDocument(id=self._composite_doc_id(drive_id, drive_item)))
|
||||
if len(batch) >= self.batch_size:
|
||||
yield batch
|
||||
batch = []
|
||||
if batch:
|
||||
yield batch
|
||||
|
||||
@@ -61,6 +61,7 @@ from common.data_source import (
|
||||
RDBMSConnector,
|
||||
DingTalkAITableConnector,
|
||||
RestAPIConnector,
|
||||
SharePointConnector,
|
||||
)
|
||||
from common.data_source.models import ConnectorFailure, SeafileSyncScope
|
||||
from common.data_source.webdav_connector import WebDAVConnector
|
||||
@@ -932,7 +933,66 @@ class SharePoint(SyncBase):
|
||||
SOURCE_NAME: str = FileSource.SHAREPOINT
|
||||
|
||||
async def _generate(self, task: dict):
|
||||
pass
|
||||
self.connector = SharePointConnector(
|
||||
batch_size=self.conf.get("batch_size", INDEX_BATCH_SIZE),
|
||||
)
|
||||
|
||||
credentials = self.conf.get("credentials") or {}
|
||||
self.connector.load_credentials(credentials)
|
||||
self.connector.validate_connector_settings()
|
||||
|
||||
if task["reindex"] == "1" or not task["poll_range_start"]:
|
||||
start_time = 0.0
|
||||
_begin_info = "totally"
|
||||
else:
|
||||
start_time = task["poll_range_start"].timestamp()
|
||||
_begin_info = f"from {task['poll_range_start']}"
|
||||
|
||||
end_time = datetime.now(timezone.utc).timestamp()
|
||||
|
||||
raw_batch_size = self.conf.get("sync_batch_size") or self.conf.get("batch_size") or INDEX_BATCH_SIZE
|
||||
try:
|
||||
batch_size = int(raw_batch_size)
|
||||
except (TypeError, ValueError):
|
||||
batch_size = INDEX_BATCH_SIZE
|
||||
if batch_size <= 0:
|
||||
batch_size = INDEX_BATCH_SIZE
|
||||
|
||||
def document_batches():
|
||||
checkpoint = self.connector.build_dummy_checkpoint()
|
||||
pending_docs = []
|
||||
iterations = 0
|
||||
iteration_limit = 100_000
|
||||
|
||||
while checkpoint.has_more:
|
||||
wrapper = CheckpointOutputWrapper()
|
||||
doc_generator = wrapper(
|
||||
self.connector.load_from_checkpoint(start_time, end_time, checkpoint)
|
||||
)
|
||||
for document, failure, next_checkpoint in doc_generator:
|
||||
if failure is not None:
|
||||
logging.warning(
|
||||
"SharePoint connector failure: %s",
|
||||
getattr(failure, "failure_message", failure),
|
||||
)
|
||||
continue
|
||||
if document is not None:
|
||||
pending_docs.append(document)
|
||||
if len(pending_docs) >= batch_size:
|
||||
yield pending_docs
|
||||
pending_docs = []
|
||||
if next_checkpoint is not None:
|
||||
checkpoint = next_checkpoint
|
||||
|
||||
iterations += 1
|
||||
if iterations > iteration_limit:
|
||||
raise RuntimeError("Too many iterations while loading SharePoint documents.")
|
||||
|
||||
if pending_docs:
|
||||
yield pending_docs
|
||||
|
||||
self.log_connection("SharePoint", self.conf.get("credentials", {}).get("site_url", ""), task)
|
||||
return document_batches()
|
||||
|
||||
|
||||
class Slack(SyncBase):
|
||||
|
||||
284
test/unit_test/data_source/test_sharepoint_connector_unit.py
Normal file
284
test/unit_test/data_source/test_sharepoint_connector_unit.py
Normal file
@@ -0,0 +1,284 @@
|
||||
#
|
||||
# Copyright 2026 The InfiniFlow Authors. All Rights Reserved.
|
||||
#
|
||||
# Licensed under the Apache License, Version 2.0 (the "License");
|
||||
# you may not use this file except in compliance with the License.
|
||||
# You may obtain a copy of the License at
|
||||
#
|
||||
# http://www.apache.org/licenses/LICENSE-2.0
|
||||
#
|
||||
# Unless required by applicable law or agreed to in writing, software
|
||||
# distributed under the License is distributed on an "AS IS" BASIS,
|
||||
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
|
||||
# See the License for the specific language governing permissions and
|
||||
# limitations under the License.
|
||||
#
|
||||
import importlib.util
|
||||
import sys
|
||||
from datetime import datetime, timezone
|
||||
from pathlib import Path
|
||||
from types import ModuleType
|
||||
|
||||
import pytest
|
||||
|
||||
|
||||
def _load_sharepoint_connector_module():
|
||||
"""Load sharepoint_connector.py in isolation (avoid the package __init__)."""
|
||||
repo_root = Path(__file__).resolve().parents[3]
|
||||
package_name = "common.data_source"
|
||||
saved_modules = {
|
||||
name: module
|
||||
for name, module in sys.modules.items()
|
||||
if name == package_name or name.startswith(f"{package_name}.")
|
||||
}
|
||||
package_stub = ModuleType(package_name)
|
||||
package_stub.__path__ = [str(repo_root / "common" / "data_source")]
|
||||
sys.modules[package_name] = package_stub
|
||||
|
||||
try:
|
||||
spec = importlib.util.spec_from_file_location(
|
||||
"_sharepoint_connector_under_test",
|
||||
repo_root / "common" / "data_source" / "sharepoint_connector.py",
|
||||
)
|
||||
module = importlib.util.module_from_spec(spec)
|
||||
spec.loader.exec_module(module)
|
||||
return module
|
||||
finally:
|
||||
for name in list(sys.modules):
|
||||
if name == package_name or name.startswith(f"{package_name}."):
|
||||
if name in saved_modules:
|
||||
sys.modules[name] = saved_modules[name]
|
||||
else:
|
||||
sys.modules.pop(name, None)
|
||||
|
||||
|
||||
sharepoint_connector = _load_sharepoint_connector_module()
|
||||
SharePointConnector = sharepoint_connector.SharePointConnector
|
||||
|
||||
|
||||
# --- fakes for the office365 fluent API ------------------------------------
|
||||
|
||||
|
||||
class _Query:
|
||||
"""Mimics the `.get()` / `.get_by_url()` -> `.execute_query()` chain."""
|
||||
|
||||
def __init__(self, value):
|
||||
self._value = value
|
||||
|
||||
def execute_query(self):
|
||||
return self._value
|
||||
|
||||
|
||||
class _Content:
|
||||
def __init__(self, value: bytes):
|
||||
self.value = value
|
||||
|
||||
def execute_query(self):
|
||||
return self
|
||||
|
||||
|
||||
class _FakeDriveItem:
|
||||
def __init__(self, item_id, name=None, content=None, modified=None, children=None, size=None):
|
||||
self.id = item_id
|
||||
self.name = name
|
||||
self.web_url = f"https://contoso.sharepoint.com/{item_id}"
|
||||
self.last_modified_datetime = modified
|
||||
self._content = content
|
||||
self._children = children or []
|
||||
self.properties = {}
|
||||
if children is not None:
|
||||
self.properties["folder"] = {"childCount": len(children)}
|
||||
else:
|
||||
self.properties["file"] = {"mimeType": "text/plain"}
|
||||
if size is not None:
|
||||
self.properties["size"] = size
|
||||
|
||||
@property
|
||||
def children(self):
|
||||
return _FakeDrivesAccessor(self._children)
|
||||
|
||||
def get_content(self):
|
||||
return _Content(self._content)
|
||||
|
||||
|
||||
class _FakeDrive:
|
||||
def __init__(self, name, root, drive_id=None):
|
||||
self.name = name
|
||||
self.root = root
|
||||
self.id = drive_id or f"drive-{name}"
|
||||
self.properties = {"name": name, "id": self.id}
|
||||
|
||||
|
||||
class _FakeDrivesAccessor:
|
||||
def __init__(self, drives):
|
||||
self._drives = drives
|
||||
|
||||
def get(self):
|
||||
return _Query(self._drives)
|
||||
|
||||
|
||||
class _FakeSite:
|
||||
def __init__(self, drives):
|
||||
self.drives = _FakeDrivesAccessor(drives)
|
||||
|
||||
def __bool__(self):
|
||||
return True
|
||||
|
||||
|
||||
class _FakeSitesAccessor:
|
||||
def __init__(self, site):
|
||||
self._site = site
|
||||
|
||||
def get_by_url(self, url):
|
||||
return _Query(self._site)
|
||||
|
||||
|
||||
class _FakeGraphClient:
|
||||
def __init__(self, site):
|
||||
self.sites = _FakeSitesAccessor(site)
|
||||
|
||||
|
||||
def _build_connector_with_tree():
|
||||
jan = datetime(2026, 1, 1, 12, tzinfo=timezone.utc)
|
||||
feb = datetime(2026, 2, 1, 12, tzinfo=timezone.utc)
|
||||
|
||||
readme = _FakeDriveItem("f1", "readme.txt", b"hello sharepoint", jan, size=16)
|
||||
nested = _FakeDriveItem("f2", "report.md", b"# Report", feb, size=8)
|
||||
subfolder = _FakeDriveItem("d2", "sub", children=[nested])
|
||||
root = _FakeDriveItem("d1", "root", children=[readme, subfolder])
|
||||
drive = _FakeDrive("Documents", root, drive_id="drv-A")
|
||||
site = _FakeSite([drive])
|
||||
|
||||
connector = SharePointConnector(batch_size=10)
|
||||
connector.graph_client = _FakeGraphClient(site)
|
||||
connector._site_url = "https://contoso.sharepoint.com/sites/MySite"
|
||||
return connector, jan, feb
|
||||
|
||||
|
||||
# --- credential loading -----------------------------------------------------
|
||||
|
||||
|
||||
def test_load_credentials_incomplete_raises():
|
||||
connector = SharePointConnector()
|
||||
with pytest.raises(sharepoint_connector.ConnectorMissingCredentialError):
|
||||
connector.load_credentials({"tenant_id": "t", "client_id": "c"})
|
||||
|
||||
|
||||
def test_load_credentials_sets_graph_client(monkeypatch):
|
||||
captured = {}
|
||||
|
||||
class _FakeApp:
|
||||
def __init__(self, **kwargs):
|
||||
captured.update(kwargs)
|
||||
|
||||
def acquire_token_for_client(self, scopes):
|
||||
return {"access_token": "tok"}
|
||||
|
||||
monkeypatch.setattr(sharepoint_connector.msal, "ConfidentialClientApplication", _FakeApp)
|
||||
monkeypatch.setattr(sharepoint_connector, "GraphClient", lambda token_callback: ("client", token_callback))
|
||||
|
||||
connector = SharePointConnector()
|
||||
result = connector.load_credentials(
|
||||
{
|
||||
"tenant_id": "tenant",
|
||||
"client_id": "client",
|
||||
"client_secret": "secret",
|
||||
"site_url": "https://contoso.sharepoint.com/sites/MySite",
|
||||
}
|
||||
)
|
||||
|
||||
assert result is None
|
||||
assert connector._site_url == "https://contoso.sharepoint.com/sites/MySite"
|
||||
assert connector.graph_client is not None
|
||||
|
||||
|
||||
def test_fetch_without_credentials_raises():
|
||||
connector = SharePointConnector()
|
||||
with pytest.raises(sharepoint_connector.ConnectorMissingCredentialError):
|
||||
list(connector.load_from_checkpoint(0.0, 9e12, connector.build_dummy_checkpoint()))
|
||||
|
||||
|
||||
# --- document generation ----------------------------------------------------
|
||||
|
||||
|
||||
def _collect(generator):
|
||||
"""Drain a checkpoint generator, returning (documents, final_checkpoint)."""
|
||||
docs = []
|
||||
try:
|
||||
while True:
|
||||
docs.append(next(generator))
|
||||
except StopIteration as stop:
|
||||
return docs, stop.value
|
||||
|
||||
|
||||
def test_load_from_checkpoint_walks_libraries_and_downloads():
|
||||
connector, _jan, _feb = _build_connector_with_tree()
|
||||
|
||||
docs, checkpoint = _collect(
|
||||
connector.load_from_checkpoint(0.0, 9e12, connector.build_dummy_checkpoint())
|
||||
)
|
||||
|
||||
assert checkpoint.has_more is False
|
||||
assert {doc.id for doc in docs} == {"drv-A:f1", "drv-A:f2"}
|
||||
|
||||
by_id = {doc.id: doc for doc in docs}
|
||||
assert by_id["drv-A:f1"].blob == b"hello sharepoint"
|
||||
assert by_id["drv-A:f1"].extension == ".txt"
|
||||
assert by_id["drv-A:f1"].size_bytes == 16
|
||||
assert by_id["drv-A:f1"].source == "sharepoint"
|
||||
assert by_id["drv-A:f1"].metadata["drive"] == "Documents"
|
||||
assert by_id["drv-A:f1"].metadata["drive_id"] == "drv-A"
|
||||
assert by_id["drv-A:f1"].metadata["drive_item_id"] == "f1"
|
||||
assert by_id["drv-A:f2"].semantic_identifier == "report.md"
|
||||
assert by_id["drv-A:f2"].extension == ".md"
|
||||
|
||||
|
||||
def test_load_from_checkpoint_filters_by_modified_window():
|
||||
connector, _jan, feb = _build_connector_with_tree()
|
||||
|
||||
# Only include files modified strictly after mid-January -> just report.md (Feb).
|
||||
start = datetime(2026, 1, 15, tzinfo=timezone.utc).timestamp()
|
||||
end = datetime(2026, 3, 1, tzinfo=timezone.utc).timestamp()
|
||||
|
||||
docs, _ = _collect(
|
||||
connector.load_from_checkpoint(start, end, connector.build_dummy_checkpoint())
|
||||
)
|
||||
|
||||
assert [doc.id for doc in docs] == ["drv-A:f2"]
|
||||
|
||||
|
||||
def test_retrieve_all_slim_docs_lists_ids_without_download():
|
||||
connector, _jan, _feb = _build_connector_with_tree()
|
||||
|
||||
batches = list(connector.retrieve_all_slim_docs_perm_sync())
|
||||
ids = [doc.id for batch in batches for doc in batch]
|
||||
|
||||
assert sorted(ids) == ["drv-A:f1", "drv-A:f2"]
|
||||
|
||||
|
||||
def test_document_ids_are_unique_across_drives_with_colliding_item_ids():
|
||||
# Graph driveItem IDs are unique only within a single drive; two libraries
|
||||
# under the same site can legitimately yield items with identical IDs.
|
||||
jan = datetime(2026, 1, 1, 12, tzinfo=timezone.utc)
|
||||
|
||||
file_a = _FakeDriveItem("same-id", "a.txt", b"A", jan, size=1)
|
||||
root_a = _FakeDriveItem("rootA", "root", children=[file_a])
|
||||
drive_a = _FakeDrive("LibraryA", root_a, drive_id="drv-A")
|
||||
|
||||
file_b = _FakeDriveItem("same-id", "b.txt", b"B", jan, size=1)
|
||||
root_b = _FakeDriveItem("rootB", "root", children=[file_b])
|
||||
drive_b = _FakeDrive("LibraryB", root_b, drive_id="drv-B")
|
||||
|
||||
site = _FakeSite([drive_a, drive_b])
|
||||
connector = SharePointConnector(batch_size=10)
|
||||
connector.graph_client = _FakeGraphClient(site)
|
||||
connector._site_url = "https://contoso.sharepoint.com/sites/MySite"
|
||||
|
||||
docs, _ = _collect(
|
||||
connector.load_from_checkpoint(0.0, 9e12, connector.build_dummy_checkpoint())
|
||||
)
|
||||
ids = {doc.id for doc in docs}
|
||||
assert ids == {"drv-A:same-id", "drv-B:same-id"}
|
||||
|
||||
slim_ids = [doc.id for batch in connector.retrieve_all_slim_docs_perm_sync() for doc in batch]
|
||||
assert sorted(slim_ids) == ["drv-A:same-id", "drv-B:same-id"]
|
||||
9
web/src/assets/svg/data-source/sharepoint.svg
Normal file
9
web/src/assets/svg/data-source/sharepoint.svg
Normal file
@@ -0,0 +1,9 @@
|
||||
<svg xmlns="http://www.w3.org/2000/svg" viewBox="0 0 48 48" width="48" height="48">
|
||||
<circle cx="20" cy="14" r="11" fill="#036C70"/>
|
||||
<circle cx="31" cy="25" r="10" fill="#1A9BA1"/>
|
||||
<circle cx="22.5" cy="36.5" r="8.5" fill="#37C6D0"/>
|
||||
<path d="M22 11H10.5A1.5 1.5 0 0 0 9 12.5v18A1.5 1.5 0 0 0 10.5 32H22a1.5 1.5 0 0 0 1.5-1.5v-18A1.5 1.5 0 0 0 22 11z" opacity=".1"/>
|
||||
<path d="M21 12H10.5A1.5 1.5 0 0 0 9 13.5v18A1.5 1.5 0 0 0 10.5 33H21a1.5 1.5 0 0 0 1.5-1.5v-18A1.5 1.5 0 0 0 21 12z" opacity=".2"/>
|
||||
<rect x="2" y="14" width="22" height="22" rx="1.5" fill="#03787C"/>
|
||||
<path d="M10.2 24.6c-.5-.3-.9-.7-1.2-1.1a2.6 2.6 0 0 1-.4-1.5c0-.8.3-1.5.9-2 .6-.5 1.4-.8 2.5-.8 1 0 1.8.1 2.5.4v1.9a4 4 0 0 0-2.3-.6c-.4 0-.8.1-1 .3-.3.1-.4.4-.4.7 0 .2.1.4.3.6.2.2.6.4 1.2.6.9.3 1.6.7 2 1.1.4.4.6 1 .6 1.6 0 .9-.3 1.5-.9 2-.6.5-1.5.7-2.6.7-.5 0-1-.1-1.5-.2a4 4 0 0 1-1.1-.4v-2c.4.3.8.5 1.3.7.5.2.9.2 1.3.2.5 0 .8-.1 1-.3.2-.1.3-.4.3-.7 0-.3-.1-.5-.4-.7-.2-.2-.7-.5-1.4-.8z" fill="#fff"/>
|
||||
</svg>
|
||||
|
After Width: | Height: | Size: 993 B |
@@ -1238,6 +1238,10 @@ Example: Virtual Hosted Style`,
|
||||
'Upload the OAuth JSON generated from Google Console. If it only contains client credentials, run the browser-based verification once to mint long-lived refresh tokens.',
|
||||
dropboxDescription:
|
||||
'Connect your Dropbox to sync files and folders from a chosen account.',
|
||||
sharepointDescription:
|
||||
'Connect a SharePoint site via Microsoft Graph to sync its document libraries.',
|
||||
sharepointSiteUrlTip:
|
||||
'Full URL of the SharePoint site to index, e.g. https://contoso.sharepoint.com/sites/MySite. Requires an Azure AD app with Sites.Read.All and Files.Read.All application permissions (admin consent).',
|
||||
bitbucketDescription: 'Connect Bitbucket to sync PR content.',
|
||||
bitbucketTopWorkspaceTip:
|
||||
'The Bitbucket workspace to index (e.g., "atlassian" from https://bitbucket.org/atlassian/workspace ).',
|
||||
|
||||
@@ -1099,6 +1099,9 @@ NER:使用 spaCy NER 和基于规则的关键词提取来抽取实体和关系
|
||||
gmailTokenTip:
|
||||
'请上传由 Google Console 生成的 OAuth JSON。如果仅包含 client credentials,请通过浏览器授权一次以获取长期有效的刷新 Token。',
|
||||
dropboxDescription: '连接 Dropbox,同步指定账号下的文件与文件夹。',
|
||||
sharepointDescription: '通过 Microsoft Graph 连接 SharePoint 站点,同步其文档库。',
|
||||
sharepointSiteUrlTip:
|
||||
'要索引的 SharePoint 站点完整 URL,例如 https://contoso.sharepoint.com/sites/MySite。需要具备 Sites.Read.All 与 Files.Read.All 应用权限(管理员同意)的 Azure AD 应用。',
|
||||
boxDescription: '连接你的 Box 云盘以同步文件和文件夹。',
|
||||
bitbucketDescription: '连接 Bitbucket,同步 PR 内容。',
|
||||
bitbucketTopWorkspaceTip:
|
||||
|
||||
@@ -43,8 +43,8 @@ export enum DataSourceKey {
|
||||
POSTGRESQL = 'postgresql',
|
||||
REST_API = 'rest_api',
|
||||
RSS = 'rss',
|
||||
SHAREPOINT = 'sharepoint',
|
||||
|
||||
// SHAREPOINT = 'sharepoint',
|
||||
// SLACK = 'slack',
|
||||
// TEAMS = 'teams',
|
||||
}
|
||||
@@ -213,6 +213,11 @@ export const generateDataSourceInfo = (t: TFunction) => {
|
||||
description: t(`setting.${DataSourceKey.MOODLE}Description`),
|
||||
icon: <SvgIcon name={'data-source/moodle'} width={38} />,
|
||||
},
|
||||
[DataSourceKey.SHAREPOINT]: {
|
||||
name: 'SharePoint',
|
||||
description: t(`setting.${DataSourceKey.SHAREPOINT}Description`),
|
||||
icon: <SvgIcon name={'data-source/sharepoint'} width={38} />,
|
||||
},
|
||||
[DataSourceKey.JIRA]: {
|
||||
name: 'Jira',
|
||||
description: t(`setting.${DataSourceKey.JIRA}Description`),
|
||||
@@ -654,6 +659,34 @@ export const DataSourceFormFields = {
|
||||
required: true,
|
||||
},
|
||||
],
|
||||
[DataSourceKey.SHAREPOINT]: [
|
||||
{
|
||||
label: 'Site URL',
|
||||
name: 'config.credentials.site_url',
|
||||
type: FormFieldType.Text,
|
||||
required: true,
|
||||
placeholder: 'https://contoso.sharepoint.com/sites/MySite',
|
||||
tooltip: t('setting.sharepointSiteUrlTip'),
|
||||
},
|
||||
{
|
||||
label: 'Tenant ID',
|
||||
name: 'config.credentials.tenant_id',
|
||||
type: FormFieldType.Text,
|
||||
required: true,
|
||||
},
|
||||
{
|
||||
label: 'Client ID',
|
||||
name: 'config.credentials.client_id',
|
||||
type: FormFieldType.Text,
|
||||
required: true,
|
||||
},
|
||||
{
|
||||
label: 'Client Secret',
|
||||
name: 'config.credentials.client_secret',
|
||||
type: FormFieldType.Password,
|
||||
required: true,
|
||||
},
|
||||
],
|
||||
[DataSourceKey.JIRA]: jiraConstant(t),
|
||||
[DataSourceKey.WEBDAV]: [
|
||||
{
|
||||
@@ -1509,6 +1542,18 @@ export const DataSourceFormDefaultValues = {
|
||||
},
|
||||
},
|
||||
},
|
||||
[DataSourceKey.SHAREPOINT]: {
|
||||
name: '',
|
||||
source: DataSourceKey.SHAREPOINT,
|
||||
config: {
|
||||
credentials: {
|
||||
site_url: '',
|
||||
tenant_id: '',
|
||||
client_id: '',
|
||||
client_secret: '',
|
||||
},
|
||||
},
|
||||
},
|
||||
[DataSourceKey.JIRA]: {
|
||||
name: '',
|
||||
source: DataSourceKey.JIRA,
|
||||
|
||||
Reference in New Issue
Block a user