feat: implement SharePoint data source connector (#15190)

### What problem does this PR solve?

Closes #15189.

RAGFlow shipped a SharePoint connector stub
(`common/data_source/sharepoint_connector.py`) whose document-loading
methods all returned `[]`, `SharePoint._generate()` was a `pass`, and
SharePoint was commented out of the data-source settings UI. As a result
there was no way to index files stored in SharePoint document libraries.

This PR implements the connector end to end on top of Microsoft Graph
(Office365-REST-Python-Client).

**Backend**

- `common/data_source/sharepoint_connector.py`
- `load_credentials()` now builds the Graph client using an MSAL
client-credentials **token callback** — the form `GraphClient` actually
expects. (The previous stub passed a raw access-token string to
`GraphClient(...)`, which is not how that client is driven.) Token
acquisition is lazy, so credential loading does no network call.
- `validate_connector_settings()` resolves the configured site via
Graph.
- `load_from_checkpoint()` is now a generator that enumerates every
document library under the site, walks folders depth-first, downloads
each file, and yields blob-based `Document` objects (`extension` /
`blob` / `size_bytes` / `doc_updated_at`). Incremental syncs are bounded
by file `lastModifiedDateTime`. Per-file errors are surfaced as
`ConnectorFailure` rather than aborting the run.
- `retrieve_all_slim_docs_perm_sync()` yields id-only `SlimDocument`
batches (no downloads) and the checkpoint helpers return proper
checkpoints.
- ACL → `ExternalAccess` mapping is intentionally left best-effort
(`load_from_checkpoint_with_perm_sync` delegates to the standard load)
because the sync pipeline does not currently persist `ExternalAccess`;
this can be extended once that plumbing exists.
- `rag/svr/sync_data_source.py`
- Implemented `SharePoint._generate()` using the existing
`CheckpointOutputWrapper` pattern (same shape as Confluence/Jira/Google
Drive), supporting full reindex and incremental polling from
`poll_range_start`.
- `SharePointConnector` is already exported from
`common/data_source/__init__.py`.

**Frontend (`web/`)**

- Enabled the `SHAREPOINT` data-source enum and added its form fields
`site_url`, `tenant_id`, `client_id`, `client_secret`), default values,
display metadata, and a SharePoint icon.
- Added `sharepointDescription` / `sharepointSiteUrlTip` to `en.ts` and
`zh.ts`.

**Tests**

- `test/unit_test/data_source/test_sharepoint_connector_unit.py`:
mock-based unit tests covering credential loading (incomplete creds
raise, happy path sets the Graph client, fetch-without-creds raises),
drive traversal + file download, incremental `lastModifiedDateTime`
filtering, and slim-doc listing. All 6 pass; `ruff check` is clean.

### Type of change

- [x] New Feature (non-breaking change which adds functionality)
This commit is contained in:
web-dev0521
2026-05-27 23:26:08 -06:00
committed by GitHub
parent 0aff6a3f32
commit c4c4e228e3
7 changed files with 627 additions and 71 deletions

View File

@@ -1,119 +1,270 @@
"""SharePoint connector"""
"""SharePoint connector
Ingests files from SharePoint document libraries via the Microsoft Graph API
(Office365-REST-Python-Client). Authentication uses MSAL client-credentials
(app-only) flow, so it requires an Azure AD app with the ``Sites.Read.All`` and
``Files.Read.All`` application permissions (admin-consented).
The connector implements the checkpointed-connector interface used by the sync
worker: ``load_from_checkpoint`` walks every document library under the
configured site, downloads each file, and yields blob-based ``Document``
objects. Incremental syncs are bounded by the file ``lastModifiedDateTime``.
"""
import logging
from datetime import datetime, timezone
from typing import Any, Generator
from typing import Any
import msal
from office365.graph_client import GraphClient
from office365.runtime.client_request import ClientRequestException
from office365.sharepoint.client_context import ClientContext
from common.data_source.config import INDEX_BATCH_SIZE
from common.data_source.exceptions import ConnectorValidationError, ConnectorMissingCredentialError
from common.data_source.exceptions import (
ConnectorMissingCredentialError,
ConnectorValidationError,
)
from common.data_source.interfaces import (
CheckpointedConnectorWithPermSync,
SecondsSinceUnixEpoch,
SlimConnectorWithPermSync
SlimConnectorWithPermSync,
)
from common.data_source.models import (
ConnectorCheckpoint
ConnectorCheckpoint,
ConnectorFailure,
Document,
DocumentFailure,
SlimDocument,
)
GRAPH_SCOPES = ["https://graph.microsoft.com/.default"]
class SharePointConnector(CheckpointedConnectorWithPermSync, SlimConnectorWithPermSync):
"""SharePoint connector for accessing SharePoint sites and documents"""
"""SharePoint connector for accessing SharePoint sites and documents."""
def __init__(self, batch_size: int = INDEX_BATCH_SIZE) -> None:
self.batch_size = batch_size
self.sharepoint_client = None
self.graph_client = None
self.graph_client: GraphClient | None = None
self._site_url: str | None = None
# -- credentials ---------------------------------------------------------
def load_credentials(self, credentials: dict[str, Any]) -> dict[str, Any] | None:
"""Load SharePoint credentials"""
try:
tenant_id = credentials.get("tenant_id")
client_id = credentials.get("client_id")
client_secret = credentials.get("client_secret")
site_url = credentials.get("site_url")
if not all([tenant_id, client_id, client_secret, site_url]):
raise ConnectorMissingCredentialError("SharePoint credentials are incomplete")
# Create MSAL confidential client
"""Configure a Microsoft Graph client from app-only credentials.
The token is acquired lazily through a callback (the way
``GraphClient`` expects it), so this method performs no network call;
the first real request triggers ``acquire_token_for_client``.
"""
tenant_id = credentials.get("tenant_id")
client_id = credentials.get("client_id")
client_secret = credentials.get("client_secret")
site_url = credentials.get("site_url")
if not all([tenant_id, client_id, client_secret, site_url]):
raise ConnectorMissingCredentialError("SharePoint credentials are incomplete")
self._site_url = site_url
authority = f"https://login.microsoftonline.com/{tenant_id}"
def _acquire_token() -> dict[str, Any]:
app = msal.ConfidentialClientApplication(
client_id=client_id,
client_credential=client_secret,
authority=f"https://login.microsoftonline.com/{tenant_id}"
authority=authority,
)
# Get access token
result = app.acquire_token_for_client(scopes=["https://graph.microsoft.com/.default"])
if "access_token" not in result:
raise ConnectorMissingCredentialError("Failed to acquire SharePoint access token")
# Create Graph client
self.graph_client = GraphClient(result["access_token"])
# Create SharePoint client context
self.sharepoint_client = ClientContext(site_url).with_access_token(result["access_token"])
return None
except Exception as e:
raise ConnectorMissingCredentialError(f"SharePoint: {e}")
token = app.acquire_token_for_client(scopes=GRAPH_SCOPES)
if "access_token" not in token:
detail = token.get("error_description") or token.get("error") or token
raise ConnectorMissingCredentialError(
f"Failed to acquire SharePoint access token: {detail}"
)
return token
self.graph_client = GraphClient(_acquire_token)
return None
def validate_connector_settings(self) -> None:
"""Validate SharePoint connector settings"""
if not self.sharepoint_client or not self.graph_client:
"""Validate credentials by resolving the configured site."""
if self.graph_client is None or not self._site_url:
raise ConnectorMissingCredentialError("SharePoint")
try:
# Test connection by getting site info
site = self.sharepoint_client.site.get().execute_query()
site = self.graph_client.sites.get_by_url(self._site_url).execute_query()
if not site:
raise ConnectorValidationError("Failed to access SharePoint site")
except ClientRequestException as e:
if "401" in str(e) or "403" in str(e):
raise ConnectorValidationError("Invalid credentials or insufficient permissions")
else:
raise ConnectorValidationError(f"SharePoint validation error: {e}")
except ConnectorValidationError:
raise
except Exception as e:
message = str(e)
if "401" in message or "403" in message:
raise ConnectorValidationError(
"Invalid credentials or insufficient permissions for SharePoint"
)
raise ConnectorValidationError(f"SharePoint validation error: {e}")
def poll_source(self, start: SecondsSinceUnixEpoch, end: SecondsSinceUnixEpoch) -> Any:
"""Poll SharePoint for recent documents"""
# Simplified implementation - in production this would handle actual polling
return []
# -- traversal helpers ---------------------------------------------------
def _iter_drives(self):
site = self.graph_client.sites.get_by_url(self._site_url).execute_query()
return site.drives.get().execute_query()
@staticmethod
def _is_folder(drive_item: Any) -> bool:
return "folder" in getattr(drive_item, "properties", {})
def _walk_files(self, root_item: Any) -> Generator[Any, None, None]:
"""Depth-first walk of a drive yielding file (non-folder) driveItems."""
stack = [root_item]
while stack:
folder = stack.pop()
children = folder.children.get().execute_query()
for child in children:
if self._is_folder(child):
stack.append(child)
else:
yield child
@staticmethod
def _modified_dt(drive_item: Any) -> datetime | None:
value = getattr(drive_item, "last_modified_datetime", None)
if value is None:
value = getattr(drive_item, "properties", {}).get("lastModifiedDateTime")
if value is None:
return None
if isinstance(value, str):
try:
value = datetime.fromisoformat(value.replace("Z", "+00:00"))
except ValueError:
return None
if value.tzinfo is None:
value = value.replace(tzinfo=timezone.utc)
return value
@staticmethod
def _composite_doc_id(drive_id: Any, drive_item: Any) -> str:
# Graph driveItem IDs are only unique within a single drive. A site can
# expose multiple document libraries (drives), so we namespace the item
# ID by drive ID to keep document identifiers globally unique.
return f"{drive_id}:{drive_item.id}"
def _drive_item_to_document(self, drive_item: Any, drive_id: Any, drive_name: str) -> Document:
name = drive_item.name or str(drive_item.id)
content_result = drive_item.get_content().execute_query()
blob = content_result.value or b""
if isinstance(blob, str):
blob = blob.encode("utf-8")
extension = ""
if "." in name:
extension = "." + name.rsplit(".", 1)[1]
size_bytes = getattr(drive_item, "properties", {}).get("size")
if not size_bytes:
size_bytes = len(blob)
modified = self._modified_dt(drive_item) or datetime.now(timezone.utc)
metadata = {"drive": drive_name, "drive_id": str(drive_id), "drive_item_id": str(drive_item.id)}
web_url = getattr(drive_item, "web_url", None)
if web_url:
metadata["web_url"] = web_url
return Document(
id=self._composite_doc_id(drive_id, drive_item),
source="sharepoint",
semantic_identifier=name,
extension=extension,
blob=blob,
size_bytes=int(size_bytes),
doc_updated_at=modified,
metadata=metadata,
)
def _generate_documents(
self,
start: SecondsSinceUnixEpoch,
end: SecondsSinceUnixEpoch,
) -> Generator[Document | ConnectorFailure, None, None]:
if self.graph_client is None or not self._site_url:
raise ConnectorMissingCredentialError("SharePoint")
for drive in self._iter_drives():
drive_name = getattr(drive, "name", None) or getattr(drive, "properties", {}).get("name", "")
drive_id = getattr(drive, "id", None) or getattr(drive, "properties", {}).get("id", "")
for drive_item in self._walk_files(drive.root):
try:
modified = self._modified_dt(drive_item)
if modified is not None:
ts = modified.timestamp()
# start is an exclusive lower bound; full reindex passes start=0.
if not (start < ts <= end):
continue
yield self._drive_item_to_document(drive_item, drive_id, drive_name)
except Exception as e:
logging.exception("SharePoint failed to process drive item")
yield ConnectorFailure(
failed_document=DocumentFailure(
document_id=self._composite_doc_id(drive_id, drive_item)
if getattr(drive_item, "id", None) is not None
else "unknown",
document_link=getattr(drive_item, "web_url", "") or "",
),
failure_message=str(e),
exception=e,
)
# -- checkpointed connector interface ------------------------------------
def load_from_checkpoint(
self,
start: SecondsSinceUnixEpoch,
end: SecondsSinceUnixEpoch,
checkpoint: ConnectorCheckpoint,
) -> Any:
"""Load documents from checkpoint"""
# Simplified implementation
return []
) -> Generator[Document | ConnectorFailure, None, ConnectorCheckpoint]:
"""Yield every file under the site as a Document, then finish.
The whole library is enumerated in a single pass, so the returned
checkpoint always has ``has_more=False``.
"""
yield from self._generate_documents(start, end)
return ConnectorCheckpoint(has_more=False)
def load_from_checkpoint_with_perm_sync(
self,
start: SecondsSinceUnixEpoch,
end: SecondsSinceUnixEpoch,
checkpoint: ConnectorCheckpoint,
) -> Any:
"""Load documents from checkpoint with permission sync"""
# Simplified implementation
return []
) -> Generator[Document | ConnectorFailure, None, ConnectorCheckpoint]:
"""Permission-aware variant.
SharePoint ACL -> ExternalAccess mapping is not yet wired through the
sync pipeline (the pipeline does not persist ExternalAccess), so this
currently yields the same documents as ``load_from_checkpoint``.
"""
return self.load_from_checkpoint(start, end, checkpoint)
def build_dummy_checkpoint(self) -> ConnectorCheckpoint:
"""Build dummy checkpoint"""
return ConnectorCheckpoint()
return ConnectorCheckpoint(has_more=True)
def validate_checkpoint_json(self, checkpoint_json: str) -> ConnectorCheckpoint:
"""Validate checkpoint JSON"""
# Simplified implementation
return ConnectorCheckpoint()
return ConnectorCheckpoint(has_more=True)
def retrieve_all_slim_docs_perm_sync(
self,
callback: Any = None,
) -> Any:
"""Retrieve all simplified documents with permission sync"""
# Simplified implementation
return []
) -> Generator[list[SlimDocument], None, None]:
"""Yield batches of slim documents (ids only) for prune/permission sync."""
if self.graph_client is None or not self._site_url:
raise ConnectorMissingCredentialError("SharePoint")
batch: list[SlimDocument] = []
for drive in self._iter_drives():
drive_id = getattr(drive, "id", None) or getattr(drive, "properties", {}).get("id", "")
for drive_item in self._walk_files(drive.root):
batch.append(SlimDocument(id=self._composite_doc_id(drive_id, drive_item)))
if len(batch) >= self.batch_size:
yield batch
batch = []
if batch:
yield batch

View File

@@ -61,6 +61,7 @@ from common.data_source import (
RDBMSConnector,
DingTalkAITableConnector,
RestAPIConnector,
SharePointConnector,
)
from common.data_source.models import ConnectorFailure, SeafileSyncScope
from common.data_source.webdav_connector import WebDAVConnector
@@ -932,7 +933,66 @@ class SharePoint(SyncBase):
SOURCE_NAME: str = FileSource.SHAREPOINT
async def _generate(self, task: dict):
pass
self.connector = SharePointConnector(
batch_size=self.conf.get("batch_size", INDEX_BATCH_SIZE),
)
credentials = self.conf.get("credentials") or {}
self.connector.load_credentials(credentials)
self.connector.validate_connector_settings()
if task["reindex"] == "1" or not task["poll_range_start"]:
start_time = 0.0
_begin_info = "totally"
else:
start_time = task["poll_range_start"].timestamp()
_begin_info = f"from {task['poll_range_start']}"
end_time = datetime.now(timezone.utc).timestamp()
raw_batch_size = self.conf.get("sync_batch_size") or self.conf.get("batch_size") or INDEX_BATCH_SIZE
try:
batch_size = int(raw_batch_size)
except (TypeError, ValueError):
batch_size = INDEX_BATCH_SIZE
if batch_size <= 0:
batch_size = INDEX_BATCH_SIZE
def document_batches():
checkpoint = self.connector.build_dummy_checkpoint()
pending_docs = []
iterations = 0
iteration_limit = 100_000
while checkpoint.has_more:
wrapper = CheckpointOutputWrapper()
doc_generator = wrapper(
self.connector.load_from_checkpoint(start_time, end_time, checkpoint)
)
for document, failure, next_checkpoint in doc_generator:
if failure is not None:
logging.warning(
"SharePoint connector failure: %s",
getattr(failure, "failure_message", failure),
)
continue
if document is not None:
pending_docs.append(document)
if len(pending_docs) >= batch_size:
yield pending_docs
pending_docs = []
if next_checkpoint is not None:
checkpoint = next_checkpoint
iterations += 1
if iterations > iteration_limit:
raise RuntimeError("Too many iterations while loading SharePoint documents.")
if pending_docs:
yield pending_docs
self.log_connection("SharePoint", self.conf.get("credentials", {}).get("site_url", ""), task)
return document_batches()
class Slack(SyncBase):

View File

@@ -0,0 +1,284 @@
#
# Copyright 2026 The InfiniFlow Authors. All Rights Reserved.
#
# Licensed under the Apache License, Version 2.0 (the "License");
# you may not use this file except in compliance with the License.
# You may obtain a copy of the License at
#
# http://www.apache.org/licenses/LICENSE-2.0
#
# Unless required by applicable law or agreed to in writing, software
# distributed under the License is distributed on an "AS IS" BASIS,
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# See the License for the specific language governing permissions and
# limitations under the License.
#
import importlib.util
import sys
from datetime import datetime, timezone
from pathlib import Path
from types import ModuleType
import pytest
def _load_sharepoint_connector_module():
"""Load sharepoint_connector.py in isolation (avoid the package __init__)."""
repo_root = Path(__file__).resolve().parents[3]
package_name = "common.data_source"
saved_modules = {
name: module
for name, module in sys.modules.items()
if name == package_name or name.startswith(f"{package_name}.")
}
package_stub = ModuleType(package_name)
package_stub.__path__ = [str(repo_root / "common" / "data_source")]
sys.modules[package_name] = package_stub
try:
spec = importlib.util.spec_from_file_location(
"_sharepoint_connector_under_test",
repo_root / "common" / "data_source" / "sharepoint_connector.py",
)
module = importlib.util.module_from_spec(spec)
spec.loader.exec_module(module)
return module
finally:
for name in list(sys.modules):
if name == package_name or name.startswith(f"{package_name}."):
if name in saved_modules:
sys.modules[name] = saved_modules[name]
else:
sys.modules.pop(name, None)
sharepoint_connector = _load_sharepoint_connector_module()
SharePointConnector = sharepoint_connector.SharePointConnector
# --- fakes for the office365 fluent API ------------------------------------
class _Query:
"""Mimics the `.get()` / `.get_by_url()` -> `.execute_query()` chain."""
def __init__(self, value):
self._value = value
def execute_query(self):
return self._value
class _Content:
def __init__(self, value: bytes):
self.value = value
def execute_query(self):
return self
class _FakeDriveItem:
def __init__(self, item_id, name=None, content=None, modified=None, children=None, size=None):
self.id = item_id
self.name = name
self.web_url = f"https://contoso.sharepoint.com/{item_id}"
self.last_modified_datetime = modified
self._content = content
self._children = children or []
self.properties = {}
if children is not None:
self.properties["folder"] = {"childCount": len(children)}
else:
self.properties["file"] = {"mimeType": "text/plain"}
if size is not None:
self.properties["size"] = size
@property
def children(self):
return _FakeDrivesAccessor(self._children)
def get_content(self):
return _Content(self._content)
class _FakeDrive:
def __init__(self, name, root, drive_id=None):
self.name = name
self.root = root
self.id = drive_id or f"drive-{name}"
self.properties = {"name": name, "id": self.id}
class _FakeDrivesAccessor:
def __init__(self, drives):
self._drives = drives
def get(self):
return _Query(self._drives)
class _FakeSite:
def __init__(self, drives):
self.drives = _FakeDrivesAccessor(drives)
def __bool__(self):
return True
class _FakeSitesAccessor:
def __init__(self, site):
self._site = site
def get_by_url(self, url):
return _Query(self._site)
class _FakeGraphClient:
def __init__(self, site):
self.sites = _FakeSitesAccessor(site)
def _build_connector_with_tree():
jan = datetime(2026, 1, 1, 12, tzinfo=timezone.utc)
feb = datetime(2026, 2, 1, 12, tzinfo=timezone.utc)
readme = _FakeDriveItem("f1", "readme.txt", b"hello sharepoint", jan, size=16)
nested = _FakeDriveItem("f2", "report.md", b"# Report", feb, size=8)
subfolder = _FakeDriveItem("d2", "sub", children=[nested])
root = _FakeDriveItem("d1", "root", children=[readme, subfolder])
drive = _FakeDrive("Documents", root, drive_id="drv-A")
site = _FakeSite([drive])
connector = SharePointConnector(batch_size=10)
connector.graph_client = _FakeGraphClient(site)
connector._site_url = "https://contoso.sharepoint.com/sites/MySite"
return connector, jan, feb
# --- credential loading -----------------------------------------------------
def test_load_credentials_incomplete_raises():
connector = SharePointConnector()
with pytest.raises(sharepoint_connector.ConnectorMissingCredentialError):
connector.load_credentials({"tenant_id": "t", "client_id": "c"})
def test_load_credentials_sets_graph_client(monkeypatch):
captured = {}
class _FakeApp:
def __init__(self, **kwargs):
captured.update(kwargs)
def acquire_token_for_client(self, scopes):
return {"access_token": "tok"}
monkeypatch.setattr(sharepoint_connector.msal, "ConfidentialClientApplication", _FakeApp)
monkeypatch.setattr(sharepoint_connector, "GraphClient", lambda token_callback: ("client", token_callback))
connector = SharePointConnector()
result = connector.load_credentials(
{
"tenant_id": "tenant",
"client_id": "client",
"client_secret": "secret",
"site_url": "https://contoso.sharepoint.com/sites/MySite",
}
)
assert result is None
assert connector._site_url == "https://contoso.sharepoint.com/sites/MySite"
assert connector.graph_client is not None
def test_fetch_without_credentials_raises():
connector = SharePointConnector()
with pytest.raises(sharepoint_connector.ConnectorMissingCredentialError):
list(connector.load_from_checkpoint(0.0, 9e12, connector.build_dummy_checkpoint()))
# --- document generation ----------------------------------------------------
def _collect(generator):
"""Drain a checkpoint generator, returning (documents, final_checkpoint)."""
docs = []
try:
while True:
docs.append(next(generator))
except StopIteration as stop:
return docs, stop.value
def test_load_from_checkpoint_walks_libraries_and_downloads():
connector, _jan, _feb = _build_connector_with_tree()
docs, checkpoint = _collect(
connector.load_from_checkpoint(0.0, 9e12, connector.build_dummy_checkpoint())
)
assert checkpoint.has_more is False
assert {doc.id for doc in docs} == {"drv-A:f1", "drv-A:f2"}
by_id = {doc.id: doc for doc in docs}
assert by_id["drv-A:f1"].blob == b"hello sharepoint"
assert by_id["drv-A:f1"].extension == ".txt"
assert by_id["drv-A:f1"].size_bytes == 16
assert by_id["drv-A:f1"].source == "sharepoint"
assert by_id["drv-A:f1"].metadata["drive"] == "Documents"
assert by_id["drv-A:f1"].metadata["drive_id"] == "drv-A"
assert by_id["drv-A:f1"].metadata["drive_item_id"] == "f1"
assert by_id["drv-A:f2"].semantic_identifier == "report.md"
assert by_id["drv-A:f2"].extension == ".md"
def test_load_from_checkpoint_filters_by_modified_window():
connector, _jan, feb = _build_connector_with_tree()
# Only include files modified strictly after mid-January -> just report.md (Feb).
start = datetime(2026, 1, 15, tzinfo=timezone.utc).timestamp()
end = datetime(2026, 3, 1, tzinfo=timezone.utc).timestamp()
docs, _ = _collect(
connector.load_from_checkpoint(start, end, connector.build_dummy_checkpoint())
)
assert [doc.id for doc in docs] == ["drv-A:f2"]
def test_retrieve_all_slim_docs_lists_ids_without_download():
connector, _jan, _feb = _build_connector_with_tree()
batches = list(connector.retrieve_all_slim_docs_perm_sync())
ids = [doc.id for batch in batches for doc in batch]
assert sorted(ids) == ["drv-A:f1", "drv-A:f2"]
def test_document_ids_are_unique_across_drives_with_colliding_item_ids():
# Graph driveItem IDs are unique only within a single drive; two libraries
# under the same site can legitimately yield items with identical IDs.
jan = datetime(2026, 1, 1, 12, tzinfo=timezone.utc)
file_a = _FakeDriveItem("same-id", "a.txt", b"A", jan, size=1)
root_a = _FakeDriveItem("rootA", "root", children=[file_a])
drive_a = _FakeDrive("LibraryA", root_a, drive_id="drv-A")
file_b = _FakeDriveItem("same-id", "b.txt", b"B", jan, size=1)
root_b = _FakeDriveItem("rootB", "root", children=[file_b])
drive_b = _FakeDrive("LibraryB", root_b, drive_id="drv-B")
site = _FakeSite([drive_a, drive_b])
connector = SharePointConnector(batch_size=10)
connector.graph_client = _FakeGraphClient(site)
connector._site_url = "https://contoso.sharepoint.com/sites/MySite"
docs, _ = _collect(
connector.load_from_checkpoint(0.0, 9e12, connector.build_dummy_checkpoint())
)
ids = {doc.id for doc in docs}
assert ids == {"drv-A:same-id", "drv-B:same-id"}
slim_ids = [doc.id for batch in connector.retrieve_all_slim_docs_perm_sync() for doc in batch]
assert sorted(slim_ids) == ["drv-A:same-id", "drv-B:same-id"]

View File

@@ -0,0 +1,9 @@
<svg xmlns="http://www.w3.org/2000/svg" viewBox="0 0 48 48" width="48" height="48">
<circle cx="20" cy="14" r="11" fill="#036C70"/>
<circle cx="31" cy="25" r="10" fill="#1A9BA1"/>
<circle cx="22.5" cy="36.5" r="8.5" fill="#37C6D0"/>
<path d="M22 11H10.5A1.5 1.5 0 0 0 9 12.5v18A1.5 1.5 0 0 0 10.5 32H22a1.5 1.5 0 0 0 1.5-1.5v-18A1.5 1.5 0 0 0 22 11z" opacity=".1"/>
<path d="M21 12H10.5A1.5 1.5 0 0 0 9 13.5v18A1.5 1.5 0 0 0 10.5 33H21a1.5 1.5 0 0 0 1.5-1.5v-18A1.5 1.5 0 0 0 21 12z" opacity=".2"/>
<rect x="2" y="14" width="22" height="22" rx="1.5" fill="#03787C"/>
<path d="M10.2 24.6c-.5-.3-.9-.7-1.2-1.1a2.6 2.6 0 0 1-.4-1.5c0-.8.3-1.5.9-2 .6-.5 1.4-.8 2.5-.8 1 0 1.8.1 2.5.4v1.9a4 4 0 0 0-2.3-.6c-.4 0-.8.1-1 .3-.3.1-.4.4-.4.7 0 .2.1.4.3.6.2.2.6.4 1.2.6.9.3 1.6.7 2 1.1.4.4.6 1 .6 1.6 0 .9-.3 1.5-.9 2-.6.5-1.5.7-2.6.7-.5 0-1-.1-1.5-.2a4 4 0 0 1-1.1-.4v-2c.4.3.8.5 1.3.7.5.2.9.2 1.3.2.5 0 .8-.1 1-.3.2-.1.3-.4.3-.7 0-.3-.1-.5-.4-.7-.2-.2-.7-.5-1.4-.8z" fill="#fff"/>
</svg>

After

Width:  |  Height:  |  Size: 993 B

View File

@@ -1238,6 +1238,10 @@ Example: Virtual Hosted Style`,
'Upload the OAuth JSON generated from Google Console. If it only contains client credentials, run the browser-based verification once to mint long-lived refresh tokens.',
dropboxDescription:
'Connect your Dropbox to sync files and folders from a chosen account.',
sharepointDescription:
'Connect a SharePoint site via Microsoft Graph to sync its document libraries.',
sharepointSiteUrlTip:
'Full URL of the SharePoint site to index, e.g. https://contoso.sharepoint.com/sites/MySite. Requires an Azure AD app with Sites.Read.All and Files.Read.All application permissions (admin consent).',
bitbucketDescription: 'Connect Bitbucket to sync PR content.',
bitbucketTopWorkspaceTip:
'The Bitbucket workspace to index (e.g., "atlassian" from https://bitbucket.org/atlassian/workspace ).',

View File

@@ -1099,6 +1099,9 @@ NER使用 spaCy NER 和基于规则的关键词提取来抽取实体和关系
gmailTokenTip:
'请上传由 Google Console 生成的 OAuth JSON。如果仅包含 client credentials请通过浏览器授权一次以获取长期有效的刷新 Token。',
dropboxDescription: '连接 Dropbox同步指定账号下的文件与文件夹。',
sharepointDescription: '通过 Microsoft Graph 连接 SharePoint 站点,同步其文档库。',
sharepointSiteUrlTip:
'要索引的 SharePoint 站点完整 URL例如 https://contoso.sharepoint.com/sites/MySite。需要具备 Sites.Read.All 与 Files.Read.All 应用权限(管理员同意)的 Azure AD 应用。',
boxDescription: '连接你的 Box 云盘以同步文件和文件夹。',
bitbucketDescription: '连接 Bitbucket同步 PR 内容。',
bitbucketTopWorkspaceTip:

View File

@@ -43,8 +43,8 @@ export enum DataSourceKey {
POSTGRESQL = 'postgresql',
REST_API = 'rest_api',
RSS = 'rss',
SHAREPOINT = 'sharepoint',
// SHAREPOINT = 'sharepoint',
// SLACK = 'slack',
// TEAMS = 'teams',
}
@@ -213,6 +213,11 @@ export const generateDataSourceInfo = (t: TFunction) => {
description: t(`setting.${DataSourceKey.MOODLE}Description`),
icon: <SvgIcon name={'data-source/moodle'} width={38} />,
},
[DataSourceKey.SHAREPOINT]: {
name: 'SharePoint',
description: t(`setting.${DataSourceKey.SHAREPOINT}Description`),
icon: <SvgIcon name={'data-source/sharepoint'} width={38} />,
},
[DataSourceKey.JIRA]: {
name: 'Jira',
description: t(`setting.${DataSourceKey.JIRA}Description`),
@@ -654,6 +659,34 @@ export const DataSourceFormFields = {
required: true,
},
],
[DataSourceKey.SHAREPOINT]: [
{
label: 'Site URL',
name: 'config.credentials.site_url',
type: FormFieldType.Text,
required: true,
placeholder: 'https://contoso.sharepoint.com/sites/MySite',
tooltip: t('setting.sharepointSiteUrlTip'),
},
{
label: 'Tenant ID',
name: 'config.credentials.tenant_id',
type: FormFieldType.Text,
required: true,
},
{
label: 'Client ID',
name: 'config.credentials.client_id',
type: FormFieldType.Text,
required: true,
},
{
label: 'Client Secret',
name: 'config.credentials.client_secret',
type: FormFieldType.Password,
required: true,
},
],
[DataSourceKey.JIRA]: jiraConstant(t),
[DataSourceKey.WEBDAV]: [
{
@@ -1509,6 +1542,18 @@ export const DataSourceFormDefaultValues = {
},
},
},
[DataSourceKey.SHAREPOINT]: {
name: '',
source: DataSourceKey.SHAREPOINT,
config: {
credentials: {
site_url: '',
tenant_id: '',
client_id: '',
client_secret: '',
},
},
},
[DataSourceKey.JIRA]: {
name: '',
source: DataSourceKey.JIRA,