From 98f2a2e60b6a6da3906628a157c1751ee439e5ca Mon Sep 17 00:00:00 2001 From: web-dev0521 Date: Thu, 4 Jun 2026 07:06:01 -0600 Subject: [PATCH] feat(connectors): add Azure Blob Storage data source connector (#15466) MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit ### What problem does this PR solve? Closes #15465. RAGFlow supports S3, Google Cloud Storage, R2, and OCI as data sources but not Azure Blob Storage, leaving Azure users without a way to index container objects into a knowledge base. This adds a first-class Azure Blob Storage data-source connector — distinct from RAGFlow's existing Azure storage *backends* (`rag/utils/azure_sas_conn.py`, `rag/utils/azure_spn_conn.py`) which store RAGFlow's own files. **Highlights** - `common/data_source/azure_blob_connector.py`: new `AzureBlobConnector` (`CheckpointedConnectorWithPermSync` + `SlimConnectorWithPermSync`). - Uses the existing `azure-storage-blob` dependency (already in `pyproject.toml`). - Three auth modes, tried in order of precedence: 1. **Account key** — `account_name` + `account_key` + `container_name`. 2. **Connection string** — `connection_string` + `container_name`. 3. **SAS token** — `container_url` + `sas_token` (same shape as `RAGFlowAzureSasBlob`). - ETag fingerprint stored per blob in `AzureBlobCheckpoint.etags` — unchanged blobs (same ETag as last run) are skipped without a download. Only new/modified blobs are fetched. - Optional `prefix` scopes indexing to a virtual folder. - `validate_connector_settings()` probes `get_container_properties()` and maps `AuthenticationFailed / 403 / ContainerNotFound` to typed connector exceptions. - Slim-doc IDs are blob names so prune reconciles correctly. - `common/constants.py`, `common/data_source/config.py`, `common/data_source/__init__.py`: register `azure_blob` in `FileSource` / `DocumentSource` and export `AzureBlobConnector`. - `rag/svr/sync_data_source.py`: new `AzureBlob(SyncBase)` class routed through `load_from_checkpoint` (ETag fingerprint owns change-detection) and added to `func_factory`. - Frontend: - `web/src/pages/user-setting/data-source/constant/index.tsx`: new `DataSourceKey.AZURE_BLOB`, auth-mode selector (account key / connection string / SAS token), all credential fields, prefix + batch-size, `syncDeletedFiles` capability, default form values, tile entry with icon. - `web/src/locales/{en,zh}.ts`: description + per-field tooltips for all 9 new keys. - `web/src/assets/svg/data-source/azure-blob.svg`: Azure-branded stacked-cylinders icon. **Verification** - `npm run build` (vite + esbuild) passes (37 s). ### Type of change - [x] New Feature (non-breaking change which adds functionality) --- common/constants.py | 1 + common/data_source/__init__.py | 2 + common/data_source/azure_blob_connector.py | 437 ++++++++++++++++++ common/data_source/config.py | 1 + rag/svr/sync_data_source.py | 54 +++ web/src/assets/svg/data-source/azure-blob.svg | 17 + web/src/locales/en.ts | 18 + web/src/locales/zh.ts | 18 + .../data-source/constant/index.tsx | 144 ++++++ 9 files changed, 692 insertions(+) create mode 100644 common/data_source/azure_blob_connector.py create mode 100644 web/src/assets/svg/data-source/azure-blob.svg diff --git a/common/constants.py b/common/constants.py index c98794e5e9..d0f32c4039 100644 --- a/common/constants.py +++ b/common/constants.py @@ -156,6 +156,7 @@ class FileSource(StrEnum): DINGTALK_AI_TABLE = "dingtalk_ai_table" ONEDRIVE = "onedrive" OUTLOOK = "outlook" + AZURE_BLOB = "azure_blob" class PipelineTaskType(StrEnum): diff --git a/common/data_source/__init__.py b/common/data_source/__init__.py index bfd365e095..c6ae1caf01 100644 --- a/common/data_source/__init__.py +++ b/common/data_source/__init__.py @@ -36,6 +36,7 @@ from .jira.connector import JiraConnector from .sharepoint_connector import SharePointConnector from .onedrive_connector import OneDriveConnector from .outlook_connector import OutlookConnector +from .azure_blob_connector import AzureBlobConnector from .teams_connector import TeamsConnector from .moodle_connector import MoodleConnector from .airtable_connector import AirtableConnector @@ -71,6 +72,7 @@ __all__ = [ "SharePointConnector", "OneDriveConnector", "OutlookConnector", + "AzureBlobConnector", "TeamsConnector", "MoodleConnector", "BlobType", diff --git a/common/data_source/azure_blob_connector.py b/common/data_source/azure_blob_connector.py new file mode 100644 index 0000000000..771aa13f5b --- /dev/null +++ b/common/data_source/azure_blob_connector.py @@ -0,0 +1,437 @@ +"""Azure Blob Storage data-source connector. + +Ingests blobs from a user's Azure container into a RAGFlow knowledge +base. This is distinct from RAGFlow's own Azure storage *backend* +(``rag/utils/azure_sas_conn.py``, ``rag/utils/azure_spn_conn.py``), +which stores RAGFlow's own files. + +Auth supports three mutually exclusive modes, selected explicitly by the +caller-supplied ``auth_mode`` (the UI hides the other modes' fields but +does not clear them, so we must not guess from whichever field happens to +be populated). When ``auth_mode`` is absent (older configs / direct API +callers) we fall back to field precedence: + + 1. **Connection string** — ``connection_string`` credential; one line, + everything embedded. Good for dev / testing. + 2. **Account key** — ``account_name`` + ``account_key``; maps to the + same underlying SAS-less AccountKey credential. + 3. **SAS token** — ``container_url`` + ``sas_token``; the shape that + ``RAGFlowAzureSasBlob`` already uses. + +Incremental runs are scoped by the poll time window +(``since_epoch`` < last-modified <= ``until_epoch``). +Each blob's ETag is also emitted as the document fingerprint, which the +indexing pipeline persists as ``content_hash`` so unchanged blobs are not +re-embedded. The connector itself keeps no cross-run ETag state. +""" + +from __future__ import annotations + +import logging +from datetime import datetime, timezone +from typing import Any, Generator + +from common.data_source.config import INDEX_BATCH_SIZE +from common.data_source.exceptions import ( + ConnectorMissingCredentialError, + ConnectorValidationError, + InsufficientPermissionsError, + UnexpectedValidationError, +) +from common.data_source.interfaces import ( + CheckpointedConnectorWithPermSync, + SecondsSinceUnixEpoch, + SlimConnectorWithPermSync, +) +from common.data_source.models import ConnectorCheckpoint, SlimDocument + +logger = logging.getLogger(__name__) + +# Extensions we ingest; mirrors the same set used by the OneDrive +# connector so behaviour is consistent across all file-based sources. +_SUPPORTED_EXTENSIONS = { + ".pdf", ".docx", ".doc", ".xlsx", ".xls", + ".pptx", ".ppt", ".txt", ".md", ".csv", + ".html", ".htm", ".json", ".xml", +} + +_AZURE_ENDPOINT_SUFFIX = "blob.core.windows.net" + + +class AzureBlobCheckpoint(ConnectorCheckpoint): + """Checkpoint marker for the Azure Blob connector. + + The connector keeps no cross-run state of its own: a single + ``load_from_checkpoint`` pass lists the container once and sets + ``has_more=False``. Incremental scoping comes from the poll time + window, and per-blob change detection from the document fingerprint + (ETag) the pipeline persists as ``content_hash``. + """ + + +class AzureBlobConnector(CheckpointedConnectorWithPermSync, SlimConnectorWithPermSync): + """Azure Blob Storage data-source connector. + + Authenticates with one of three credential modes (connection string, + account key, or SAS token), chosen by ``auth_mode``, and enumerates + blobs in the configured container under an optional prefix. Each blob's + ETag is surfaced as the document fingerprint so the pipeline can skip + re-embedding unchanged blobs across runs. + """ + + def __init__( + self, + batch_size: int = INDEX_BATCH_SIZE, + prefix: str | None = None, + allow_images: bool = False, + auth_mode: str | None = None, + ) -> None: + self.batch_size = batch_size + self.prefix = (prefix or "").lstrip("/") + self.allow_images = allow_images + # Explicitly selected credential mode: "connection_string", + # "account_key", or "sas_token". Empty falls back to precedence. + self.auth_mode = (auth_mode or "").strip().lower() + self._container_client = None + + # ------------------------------------------------------------------ + # Auth + # ------------------------------------------------------------------ + + def load_credentials(self, credentials: dict[str, Any]) -> dict[str, Any] | None: + from azure.storage.blob import BlobServiceClient, ContainerClient + + conn_str = credentials.get("connection_string") + account_name = credentials.get("account_name") + account_key = credentials.get("account_key") + container_url = (credentials.get("container_url") or "").rstrip("/") + sas_token = credentials.get("sas_token") + container_name = credentials.get("container_name") or "" + + # Honor the explicitly selected auth mode. The UI hides inactive + # credential fields but does not clear them, so a user who fills one + # mode and then switches can leave stale values behind; selecting by + # field precedence would then authenticate with the wrong mode. + # Fall back to precedence only when no auth_mode was supplied. + mode = self.auth_mode + if not mode: + if conn_str: + mode = "connection_string" + elif account_name and account_key: + mode = "account_key" + elif container_url and sas_token: + mode = "sas_token" + + try: + if mode == "connection_string": + if not conn_str: + raise ConnectorMissingCredentialError( + "Azure Blob: connection_string is required for the connection_string auth mode" + ) + if not container_name: + raise ConnectorMissingCredentialError( + "Azure Blob: container_name is required together with connection_string" + ) + svc = BlobServiceClient.from_connection_string(conn_str) + self._container_client = svc.get_container_client(container_name) + elif mode == "account_key": + if not (account_name and account_key): + raise ConnectorMissingCredentialError( + "Azure Blob: account_name and account_key are required for the account_key auth mode" + ) + if not container_name: + raise ConnectorMissingCredentialError( + "Azure Blob: container_name is required together with account_name + account_key" + ) + account_url = f"https://{account_name}.{_AZURE_ENDPOINT_SUFFIX}" + svc = BlobServiceClient( + account_url=account_url, + credential=account_key, + ) + self._container_client = svc.get_container_client(container_name) + elif mode == "sas_token": + if not (container_url and sas_token): + raise ConnectorMissingCredentialError( + "Azure Blob: container_url and sas_token are required for the sas_token auth mode" + ) + # mirrors RAGFlowAzureSasBlob; strip a leading "?" so we + # never produce a double-"?" that breaks SAS auth. + normalized_sas = str(sas_token).lstrip("?") + full_url = f"{container_url}?{normalized_sas}" + self._container_client = ContainerClient.from_container_url(full_url) + else: + raise ConnectorMissingCredentialError( + "Azure Blob credentials are incomplete. Provide one of: " + "(a) connection_string + container_name, " + "(b) account_name + account_key + container_name, " + "(c) container_url + sas_token." + ) + except ConnectorMissingCredentialError: + raise + except Exception as exc: + raise ConnectorMissingCredentialError( + f"Failed to initialise Azure Blob client: {exc}" + ) from exc + + return None + + # ------------------------------------------------------------------ + # Validation + # ------------------------------------------------------------------ + + def validate_connector_settings(self) -> None: + if self._container_client is None: + raise ConnectorMissingCredentialError("Azure Blob") + + try: + # get_container_properties() costs one API call; it returns + # the ETag and last-modified of the container, proving both + # the credential and the container name are valid. + self._container_client.get_container_properties() + except Exception as exc: + msg = str(exc) + code = getattr(getattr(exc, "error_code", None), "value", None) or getattr(exc, "error_code", "") + if "AuthenticationFailed" in msg or "InvalidAuthenticationInfo" in msg: + raise ConnectorMissingCredentialError( + f"Azure Blob credential rejected: {msg[:300]}" + ) from exc + if "AuthorizationPermissionMismatch" in msg or "403" in msg: + raise InsufficientPermissionsError( + f"Azure Blob: insufficient permissions on container: {msg[:300]}" + ) from exc + if "ContainerNotFound" in msg or "404" in msg: + raise ConnectorValidationError( + f"Azure Blob: container not found: {msg[:300]}" + ) from exc + raise UnexpectedValidationError( + f"Azure Blob validation failed ({code}): {msg[:300]}" + ) from exc + + # ------------------------------------------------------------------ + # Checkpoint helpers + # ------------------------------------------------------------------ + + def build_dummy_checkpoint(self) -> AzureBlobCheckpoint: + return AzureBlobCheckpoint(has_more=True) + + def validate_checkpoint_json(self, checkpoint_json: str) -> AzureBlobCheckpoint: + try: + return AzureBlobCheckpoint.model_validate_json(checkpoint_json) + except Exception: + return self.build_dummy_checkpoint() + + # ------------------------------------------------------------------ + # Core data loading + # ------------------------------------------------------------------ + + def poll_source( + self, start: SecondsSinceUnixEpoch, end: SecondsSinceUnixEpoch + ) -> Any: + return self._iter_documents(since_epoch=start, until_epoch=end) + + def load_from_checkpoint( + self, + start: SecondsSinceUnixEpoch, + end: SecondsSinceUnixEpoch, + checkpoint: ConnectorCheckpoint, + ) -> Any: + if not isinstance(checkpoint, AzureBlobCheckpoint): + checkpoint = self.build_dummy_checkpoint() + since = start if start else None + until = end if end else None + return self._iter_documents( + checkpoint=checkpoint, since_epoch=since, until_epoch=until + ) + + def load_from_checkpoint_with_perm_sync( + self, + start: SecondsSinceUnixEpoch, + end: SecondsSinceUnixEpoch, + checkpoint: ConnectorCheckpoint, + ) -> Any: + return self.load_from_checkpoint(start, end, checkpoint) + + def retrieve_all_slim_docs_perm_sync( + self, + callback: Any = None, + ) -> Generator[list[SlimDocument], None, None]: + """Yield batches of slim documents for prune / permission sync.""" + if self._container_client is None: + raise ConnectorMissingCredentialError("Azure Blob") + + batch: list[SlimDocument] = [] + try: + for blob_props in self._container_client.list_blobs(name_starts_with=self.prefix or None): + name = blob_props.name + if not _has_supported_extension(name, self.allow_images): + continue + if callback: + callback(name, name) + batch.append(SlimDocument(id=name)) + if len(batch) >= self.batch_size: + yield batch + batch = [] + except Exception as exc: + raise UnexpectedValidationError( + f"Azure Blob prune listing failed: {exc}" + ) from exc + + if batch: + yield batch + + # ------------------------------------------------------------------ + # Internal document iteration + # ------------------------------------------------------------------ + + def _iter_documents( + self, + checkpoint: AzureBlobCheckpoint | None = None, + since_epoch: float | None = None, + until_epoch: float | None = None, + ): + from common.data_source.models import Document + + if self._container_client is None: + raise ConnectorMissingCredentialError("Azure Blob") + + batch: list[Document] = [] + + try: + for blob_props in self._container_client.list_blobs( + name_starts_with=self.prefix or None + ): + name: str = blob_props.name + + if not _has_supported_extension(name, self.allow_images): + continue + + # Raw ETag (always present); Azure updates it on every + # write. Emitted below as the document fingerprint so the + # pipeline persists it as content_hash and skips re-embedding + # unchanged blobs across runs. + current_etag = (blob_props.etag or "").strip('"') + + # Time-window filter: strict lower bound, inclusive upper + # bound (``since_epoch`` < last-modified <= ``until_epoch``). + # Excluding last-modified == since_epoch (the prior run's + # watermark, which that run already yielded) avoids stable + # duplicate re-fetches on the boundary — matching the + # Salesforce connector's ``> since``. Enforcing the upper + # bound keeps blobs modified mid-run from leaking into this + # window; they're picked up by the next run (whose lower bound + # is this run's upper bound), so an update can never fall into + # a gap between windows. + last_modified: datetime | None = blob_props.last_modified + if last_modified: + ts = last_modified.timestamp() + if since_epoch and ts <= since_epoch: + continue + if until_epoch and ts > until_epoch: + continue + + # Download blob content. A blob that was deleted between the + # listing and this fetch is genuinely gone — skip it. Any + # other failure (throttling, transient 5xx, network) must + # abort the run: the sync framework advances its watermark + # from successfully yielded docs, so silently skipping a + # transiently-failed blob while newer blobs succeed would + # move the watermark past it and drop it permanently. + try: + blob_client = self._container_client.get_blob_client(name) + data = blob_client.download_blob().readall() + except Exception as exc: + if _is_blob_gone(exc): + logger.warning( + "Azure Blob: %s vanished between listing and fetch; skipping", + name, + ) + continue + raise UnexpectedValidationError( + f"Azure Blob: failed to download {name}: {exc}" + ) from exc + + doc_updated_at = ( + last_modified.astimezone(timezone.utc) + if last_modified + else datetime.now(timezone.utc) + ) + + ext = _extension(name) + doc = Document( + id=name, + source="azure_blob", + semantic_identifier=name, + extension=ext, + blob=data, + doc_updated_at=doc_updated_at, + size_bytes=len(data), + fingerprint=current_etag or None, + metadata={ + "container": _container_name(self._container_client), + "etag": current_etag, + "prefix": self.prefix, + }, + ) + batch.append(doc) + + if len(batch) >= self.batch_size: + yield batch + batch = [] + except UnexpectedValidationError: + raise + except Exception as exc: + raise UnexpectedValidationError( + f"Azure Blob listing failed: {exc}" + ) from exc + + if batch: + yield batch + + if checkpoint is not None: + checkpoint.has_more = False + + +# ---------------------------------------------------------------------- +# Module-level helpers +# ---------------------------------------------------------------------- + +def _extension(name: str) -> str: + if "." not in name: + return "" + return "." + name.rsplit(".", 1)[-1].lower() + + +def _has_supported_extension(name: str, allow_images: bool) -> bool: + ext = _extension(name) + if ext in _SUPPORTED_EXTENSIONS: + return True + if allow_images and ext in {".png", ".jpg", ".jpeg", ".webp", ".gif", ".bmp", ".tiff"}: + return True + return False + + +def _is_blob_gone(exc: Exception) -> bool: + """True when a download failed because the blob no longer exists. + + Azure raises ``ResourceNotFoundError`` (status 404, error code + ``BlobNotFound``) when a blob listed moments earlier has since been + deleted. That is not data loss — the blob is gone — so it is safe to + skip. Detected by attribute and string so we need not import the Azure + exception type at module load. + """ + if getattr(exc, "status_code", None) == 404: + return True + code = getattr(exc, "error_code", "") or "" + if "BlobNotFound" in str(code): + return True + msg = str(exc) + return "BlobNotFound" in msg or "ResourceNotFound" in msg + + +def _container_name(client: Any) -> str: + """Extract the container name from a ContainerClient without + importing the Azure SDK at module level.""" + try: + return client.container_name + except AttributeError: + return "" diff --git a/common/data_source/config.py b/common/data_source/config.py index d0238c505e..a790df1a67 100644 --- a/common/data_source/config.py +++ b/common/data_source/config.py @@ -71,6 +71,7 @@ class DocumentSource(str, Enum): DINGTALK_AI_TABLE = "dingtalk_ai_table" ONEDRIVE = "onedrive" OUTLOOK = "outlook" + AZURE_BLOB = "azure_blob" class FileOrigin(str, Enum): diff --git a/rag/svr/sync_data_source.py b/rag/svr/sync_data_source.py index cfc86a2d27..7278736710 100644 --- a/rag/svr/sync_data_source.py +++ b/rag/svr/sync_data_source.py @@ -63,6 +63,7 @@ from common.data_source import ( RestAPIConnector, OneDriveConnector, OutlookConnector, + AzureBlobConnector, TeamsConnector, SlackConnector, SharePointConnector, @@ -1128,6 +1129,58 @@ class Outlook(SyncBase): return wrapper() +class AzureBlob(SyncBase): + SOURCE_NAME: str = FileSource.AZURE_BLOB + + async def _generate(self, task: dict): + raw_batch_size = self.conf.get("batch_size", INDEX_BATCH_SIZE) + try: + batch_size = int(raw_batch_size) + except (TypeError, ValueError): + batch_size = INDEX_BATCH_SIZE + if batch_size <= 0: + batch_size = INDEX_BATCH_SIZE + + self.connector = AzureBlobConnector( + batch_size=batch_size, + prefix=self.conf.get("prefix") or None, + allow_images=bool(self.conf.get("allow_images", False)), + auth_mode=self.conf.get("auth_mode"), + ) + credentials = self.conf.get("credentials") or {} + self.connector.load_credentials(credentials) + + # Route through load_from_checkpoint so incremental runs are scoped + # by the poll time window; per-blob ETags ride along as document + # fingerprints (content_hash) so unchanged blobs aren't re-embedded. + if task["reindex"] == "1" or not task["poll_range_start"]: + start_ts = 0.0 + else: + start_ts = task["poll_range_start"].timestamp() + end_ts = datetime.now(timezone.utc).timestamp() + checkpoint = self.connector.build_dummy_checkpoint() + document_batch_generator = self.connector.load_from_checkpoint( + start_ts, end_ts, checkpoint + ) + + container_hint = ( + credentials.get("container_name") + or credentials.get("container_url", "").rstrip("/").rsplit("/", 1)[-1] + or "" + ) + self.log_connection( + "Azure Blob", + f"{container_hint}/{self.conf.get('prefix', '') or ''}", + task, + ) + + def wrapper(): + for document_batch in document_batch_generator: + yield document_batch + + return wrapper() + + class Slack(SyncBase): SOURCE_NAME: str = FileSource.SLACK @@ -1990,6 +2043,7 @@ func_factory = { FileSource.SHAREPOINT: SharePoint, FileSource.ONEDRIVE: OneDrive, FileSource.OUTLOOK: Outlook, + FileSource.AZURE_BLOB: AzureBlob, FileSource.SLACK: Slack, FileSource.TEAMS: Teams, FileSource.MOODLE: Moodle, diff --git a/web/src/assets/svg/data-source/azure-blob.svg b/web/src/assets/svg/data-source/azure-blob.svg new file mode 100644 index 0000000000..7219d3446d --- /dev/null +++ b/web/src/assets/svg/data-source/azure-blob.svg @@ -0,0 +1,17 @@ + + + + + + + + + + + + + + + + + diff --git a/web/src/locales/en.ts b/web/src/locales/en.ts index 68878961d1..75d1aaeb0d 100644 --- a/web/src/locales/en.ts +++ b/web/src/locales/en.ts @@ -1407,6 +1407,24 @@ Example: Virtual Hosted Style`, 'Mail folder to sync (e.g. inbox, sentitems, archive). Defaults to inbox.', outlookUserIdsTip: 'Comma-separated UPNs or object IDs of mailboxes to sync. Leave blank to sync every mailbox in the tenant (requires User.Read.All).', + azure_blobDescription: + 'Index blobs from an Azure Blob Storage container into a knowledge base. Supports account-key, connection-string, and SAS-token auth. Unchanged blobs are skipped via ETag fingerprinting.', + azureBlobAuthModeTip: + 'Choose the authentication method. Account Key and Connection String require container_name; SAS Token requires container_url + sas_token.', + azureBlobAccountNameTip: + 'Azure storage account name (e.g. mystorageaccount). Required for account-key auth.', + azureBlobAccountKeyTip: + 'Storage account access key (Base64-encoded). Required for account-key auth.', + azureBlobConnectionStringTip: + 'Full Azure Storage connection string (DefaultEndpointsProtocol=https;AccountName=...;...). Required for connection-string auth.', + azureBlobContainerUrlTip: + 'Full HTTPS URL of the container (e.g. https://account.blob.core.windows.net/container). Required for SAS-token auth.', + azureBlobSasTokenTip: + 'SAS query string (without the leading "?"). Required for SAS-token auth.', + azureBlobContainerNameTip: + 'Name of the container to index. Required for account-key and connection-string auth.', + azureBlobPrefixTip: + 'Optional blob name prefix to limit indexing to a virtual folder (e.g. documents/reports/). Leave blank to index the entire container.', restApiQueryParamsTip: 'Key=value pairs (one per line) sent as URL query parameters. Use this instead of embedding params in the URL.', restApiHeadersTip: diff --git a/web/src/locales/zh.ts b/web/src/locales/zh.ts index 78f570cf4b..21deb62450 100644 --- a/web/src/locales/zh.ts +++ b/web/src/locales/zh.ts @@ -1119,6 +1119,24 @@ NER:使用 spaCy NER 和基于规则的关键词提取来抽取实体和关系 '要同步的邮件文件夹(例如 inbox、sentitems、archive),默认为 inbox。', outlookUserIdsTip: '要同步的邮箱 UPN 或对象 ID 列表(逗号分隔)。留空则同步租户内的所有邮箱(需要 User.Read.All 权限)。', + azure_blobDescription: + '将 Azure Blob 存储容器中的文件索引到知识库。支持账户密钥、连接字符串和 SAS 令牌三种认证方式,通过 ETag 指纹跳过未变更的文件。', + azureBlobAuthModeTip: + '选择认证方式。账户密钥和连接字符串需填写容器名称;SAS 令牌方式需填写容器 URL 和 SAS 令牌。', + azureBlobAccountNameTip: + 'Azure 存储账户名称(例如 mystorageaccount),账户密钥认证时必填。', + azureBlobAccountKeyTip: + '存储账户访问密钥(Base64 编码),账户密钥认证时必填。', + azureBlobConnectionStringTip: + '完整的 Azure 存储连接字符串(DefaultEndpointsProtocol=https;AccountName=...;...),连接字符串认证时必填。', + azureBlobContainerUrlTip: + '容器的完整 HTTPS 地址(例如 https://account.blob.core.windows.net/container),SAS 令牌认证时必填。', + azureBlobSasTokenTip: + 'SAS 查询字符串(不含开头的"?"),SAS 令牌认证时必填。', + azureBlobContainerNameTip: + '要索引的容器名称,账户密钥和连接字符串认证时必填。', + azureBlobPrefixTip: + '可选的 Blob 名称前缀,用于限定索引范围(例如 documents/reports/)。留空则索引整个容器。', teamsDescription: '通过 Microsoft Graph 连接 Microsoft Teams,同步频道帖子与回复。', teamsTenantIdTip: diff --git a/web/src/pages/user-setting/data-source/constant/index.tsx b/web/src/pages/user-setting/data-source/constant/index.tsx index 17d66b97b3..82ef99ab0e 100644 --- a/web/src/pages/user-setting/data-source/constant/index.tsx +++ b/web/src/pages/user-setting/data-source/constant/index.tsx @@ -45,6 +45,7 @@ export enum DataSourceKey { RSS = 'rss', ONEDRIVE = 'onedrive', OUTLOOK = 'outlook', + AZURE_BLOB = 'azure_blob', TEAMS = 'teams', SLACK = 'slack', SHAREPOINT = 'sharepoint', @@ -137,6 +138,9 @@ export const DataSourceFeatureVisibilityMap: Partial< [DataSourceKey.OUTLOOK]: { syncDeletedFiles: true, }, + [DataSourceKey.AZURE_BLOB]: { + syncDeletedFiles: true, + }, [DataSourceKey.TEAMS]: { syncDeletedFiles: true, }, @@ -335,6 +339,11 @@ export const generateDataSourceInfo = (t: TFunction) => { description: t(`setting.${DataSourceKey.OUTLOOK}Description`), icon: , }, + [DataSourceKey.AZURE_BLOB]: { + name: 'Azure Blob Storage', + description: t(`setting.${DataSourceKey.AZURE_BLOB}Description`), + icon: , + }, }; }; @@ -515,6 +524,124 @@ export const DataSourceFormFields = { }, }, ], + [DataSourceKey.AZURE_BLOB]: [ + { + label: 'Auth Mode', + name: 'config.auth_mode', + type: FormFieldType.Select, + required: true, + options: [ + { label: 'Account Key', value: 'account_key' }, + { label: 'Connection String', value: 'connection_string' }, + { label: 'SAS Token', value: 'sas_token' }, + ], + tooltip: t('setting.azureBlobAuthModeTip'), + }, + { + label: 'Account Name', + name: 'config.credentials.account_name', + type: FormFieldType.Text, + required: false, + placeholder: 'mystorageaccount', + tooltip: t('setting.azureBlobAccountNameTip'), + shouldRender: (values: any) => + values?.config?.auth_mode === 'account_key', + customValidate: (val: string, values: any) => + values?.config?.auth_mode === 'account_key' && !(val ?? '').trim() + ? 'Account name is required for account key auth' + : true, + }, + { + label: 'Account Key', + name: 'config.credentials.account_key', + type: FormFieldType.Password, + required: false, + tooltip: t('setting.azureBlobAccountKeyTip'), + shouldRender: (values: any) => + values?.config?.auth_mode === 'account_key', + customValidate: (val: string, values: any) => + values?.config?.auth_mode === 'account_key' && !val + ? 'Account key is required for account key auth' + : true, + }, + { + label: 'Connection String', + name: 'config.credentials.connection_string', + type: FormFieldType.Password, + required: false, + tooltip: t('setting.azureBlobConnectionStringTip'), + shouldRender: (values: any) => + values?.config?.auth_mode === 'connection_string', + customValidate: (val: string, values: any) => + values?.config?.auth_mode === 'connection_string' && !val + ? 'Connection string is required for connection string auth' + : true, + }, + { + label: 'Container URL', + name: 'config.credentials.container_url', + type: FormFieldType.Text, + required: false, + placeholder: 'https://account.blob.core.windows.net/container', + tooltip: t('setting.azureBlobContainerUrlTip'), + shouldRender: (values: any) => values?.config?.auth_mode === 'sas_token', + customValidate: (val: string, values: any) => + values?.config?.auth_mode === 'sas_token' && !(val ?? '').trim() + ? 'Container URL is required for SAS token auth' + : true, + }, + { + label: 'SAS Token', + name: 'config.credentials.sas_token', + type: FormFieldType.Password, + required: false, + tooltip: t('setting.azureBlobSasTokenTip'), + shouldRender: (values: any) => values?.config?.auth_mode === 'sas_token', + customValidate: (val: string, values: any) => + values?.config?.auth_mode === 'sas_token' && !val + ? 'SAS token is required for SAS token auth' + : true, + }, + { + label: 'Container Name', + name: 'config.credentials.container_name', + type: FormFieldType.Text, + required: false, + placeholder: 'my-container', + tooltip: t('setting.azureBlobContainerNameTip'), + shouldRender: (values: any) => + values?.config?.auth_mode === 'account_key' || + values?.config?.auth_mode === 'connection_string', + customValidate: (val: string, values: any) => { + const mode = values?.config?.auth_mode; + if ( + (mode === 'account_key' || mode === 'connection_string') && + !(val ?? '').trim() + ) { + return 'Container name is required for this auth mode'; + } + return true; + }, + }, + { + label: 'Prefix (optional)', + name: 'config.prefix', + type: FormFieldType.Text, + required: false, + placeholder: 'documents/reports/', + tooltip: t('setting.azureBlobPrefixTip'), + }, + { + label: 'Batch Size', + name: 'config.batch_size', + type: FormFieldType.Number, + required: false, + validation: { + min: 1, + message: 'Batch Size must be at least 1', + }, + }, + ], [DataSourceKey.RSS]: [ { label: 'Feed URL', @@ -1982,6 +2109,23 @@ export const DataSourceFormDefaultValues = { }, }, }, + [DataSourceKey.AZURE_BLOB]: { + name: '', + source: DataSourceKey.AZURE_BLOB, + config: { + auth_mode: 'account_key', + prefix: '', + batch_size: 2, + credentials: { + account_name: '', + account_key: '', + connection_string: '', + container_url: '', + sas_token: '', + container_name: '', + }, + }, + }, [DataSourceKey.REST_API]: { name: '', source: DataSourceKey.REST_API,