feat: implement Slack data source connector (#15188)

### What problem does this PR solve? Closes #15187. RAGFlow shipped a Slack connector (`common/data_source/slack_connector.py`) but it was never usable: `Slack._generate()` in the sync worker was a `pass` stub, the connector's document-generating code was incompatible with the current data model, and Slack was commented out of the data-source settings UI. As a result, teams had no way to index Slack channels/threads into a knowledge base. This PR completes the connector end to end. **Backend** - `common/data_source/slack_connector.py` - Rewrote `thread_to_doc` to produce a blob-based `Document` (`extension`/`blob`/`size_bytes`). The previous implementation built the doc with a `sections=[...]` argument and omitted the now-required `blob`/`extension`/ `size_bytes` fields, so it raised a validation error against the current `Document` model. Thread messages are now cleaned and flattened into a single UTF-8 text blob. - Added `load_from_state()` / `poll_source(start, end)` generators. The connector's checkpoint interface is a no-op stub, so both full and incremental syncs run through a single channel-iterating generator built on the existing module helpers (`get_channels`, `filter_channels`, `get_channel_messages`, `_process_message`), with per-channel thread de-duplication. - `rag/svr/sync_data_source.py` - Implemented `Slack._generate()`. Credentials are loaded via `StaticCredentialsProvider` (the connector requires `slack_bot_token` and does not support `load_credentials`). Supports full reindex and incremental polling from `poll_range_start`, plus the optional channel filter. Modeled on the Confluence/Dropbox wrappers. - `SlackConnector` was already exported from `common/data_source/__init__.py`. **Frontend (`web/`)** - Enabled the `SLACK` data-source enum and added its form fields (Slack bot token + optional channel filter), default values, display metadata, and a Slack icon. - Added `slackDescription` / `slackBotTokenTip` / `slackChannelsTip` strings to `en.ts` and `zh.ts`. **Tests** - `test/unit_test/data_source/test_slack_connector_unit.py`: unit tests covering credential loading (`load_credentials` raises, `set_credentials_provider` initializes clients, missing credentials raises) and document generation (standalone message + flattened thread, blob/extension/size_bytes/metadata, and the incremental poll time window). All 5 pass; `ruff check` is clean. Required Slack scopes: `channels:read`, `channels:history`, `users:read`. ### Type of change - [x] New Feature (non-breaking change which adds functionality)
2026-06-29 15:31:05 +08:00 · 2026-05-28 01:46:07 -06:00
parent 7e83643536
commit 5de021ebb4
8 changed files with 428 additions and 19 deletions
--- a/common/data_source/slack_connector.py
+++ b/common/data_source/slack_connector.py
@@ -37,7 +37,6 @@ from common.data_source.models import (
    Document,
    DocumentFailure,
    SlimDocument,
-    TextSection,
    SecondsSinceUnixEpoch,
    GenerateSlimDocumentOutput, MessageType, SlackMessageFilterReason, ChannelType, ThreadType, ProcessedSlackMessage,
    CheckpointOutput
@@ -201,7 +200,10 @@ def thread_to_doc(
        ]
        valid_experts = [expert for expert in experts if expert]

-    first_message = slack_cleaner.index_clean(cast(str, thread[0]["text"]))
+    cleaned_messages = [
+        slack_cleaner.index_clean(cast(str, m["text"])) for m in thread
+    ]
+    first_message = cleaned_messages[0] if cleaned_messages else ""
    snippet = (
        first_message[:50].rstrip() + "..."
        if len(first_message) > 50
@@ -212,21 +214,22 @@ def thread_to_doc(
        "\n", " "
    )

+    # The Document model is blob-based (no sections), so flatten the thread's
+    # cleaned messages into a single UTF-8 text blob.
+    content = "\n\n".join(cleaned_messages)
+    blob = content.encode("utf-8")
+
    return Document(
        id=_build_doc_id(channel_id=channel_id, thread_ts=thread[0]["ts"]),
-        sections=[
-            TextSection(
-                link=get_message_link(event=m, client=client, channel_id=channel_id),
-                text=slack_cleaner.index_clean(cast(str, m["text"])),
-            )
-            for m in thread
-        ],
        source="slack",
        semantic_identifier=doc_sem_id,
+        extension=".txt",
+        blob=blob,
+        size_bytes=len(blob),
        doc_updated_at=get_latest_message_time(thread),
        primary_owners=valid_experts,
        metadata={"Channel": channel["name"]},
-        external_access=channel_access,
+        externale_access=channel_access,
    )


@@ -540,6 +543,79 @@ class SlackConnector(
            callback=callback,
        )

+    def _fetch_document_batches(
+        self,
+        oldest: str | None = None,
+        latest: str | None = None,
+        callback: Any = None,
+    ) -> Generator[list[Document], None, None]:
+        """Iterate the configured channels and yield batches of thread documents.
+
+        The checkpoint interface is not implemented in this connector, so both
+        full and incremental syncs run through this generator. ``oldest`` /
+        ``latest`` are Slack epoch-second strings used to bound the
+        conversations history for incremental polling.
+        """
+        if self.client is None or self.text_cleaner is None:
+            raise ConnectorMissingCredentialError("Slack")
+
+        all_channels = get_channels(self.client)
+        filtered_channels = filter_channels(
+            all_channels, self.channels, self.channel_regex_enabled
+        )
+
+        batch: list[Document] = []
+        for channel in filtered_channels:
+            seen_thread_ts: set[str] = set()
+            for message_batch in get_channel_messages(
+                client=self.client,
+                channel=channel,
+                oldest=oldest,
+                latest=latest,
+                callback=callback,
+            ):
+                for message in message_batch:
+                    processed = _process_message(
+                        message=message,
+                        client=self.client,
+                        channel=channel,
+                        slack_cleaner=self.text_cleaner,
+                        user_cache=self.user_cache,
+                        seen_thread_ts=seen_thread_ts,
+                        channel_access=None,
+                    )
+
+                    if processed.thread_or_message_ts:
+                        seen_thread_ts.add(processed.thread_or_message_ts)
+
+                    if processed.failure is not None:
+                        logging.warning(
+                            "Slack message processing failure: %s",
+                            processed.failure.failure_message,
+                        )
+                        continue
+
+                    if processed.doc is not None:
+                        batch.append(processed.doc)
+                        if len(batch) >= self.batch_size:
+                            yield batch
+                            batch = []
+
+        if batch:
+            yield batch
+
+    def load_from_state(self) -> Generator[list[Document], None, None]:
+        """Full sync: ingest every accessible channel message/thread."""
+        return self._fetch_document_batches()
+
+    def poll_source(
+        self,
+        start: SecondsSinceUnixEpoch,
+        end: SecondsSinceUnixEpoch,
+    ) -> Generator[list[Document], None, None]:
+        """Incremental sync bounded by a [start, end] epoch-seconds window."""
+        return self._fetch_document_batches(oldest=str(start), latest=str(end))
+
    def load_from_checkpoint(
        self,
        start: SecondsSinceUnixEpoch,
@@ -602,6 +678,16 @@ class SlackConnector(
                    f"Slack API returned a failure: {error_msg}"
                )

+            # 3) Confirm users:read scope is available (required by thread_to_doc)
+            users_resp = self.fast_client.users_info(user="USLACKBOT")
+            if not users_resp.get("ok", False):
+                error_msg = users_resp.get("error", "")
+                if error_msg in ("missing_scope", "not_allowed_token_type"):
+                    raise InsufficientPermissionsError(
+                        "Slack bot token lacks the 'users:read' scope required to look up message senders. "
+                        "Please add 'users:read' to your Slack app's OAuth scopes."
+                    )
+
        except SlackApiError as e:
            slack_error = e.response.get("error", "")
            if slack_error == "ratelimited":