feat: implement Slack data source connector (#15188)

### What problem does this PR solve?

Closes #15187.

RAGFlow shipped a Slack connector
(`common/data_source/slack_connector.py`) but it was never usable:
`Slack._generate()` in the sync worker was a `pass` stub, the
connector's document-generating code was incompatible with the current
data model,
and Slack was commented out of the data-source settings UI. As a result,
teams had no way to index Slack channels/threads into a knowledge base.

This PR completes the connector end to end.

**Backend**

- `common/data_source/slack_connector.py`
- Rewrote `thread_to_doc` to produce a blob-based `Document`
(`extension`/`blob`/`size_bytes`). The previous implementation built the
doc with a `sections=[...]` argument and omitted the now-required
`blob`/`extension`/ `size_bytes` fields, so it raised a validation error
against the current `Document` model. Thread messages are now cleaned
and flattened into a single UTF-8 text blob.
- Added `load_from_state()` / `poll_source(start, end)` generators. The
connector's checkpoint interface is a no-op stub, so both full and
incremental syncs run through a single channel-iterating generator built
on the existing module helpers (`get_channels`, `filter_channels`,
`get_channel_messages`, `_process_message`), with per-channel thread
de-duplication.
- `rag/svr/sync_data_source.py`
- Implemented `Slack._generate()`. Credentials are loaded via
`StaticCredentialsProvider` (the connector requires `slack_bot_token`
and does not support `load_credentials`). Supports full reindex and
incremental polling from `poll_range_start`, plus the optional channel
filter. Modeled on the Confluence/Dropbox wrappers.
- `SlackConnector` was already exported from
`common/data_source/__init__.py`.

**Frontend (`web/`)**

- Enabled the `SLACK` data-source enum and added its form fields (Slack
bot token + optional channel filter), default values, display metadata,
and a Slack icon.
- Added `slackDescription` / `slackBotTokenTip` / `slackChannelsTip`
strings to `en.ts` and `zh.ts`.

**Tests**

- `test/unit_test/data_source/test_slack_connector_unit.py`: unit tests
covering credential loading (`load_credentials` raises,
`set_credentials_provider` initializes clients, missing credentials
raises) and document generation (standalone message + flattened thread,
blob/extension/size_bytes/metadata, and the incremental poll time
window). All 5 pass; `ruff check` is clean.

Required Slack scopes: `channels:read`, `channels:history`,
`users:read`.

### Type of change

- [x] New Feature (non-breaking change which adds functionality)
This commit is contained in:
web-dev0521
2026-05-28 01:46:07 -06:00
committed by GitHub
parent 7e83643536
commit 5de021ebb4
8 changed files with 428 additions and 19 deletions

View File

@@ -37,7 +37,6 @@ from common.data_source.models import (
Document,
DocumentFailure,
SlimDocument,
TextSection,
SecondsSinceUnixEpoch,
GenerateSlimDocumentOutput, MessageType, SlackMessageFilterReason, ChannelType, ThreadType, ProcessedSlackMessage,
CheckpointOutput
@@ -201,7 +200,10 @@ def thread_to_doc(
]
valid_experts = [expert for expert in experts if expert]
first_message = slack_cleaner.index_clean(cast(str, thread[0]["text"]))
cleaned_messages = [
slack_cleaner.index_clean(cast(str, m["text"])) for m in thread
]
first_message = cleaned_messages[0] if cleaned_messages else ""
snippet = (
first_message[:50].rstrip() + "..."
if len(first_message) > 50
@@ -212,21 +214,22 @@ def thread_to_doc(
"\n", " "
)
# The Document model is blob-based (no sections), so flatten the thread's
# cleaned messages into a single UTF-8 text blob.
content = "\n\n".join(cleaned_messages)
blob = content.encode("utf-8")
return Document(
id=_build_doc_id(channel_id=channel_id, thread_ts=thread[0]["ts"]),
sections=[
TextSection(
link=get_message_link(event=m, client=client, channel_id=channel_id),
text=slack_cleaner.index_clean(cast(str, m["text"])),
)
for m in thread
],
source="slack",
semantic_identifier=doc_sem_id,
extension=".txt",
blob=blob,
size_bytes=len(blob),
doc_updated_at=get_latest_message_time(thread),
primary_owners=valid_experts,
metadata={"Channel": channel["name"]},
external_access=channel_access,
externale_access=channel_access,
)
@@ -540,6 +543,79 @@ class SlackConnector(
callback=callback,
)
def _fetch_document_batches(
self,
oldest: str | None = None,
latest: str | None = None,
callback: Any = None,
) -> Generator[list[Document], None, None]:
"""Iterate the configured channels and yield batches of thread documents.
The checkpoint interface is not implemented in this connector, so both
full and incremental syncs run through this generator. ``oldest`` /
``latest`` are Slack epoch-second strings used to bound the
conversations history for incremental polling.
"""
if self.client is None or self.text_cleaner is None:
raise ConnectorMissingCredentialError("Slack")
all_channels = get_channels(self.client)
filtered_channels = filter_channels(
all_channels, self.channels, self.channel_regex_enabled
)
batch: list[Document] = []
for channel in filtered_channels:
seen_thread_ts: set[str] = set()
for message_batch in get_channel_messages(
client=self.client,
channel=channel,
oldest=oldest,
latest=latest,
callback=callback,
):
for message in message_batch:
processed = _process_message(
message=message,
client=self.client,
channel=channel,
slack_cleaner=self.text_cleaner,
user_cache=self.user_cache,
seen_thread_ts=seen_thread_ts,
channel_access=None,
)
if processed.thread_or_message_ts:
seen_thread_ts.add(processed.thread_or_message_ts)
if processed.failure is not None:
logging.warning(
"Slack message processing failure: %s",
processed.failure.failure_message,
)
continue
if processed.doc is not None:
batch.append(processed.doc)
if len(batch) >= self.batch_size:
yield batch
batch = []
if batch:
yield batch
def load_from_state(self) -> Generator[list[Document], None, None]:
"""Full sync: ingest every accessible channel message/thread."""
return self._fetch_document_batches()
def poll_source(
self,
start: SecondsSinceUnixEpoch,
end: SecondsSinceUnixEpoch,
) -> Generator[list[Document], None, None]:
"""Incremental sync bounded by a [start, end] epoch-seconds window."""
return self._fetch_document_batches(oldest=str(start), latest=str(end))
def load_from_checkpoint(
self,
start: SecondsSinceUnixEpoch,
@@ -602,6 +678,16 @@ class SlackConnector(
f"Slack API returned a failure: {error_msg}"
)
# 3) Confirm users:read scope is available (required by thread_to_doc)
users_resp = self.fast_client.users_info(user="USLACKBOT")
if not users_resp.get("ok", False):
error_msg = users_resp.get("error", "")
if error_msg in ("missing_scope", "not_allowed_token_type"):
raise InsufficientPermissionsError(
"Slack bot token lacks the 'users:read' scope required to look up message senders. "
"Please add 'users:read' to your Slack app's OAuth scopes."
)
except SlackApiError as e:
slack_error = e.response.get("error", "")
if slack_error == "ratelimited":