mirror of
https://github.com/infiniflow/ragflow.git
synced 2026-06-29 15:31:05 +08:00
feat: implement Slack data source connector (#15188)
### What problem does this PR solve? Closes #15187. RAGFlow shipped a Slack connector (`common/data_source/slack_connector.py`) but it was never usable: `Slack._generate()` in the sync worker was a `pass` stub, the connector's document-generating code was incompatible with the current data model, and Slack was commented out of the data-source settings UI. As a result, teams had no way to index Slack channels/threads into a knowledge base. This PR completes the connector end to end. **Backend** - `common/data_source/slack_connector.py` - Rewrote `thread_to_doc` to produce a blob-based `Document` (`extension`/`blob`/`size_bytes`). The previous implementation built the doc with a `sections=[...]` argument and omitted the now-required `blob`/`extension`/ `size_bytes` fields, so it raised a validation error against the current `Document` model. Thread messages are now cleaned and flattened into a single UTF-8 text blob. - Added `load_from_state()` / `poll_source(start, end)` generators. The connector's checkpoint interface is a no-op stub, so both full and incremental syncs run through a single channel-iterating generator built on the existing module helpers (`get_channels`, `filter_channels`, `get_channel_messages`, `_process_message`), with per-channel thread de-duplication. - `rag/svr/sync_data_source.py` - Implemented `Slack._generate()`. Credentials are loaded via `StaticCredentialsProvider` (the connector requires `slack_bot_token` and does not support `load_credentials`). Supports full reindex and incremental polling from `poll_range_start`, plus the optional channel filter. Modeled on the Confluence/Dropbox wrappers. - `SlackConnector` was already exported from `common/data_source/__init__.py`. **Frontend (`web/`)** - Enabled the `SLACK` data-source enum and added its form fields (Slack bot token + optional channel filter), default values, display metadata, and a Slack icon. - Added `slackDescription` / `slackBotTokenTip` / `slackChannelsTip` strings to `en.ts` and `zh.ts`. **Tests** - `test/unit_test/data_source/test_slack_connector_unit.py`: unit tests covering credential loading (`load_credentials` raises, `set_credentials_provider` initializes clients, missing credentials raises) and document generation (standalone message + flattened thread, blob/extension/size_bytes/metadata, and the incremental poll time window). All 5 pass; `ruff check` is clean. Required Slack scopes: `channels:read`, `channels:history`, `users:read`. ### Type of change - [x] New Feature (non-breaking change which adds functionality)
This commit is contained in:
@@ -37,7 +37,6 @@ from common.data_source.models import (
|
||||
Document,
|
||||
DocumentFailure,
|
||||
SlimDocument,
|
||||
TextSection,
|
||||
SecondsSinceUnixEpoch,
|
||||
GenerateSlimDocumentOutput, MessageType, SlackMessageFilterReason, ChannelType, ThreadType, ProcessedSlackMessage,
|
||||
CheckpointOutput
|
||||
@@ -201,7 +200,10 @@ def thread_to_doc(
|
||||
]
|
||||
valid_experts = [expert for expert in experts if expert]
|
||||
|
||||
first_message = slack_cleaner.index_clean(cast(str, thread[0]["text"]))
|
||||
cleaned_messages = [
|
||||
slack_cleaner.index_clean(cast(str, m["text"])) for m in thread
|
||||
]
|
||||
first_message = cleaned_messages[0] if cleaned_messages else ""
|
||||
snippet = (
|
||||
first_message[:50].rstrip() + "..."
|
||||
if len(first_message) > 50
|
||||
@@ -212,21 +214,22 @@ def thread_to_doc(
|
||||
"\n", " "
|
||||
)
|
||||
|
||||
# The Document model is blob-based (no sections), so flatten the thread's
|
||||
# cleaned messages into a single UTF-8 text blob.
|
||||
content = "\n\n".join(cleaned_messages)
|
||||
blob = content.encode("utf-8")
|
||||
|
||||
return Document(
|
||||
id=_build_doc_id(channel_id=channel_id, thread_ts=thread[0]["ts"]),
|
||||
sections=[
|
||||
TextSection(
|
||||
link=get_message_link(event=m, client=client, channel_id=channel_id),
|
||||
text=slack_cleaner.index_clean(cast(str, m["text"])),
|
||||
)
|
||||
for m in thread
|
||||
],
|
||||
source="slack",
|
||||
semantic_identifier=doc_sem_id,
|
||||
extension=".txt",
|
||||
blob=blob,
|
||||
size_bytes=len(blob),
|
||||
doc_updated_at=get_latest_message_time(thread),
|
||||
primary_owners=valid_experts,
|
||||
metadata={"Channel": channel["name"]},
|
||||
external_access=channel_access,
|
||||
externale_access=channel_access,
|
||||
)
|
||||
|
||||
|
||||
@@ -540,6 +543,79 @@ class SlackConnector(
|
||||
callback=callback,
|
||||
)
|
||||
|
||||
def _fetch_document_batches(
|
||||
self,
|
||||
oldest: str | None = None,
|
||||
latest: str | None = None,
|
||||
callback: Any = None,
|
||||
) -> Generator[list[Document], None, None]:
|
||||
"""Iterate the configured channels and yield batches of thread documents.
|
||||
|
||||
The checkpoint interface is not implemented in this connector, so both
|
||||
full and incremental syncs run through this generator. ``oldest`` /
|
||||
``latest`` are Slack epoch-second strings used to bound the
|
||||
conversations history for incremental polling.
|
||||
"""
|
||||
if self.client is None or self.text_cleaner is None:
|
||||
raise ConnectorMissingCredentialError("Slack")
|
||||
|
||||
all_channels = get_channels(self.client)
|
||||
filtered_channels = filter_channels(
|
||||
all_channels, self.channels, self.channel_regex_enabled
|
||||
)
|
||||
|
||||
batch: list[Document] = []
|
||||
for channel in filtered_channels:
|
||||
seen_thread_ts: set[str] = set()
|
||||
for message_batch in get_channel_messages(
|
||||
client=self.client,
|
||||
channel=channel,
|
||||
oldest=oldest,
|
||||
latest=latest,
|
||||
callback=callback,
|
||||
):
|
||||
for message in message_batch:
|
||||
processed = _process_message(
|
||||
message=message,
|
||||
client=self.client,
|
||||
channel=channel,
|
||||
slack_cleaner=self.text_cleaner,
|
||||
user_cache=self.user_cache,
|
||||
seen_thread_ts=seen_thread_ts,
|
||||
channel_access=None,
|
||||
)
|
||||
|
||||
if processed.thread_or_message_ts:
|
||||
seen_thread_ts.add(processed.thread_or_message_ts)
|
||||
|
||||
if processed.failure is not None:
|
||||
logging.warning(
|
||||
"Slack message processing failure: %s",
|
||||
processed.failure.failure_message,
|
||||
)
|
||||
continue
|
||||
|
||||
if processed.doc is not None:
|
||||
batch.append(processed.doc)
|
||||
if len(batch) >= self.batch_size:
|
||||
yield batch
|
||||
batch = []
|
||||
|
||||
if batch:
|
||||
yield batch
|
||||
|
||||
def load_from_state(self) -> Generator[list[Document], None, None]:
|
||||
"""Full sync: ingest every accessible channel message/thread."""
|
||||
return self._fetch_document_batches()
|
||||
|
||||
def poll_source(
|
||||
self,
|
||||
start: SecondsSinceUnixEpoch,
|
||||
end: SecondsSinceUnixEpoch,
|
||||
) -> Generator[list[Document], None, None]:
|
||||
"""Incremental sync bounded by a [start, end] epoch-seconds window."""
|
||||
return self._fetch_document_batches(oldest=str(start), latest=str(end))
|
||||
|
||||
def load_from_checkpoint(
|
||||
self,
|
||||
start: SecondsSinceUnixEpoch,
|
||||
@@ -602,6 +678,16 @@ class SlackConnector(
|
||||
f"Slack API returned a failure: {error_msg}"
|
||||
)
|
||||
|
||||
# 3) Confirm users:read scope is available (required by thread_to_doc)
|
||||
users_resp = self.fast_client.users_info(user="USLACKBOT")
|
||||
if not users_resp.get("ok", False):
|
||||
error_msg = users_resp.get("error", "")
|
||||
if error_msg in ("missing_scope", "not_allowed_token_type"):
|
||||
raise InsufficientPermissionsError(
|
||||
"Slack bot token lacks the 'users:read' scope required to look up message senders. "
|
||||
"Please add 'users:read' to your Slack app's OAuth scopes."
|
||||
)
|
||||
|
||||
except SlackApiError as e:
|
||||
slack_error = e.response.get("error", "")
|
||||
if slack_error == "ratelimited":
|
||||
|
||||
Reference in New Issue
Block a user