mirror of
https://github.com/infiniflow/ragflow.git
synced 2026-06-29 23:41:12 +08:00
feat: implement Slack data source connector (#15188)
### What problem does this PR solve? Closes #15187. RAGFlow shipped a Slack connector (`common/data_source/slack_connector.py`) but it was never usable: `Slack._generate()` in the sync worker was a `pass` stub, the connector's document-generating code was incompatible with the current data model, and Slack was commented out of the data-source settings UI. As a result, teams had no way to index Slack channels/threads into a knowledge base. This PR completes the connector end to end. **Backend** - `common/data_source/slack_connector.py` - Rewrote `thread_to_doc` to produce a blob-based `Document` (`extension`/`blob`/`size_bytes`). The previous implementation built the doc with a `sections=[...]` argument and omitted the now-required `blob`/`extension`/ `size_bytes` fields, so it raised a validation error against the current `Document` model. Thread messages are now cleaned and flattened into a single UTF-8 text blob. - Added `load_from_state()` / `poll_source(start, end)` generators. The connector's checkpoint interface is a no-op stub, so both full and incremental syncs run through a single channel-iterating generator built on the existing module helpers (`get_channels`, `filter_channels`, `get_channel_messages`, `_process_message`), with per-channel thread de-duplication. - `rag/svr/sync_data_source.py` - Implemented `Slack._generate()`. Credentials are loaded via `StaticCredentialsProvider` (the connector requires `slack_bot_token` and does not support `load_credentials`). Supports full reindex and incremental polling from `poll_range_start`, plus the optional channel filter. Modeled on the Confluence/Dropbox wrappers. - `SlackConnector` was already exported from `common/data_source/__init__.py`. **Frontend (`web/`)** - Enabled the `SLACK` data-source enum and added its form fields (Slack bot token + optional channel filter), default values, display metadata, and a Slack icon. - Added `slackDescription` / `slackBotTokenTip` / `slackChannelsTip` strings to `en.ts` and `zh.ts`. **Tests** - `test/unit_test/data_source/test_slack_connector_unit.py`: unit tests covering credential loading (`load_credentials` raises, `set_credentials_provider` initializes clients, missing credentials raises) and document generation (standalone message + flattened thread, blob/extension/size_bytes/metadata, and the incremental poll time window). All 5 pass; `ruff check` is clean. Required Slack scopes: `channels:read`, `channels:history`, `users:read`. ### Type of change - [x] New Feature (non-breaking change which adds functionality)
This commit is contained in:
@@ -64,16 +64,19 @@ def test_documents_upload_and_list(rest_client, create_dataset, tmp_path):
|
||||
assert any(doc["name"] == fp.name for doc in list_payload["data"]["docs"]), list_payload
|
||||
|
||||
|
||||
def _upload_files(rest_client, dataset_id, file_paths):
|
||||
def _upload_files(rest_client, dataset_id, file_paths, timeout=None):
|
||||
with ExitStack() as stack:
|
||||
files = [("file", (fp.name, stack.enter_context(fp.open("rb")))) for fp in file_paths]
|
||||
return rest_client.post(f"/datasets/{dataset_id}/documents", files=files)
|
||||
kwargs = {"files": files}
|
||||
if timeout is not None:
|
||||
kwargs["timeout"] = timeout
|
||||
return rest_client.post(f"/datasets/{dataset_id}/documents", **kwargs)
|
||||
|
||||
|
||||
def _seed_documents(rest_client, create_dataset, tmp_path, count=5):
|
||||
def _seed_documents(rest_client, create_dataset, tmp_path, count=5, timeout=None):
|
||||
dataset_id = create_dataset("dataset_list_contract")
|
||||
file_paths = [create_txt_file(tmp_path / f"ragflow_test_upload_{i}.txt") for i in range(count)]
|
||||
res = _upload_files(rest_client, dataset_id, file_paths)
|
||||
res = _upload_files(rest_client, dataset_id, file_paths, timeout=timeout)
|
||||
assert res.status_code == 200
|
||||
payload = res.json()
|
||||
assert payload["code"] == 0, payload
|
||||
@@ -1166,7 +1169,9 @@ def test_documents_delete_invalid_dataset_partial_duplicate_repeat_and_cross_dat
|
||||
|
||||
@pytest.mark.p2
|
||||
def test_documents_delete_concurrent_and_bulk_contract(rest_client, create_dataset, tmp_path):
|
||||
dataset_id, uploaded_docs = _seed_documents(rest_client, create_dataset, tmp_path, count=60)
|
||||
dataset_id, uploaded_docs = _seed_documents(
|
||||
rest_client, create_dataset, tmp_path, count=60, timeout=120
|
||||
)
|
||||
document_ids = [doc["id"] for doc in uploaded_docs]
|
||||
|
||||
with ThreadPoolExecutor(max_workers=8) as executor:
|
||||
@@ -1192,9 +1197,15 @@ def test_documents_delete_concurrent_and_bulk_contract(rest_client, create_datas
|
||||
assert list_after_payload["code"] == 0, list_after_payload
|
||||
assert list_after_payload["data"]["total"] == 0, list_after_payload
|
||||
|
||||
bulk_dataset_id, bulk_docs = _seed_documents(rest_client, create_dataset, tmp_path, count=120)
|
||||
bulk_dataset_id, bulk_docs = _seed_documents(
|
||||
rest_client, create_dataset, tmp_path, count=120, timeout=120
|
||||
)
|
||||
bulk_ids = [doc["id"] for doc in bulk_docs]
|
||||
bulk_delete_res = rest_client.delete(f"/datasets/{bulk_dataset_id}/documents", json={"ids": bulk_ids})
|
||||
bulk_delete_res = rest_client.delete(
|
||||
f"/datasets/{bulk_dataset_id}/documents",
|
||||
json={"ids": bulk_ids},
|
||||
timeout=120,
|
||||
)
|
||||
assert bulk_delete_res.status_code == 200
|
||||
bulk_delete_payload = bulk_delete_res.json()
|
||||
assert bulk_delete_payload["code"] == 0, bulk_delete_payload
|
||||
|
||||
Reference in New Issue
Block a user