feat: implement Slack data source connector (#15188)

### What problem does this PR solve? Closes #15187. RAGFlow shipped a Slack connector (`common/data_source/slack_connector.py`) but it was never usable: `Slack._generate()` in the sync worker was a `pass` stub, the connector's document-generating code was incompatible with the current data model, and Slack was commented out of the data-source settings UI. As a result, teams had no way to index Slack channels/threads into a knowledge base. This PR completes the connector end to end. **Backend** - `common/data_source/slack_connector.py` - Rewrote `thread_to_doc` to produce a blob-based `Document` (`extension`/`blob`/`size_bytes`). The previous implementation built the doc with a `sections=[...]` argument and omitted the now-required `blob`/`extension`/ `size_bytes` fields, so it raised a validation error against the current `Document` model. Thread messages are now cleaned and flattened into a single UTF-8 text blob. - Added `load_from_state()` / `poll_source(start, end)` generators. The connector's checkpoint interface is a no-op stub, so both full and incremental syncs run through a single channel-iterating generator built on the existing module helpers (`get_channels`, `filter_channels`, `get_channel_messages`, `_process_message`), with per-channel thread de-duplication. - `rag/svr/sync_data_source.py` - Implemented `Slack._generate()`. Credentials are loaded via `StaticCredentialsProvider` (the connector requires `slack_bot_token` and does not support `load_credentials`). Supports full reindex and incremental polling from `poll_range_start`, plus the optional channel filter. Modeled on the Confluence/Dropbox wrappers. - `SlackConnector` was already exported from `common/data_source/__init__.py`. **Frontend (`web/`)** - Enabled the `SLACK` data-source enum and added its form fields (Slack bot token + optional channel filter), default values, display metadata, and a Slack icon. - Added `slackDescription` / `slackBotTokenTip` / `slackChannelsTip` strings to `en.ts` and `zh.ts`. **Tests** - `test/unit_test/data_source/test_slack_connector_unit.py`: unit tests covering credential loading (`load_credentials` raises, `set_credentials_provider` initializes clients, missing credentials raises) and document generation (standalone message + flattened thread, blob/extension/size_bytes/metadata, and the incremental poll time window). All 5 pass; `ruff check` is clean. Required Slack scopes: `channels:read`, `channels:history`, `users:read`. ### Type of change - [x] New Feature (non-breaking change which adds functionality)
2026-06-29 23:41:12 +08:00 · 2026-05-28 01:46:07 -06:00
parent 7e83643536
commit 5de021ebb4
8 changed files with 428 additions and 19 deletions
--- a/test/testcases/restful_api/test_documents.py
+++ b/test/testcases/restful_api/test_documents.py
@@ -64,16 +64,19 @@ def test_documents_upload_and_list(rest_client, create_dataset, tmp_path):
    assert any(doc["name"] == fp.name for doc in list_payload["data"]["docs"]), list_payload


-def _upload_files(rest_client, dataset_id, file_paths):
+def _upload_files(rest_client, dataset_id, file_paths, timeout=None):
    with ExitStack() as stack:
        files = [("file", (fp.name, stack.enter_context(fp.open("rb")))) for fp in file_paths]
-        return rest_client.post(f"/datasets/{dataset_id}/documents", files=files)
+        kwargs = {"files": files}
+        if timeout is not None:
+            kwargs["timeout"] = timeout
+        return rest_client.post(f"/datasets/{dataset_id}/documents", **kwargs)


-def _seed_documents(rest_client, create_dataset, tmp_path, count=5):
+def _seed_documents(rest_client, create_dataset, tmp_path, count=5, timeout=None):
    dataset_id = create_dataset("dataset_list_contract")
    file_paths = [create_txt_file(tmp_path / f"ragflow_test_upload_{i}.txt") for i in range(count)]
-    res = _upload_files(rest_client, dataset_id, file_paths)
+    res = _upload_files(rest_client, dataset_id, file_paths, timeout=timeout)
    assert res.status_code == 200
    payload = res.json()
    assert payload["code"] == 0, payload
@@ -1166,7 +1169,9 @@ def test_documents_delete_invalid_dataset_partial_duplicate_repeat_and_cross_dat

@pytest.mark.p2
 def test_documents_delete_concurrent_and_bulk_contract(rest_client, create_dataset, tmp_path):
-    dataset_id, uploaded_docs = _seed_documents(rest_client, create_dataset, tmp_path, count=60)
+    dataset_id, uploaded_docs = _seed_documents(
+        rest_client, create_dataset, tmp_path, count=60, timeout=120
+    )
    document_ids = [doc["id"] for doc in uploaded_docs]

    with ThreadPoolExecutor(max_workers=8) as executor:
@@ -1192,9 +1197,15 @@ def test_documents_delete_concurrent_and_bulk_contract(rest_client, create_datas
    assert list_after_payload["code"] == 0, list_after_payload
    assert list_after_payload["data"]["total"] == 0, list_after_payload

-    bulk_dataset_id, bulk_docs = _seed_documents(rest_client, create_dataset, tmp_path, count=120)
+    bulk_dataset_id, bulk_docs = _seed_documents(
+        rest_client, create_dataset, tmp_path, count=120, timeout=120
+    )
    bulk_ids = [doc["id"] for doc in bulk_docs]
-    bulk_delete_res = rest_client.delete(f"/datasets/{bulk_dataset_id}/documents", json={"ids": bulk_ids})
+    bulk_delete_res = rest_client.delete(
+        f"/datasets/{bulk_dataset_id}/documents",
+        json={"ids": bulk_ids},
+        timeout=120,
+    )
    assert bulk_delete_res.status_code == 200
    bulk_delete_payload = bulk_delete_res.json()
    assert bulk_delete_payload["code"] == 0, bulk_delete_payload