mirror of
https://github.com/infiniflow/ragflow.git
synced 2026-06-29 23:41:12 +08:00
### What problem does this PR solve? Closes #15187. RAGFlow shipped a Slack connector (`common/data_source/slack_connector.py`) but it was never usable: `Slack._generate()` in the sync worker was a `pass` stub, the connector's document-generating code was incompatible with the current data model, and Slack was commented out of the data-source settings UI. As a result, teams had no way to index Slack channels/threads into a knowledge base. This PR completes the connector end to end. **Backend** - `common/data_source/slack_connector.py` - Rewrote `thread_to_doc` to produce a blob-based `Document` (`extension`/`blob`/`size_bytes`). The previous implementation built the doc with a `sections=[...]` argument and omitted the now-required `blob`/`extension`/ `size_bytes` fields, so it raised a validation error against the current `Document` model. Thread messages are now cleaned and flattened into a single UTF-8 text blob. - Added `load_from_state()` / `poll_source(start, end)` generators. The connector's checkpoint interface is a no-op stub, so both full and incremental syncs run through a single channel-iterating generator built on the existing module helpers (`get_channels`, `filter_channels`, `get_channel_messages`, `_process_message`), with per-channel thread de-duplication. - `rag/svr/sync_data_source.py` - Implemented `Slack._generate()`. Credentials are loaded via `StaticCredentialsProvider` (the connector requires `slack_bot_token` and does not support `load_credentials`). Supports full reindex and incremental polling from `poll_range_start`, plus the optional channel filter. Modeled on the Confluence/Dropbox wrappers. - `SlackConnector` was already exported from `common/data_source/__init__.py`. **Frontend (`web/`)** - Enabled the `SLACK` data-source enum and added its form fields (Slack bot token + optional channel filter), default values, display metadata, and a Slack icon. - Added `slackDescription` / `slackBotTokenTip` / `slackChannelsTip` strings to `en.ts` and `zh.ts`. **Tests** - `test/unit_test/data_source/test_slack_connector_unit.py`: unit tests covering credential loading (`load_credentials` raises, `set_credentials_provider` initializes clients, missing credentials raises) and document generation (standalone message + flattened thread, blob/extension/size_bytes/metadata, and the incremental poll time window). All 5 pass; `ruff check` is clean. Required Slack scopes: `channels:read`, `channels:history`, `users:read`. ### Type of change - [x] New Feature (non-breaking change which adds functionality)
1668 lines
78 KiB
Python
1668 lines
78 KiB
Python
#
|
|
# Copyright 2026 The InfiniFlow Authors. All Rights Reserved.
|
|
#
|
|
# Licensed under the Apache License, Version 2.0 (the "License");
|
|
# you may not use this file except in compliance with the License.
|
|
# You may obtain a copy of the License at
|
|
#
|
|
# http://www.apache.org/licenses/LICENSE-2.0
|
|
#
|
|
# Unless required by applicable law or agreed to in writing, software
|
|
# distributed under the License is distributed on an "AS IS" BASIS,
|
|
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
|
|
# See the License for the specific language governing permissions and
|
|
# limitations under the License.
|
|
#
|
|
|
|
from concurrent.futures import ThreadPoolExecutor, as_completed
|
|
import string
|
|
from contextlib import ExitStack
|
|
from pathlib import Path
|
|
import uuid
|
|
|
|
from openpyxl import Workbook
|
|
import pytest
|
|
import requests
|
|
from requests_toolbelt import MultipartEncoder
|
|
from test.testcases.configs import DEFAULT_PARSER_CONFIG, DOCUMENT_NAME_LIMIT, HOST_ADDRESS, INVALID_API_TOKEN, INVALID_ID_32, VERSION
|
|
from test.testcases.restful_api.helpers.client import RestClient
|
|
from test.testcases.utils import compare_by_hash
|
|
from test.testcases.utils.file_utils import (
|
|
create_docx_file,
|
|
create_eml_file,
|
|
create_excel_file,
|
|
create_html_file,
|
|
create_image_file,
|
|
create_json_file,
|
|
create_md_file,
|
|
create_pdf_file,
|
|
create_ppt_file,
|
|
)
|
|
from utils import wait_for
|
|
from utils.file_utils import create_txt_file
|
|
|
|
|
|
@pytest.mark.p1
|
|
def test_documents_upload_and_list(rest_client, create_dataset, tmp_path):
|
|
dataset_id = create_dataset("dataset_upload_list")
|
|
fp = create_txt_file(tmp_path / "upload_and_list.txt")
|
|
with fp.open("rb") as file_obj:
|
|
res = rest_client.post(
|
|
f"/datasets/{dataset_id}/documents",
|
|
files=[("file", (fp.name, file_obj))],
|
|
)
|
|
assert res.status_code == 200
|
|
payload = res.json()
|
|
assert payload["code"] == 0, payload
|
|
assert payload["data"][0]["dataset_id"] == dataset_id, payload
|
|
|
|
list_res = rest_client.get(f"/datasets/{dataset_id}/documents")
|
|
assert list_res.status_code == 200
|
|
list_payload = list_res.json()
|
|
assert list_payload["code"] == 0, list_payload
|
|
assert list_payload["data"]["total"] >= 1, list_payload
|
|
assert any(doc["name"] == fp.name for doc in list_payload["data"]["docs"]), list_payload
|
|
|
|
|
|
def _upload_files(rest_client, dataset_id, file_paths, timeout=None):
|
|
with ExitStack() as stack:
|
|
files = [("file", (fp.name, stack.enter_context(fp.open("rb")))) for fp in file_paths]
|
|
kwargs = {"files": files}
|
|
if timeout is not None:
|
|
kwargs["timeout"] = timeout
|
|
return rest_client.post(f"/datasets/{dataset_id}/documents", **kwargs)
|
|
|
|
|
|
def _seed_documents(rest_client, create_dataset, tmp_path, count=5, timeout=None):
|
|
dataset_id = create_dataset("dataset_list_contract")
|
|
file_paths = [create_txt_file(tmp_path / f"ragflow_test_upload_{i}.txt") for i in range(count)]
|
|
res = _upload_files(rest_client, dataset_id, file_paths, timeout=timeout)
|
|
assert res.status_code == 200
|
|
payload = res.json()
|
|
assert payload["code"] == 0, payload
|
|
assert len(payload["data"]) == count, payload
|
|
return dataset_id, payload["data"]
|
|
|
|
|
|
def _seed_documents_for_update(rest_client, create_dataset, tmp_path):
|
|
dataset_id = create_dataset("dataset_update_contract")
|
|
file_paths = [
|
|
create_txt_file(tmp_path / "ragflow_test_upload_0.txt"),
|
|
create_txt_file(tmp_path / "ragflow_test_upload_1.txt"),
|
|
]
|
|
res = _upload_files(rest_client, dataset_id, file_paths)
|
|
assert res.status_code == 200
|
|
payload = res.json()
|
|
assert payload["code"] == 0, payload
|
|
return dataset_id, payload["data"]
|
|
|
|
|
|
def _assert_docs_sorted(docs, key, reverse):
|
|
values = [doc.get(key) for doc in docs]
|
|
assert values == sorted(values, reverse=reverse)
|
|
|
|
|
|
@wait_for(200, 1, "Document parsing timeout in RESTful document tests")
|
|
def _wait_document_runs(rest_client, dataset_id, document_ids, expected_run="DONE"):
|
|
res = rest_client.get(f"/datasets/{dataset_id}/documents", params={"page_size": max(100, len(document_ids))})
|
|
if res.status_code != 200:
|
|
return False
|
|
payload = res.json()
|
|
if payload["code"] != 0:
|
|
return False
|
|
docs = {doc["id"]: doc for doc in payload["data"]["docs"]}
|
|
for doc_id in document_ids:
|
|
doc = docs.get(doc_id)
|
|
if not doc or doc.get("run") != expected_run:
|
|
return False
|
|
return True
|
|
|
|
|
|
def _download_document_to_file(rest_client, dataset_id, document_id, save_path):
|
|
res = rest_client.get(f"/datasets/{dataset_id}/documents/{document_id}", timeout=60)
|
|
if res.status_code == 200 and res.headers.get("Content-Type", "").startswith("application/octet-stream"):
|
|
save_path.write_bytes(res.content)
|
|
return res
|
|
|
|
|
|
def _create_table_excel(path, rows):
|
|
wb = Workbook()
|
|
ws = wb.active
|
|
for ridx, row in enumerate(rows, start=1):
|
|
for cidx, value in enumerate(row, start=1):
|
|
ws.cell(row=ridx, column=cidx, value=value)
|
|
wb.save(path)
|
|
return path
|
|
|
|
|
|
@pytest.mark.p1
|
|
def test_documents_list_requires_auth(create_dataset):
|
|
dataset_id = create_dataset("dataset_list_auth")
|
|
for scenario_name, client in (("missing token", RestClient(token=None)), ("invalid token", RestClient(token=INVALID_API_TOKEN))):
|
|
res = client.get(f"/datasets/{dataset_id}/documents")
|
|
assert res.status_code == 401, (scenario_name, res.text)
|
|
payload = res.json()
|
|
assert payload["code"] == 401, (scenario_name, payload)
|
|
assert payload["message"] == "<Unauthorized '401: Unauthorized'>", (scenario_name, payload)
|
|
|
|
|
|
@pytest.mark.p1
|
|
def test_documents_upload_requires_auth(create_dataset, tmp_path):
|
|
dataset_id = create_dataset("dataset_upload_auth")
|
|
fp = create_txt_file(tmp_path / "upload_auth.txt")
|
|
for scenario_name, client in (("missing token", RestClient(token=None)), ("invalid token", RestClient(token=INVALID_API_TOKEN))):
|
|
with fp.open("rb") as file_obj:
|
|
res = client.post(
|
|
f"/datasets/{dataset_id}/documents",
|
|
files=[("file", (fp.name, file_obj))],
|
|
)
|
|
assert res.status_code == 401, (scenario_name, res.text)
|
|
payload = res.json()
|
|
assert payload["code"] == 401, (scenario_name, payload)
|
|
assert payload["message"] == "<Unauthorized '401: Unauthorized'>", (scenario_name, payload)
|
|
|
|
|
|
@pytest.mark.p2
|
|
def test_documents_list_default_concurrent_and_filters_contract(rest_client, create_dataset, tmp_path):
|
|
dataset_id, uploaded_docs = _seed_documents(rest_client, create_dataset, tmp_path)
|
|
first_id = uploaded_docs[0]["id"]
|
|
first_name = uploaded_docs[0]["name"]
|
|
|
|
default_res = rest_client.get(f"/datasets/{dataset_id}/documents")
|
|
assert default_res.status_code == 200
|
|
default_payload = default_res.json()
|
|
assert default_payload["code"] == 0, default_payload
|
|
assert default_payload["data"]["total"] == 5, default_payload
|
|
assert len(default_payload["data"]["docs"]) == 5, default_payload
|
|
|
|
with ThreadPoolExecutor(max_workers=5) as executor:
|
|
futures = [executor.submit(rest_client.get, f"/datasets/{dataset_id}/documents") for _ in range(30)]
|
|
responses = list(as_completed(futures))
|
|
assert len(responses) == 30, responses
|
|
assert all(f.result().json()["code"] == 0 for f in futures)
|
|
|
|
for params, expected_code, expected_docs, expected_total in (
|
|
({"create_time_from": "9999999999000"}, 0, 0, 5),
|
|
({"create_time_to": "1"}, 0, 0, 5),
|
|
({"create_time_from": "0", "create_time_to": "9999999999000"}, 0, 5, 5),
|
|
({"keywords": None}, 0, 5, 5),
|
|
({"keywords": ""}, 0, 5, 5),
|
|
({"keywords": "0"}, 0, 1, 1),
|
|
({"keywords": "ragflow_test_upload"}, 0, 5, 5),
|
|
({"keywords": "unknown"}, 0, 0, 0),
|
|
({"name": None}, 0, 5, 5),
|
|
({"name": ""}, 0, 5, 5),
|
|
({"name": first_name}, 0, 1, 1),
|
|
({"id": None}, 0, 5, 5),
|
|
({"id": ""}, 0, 5, 5),
|
|
({"id": first_id}, 0, 1, 1),
|
|
({"id": first_id, "name": first_name}, 0, 1, 1),
|
|
({"id": first_id, "name": "ragflow_test_upload_1.txt"}, 0, 0, 0),
|
|
({"run": ["UNSTART"]}, 0, 5, 5),
|
|
):
|
|
res = rest_client.get(f"/datasets/{dataset_id}/documents", params=params)
|
|
assert res.status_code == 200, (params, res.text)
|
|
payload = res.json()
|
|
assert payload["code"] == expected_code, (params, payload)
|
|
assert payload["data"]["total"] == expected_total, (params, payload)
|
|
assert len(payload["data"]["docs"]) == expected_docs, (params, payload)
|
|
|
|
|
|
@pytest.mark.p2
|
|
def test_documents_list_error_and_sorting_contract(rest_client, create_dataset, tmp_path):
|
|
dataset_id, uploaded_docs = _seed_documents(rest_client, create_dataset, tmp_path)
|
|
first_id = uploaded_docs[0]["id"]
|
|
|
|
error_cases = [
|
|
(
|
|
"invalid dataset empty",
|
|
"/datasets//documents",
|
|
None,
|
|
102,
|
|
"lacks permission for dataset 'documents'",
|
|
),
|
|
(
|
|
"invalid dataset id",
|
|
"/datasets/invalid_dataset_id/documents",
|
|
None,
|
|
102,
|
|
"You don't own the dataset invalid_dataset_id.",
|
|
),
|
|
(
|
|
"invalid params ignored",
|
|
f"/datasets/{dataset_id}/documents",
|
|
{"a": "b"},
|
|
0,
|
|
"",
|
|
),
|
|
(
|
|
"metadata json invalid",
|
|
f"/datasets/{dataset_id}/documents",
|
|
{"metadata_condition": "{bad json"},
|
|
102,
|
|
"metadata_condition must be valid JSON",
|
|
),
|
|
(
|
|
"metadata json non-object",
|
|
f"/datasets/{dataset_id}/documents",
|
|
{"metadata_condition": "[1]"},
|
|
102,
|
|
"metadata_condition must be an object",
|
|
),
|
|
(
|
|
"name unknown",
|
|
f"/datasets/{dataset_id}/documents",
|
|
{"name": "unknown.txt"},
|
|
102,
|
|
"You don't own the document unknown.txt.",
|
|
),
|
|
(
|
|
"id unknown",
|
|
f"/datasets/{dataset_id}/documents",
|
|
{"id": "unknown.txt"},
|
|
102,
|
|
"You don't own the document unknown.txt.",
|
|
),
|
|
(
|
|
"name+id unknown name",
|
|
f"/datasets/{dataset_id}/documents",
|
|
{"id": first_id, "name": "unknown"},
|
|
102,
|
|
"You don't own the document unknown.",
|
|
),
|
|
(
|
|
"name+id unknown id",
|
|
f"/datasets/{dataset_id}/documents",
|
|
{"id": "id", "name": "ragflow_test_upload_0.txt"},
|
|
102,
|
|
"You don't own the document id.",
|
|
),
|
|
(
|
|
"run invalid",
|
|
f"/datasets/{dataset_id}/documents",
|
|
{"run": ["INVALID_STATUS"]},
|
|
102,
|
|
"Invalid filter run status conditions: INVALID_STATUS",
|
|
),
|
|
(
|
|
"orderby invalid",
|
|
f"/datasets/{dataset_id}/documents",
|
|
{"orderby": "unknown"},
|
|
100,
|
|
"Document' has no attribute 'unknown'",
|
|
),
|
|
(
|
|
"page invalid number",
|
|
f"/datasets/{dataset_id}/documents",
|
|
{"page": -1, "page_size": 2},
|
|
100,
|
|
"1064",
|
|
),
|
|
(
|
|
"page invalid type",
|
|
f"/datasets/{dataset_id}/documents",
|
|
{"page": "a", "page_size": 2},
|
|
100,
|
|
"invalid literal for int()",
|
|
),
|
|
(
|
|
"page_size invalid number",
|
|
f"/datasets/{dataset_id}/documents",
|
|
{"page_size": -1},
|
|
100,
|
|
"1064",
|
|
),
|
|
(
|
|
"page_size invalid type",
|
|
f"/datasets/{dataset_id}/documents",
|
|
{"page_size": "a"},
|
|
100,
|
|
"invalid literal for int()",
|
|
),
|
|
]
|
|
for case_name, path, params, expected_code, expected_message in error_cases:
|
|
res = rest_client.get(path, params=params)
|
|
assert res.status_code == 200, (case_name, res.text)
|
|
payload = res.json()
|
|
assert payload["code"] == expected_code, (case_name, payload)
|
|
assert expected_message in payload["message"], (case_name, payload)
|
|
|
|
for params, expected_total in (
|
|
({"page": None, "page_size": 2}, 2),
|
|
({"page": 1, "page_size": 2}, 2),
|
|
({"page": 2, "page_size": 2}, 2),
|
|
({"page": 3, "page_size": 2}, 1),
|
|
({"page": "3", "page_size": 2}, 1),
|
|
({"page_size": None}, 5),
|
|
({"page_size": 1}, 1),
|
|
({"page_size": 6}, 5),
|
|
({"page_size": "1"}, 1),
|
|
):
|
|
res = rest_client.get(f"/datasets/{dataset_id}/documents", params=params)
|
|
assert res.status_code == 200, (params, res.text)
|
|
payload = res.json()
|
|
assert payload["code"] == 0, (params, payload)
|
|
assert len(payload["data"]["docs"]) == expected_total, (params, payload)
|
|
assert payload["data"]["total"] == 5, (params, payload)
|
|
|
|
for params, expected_key, expected_desc in (
|
|
({"orderby": None}, "create_time", True),
|
|
({"orderby": "create_time"}, "create_time", True),
|
|
({"orderby": "update_time"}, "update_time", True),
|
|
({"orderby": "name", "desc": "False"}, "name", False),
|
|
({"desc": None}, "create_time", True),
|
|
({"desc": "true"}, "create_time", True),
|
|
({"desc": "True"}, "create_time", True),
|
|
({"desc": True}, "create_time", True),
|
|
({"desc": "false"}, "create_time", False),
|
|
({"desc": "False"}, "create_time", False),
|
|
({"desc": False}, "create_time", False),
|
|
({"desc": "False", "orderby": "update_time"}, "update_time", False),
|
|
({"desc": "unknown"}, "create_time", True),
|
|
):
|
|
res = rest_client.get(f"/datasets/{dataset_id}/documents", params=params)
|
|
assert res.status_code == 200, (params, res.text)
|
|
payload = res.json()
|
|
assert payload["code"] == 0, (params, payload)
|
|
_assert_docs_sorted(payload["data"]["docs"], expected_key, expected_desc)
|
|
|
|
|
|
@pytest.mark.p2
|
|
def test_documents_upload_missing_file(rest_client, create_dataset):
|
|
dataset_id = create_dataset("dataset_upload_missing")
|
|
res = rest_client.post(f"/datasets/{dataset_id}/documents")
|
|
assert res.status_code == 200
|
|
payload = res.json()
|
|
assert payload["code"] == 101, payload
|
|
assert payload["message"] == "No file part!", payload
|
|
|
|
|
|
@pytest.mark.p2
|
|
def test_documents_upload_contract_matrix(rest_client, create_dataset, tmp_path):
|
|
dataset_id = create_dataset("dataset_upload_contract")
|
|
|
|
valid_res = _upload_files(rest_client, dataset_id, [create_txt_file(tmp_path / "ragflow_test.txt")])
|
|
assert valid_res.status_code == 200
|
|
valid_payload = valid_res.json()
|
|
assert valid_payload["code"] == 0, valid_payload
|
|
assert valid_payload["data"][0]["dataset_id"] == dataset_id, valid_payload
|
|
assert valid_payload["data"][0]["name"] == "ragflow_test.txt", valid_payload
|
|
|
|
for ext in ("docx", "xlsx", "pptx", "jpg", "pdf", "txt", "md", "json", "eml", "html"):
|
|
fp = create_txt_file(tmp_path / f"ragflow_test_file_type.{ext}")
|
|
res = _upload_files(rest_client, dataset_id, [fp])
|
|
assert res.status_code == 200, (ext, res.text)
|
|
payload = res.json()
|
|
assert payload["code"] == 0, (ext, payload)
|
|
assert payload["data"][0]["name"] == fp.name, (ext, payload)
|
|
|
|
empty_fp = tmp_path / "empty.txt"
|
|
empty_fp.touch()
|
|
empty_res = _upload_files(rest_client, dataset_id, [empty_fp])
|
|
assert empty_res.status_code == 200
|
|
empty_payload = empty_res.json()
|
|
assert empty_payload["code"] == 0, empty_payload
|
|
assert empty_payload["data"][0]["size"] == 0, empty_payload
|
|
|
|
duplicate_fp = create_txt_file(tmp_path / "duplicate.txt")
|
|
duplicate_res = _upload_files(rest_client, dataset_id, [duplicate_fp, duplicate_fp])
|
|
assert duplicate_res.status_code == 200
|
|
duplicate_payload = duplicate_res.json()
|
|
assert duplicate_payload["code"] == 0, duplicate_payload
|
|
assert [x["name"] for x in duplicate_payload["data"]] == ["duplicate.txt", "duplicate(1).txt"], duplicate_payload
|
|
|
|
for index in range(3):
|
|
repeat_res = _upload_files(rest_client, dataset_id, [duplicate_fp])
|
|
assert repeat_res.status_code == 200
|
|
repeat_payload = repeat_res.json()
|
|
assert repeat_payload["code"] == 0, (index, repeat_payload)
|
|
expected_name = f"duplicate({index + 2}).txt"
|
|
assert repeat_payload["data"][0]["name"] == expected_name, (index, repeat_payload)
|
|
|
|
max_name_fp = create_txt_file(tmp_path / f"{'a' * (DOCUMENT_NAME_LIMIT - 4)}.txt")
|
|
max_name_res = _upload_files(rest_client, dataset_id, [max_name_fp])
|
|
assert max_name_res.status_code == 200
|
|
max_name_payload = max_name_res.json()
|
|
assert max_name_payload["code"] == 0, max_name_payload
|
|
assert max_name_payload["data"][0]["name"] == max_name_fp.name, max_name_payload
|
|
|
|
illegal_chars = '<>:"/\\|?*'
|
|
safe_filename = string.punctuation.translate(str.maketrans({char: "_" for char in illegal_chars}))
|
|
special_fp = tmp_path / f"{safe_filename}.txt"
|
|
special_fp.write_text("Sample text content")
|
|
special_res = _upload_files(rest_client, dataset_id, [special_fp])
|
|
assert special_res.status_code == 200
|
|
special_payload = special_res.json()
|
|
assert special_payload["code"] == 0, special_payload
|
|
assert special_payload["data"][0]["name"] == special_fp.name, special_payload
|
|
|
|
multi_paths = [create_txt_file(tmp_path / f"ragflow_test_multi_{i}.txt") for i in range(20)]
|
|
multi_res = _upload_files(rest_client, dataset_id, multi_paths)
|
|
assert multi_res.status_code == 200
|
|
multi_payload = multi_res.json()
|
|
assert multi_payload["code"] == 0, multi_payload
|
|
assert len(multi_payload["data"]) == 20, multi_payload
|
|
|
|
with ThreadPoolExecutor(max_workers=5) as executor:
|
|
futures = [
|
|
executor.submit(_upload_files, rest_client, dataset_id, [create_txt_file(tmp_path / f"parallel_upload_{i}.txt")])
|
|
for i in range(20)
|
|
]
|
|
responses = list(as_completed(futures))
|
|
assert len(responses) == 20, responses
|
|
assert all(f.result().json()["code"] == 0 for f in futures)
|
|
|
|
|
|
@pytest.mark.p2
|
|
def test_documents_upload_error_contract(rest_client, create_dataset, tmp_path):
|
|
invalid_dataset_res = _upload_files(
|
|
rest_client,
|
|
"invalid_dataset_id",
|
|
[create_txt_file(tmp_path / "invalid_dataset.txt")],
|
|
)
|
|
assert invalid_dataset_res.status_code == 200
|
|
invalid_dataset_payload = invalid_dataset_res.json()
|
|
assert invalid_dataset_payload["code"] == 102, invalid_dataset_payload
|
|
assert invalid_dataset_payload["message"] == "Can't find the dataset with ID invalid_dataset_id!", invalid_dataset_payload
|
|
|
|
for file_type in ("exe", "unknown"):
|
|
bad_file = tmp_path / f"ragflow_test.{file_type}"
|
|
bad_file.touch()
|
|
res = _upload_files(rest_client, create_dataset(f"dataset_upload_unsupported_{file_type}"), [bad_file])
|
|
assert res.status_code == 200, (file_type, res.text)
|
|
payload = res.json()
|
|
assert payload["code"] == 500, (file_type, payload)
|
|
assert payload["message"] == f"ragflow_test.{file_type}: This type of file has not been supported yet!", (file_type, payload)
|
|
|
|
dataset_id = create_dataset("dataset_upload_missing_empty")
|
|
missing_res = rest_client.post(f"/datasets/{dataset_id}/documents")
|
|
assert missing_res.status_code == 200
|
|
missing_payload = missing_res.json()
|
|
assert missing_payload["code"] == 101, missing_payload
|
|
assert missing_payload["message"] == "No file part!", missing_payload
|
|
|
|
fp = create_txt_file(tmp_path / "filename_empty.txt")
|
|
with fp.open("rb") as file_obj:
|
|
m = MultipartEncoder(fields=(("file", ("", file_obj)),))
|
|
filename_empty_res = requests.post(
|
|
url=f"{HOST_ADDRESS}/api/{VERSION}/datasets/{dataset_id}/documents",
|
|
headers={"Content-Type": m.content_type, "Authorization": f"Bearer {rest_client.token}"},
|
|
data=m,
|
|
timeout=30,
|
|
)
|
|
assert filename_empty_res.status_code == 200
|
|
filename_empty_payload = filename_empty_res.json()
|
|
assert filename_empty_payload["code"] == 101, filename_empty_payload
|
|
assert filename_empty_payload["message"] == "No file selected!", filename_empty_payload
|
|
|
|
|
|
@pytest.mark.p2
|
|
def test_documents_update_patch_and_delete(rest_client, create_document):
|
|
dataset_id, document_id = create_document("update_target.txt")
|
|
|
|
patch_res = rest_client.patch(
|
|
f"/datasets/{dataset_id}/documents/{document_id}",
|
|
json={"name": "updated_target.txt"},
|
|
)
|
|
assert patch_res.status_code == 200
|
|
patch_payload = patch_res.json()
|
|
assert patch_payload["code"] == 0, patch_payload
|
|
assert patch_payload["data"]["name"] == "updated_target.txt", patch_payload
|
|
|
|
delete_res = rest_client.delete(
|
|
f"/datasets/{dataset_id}/documents",
|
|
json={"ids": [document_id]},
|
|
)
|
|
assert delete_res.status_code == 200
|
|
delete_payload = delete_res.json()
|
|
assert delete_payload["code"] == 0, delete_payload
|
|
assert delete_payload["data"]["deleted"] == 1, delete_payload
|
|
|
|
list_res = rest_client.get(f"/datasets/{dataset_id}/documents")
|
|
assert list_res.status_code == 200
|
|
list_payload = list_res.json()
|
|
assert list_payload["code"] == 0, list_payload
|
|
assert all(doc["id"] != document_id for doc in list_payload["data"]["docs"]), list_payload
|
|
|
|
|
|
@pytest.mark.p2
|
|
def test_documents_update_requires_auth(create_document):
|
|
dataset_id, document_id = create_document("update_auth_target.txt")
|
|
for scenario_name, client in (("missing token", RestClient(token=None)), ("invalid token", RestClient(token=INVALID_API_TOKEN))):
|
|
res = client.patch(
|
|
f"/datasets/{dataset_id}/documents/{document_id}",
|
|
json={"name": "updated_auth_target.txt"},
|
|
)
|
|
assert res.status_code == 401, (scenario_name, res.text)
|
|
body = res.json()
|
|
assert body["code"] == 401, (scenario_name, body)
|
|
assert body["message"] == "<Unauthorized '401: Unauthorized'>", (scenario_name, body)
|
|
|
|
|
|
@pytest.mark.p2
|
|
def test_documents_update_name_contract(rest_client, create_dataset, tmp_path):
|
|
dataset_id, uploaded_docs = _seed_documents_for_update(rest_client, create_dataset, tmp_path)
|
|
first_document_id = uploaded_docs[0]["id"]
|
|
|
|
long_name = f"{'a' * (DOCUMENT_NAME_LIMIT - 4)}.txt"
|
|
name_cases = [
|
|
("new_name.txt", 0, ""),
|
|
(long_name, 0, ""),
|
|
(0, 102, "Field: <name> - Message: <Input should be a valid string> - Value: <0>"),
|
|
(None, 100, "AttributeError('NoneType' object has no attribute 'encode')"),
|
|
("", 101, "The extension of file can't be changed"),
|
|
("ragflow_test_upload_0", 101, "The extension of file can't be changed"),
|
|
("ragflow_test_upload_1.txt", 102, "Duplicated document name in the same dataset."),
|
|
("RAGFLOW_TEST_UPLOAD_1.TXT", 0, ""),
|
|
]
|
|
for name, expected_code, expected_message in name_cases:
|
|
res = rest_client.patch(
|
|
f"/datasets/{dataset_id}/documents/{first_document_id}",
|
|
json={"name": name},
|
|
)
|
|
assert res.status_code == 200, (name, res.text)
|
|
body = res.json()
|
|
assert body["code"] == expected_code, (name, body)
|
|
if expected_code == 0:
|
|
assert body["data"]["name"] == name, (name, body)
|
|
list_res = rest_client.get(f"/datasets/{dataset_id}/documents", params={"id": first_document_id})
|
|
assert list_res.status_code == 200, (name, list_res.text)
|
|
list_body = list_res.json()
|
|
assert list_body["code"] == 0, (name, list_body)
|
|
assert list_body["data"]["docs"][0]["name"] == name, (name, list_body)
|
|
else:
|
|
assert body["message"] == expected_message, (name, body)
|
|
|
|
|
|
@pytest.mark.p2
|
|
def test_documents_update_invalid_dataset_and_document_contract(rest_client, create_dataset, tmp_path):
|
|
dataset_id, uploaded_docs = _seed_documents_for_update(rest_client, create_dataset, tmp_path)
|
|
first_document_id = uploaded_docs[0]["id"]
|
|
|
|
invalid_dataset_res = rest_client.patch(
|
|
f"/datasets/{INVALID_ID_32}/documents/{first_document_id}",
|
|
json={"name": "new_name.txt"},
|
|
)
|
|
assert invalid_dataset_res.status_code == 200
|
|
invalid_dataset_body = invalid_dataset_res.json()
|
|
assert invalid_dataset_body["code"] == 102, invalid_dataset_body
|
|
assert "You don't own the dataset." in invalid_dataset_body["message"], invalid_dataset_body
|
|
|
|
invalid_document_res = rest_client.patch(
|
|
f"/datasets/{dataset_id}/documents/{INVALID_ID_32}",
|
|
json={"name": "new_name.txt"},
|
|
)
|
|
assert invalid_document_res.status_code == 200
|
|
invalid_document_body = invalid_document_res.json()
|
|
assert invalid_document_body["code"] == 102, invalid_document_body
|
|
assert invalid_document_body["message"] == "The dataset doesn't own the document.", invalid_document_body
|
|
|
|
|
|
@pytest.mark.p2
|
|
def test_documents_update_chunk_method_contract(rest_client, create_dataset, tmp_path):
|
|
dataset_id, uploaded_docs = _seed_documents_for_update(rest_client, create_dataset, tmp_path)
|
|
first_document_id = uploaded_docs[0]["id"]
|
|
|
|
chunk_method_cases = [
|
|
("naive", 0, ""),
|
|
("manual", 0, ""),
|
|
("qa", 0, ""),
|
|
("table", 0, ""),
|
|
("paper", 0, ""),
|
|
("book", 0, ""),
|
|
("laws", 0, ""),
|
|
("presentation", 0, ""),
|
|
("picture", 0, ""),
|
|
("one", 0, ""),
|
|
("knowledge_graph", 0, ""),
|
|
("email", 0, ""),
|
|
("tag", 0, ""),
|
|
("", 102, "`chunk_method` (empty string) is not valid"),
|
|
(
|
|
"other_chunk_method",
|
|
102,
|
|
"Field: <chunk_method> - Message: <`chunk_method` other_chunk_method doesn't exist> - Value: <other_chunk_method>",
|
|
),
|
|
]
|
|
for chunk_method, expected_code, expected_message in chunk_method_cases:
|
|
res = rest_client.patch(
|
|
f"/datasets/{dataset_id}/documents/{first_document_id}",
|
|
json={"chunk_method": chunk_method},
|
|
)
|
|
assert res.status_code == 200, (chunk_method, res.text)
|
|
body = res.json()
|
|
assert body["code"] == expected_code, (chunk_method, body)
|
|
if expected_code == 0:
|
|
list_res = rest_client.get(f"/datasets/{dataset_id}/documents", params={"id": first_document_id})
|
|
assert list_res.status_code == 200, (chunk_method, list_res.text)
|
|
list_body = list_res.json()
|
|
assert list_body["code"] == 0, (chunk_method, list_body)
|
|
assert list_body["data"]["docs"][0]["chunk_method"] == chunk_method, (chunk_method, list_body)
|
|
else:
|
|
assert body["message"] == expected_message, (chunk_method, body)
|
|
|
|
|
|
@pytest.mark.p2
|
|
def test_documents_update_meta_fields_contract(rest_client, create_dataset, tmp_path):
|
|
dataset_id, uploaded_docs = _seed_documents_for_update(rest_client, create_dataset, tmp_path)
|
|
first_document_id = uploaded_docs[0]["id"]
|
|
|
|
meta_fields_cases = [
|
|
({"test": "test"}, 0, ""),
|
|
({"author": "alice", "year": 2024}, 0, ""),
|
|
({"tags": ["tag1", "tag2"]}, 0, ""),
|
|
({"count": 42, "price": 19.99}, 0, ""),
|
|
("test", 102, "Field: <meta_fields> - Message: <Input should be a valid dictionary> - Value: <test>"),
|
|
([], 102, "Field: <meta_fields> - Message: <Input should be a valid dictionary> - Value: <[]>"),
|
|
({"tags": [{"x": {"a": "b"}}]}, 102, "Field: <meta_fields> - Message: <The type is not supported in list: [{'x': {'a': 'b'}}]> - Value: <{'tags': [{'x': {'a': 'b'}}]}>"),
|
|
({"tags": [{"x": 1}]}, 102, "Field: <meta_fields> - Message: <The type is not supported in list: [{'x': 1}]> - Value: <{'tags': [{'x': 1}]}>"),
|
|
({"obj": {"x": 1}}, 102, "Field: <meta_fields> - Message: <The type is not supported: {'x': 1}> - Value: <{'obj': {'x': 1}}>"),
|
|
({"tags": [2, 1]}, 0, ""),
|
|
]
|
|
for meta_fields, expected_code, expected_message in meta_fields_cases:
|
|
res = rest_client.patch(
|
|
f"/datasets/{dataset_id}/documents/{first_document_id}",
|
|
json={"meta_fields": meta_fields},
|
|
)
|
|
assert res.status_code == 200, (meta_fields, res.text)
|
|
body = res.json()
|
|
assert body["code"] == expected_code, (meta_fields, body)
|
|
if expected_code == 0:
|
|
list_res = rest_client.get(f"/datasets/{dataset_id}/documents", params={"id": first_document_id})
|
|
assert list_res.status_code == 200, (meta_fields, list_res.text)
|
|
list_body = list_res.json()
|
|
assert list_body["code"] == 0, (meta_fields, list_body)
|
|
assert list_body["data"]["docs"][0]["meta_fields"] == meta_fields, (meta_fields, list_body)
|
|
else:
|
|
assert expected_message in body["message"] or body["message"] == expected_message, (meta_fields, body)
|
|
|
|
invalid_meta_doc_res = rest_client.patch(
|
|
f"/datasets/{dataset_id}/documents/invalid_doc_id_12345678901234567890",
|
|
json={"meta_fields": {"author": "alice"}},
|
|
)
|
|
assert invalid_meta_doc_res.status_code == 200
|
|
invalid_meta_doc_body = invalid_meta_doc_res.json()
|
|
assert invalid_meta_doc_body["code"] == 102, invalid_meta_doc_body
|
|
assert "The dataset doesn't own the document." in invalid_meta_doc_body["message"], invalid_meta_doc_body
|
|
|
|
|
|
@pytest.mark.p2
|
|
def test_documents_update_invalid_field_and_guard_contract(rest_client, create_dataset, tmp_path):
|
|
dataset_id, uploaded_docs = _seed_documents_for_update(rest_client, create_dataset, tmp_path)
|
|
first_document_id = uploaded_docs[0]["id"]
|
|
|
|
strict_guard_cases = [
|
|
({"chunk_count": 1}, 102, "Can't change `chunk_count`."),
|
|
({"token_count": 1}, 102, "Can't change `token_count`."),
|
|
({"chunk_count": 100}, 102, "Can't change `chunk_count`."),
|
|
({"token_count": 100}, 102, "Can't change `token_count`."),
|
|
({"progress": 2.0}, 102, "Field: <progress> - Message: <Input should be less than or equal to 1> - Value: <2.0>"),
|
|
({"progress": 1.0}, 102, "Can't change `progress`."),
|
|
({"meta_fields": []}, 102, "Field: <meta_fields> - Message: <Input should be a valid dictionary> - Value: <[]>"),
|
|
]
|
|
for payload, expected_code, expected_message in strict_guard_cases:
|
|
res = rest_client.patch(
|
|
f"/datasets/{dataset_id}/documents/{first_document_id}",
|
|
json=payload,
|
|
)
|
|
assert res.status_code == 200, (payload, res.text)
|
|
body = res.json()
|
|
assert body["code"] == expected_code, (payload, body)
|
|
assert expected_message in body["message"] or body["message"] == expected_message, (payload, body)
|
|
|
|
legacy_invalid_field_cases = [
|
|
{"create_date": "Fri, 14 Mar 2025 16:53:42 GMT"},
|
|
{"create_time": 1},
|
|
{"created_by": "ragflow_test"},
|
|
{"dataset_id": "ragflow_test"},
|
|
{"id": "ragflow_test"},
|
|
{"location": "ragflow_test.txt"},
|
|
{"process_begin_at": 1},
|
|
{"process_duration": 1.0},
|
|
{"progress_msg": "ragflow_test"},
|
|
{"run": "ragflow_test"},
|
|
{"size": 1},
|
|
{"source_type": "ragflow_test"},
|
|
{"thumbnail": "ragflow_test"},
|
|
{"type": "ragflow_test"},
|
|
{"update_date": "Fri, 14 Mar 2025 16:33:17 GMT"},
|
|
{"update_time": 1},
|
|
]
|
|
for payload in legacy_invalid_field_cases:
|
|
res = rest_client.patch(
|
|
f"/datasets/{dataset_id}/documents/{first_document_id}",
|
|
json=payload,
|
|
)
|
|
assert res.status_code == 200, (payload, res.text)
|
|
body = res.json()
|
|
assert body["code"] in (0, 102), (payload, body)
|
|
if body["code"] == 102:
|
|
assert "invalid" in body["message"].lower(), (payload, body)
|
|
else:
|
|
assert "data" in body, (payload, body)
|
|
|
|
|
|
@pytest.mark.p2
|
|
def test_documents_update_parser_config_contract(rest_client, create_dataset, tmp_path):
|
|
dataset_id, uploaded_docs = _seed_documents_for_update(rest_client, create_dataset, tmp_path)
|
|
first_document_id = uploaded_docs[0]["id"]
|
|
default_parser_config_for_test = {
|
|
"layout_recognize": "DeepDOC",
|
|
"chunk_token_num": 512,
|
|
"delimiter": "\n",
|
|
"auto_keywords": 0,
|
|
"auto_questions": 0,
|
|
"html4excel": False,
|
|
"topn_tags": 3,
|
|
"raptor": {
|
|
"use_raptor": True,
|
|
"prompt": "Please summarize the following paragraphs. Be careful with the numbers, do not make things up. Paragraphs as following:\n {cluster_content}\nThe above is the content you need to summarize.",
|
|
"max_token": 256,
|
|
"threshold": 0.1,
|
|
"max_cluster": 64,
|
|
"random_seed": 0,
|
|
},
|
|
"graphrag": {
|
|
"use_graphrag": True,
|
|
"entity_types": ["organization", "person", "geo", "event", "category"],
|
|
"method": "light",
|
|
"batch_chunk_token_size": 4096,
|
|
},
|
|
}
|
|
|
|
parser_cases = [
|
|
({}, 0, ""),
|
|
(default_parser_config_for_test, 0, ""),
|
|
({"chunk_token_num": -1}, 102, "Field: <parser_config.chunk_token_num> - Message: <Input should be greater than or equal to 1> - Value: <-1>"),
|
|
({"chunk_token_num": 0}, 102, "Field: <parser_config.chunk_token_num> - Message: <Input should be greater than or equal to 1> - Value: <0>"),
|
|
({"chunk_token_num": 100000000}, 102, "Field: <parser_config.chunk_token_num> - Message: <Input should be less than or equal to 2048> - Value: <100000000>"),
|
|
({"chunk_token_num": 3.14}, 102, "Field: <parser_config.chunk_token_num> - Message: <Input should be a valid integer> - Value: <3.14>"),
|
|
({"chunk_token_num": "1024"}, 102, "Field: <parser_config.chunk_token_num> - Message: <Input should be a valid integer> - Value: <1024>"),
|
|
({"layout_recognize": "DeepDOC"}, 0, ""),
|
|
({"layout_recognize": "Naive"}, 0, ""),
|
|
({"html4excel": True}, 0, ""),
|
|
({"html4excel": False}, 0, ""),
|
|
({"html4excel": 1}, 102, "Field: <parser_config.html4excel> - Message: <Input should be a valid boolean> - Value: <1>"),
|
|
({"delimiter": ""}, 102, "Field: <parser_config.delimiter> - Message: <String should have at least 1 character> - Value: <>"),
|
|
({"delimiter": "`##`"}, 0, ""),
|
|
({"delimiter": 1}, 102, "Field: <parser_config.delimiter> - Message: <Input should be a valid string> - Value: <1>"),
|
|
({"task_page_size": -1}, 102, "Field: <parser_config.task_page_size> - Message: <Input should be greater than or equal to 1> - Value: <-1>"),
|
|
({"task_page_size": 0}, 102, "Field: <parser_config.task_page_size> - Message: <Input should be greater than or equal to 1> - Value: <0>"),
|
|
({"task_page_size": 100000000}, 0, ""),
|
|
({"task_page_size": 3.14}, 102, "Field: <parser_config.task_page_size> - Message: <Input should be a valid integer> - Value: <3.14>"),
|
|
({"task_page_size": "1024"}, 102, "Field: <parser_config.task_page_size> - Message: <Input should be a valid integer> - Value: <1024>"),
|
|
({"raptor": {"use_raptor": {"a": "b"}}}, 102, "Field: <parser_config.raptor.use_raptor> - Message: <Input should be a valid boolean> - Value: <{'a': 'b'}>"),
|
|
({"raptor": {"use_raptor": False}}, 0, ""),
|
|
({"invalid_key": "invalid_value"}, 102, "Field: <parser_config.invalid_key> - Message: <Extra inputs are not permitted> - Value: <invalid_value>"),
|
|
({"auto_keywords": -1}, 102, "Field: <parser_config.auto_keywords> - Message: <Input should be greater than or equal to 0> - Value: <-1>"),
|
|
({"auto_keywords": 32}, 0, ""),
|
|
({"auto_keywords": "1024"}, 102, "Field: <parser_config.auto_keywords> - Message: <Input should be a valid integer> - Value: <1024>"),
|
|
({"auto_keywords": 3.14}, 102, "Field: <parser_config.auto_keywords> - Message: <Input should be a valid integer> - Value: <3.14>"),
|
|
({"auto_questions": -1}, 102, "Field: <parser_config.auto_questions> - Message: <Input should be greater than or equal to 0> - Value: <-1>"),
|
|
({"auto_questions": 10}, 0, ""),
|
|
({"auto_questions": 3.14}, 102, "Field: <parser_config.auto_questions> - Message: <Input should be a valid integer> - Value: <3.14>"),
|
|
({"auto_questions": "1024"}, 102, "Field: <parser_config.auto_questions> - Message: <Input should be a valid integer> - Value: <1024>"),
|
|
({"topn_tags": -1}, 102, "Field: <parser_config.topn_tags> - Message: <Input should be greater than or equal to 1> - Value: <-1>"),
|
|
({"topn_tags": 10}, 0, ""),
|
|
({"topn_tags": 3.14}, 102, "Field: <parser_config.topn_tags> - Message: <Input should be a valid integer> - Value: <3.14>"),
|
|
({"topn_tags": "1024"}, 102, "Field: <parser_config.topn_tags> - Message: <Input should be a valid integer> - Value: <1024>"),
|
|
]
|
|
for parser_config, expected_code, expected_message in parser_cases:
|
|
res = rest_client.patch(
|
|
f"/datasets/{dataset_id}/documents/{first_document_id}",
|
|
json={"chunk_method": "naive", "parser_config": parser_config},
|
|
)
|
|
assert res.status_code == 200, (parser_config, res.text)
|
|
body = res.json()
|
|
assert body["code"] == expected_code, (parser_config, body)
|
|
if expected_code == 0:
|
|
list_res = rest_client.get(f"/datasets/{dataset_id}/documents", params={"id": first_document_id})
|
|
assert list_res.status_code == 200, (parser_config, list_res.text)
|
|
list_body = list_res.json()
|
|
assert list_body["code"] == 0, (parser_config, list_body)
|
|
doc_parser_config = list_body["data"]["docs"][0]["parser_config"]
|
|
if parser_config == {}:
|
|
assert doc_parser_config == DEFAULT_PARSER_CONFIG, (parser_config, list_body)
|
|
else:
|
|
for key, value in parser_config.items():
|
|
if isinstance(value, dict):
|
|
for sub_key, sub_value in value.items():
|
|
assert doc_parser_config[key][sub_key] == sub_value, (parser_config, list_body)
|
|
else:
|
|
assert doc_parser_config[key] == value, (parser_config, list_body)
|
|
else:
|
|
assert body["message"] == expected_message, (parser_config, body)
|
|
|
|
|
|
@pytest.mark.p2
|
|
def test_documents_parse_and_stop(rest_client, create_document):
|
|
dataset_id, document_id = create_document("parse_target.txt")
|
|
|
|
parse_res = rest_client.post(
|
|
f"/datasets/{dataset_id}/documents/parse",
|
|
json={"document_ids": [document_id]},
|
|
)
|
|
assert parse_res.status_code == 200
|
|
parse_payload = parse_res.json()
|
|
assert parse_payload["code"] == 0, parse_payload
|
|
|
|
stop_res = rest_client.post(
|
|
f"/datasets/{dataset_id}/documents/stop",
|
|
json={"document_ids": [document_id]},
|
|
)
|
|
assert stop_res.status_code == 200
|
|
stop_payload = stop_res.json()
|
|
# Depending on timing this can be immediate stop success or "already completed".
|
|
assert stop_payload["code"] in (0, 102), stop_payload
|
|
if stop_payload["code"] == 102:
|
|
assert "already completed" in stop_payload["message"], stop_payload
|
|
|
|
|
|
@pytest.mark.p2
|
|
def test_documents_metadata_batch_update_contract(rest_client, create_dataset, tmp_path):
|
|
dataset_id, uploaded_docs = _seed_documents(rest_client, create_dataset, tmp_path, count=5)
|
|
document_ids = [doc["id"] for doc in uploaded_docs]
|
|
|
|
for scenario_name, client in (("missing token", RestClient(token=None)), ("invalid token", RestClient(token=INVALID_API_TOKEN))):
|
|
res = client.patch(
|
|
f"/datasets/{dataset_id}/documents/metadatas",
|
|
json={"selector": {"document_ids": document_ids[:1]}, "updates": [], "deletes": []},
|
|
)
|
|
assert res.status_code == 401, (scenario_name, res.text)
|
|
payload = res.json()
|
|
assert payload["code"] == 401, (scenario_name, payload)
|
|
assert payload["message"] == "<Unauthorized '401: Unauthorized'>", (scenario_name, payload)
|
|
|
|
invalid_dataset_res = rest_client.patch(
|
|
"/datasets/invalid_dataset_id/documents/metadatas",
|
|
json={"selector": {"document_ids": []}, "updates": [], "deletes": []},
|
|
)
|
|
assert invalid_dataset_res.status_code == 200
|
|
invalid_dataset_payload = invalid_dataset_res.json()
|
|
assert invalid_dataset_payload["code"] == 102, invalid_dataset_payload
|
|
assert invalid_dataset_payload["message"] == "You don't own the dataset invalid_dataset_id.", invalid_dataset_payload
|
|
|
|
validation_cases = [
|
|
("selector not object", {"selector": [1], "updates": [], "deletes": []}, 102, "selector must be an object."),
|
|
("updates not list", {"selector": {}, "updates": {"key": "value"}, "deletes": []}, 102, "updates and deletes must be lists."),
|
|
("metadata condition not object", {"selector": {"metadata_condition": [1]}, "updates": [], "deletes": []}, 102, "metadata_condition must be an object."),
|
|
("document ids not list", {"selector": {"document_ids": "doc-1"}, "updates": [], "deletes": []}, 102, "document_ids must be a list."),
|
|
("update missing key", {"selector": {}, "updates": [{"key": ""}], "deletes": []}, 102, "Each update requires key and value."),
|
|
("delete missing key", {"selector": {}, "updates": [], "deletes": [{"x": "y"}]}, 102, "Each delete requires key."),
|
|
(
|
|
"document ids wrong dataset",
|
|
{"selector": {"document_ids": ["doc-does-not-exist-1", "doc-does-not-exist-2"]}, "updates": [{"key": "author", "value": "test"}], "deletes": []},
|
|
102,
|
|
f"These documents do not belong to dataset {dataset_id}: ",
|
|
),
|
|
]
|
|
for scenario_name, payload, expected_code, expected_message in validation_cases:
|
|
res = rest_client.patch(f"/datasets/{dataset_id}/documents/metadatas", json=payload)
|
|
assert res.status_code == 200, (scenario_name, res.text)
|
|
body = res.json()
|
|
assert body["code"] == expected_code, (scenario_name, body)
|
|
if scenario_name == "document ids wrong dataset":
|
|
assert body["message"].startswith(expected_message), (scenario_name, body)
|
|
invalid_ids = set(body["message"][len(expected_message) :].split(", "))
|
|
assert invalid_ids == {"doc-does-not-exist-1", "doc-does-not-exist-2"}, (scenario_name, body)
|
|
else:
|
|
assert body["message"] == expected_message, (scenario_name, body)
|
|
|
|
update_by_ids_res = rest_client.patch(
|
|
f"/datasets/{dataset_id}/documents/metadatas",
|
|
json={
|
|
"selector": {"document_ids": document_ids},
|
|
"updates": [{"key": "author", "value": "test_author"}, {"key": "status", "value": "processed"}],
|
|
"deletes": [],
|
|
},
|
|
)
|
|
assert update_by_ids_res.status_code == 200
|
|
update_by_ids_payload = update_by_ids_res.json()
|
|
assert update_by_ids_payload["code"] == 0, update_by_ids_payload
|
|
assert update_by_ids_payload["data"] == {"updated": 5, "matched_docs": 5}, update_by_ids_payload
|
|
|
|
filtered_update_res = rest_client.patch(
|
|
f"/datasets/{dataset_id}/documents/metadatas",
|
|
json={
|
|
"selector": {
|
|
"document_ids": document_ids,
|
|
"metadata_condition": {"conditions": [{"comparison_operator": "is", "name": "status", "value": "processed"}]},
|
|
},
|
|
"updates": [{"key": "author", "value": "filtered_author"}],
|
|
"deletes": [],
|
|
},
|
|
)
|
|
assert filtered_update_res.status_code == 200
|
|
filtered_update_payload = filtered_update_res.json()
|
|
assert filtered_update_payload["code"] == 0, filtered_update_payload
|
|
assert filtered_update_payload["data"] == {"updated": 5, "matched_docs": 5}, filtered_update_payload
|
|
|
|
delete_metadata_res = rest_client.patch(
|
|
f"/datasets/{dataset_id}/documents/metadatas",
|
|
json={
|
|
"selector": {"document_ids": document_ids},
|
|
"updates": [],
|
|
"deletes": [{"key": "author"}],
|
|
},
|
|
)
|
|
assert delete_metadata_res.status_code == 200
|
|
delete_metadata_payload = delete_metadata_res.json()
|
|
assert delete_metadata_payload["code"] == 0, delete_metadata_payload
|
|
assert delete_metadata_payload["data"] == {"updated": 5, "matched_docs": 5}, delete_metadata_payload
|
|
|
|
combined_res = rest_client.patch(
|
|
f"/datasets/{dataset_id}/documents/metadatas",
|
|
json={
|
|
"selector": {"document_ids": document_ids},
|
|
"updates": [{"key": "author", "value": "new_author"}],
|
|
"deletes": [{"key": "status"}],
|
|
},
|
|
)
|
|
assert combined_res.status_code == 200
|
|
combined_payload = combined_res.json()
|
|
assert combined_payload["code"] == 0, combined_payload
|
|
assert combined_payload["data"] == {"updated": 5, "matched_docs": 5}, combined_payload
|
|
|
|
empty_ids_res = rest_client.patch(
|
|
f"/datasets/{dataset_id}/documents/metadatas",
|
|
json={"selector": {"document_ids": []}, "updates": [{"key": "author", "value": "test"}], "deletes": []},
|
|
)
|
|
assert empty_ids_res.status_code == 200
|
|
empty_ids_payload = empty_ids_res.json()
|
|
assert empty_ids_payload["code"] == 0, empty_ids_payload
|
|
assert empty_ids_payload["data"] == {"updated": 0, "matched_docs": 0}, empty_ids_payload
|
|
|
|
no_match_res = rest_client.patch(
|
|
f"/datasets/{dataset_id}/documents/metadatas",
|
|
json={
|
|
"selector": {
|
|
"document_ids": document_ids,
|
|
"metadata_condition": {"conditions": [{"comparison_operator": "is", "name": "nonexistent_key", "value": "nonexistent_value"}]},
|
|
},
|
|
"updates": [{"key": "author", "value": "test"}],
|
|
"deletes": [],
|
|
},
|
|
)
|
|
assert no_match_res.status_code == 200
|
|
no_match_payload = no_match_res.json()
|
|
assert no_match_payload["code"] == 0, no_match_payload
|
|
assert no_match_payload["data"] == {"updated": 0, "matched_docs": 0}, no_match_payload
|
|
|
|
|
|
@pytest.mark.p2
|
|
def test_document_metadata_config_contract(rest_client, create_document):
|
|
dataset_id, document_id = create_document("document_metadata_config_contract.txt")
|
|
|
|
for scenario_name, client in (("missing token", RestClient(token=None)), ("invalid token", RestClient(token=INVALID_API_TOKEN))):
|
|
res = client.put(
|
|
f"/datasets/{dataset_id}/documents/{document_id}/metadata/config",
|
|
json={"metadata": {"author": "alice"}},
|
|
)
|
|
assert res.status_code == 401, (scenario_name, res.text)
|
|
payload = res.json()
|
|
assert payload["code"] == 401, (scenario_name, payload)
|
|
assert payload["message"] == "<Unauthorized '401: Unauthorized'>", (scenario_name, payload)
|
|
|
|
missing_payload_res = rest_client.put(f"/datasets/{dataset_id}/documents/{document_id}/metadata/config", json={})
|
|
assert missing_payload_res.status_code == 200
|
|
missing_payload = missing_payload_res.json()
|
|
assert missing_payload["code"] == 101, missing_payload
|
|
assert missing_payload["message"] == "metadata is required", missing_payload
|
|
|
|
invalid_dataset_res = rest_client.put(
|
|
f"/datasets/{INVALID_ID_32}/documents/{document_id}/metadata/config",
|
|
json={"metadata": {"author": "alice"}},
|
|
)
|
|
assert invalid_dataset_res.status_code == 200
|
|
invalid_dataset_payload = invalid_dataset_res.json()
|
|
assert invalid_dataset_payload["code"] == 102, invalid_dataset_payload
|
|
assert invalid_dataset_payload["message"] == "You don't own the dataset.", invalid_dataset_payload
|
|
|
|
invalid_document_res = rest_client.put(
|
|
f"/datasets/{dataset_id}/documents/{INVALID_ID_32}/metadata/config",
|
|
json={"metadata": {"author": "alice"}},
|
|
)
|
|
assert invalid_document_res.status_code == 200
|
|
invalid_document_payload = invalid_document_res.json()
|
|
assert invalid_document_payload["code"] == 102, invalid_document_payload
|
|
assert invalid_document_payload["message"] == f"Document {INVALID_ID_32} not found in dataset {dataset_id}", invalid_document_payload
|
|
|
|
update_payload = {"metadata": {"author": "alice", "tags": ["one", "two"]}}
|
|
update_res = rest_client.put(
|
|
f"/datasets/{dataset_id}/documents/{document_id}/metadata/config",
|
|
json=update_payload,
|
|
)
|
|
assert update_res.status_code == 200
|
|
update_body = update_res.json()
|
|
assert update_body["code"] == 0, update_body
|
|
parser_config = update_body["data"]["parser_config"]
|
|
assert parser_config["metadata"] == update_payload["metadata"], update_body
|
|
|
|
|
|
@pytest.mark.p2
|
|
def test_documents_metadata_update_path(rest_client, create_document):
|
|
dataset_id, document_id = create_document("metadata_target.txt")
|
|
|
|
res = rest_client.patch(
|
|
f"/datasets/{dataset_id}/documents/metadatas",
|
|
json={
|
|
"selector": {"document_ids": [document_id]},
|
|
"updates": [{"key": "author", "value": "qa"}],
|
|
"deletes": [],
|
|
},
|
|
)
|
|
assert res.status_code == 200
|
|
payload = res.json()
|
|
assert payload["code"] == 0, payload
|
|
assert payload["data"]["matched_docs"] == 1, payload
|
|
assert payload["data"]["updated"] >= 1, payload
|
|
|
|
|
|
@pytest.mark.p2
|
|
def test_documents_delete_contract_matrix(rest_client, create_dataset, tmp_path):
|
|
scenarios = [
|
|
("empty object", lambda ids: {}, 102, "should either provide doc ids or set delete_all(true)", 3),
|
|
("empty ids", lambda ids: {"ids": []}, 102, "should either provide doc ids or set delete_all(true)", 3),
|
|
("invalid id only", lambda ids: {"ids": ["invalid_id"]}, 102, "These documents do not belong to dataset", 3),
|
|
("not json object", lambda ids: "not json", 101, "Invalid request payload: expected object, got str", 3),
|
|
("delete one", lambda ids: {"ids": ids[:1]}, 0, "", 2),
|
|
("delete all by ids", lambda ids: {"ids": ids}, 0, "", 0),
|
|
("delete_all flag", lambda ids: {"delete_all": True}, 0, "", 0),
|
|
]
|
|
for scenario_name, payload_builder, expected_code, expected_message, expected_total in scenarios:
|
|
dataset_id, uploaded_docs = _seed_documents(rest_client, create_dataset, tmp_path, count=3)
|
|
document_ids = [doc["id"] for doc in uploaded_docs]
|
|
payload = payload_builder(document_ids)
|
|
|
|
res = rest_client.delete(f"/datasets/{dataset_id}/documents", json=payload)
|
|
assert res.status_code == 200, (scenario_name, res.text)
|
|
body = res.json()
|
|
assert body["code"] == expected_code, (scenario_name, body)
|
|
if expected_code != 0:
|
|
assert expected_message in body["message"], (scenario_name, body)
|
|
else:
|
|
assert body["data"]["deleted"] in (len(document_ids), len(document_ids[:1])), (scenario_name, body)
|
|
|
|
list_res = rest_client.get(f"/datasets/{dataset_id}/documents", params={"page_size": 10})
|
|
assert list_res.status_code == 200, (scenario_name, list_res.text)
|
|
list_payload = list_res.json()
|
|
assert list_payload["code"] == 0, (scenario_name, list_payload)
|
|
assert list_payload["data"]["total"] == expected_total, (scenario_name, list_payload)
|
|
|
|
|
|
@pytest.mark.p2
|
|
def test_documents_delete_requires_auth(rest_client, create_dataset, tmp_path):
|
|
dataset_id = create_dataset("dataset_delete_auth")
|
|
file_path = create_txt_file(tmp_path / "delete_auth_target.txt")
|
|
with file_path.open("rb") as file_obj:
|
|
upload_res = rest_client.post(f"/datasets/{dataset_id}/documents", files=[("file", (file_path.name, file_obj))])
|
|
assert upload_res.status_code == 200
|
|
upload_payload = upload_res.json()
|
|
assert upload_payload["code"] == 0, upload_payload
|
|
document_id = upload_payload["data"][0]["id"]
|
|
|
|
for scenario_name, client in (("missing token", RestClient(token=None)), ("invalid token", RestClient(token=INVALID_API_TOKEN))):
|
|
res = client.delete(f"/datasets/{dataset_id}/documents", json={"ids": [document_id]})
|
|
assert res.status_code == 401, (scenario_name, res.text)
|
|
body = res.json()
|
|
assert body["code"] == 401, (scenario_name, body)
|
|
assert body["message"] == "<Unauthorized '401: Unauthorized'>", (scenario_name, body)
|
|
|
|
|
|
@pytest.mark.p2
|
|
def test_documents_delete_invalid_dataset_partial_duplicate_repeat_and_cross_dataset(rest_client, create_dataset, tmp_path):
|
|
dataset_id, uploaded_docs = _seed_documents(rest_client, create_dataset, tmp_path, count=3)
|
|
document_ids = [doc["id"] for doc in uploaded_docs]
|
|
other_dataset_id, other_docs = _seed_documents(rest_client, create_dataset, tmp_path, count=1)
|
|
other_document_id = other_docs[0]["id"]
|
|
|
|
invalid_dataset_res = rest_client.delete(
|
|
"/datasets/invalid_dataset_id/documents",
|
|
json={"ids": document_ids[:1]},
|
|
)
|
|
assert invalid_dataset_res.status_code == 200
|
|
invalid_dataset_payload = invalid_dataset_res.json()
|
|
assert invalid_dataset_payload["code"] == 102, invalid_dataset_payload
|
|
assert "You don't own the dataset invalid_dataset_id." in invalid_dataset_payload["message"], invalid_dataset_payload
|
|
|
|
partial_invalid_payloads = [
|
|
{"ids": ["invalid_id"] + document_ids},
|
|
{"ids": document_ids[:1] + ["invalid_id"] + document_ids[1:]},
|
|
{"ids": document_ids + ["invalid_id"]},
|
|
]
|
|
for payload in partial_invalid_payloads:
|
|
res = rest_client.delete(f"/datasets/{dataset_id}/documents", json=payload)
|
|
assert res.status_code == 200, (payload, res.text)
|
|
body = res.json()
|
|
assert body["code"] == 102, (payload, body)
|
|
assert "These documents do not belong to dataset" in body["message"], (payload, body)
|
|
|
|
cross_dataset_res = rest_client.delete(f"/datasets/{dataset_id}/documents", json={"ids": [other_document_id]})
|
|
assert cross_dataset_res.status_code == 200
|
|
cross_dataset_payload = cross_dataset_res.json()
|
|
assert cross_dataset_payload["code"] == 102, cross_dataset_payload
|
|
assert f"These documents do not belong to dataset {dataset_id}" in cross_dataset_payload["message"], cross_dataset_payload
|
|
|
|
duplicate_res = rest_client.delete(f"/datasets/{dataset_id}/documents", json={"ids": document_ids + document_ids})
|
|
assert duplicate_res.status_code == 200
|
|
duplicate_payload = duplicate_res.json()
|
|
assert duplicate_payload["code"] == 101, duplicate_payload
|
|
assert "Field: <ids> - Message: <Duplicate ids:" in duplicate_payload["message"], duplicate_payload
|
|
|
|
delete_once_res = rest_client.delete(f"/datasets/{dataset_id}/documents", json={"ids": document_ids})
|
|
assert delete_once_res.status_code == 200
|
|
delete_once_payload = delete_once_res.json()
|
|
assert delete_once_payload["code"] == 0, delete_once_payload
|
|
|
|
delete_twice_res = rest_client.delete(f"/datasets/{dataset_id}/documents", json={"ids": document_ids})
|
|
assert delete_twice_res.status_code == 200
|
|
delete_twice_payload = delete_twice_res.json()
|
|
assert delete_twice_payload["code"] == 102, delete_twice_payload
|
|
assert "or Document not found" in delete_twice_payload["message"], delete_twice_payload
|
|
|
|
other_list_res = rest_client.get(f"/datasets/{other_dataset_id}/documents")
|
|
assert other_list_res.status_code == 200
|
|
other_list_payload = other_list_res.json()
|
|
assert other_list_payload["code"] == 0, other_list_payload
|
|
assert other_list_payload["data"]["total"] == 1, other_list_payload
|
|
|
|
|
|
@pytest.mark.p2
|
|
def test_documents_delete_concurrent_and_bulk_contract(rest_client, create_dataset, tmp_path):
|
|
dataset_id, uploaded_docs = _seed_documents(
|
|
rest_client, create_dataset, tmp_path, count=60, timeout=120
|
|
)
|
|
document_ids = [doc["id"] for doc in uploaded_docs]
|
|
|
|
with ThreadPoolExecutor(max_workers=8) as executor:
|
|
futures = [
|
|
executor.submit(
|
|
rest_client.delete,
|
|
f"/datasets/{dataset_id}/documents",
|
|
json={"ids": [doc_id]},
|
|
)
|
|
for doc_id in document_ids
|
|
]
|
|
_responses = list(as_completed(futures))
|
|
assert len(_responses) == len(document_ids), _responses
|
|
for future in futures:
|
|
response = future.result()
|
|
assert response.status_code == 200, response.text
|
|
payload = response.json()
|
|
assert payload["code"] == 0, payload
|
|
|
|
list_after_concurrent = rest_client.get(f"/datasets/{dataset_id}/documents")
|
|
assert list_after_concurrent.status_code == 200
|
|
list_after_payload = list_after_concurrent.json()
|
|
assert list_after_payload["code"] == 0, list_after_payload
|
|
assert list_after_payload["data"]["total"] == 0, list_after_payload
|
|
|
|
bulk_dataset_id, bulk_docs = _seed_documents(
|
|
rest_client, create_dataset, tmp_path, count=120, timeout=120
|
|
)
|
|
bulk_ids = [doc["id"] for doc in bulk_docs]
|
|
bulk_delete_res = rest_client.delete(
|
|
f"/datasets/{bulk_dataset_id}/documents",
|
|
json={"ids": bulk_ids},
|
|
timeout=120,
|
|
)
|
|
assert bulk_delete_res.status_code == 200
|
|
bulk_delete_payload = bulk_delete_res.json()
|
|
assert bulk_delete_payload["code"] == 0, bulk_delete_payload
|
|
assert bulk_delete_payload["data"]["deleted"] == 120, bulk_delete_payload
|
|
|
|
bulk_list_res = rest_client.get(f"/datasets/{bulk_dataset_id}/documents")
|
|
assert bulk_list_res.status_code == 200
|
|
bulk_list_payload = bulk_list_res.json()
|
|
assert bulk_list_payload["code"] == 0, bulk_list_payload
|
|
assert bulk_list_payload["data"]["total"] == 0, bulk_list_payload
|
|
|
|
|
|
@pytest.mark.p2
|
|
def test_documents_parse_requires_auth(create_document):
|
|
dataset_id, document_id = create_document("parse_auth_target.txt")
|
|
payload = {"document_ids": [document_id]}
|
|
for scenario_name, client in (("missing token", RestClient(token=None)), ("invalid token", RestClient(token=INVALID_API_TOKEN))):
|
|
res = client.post(f"/datasets/{dataset_id}/documents/parse", json=payload)
|
|
assert res.status_code == 401, (scenario_name, res.text)
|
|
body = res.json()
|
|
assert body["code"] == 401, (scenario_name, body)
|
|
assert body["message"] == "<Unauthorized '401: Unauthorized'>", (scenario_name, body)
|
|
|
|
|
|
@pytest.mark.p2
|
|
def test_documents_parse_contract_matrix(rest_client, create_dataset, tmp_path):
|
|
scenarios = [
|
|
("empty ids", lambda ids: {"document_ids": []}, 102, "`document_ids` is required"),
|
|
("invalid id", lambda ids: {"document_ids": ["invalid_id"]}, 102, "Documents not found: ['invalid_id']"),
|
|
("special invalid id", lambda ids: {"document_ids": ["\\n!?。;!?\"'"]}, 102, "Documents not found:"),
|
|
("not json object", lambda ids: "not json", 100, "object has no attribute"),
|
|
("parse one", lambda ids: {"document_ids": ids[:1]}, 0, ""),
|
|
("parse all", lambda ids: {"document_ids": ids}, 0, ""),
|
|
]
|
|
for scenario_name, payload_builder, expected_code, expected_message in scenarios:
|
|
dataset_id, uploaded_docs = _seed_documents(rest_client, create_dataset, tmp_path, count=3)
|
|
doc_ids = [doc["id"] for doc in uploaded_docs]
|
|
payload = payload_builder(doc_ids)
|
|
|
|
res = rest_client.post(f"/datasets/{dataset_id}/documents/parse", json=payload, timeout=60)
|
|
assert res.status_code == 200, (scenario_name, res.text)
|
|
body = res.json()
|
|
assert body["code"] == expected_code, (scenario_name, body)
|
|
if expected_code != 0:
|
|
assert expected_message in body["message"], (scenario_name, body)
|
|
else:
|
|
target_ids = payload["document_ids"]
|
|
_wait_document_runs(rest_client, dataset_id, target_ids, expected_run="DONE")
|
|
detail_res = rest_client.get(f"/datasets/{dataset_id}/documents", params={"page_size": 10})
|
|
detail_payload = detail_res.json()
|
|
docs = {doc["id"]: doc for doc in detail_payload["data"]["docs"]}
|
|
for doc_id in target_ids:
|
|
doc = docs[doc_id]
|
|
assert doc["run"] == "DONE", (scenario_name, doc)
|
|
assert doc["process_begin_at"], (scenario_name, doc)
|
|
assert doc["process_duration"] >= 0, (scenario_name, doc)
|
|
assert doc["progress"] >= 0, (scenario_name, doc)
|
|
assert "Task done" in doc["progress_msg"], (scenario_name, doc)
|
|
|
|
|
|
@pytest.mark.p2
|
|
def test_documents_parse_invalid_dataset_partial_duplicate_and_repeated(rest_client, create_dataset, tmp_path):
|
|
dataset_id, uploaded_docs = _seed_documents(rest_client, create_dataset, tmp_path, count=3)
|
|
doc_ids = [doc["id"] for doc in uploaded_docs]
|
|
|
|
for bad_dataset in ("", "invalid_dataset_id"):
|
|
path = f"/datasets/{bad_dataset}/documents/parse" if bad_dataset else "/datasets//documents/parse"
|
|
res = rest_client.post(path, json={"document_ids": doc_ids})
|
|
assert res.status_code == 200, (bad_dataset, res.text)
|
|
body = res.json()
|
|
if bad_dataset == "":
|
|
assert body["code"] == 100, (bad_dataset, body)
|
|
assert "Method Not Allowed" in body["message"], (bad_dataset, body)
|
|
else:
|
|
assert body["code"] == 102, (bad_dataset, body)
|
|
assert "You don't own the dataset" in body["message"], (bad_dataset, body)
|
|
|
|
for payload in (
|
|
{"document_ids": ["invalid_id"] + doc_ids},
|
|
{"document_ids": doc_ids[:1] + ["invalid_id"] + doc_ids[1:]},
|
|
{"document_ids": doc_ids + ["invalid_id"]},
|
|
):
|
|
res = rest_client.post(f"/datasets/{dataset_id}/documents/parse", json=payload, timeout=60)
|
|
assert res.status_code == 200, (payload, res.text)
|
|
body = res.json()
|
|
assert body["code"] == 102, (payload, body)
|
|
assert body["message"] == "Documents not found: ['invalid_id']", (payload, body)
|
|
|
|
duplicate_res = rest_client.post(
|
|
f"/datasets/{dataset_id}/documents/parse",
|
|
json={"document_ids": doc_ids + doc_ids},
|
|
timeout=60,
|
|
)
|
|
assert duplicate_res.status_code == 200
|
|
duplicate_payload = duplicate_res.json()
|
|
assert duplicate_payload["code"] == 0, duplicate_payload
|
|
assert duplicate_payload["data"]["success_count"] == len(doc_ids), duplicate_payload
|
|
assert any("Duplicate document ids:" in err for err in duplicate_payload["data"].get("errors", [])), duplicate_payload
|
|
_wait_document_runs(rest_client, dataset_id, doc_ids, expected_run="DONE")
|
|
|
|
repeated_res = rest_client.post(f"/datasets/{dataset_id}/documents/parse", json={"document_ids": doc_ids}, timeout=60)
|
|
assert repeated_res.status_code == 200
|
|
repeated_payload = repeated_res.json()
|
|
assert repeated_payload["code"] == 0, repeated_payload
|
|
_wait_document_runs(rest_client, dataset_id, doc_ids, expected_run="DONE")
|
|
|
|
|
|
@pytest.mark.p2
|
|
def test_documents_parse_chunks_and_scaled_bulk_contract(rest_client, create_dataset, tmp_path):
|
|
single_dataset_id, single_docs = _seed_documents(rest_client, create_dataset, tmp_path, count=1)
|
|
single_doc_id = single_docs[0]["id"]
|
|
parse_single_res = rest_client.post(
|
|
f"/datasets/{single_dataset_id}/documents/parse",
|
|
json={"document_ids": [single_doc_id]},
|
|
timeout=60,
|
|
)
|
|
assert parse_single_res.status_code == 200
|
|
parse_single_payload = parse_single_res.json()
|
|
assert parse_single_payload["code"] == 0, parse_single_payload
|
|
_wait_document_runs(rest_client, single_dataset_id, [single_doc_id], expected_run="DONE")
|
|
|
|
chunk_res = rest_client.get(f"/datasets/{single_dataset_id}/documents/{single_doc_id}/chunks")
|
|
assert chunk_res.status_code == 200, chunk_res.text
|
|
chunk_payload = chunk_res.json()
|
|
assert chunk_payload["code"] == 0, chunk_payload
|
|
assert chunk_payload["data"]["doc"]["chunk_count"] > 0, chunk_payload
|
|
assert len(chunk_payload["data"]["chunks"]) > 0, chunk_payload
|
|
|
|
parse_bulk_dataset, parse_bulk_docs = _seed_documents(rest_client, create_dataset, tmp_path, count=20)
|
|
parse_bulk_ids = [doc["id"] for doc in parse_bulk_docs]
|
|
parse_bulk_res = rest_client.post(
|
|
f"/datasets/{parse_bulk_dataset}/documents/parse",
|
|
json={"document_ids": parse_bulk_ids},
|
|
timeout=60,
|
|
)
|
|
assert parse_bulk_res.status_code == 200
|
|
parse_bulk_payload = parse_bulk_res.json()
|
|
assert parse_bulk_payload["code"] == 0, parse_bulk_payload
|
|
_wait_document_runs(rest_client, parse_bulk_dataset, parse_bulk_ids, expected_run="DONE")
|
|
|
|
concurrent_dataset, concurrent_docs = _seed_documents(rest_client, create_dataset, tmp_path, count=20)
|
|
concurrent_ids = [doc["id"] for doc in concurrent_docs]
|
|
with ThreadPoolExecutor(max_workers=8) as executor:
|
|
futures = [
|
|
executor.submit(
|
|
rest_client.post,
|
|
f"/datasets/{concurrent_dataset}/documents/parse",
|
|
json={"document_ids": [doc_id]},
|
|
timeout=60,
|
|
)
|
|
for doc_id in concurrent_ids
|
|
]
|
|
_responses = list(as_completed(futures))
|
|
assert len(_responses) == len(concurrent_ids), _responses
|
|
for future in futures:
|
|
response = future.result()
|
|
assert response.status_code == 200, response.text
|
|
payload = response.json()
|
|
assert payload["code"] == 0, payload
|
|
_wait_document_runs(rest_client, concurrent_dataset, concurrent_ids, expected_run="DONE")
|
|
|
|
|
|
@pytest.mark.p2
|
|
def test_documents_stop_parse_requires_auth(rest_client, create_document):
|
|
dataset_id, document_id = create_document("stop_parse_auth_target.txt")
|
|
parse_res = rest_client.post(
|
|
f"/datasets/{dataset_id}/documents/parse",
|
|
json={"document_ids": [document_id]},
|
|
)
|
|
assert parse_res.status_code == 200
|
|
assert parse_res.json()["code"] == 0, parse_res.json()
|
|
for scenario_name, client in (("missing token", RestClient(token=None)), ("invalid token", RestClient(token=INVALID_API_TOKEN))):
|
|
res = client.post(f"/datasets/{dataset_id}/documents/stop", json={"document_ids": [document_id]})
|
|
assert res.status_code == 401, (scenario_name, res.text)
|
|
body = res.json()
|
|
assert body["code"] == 401, (scenario_name, body)
|
|
assert body["message"] == "<Unauthorized '401: Unauthorized'>", (scenario_name, body)
|
|
|
|
|
|
@pytest.mark.p2
|
|
def test_documents_stop_parse_contract_matrix(rest_client, create_dataset, tmp_path):
|
|
dataset_id, uploaded_docs = _seed_documents(rest_client, create_dataset, tmp_path, count=6)
|
|
doc_ids = [doc["id"] for doc in uploaded_docs]
|
|
|
|
parse_res = rest_client.post(f"/datasets/{dataset_id}/documents/parse", json={"document_ids": doc_ids}, timeout=60)
|
|
assert parse_res.status_code == 200
|
|
parse_payload = parse_res.json()
|
|
assert parse_payload["code"] == 0, parse_payload
|
|
|
|
invalid_payloads = [
|
|
("empty ids", {"document_ids": []}, 102, "`document_ids` is required"),
|
|
("invalid id", {"document_ids": ["invalid_id"]}, 102, "Documents not found: ['invalid_id']"),
|
|
("special invalid id", {"document_ids": ["\\n!?。;!?\"'"]}, 102, "Documents not found:"),
|
|
("not json object", "not json", 100, "object has no attribute"),
|
|
]
|
|
for case_name, payload, expected_code, expected_message in invalid_payloads:
|
|
res = rest_client.post(f"/datasets/{dataset_id}/documents/stop", json=payload, timeout=60)
|
|
assert res.status_code == 200, (case_name, res.text)
|
|
body = res.json()
|
|
assert body["code"] == expected_code, (case_name, body)
|
|
assert expected_message in body["message"], (case_name, body)
|
|
|
|
stop_subset_res = rest_client.post(f"/datasets/{dataset_id}/documents/stop", json={"document_ids": doc_ids[:3]}, timeout=60)
|
|
assert stop_subset_res.status_code == 200
|
|
stop_subset_payload = stop_subset_res.json()
|
|
assert stop_subset_payload["code"] == 0, stop_subset_payload
|
|
assert stop_subset_payload["data"]["success_count"] >= 0, stop_subset_payload
|
|
|
|
duplicate_stop_res = rest_client.post(
|
|
f"/datasets/{dataset_id}/documents/stop",
|
|
json={"document_ids": doc_ids[:3] + doc_ids[:3]},
|
|
timeout=60,
|
|
)
|
|
assert duplicate_stop_res.status_code == 200
|
|
duplicate_stop_payload = duplicate_stop_res.json()
|
|
assert duplicate_stop_payload["code"] == 0, duplicate_stop_payload
|
|
assert any("Duplicate document ids:" in err for err in duplicate_stop_payload["data"].get("errors", [])), duplicate_stop_payload
|
|
|
|
repeated_stop_res = rest_client.post(f"/datasets/{dataset_id}/documents/stop", json={"document_ids": doc_ids[:3]}, timeout=60)
|
|
assert repeated_stop_res.status_code == 200
|
|
repeated_stop_payload = repeated_stop_res.json()
|
|
assert repeated_stop_payload["code"] in (0, 102), repeated_stop_payload
|
|
if repeated_stop_payload["code"] == 102:
|
|
assert "Can't stop parsing document that has not started or already completed" in repeated_stop_payload["message"], repeated_stop_payload
|
|
|
|
|
|
@pytest.mark.p2
|
|
def test_documents_stop_parse_invalid_dataset_partial_and_scaled_concurrency(rest_client, create_dataset, tmp_path):
|
|
dataset_id, uploaded_docs = _seed_documents(rest_client, create_dataset, tmp_path, count=25)
|
|
doc_ids = [doc["id"] for doc in uploaded_docs]
|
|
|
|
parse_res = rest_client.post(f"/datasets/{dataset_id}/documents/parse", json={"document_ids": doc_ids}, timeout=60)
|
|
assert parse_res.status_code == 200
|
|
assert parse_res.json()["code"] == 0, parse_res.json()
|
|
|
|
for bad_dataset in ("", "invalid_dataset_id"):
|
|
path = f"/datasets/{bad_dataset}/documents/stop" if bad_dataset else "/datasets//documents/stop"
|
|
res = rest_client.post(path, json={"document_ids": doc_ids[:1]})
|
|
assert res.status_code == 200, (bad_dataset, res.text)
|
|
body = res.json()
|
|
if bad_dataset == "":
|
|
assert body["code"] == 100, (bad_dataset, body)
|
|
assert "Method Not Allowed" in body["message"], (bad_dataset, body)
|
|
else:
|
|
assert body["code"] == 102, (bad_dataset, body)
|
|
assert "You don't own the dataset" in body["message"], (bad_dataset, body)
|
|
|
|
for payload in (
|
|
{"document_ids": ["invalid_id"] + doc_ids[:3]},
|
|
{"document_ids": doc_ids[:1] + ["invalid_id"] + doc_ids[1:3]},
|
|
{"document_ids": doc_ids[:3] + ["invalid_id"]},
|
|
):
|
|
res = rest_client.post(f"/datasets/{dataset_id}/documents/stop", json=payload, timeout=60)
|
|
assert res.status_code == 200, (payload, res.text)
|
|
body = res.json()
|
|
assert body["code"] == 102, (payload, body)
|
|
assert body["message"] == "Documents not found: ['invalid_id']", (payload, body)
|
|
|
|
with ThreadPoolExecutor(max_workers=8) as executor:
|
|
futures = [
|
|
executor.submit(
|
|
rest_client.post,
|
|
f"/datasets/{dataset_id}/documents/stop",
|
|
json={"document_ids": [doc_id]},
|
|
timeout=60,
|
|
)
|
|
for doc_id in doc_ids
|
|
]
|
|
responses = [future.result() for future in futures]
|
|
assert len(responses) == len(doc_ids), responses
|
|
assert all(res.status_code == 200 for res in responses)
|
|
assert all(res.json()["code"] == 0 for res in responses)
|
|
|
|
stop_all_res = rest_client.post(f"/datasets/{dataset_id}/documents/stop", json={"document_ids": doc_ids}, timeout=60)
|
|
assert stop_all_res.status_code == 200
|
|
stop_all_payload = stop_all_res.json()
|
|
assert stop_all_payload["code"] == 0, stop_all_payload
|
|
assert stop_all_payload["data"]["success_count"] >= 0, stop_all_payload
|
|
|
|
|
|
@pytest.mark.p2
|
|
def test_documents_download_requires_auth_and_invalid_id_contract(rest_client, create_document, tmp_path):
|
|
dataset_id, document_id = create_document("download_target.txt")
|
|
|
|
for scenario_name, client in (("missing token", RestClient(token=None)), ("invalid token", RestClient(token=INVALID_API_TOKEN))):
|
|
res = client.get(f"/datasets/{dataset_id}/documents/{document_id}")
|
|
assert res.status_code == 401, (scenario_name, res.text)
|
|
body = res.json()
|
|
assert body["code"] == 401, (scenario_name, body)
|
|
|
|
invalid_doc_path = tmp_path / "invalid_doc_download.txt"
|
|
invalid_doc_res = _download_document_to_file(rest_client, dataset_id, "invalid_document_id", invalid_doc_path)
|
|
assert invalid_doc_res.status_code == 200
|
|
invalid_doc_payload = invalid_doc_res.json()
|
|
assert invalid_doc_payload["code"] == 102, invalid_doc_payload
|
|
assert "The dataset not own the document invalid_document_id." in invalid_doc_payload["message"], invalid_doc_payload
|
|
|
|
invalid_dataset_path = tmp_path / "invalid_dataset_download.txt"
|
|
invalid_dataset_res = _download_document_to_file(rest_client, "invalid_dataset_id", document_id, invalid_dataset_path)
|
|
assert invalid_dataset_res.status_code == 200
|
|
invalid_dataset_payload = invalid_dataset_res.json()
|
|
assert invalid_dataset_payload["code"] == 102, invalid_dataset_payload
|
|
assert f"The dataset not own the document {document_id}." in invalid_dataset_payload["message"], invalid_dataset_payload
|
|
|
|
|
|
@pytest.mark.p2
|
|
def test_documents_download_filetype_repeat_and_concurrent_contract(rest_client, create_dataset, tmp_path):
|
|
dataset_id = create_dataset("dataset_download_contract")
|
|
|
|
creators = {
|
|
"docx": create_docx_file,
|
|
"xlsx": create_excel_file,
|
|
"pptx": create_ppt_file,
|
|
"png": create_image_file,
|
|
"pdf": create_pdf_file,
|
|
"txt": create_txt_file,
|
|
"md": create_md_file,
|
|
"json": create_json_file,
|
|
"eml": create_eml_file,
|
|
"html": create_html_file,
|
|
}
|
|
|
|
uploaded = []
|
|
for ext, creator in creators.items():
|
|
source_path = Path(creator(tmp_path / f"download_type_{ext}.{ext}"))
|
|
with source_path.open("rb") as file_obj:
|
|
upload_res = rest_client.post(
|
|
f"/datasets/{dataset_id}/documents",
|
|
files=[("file", (source_path.name, file_obj))],
|
|
)
|
|
assert upload_res.status_code == 200, (ext, upload_res.text)
|
|
upload_payload = upload_res.json()
|
|
assert upload_payload["code"] == 0, (ext, upload_payload)
|
|
uploaded.append((source_path, upload_payload["data"][0]["id"]))
|
|
|
|
for source_path, document_id in uploaded:
|
|
target_path = tmp_path / f"download_once_{source_path.name}"
|
|
download_res = _download_document_to_file(rest_client, dataset_id, document_id, target_path)
|
|
assert download_res.status_code == 200, (source_path.name, download_res.text)
|
|
assert compare_by_hash(source_path, target_path), source_path.name
|
|
|
|
first_source, first_document_id = uploaded[0]
|
|
for index in range(5):
|
|
repeated_path = tmp_path / f"download_repeat_{index}_{first_source.name}"
|
|
repeated_res = _download_document_to_file(rest_client, dataset_id, first_document_id, repeated_path)
|
|
assert repeated_res.status_code == 200, (index, repeated_res.text)
|
|
assert compare_by_hash(first_source, repeated_path), index
|
|
|
|
with ThreadPoolExecutor(max_workers=8) as executor:
|
|
futures = [
|
|
executor.submit(
|
|
_download_document_to_file,
|
|
rest_client,
|
|
dataset_id,
|
|
document_id,
|
|
tmp_path / f"download_concurrent_{i}_{source_path.name}",
|
|
)
|
|
for i, (source_path, document_id) in enumerate(uploaded)
|
|
]
|
|
_responses = list(as_completed(futures))
|
|
assert len(_responses) == len(uploaded), _responses
|
|
for i, (source_path, _document_id) in enumerate(uploaded):
|
|
downloaded_path = tmp_path / f"download_concurrent_{i}_{source_path.name}"
|
|
assert downloaded_path.exists(), source_path.name
|
|
assert compare_by_hash(source_path, downloaded_path), source_path.name
|
|
|
|
|
|
@pytest.mark.p2
|
|
def test_documents_table_parser_chat_patterns(rest_client, clear_datasets, tmp_path):
|
|
create_dataset_res = rest_client.post(
|
|
"/datasets",
|
|
json={"name": f"table_parser_dataset_contract_{uuid.uuid4().hex[:8]}", "chunk_method": "table"},
|
|
)
|
|
assert create_dataset_res.status_code == 200
|
|
create_dataset_payload = create_dataset_res.json()
|
|
assert create_dataset_payload["code"] == 0, create_dataset_payload
|
|
dataset_id = create_dataset_payload["data"]["id"]
|
|
|
|
excel_a = _create_table_excel(
|
|
tmp_path / "table_a.xlsx",
|
|
[
|
|
["employee_id", "name", "department", "salary"],
|
|
["E001", "Alice Johnson", "Engineering", "95000"],
|
|
["E002", "Bob Smith", "Marketing", "65000"],
|
|
["E003", "Carol Williams", "Engineering", "88000"],
|
|
],
|
|
)
|
|
excel_b = _create_table_excel(
|
|
tmp_path / "table_b.xlsx",
|
|
[
|
|
["product", "price", "category"],
|
|
["Laptop", "999", "Electronics"],
|
|
["Keyboard", "79", "Electronics"],
|
|
["Desk", "299", "Furniture"],
|
|
],
|
|
)
|
|
with ExitStack() as stack:
|
|
files = [
|
|
("file", (excel_a.name, stack.enter_context(excel_a.open("rb")))),
|
|
("file", (excel_b.name, stack.enter_context(excel_b.open("rb")))),
|
|
]
|
|
upload_res = rest_client.post(f"/datasets/{dataset_id}/documents", files=files)
|
|
assert upload_res.status_code == 200
|
|
upload_payload = upload_res.json()
|
|
assert upload_payload["code"] == 0, upload_payload
|
|
document_ids = [doc["id"] for doc in upload_payload["data"]]
|
|
|
|
parse_res = rest_client.post(
|
|
f"/datasets/{dataset_id}/documents/parse",
|
|
json={"document_ids": document_ids},
|
|
timeout=60,
|
|
)
|
|
assert parse_res.status_code == 200
|
|
parse_payload = parse_res.json()
|
|
assert parse_payload["code"] == 0, parse_payload
|
|
_wait_document_runs(rest_client, dataset_id, document_ids, expected_run="DONE")
|
|
|
|
chat_payload = {
|
|
"name": f"table_parser_chat_{uuid.uuid4().hex[:8]}",
|
|
"dataset_ids": [dataset_id],
|
|
"prompt_config": {
|
|
"system": "Use table knowledge to answer questions.",
|
|
"parameters": [{"key": "knowledge", "optional": True, "value": "Answer with table evidence."}],
|
|
},
|
|
}
|
|
create_chat_res = rest_client.post("/chats", json=chat_payload)
|
|
assert create_chat_res.status_code == 200
|
|
create_chat_payload = create_chat_res.json()
|
|
assert create_chat_payload["code"] == 0, create_chat_payload
|
|
chat_id = create_chat_payload["data"]["id"]
|
|
|
|
create_session_res = rest_client.post(f"/chats/{chat_id}/sessions", json={"name": "table_parser_session"})
|
|
assert create_session_res.status_code == 200
|
|
create_session_payload = create_session_res.json()
|
|
assert create_session_payload["code"] == 0, create_session_payload
|
|
session_id = create_session_payload["data"]["id"]
|
|
|
|
questions = [
|
|
"show me column of product",
|
|
"which product has price 79",
|
|
"How many rows in the dataset?",
|
|
"Show me all employees in Engineering department",
|
|
]
|
|
for question in questions:
|
|
completion_res = rest_client.post(
|
|
"/chat/completions",
|
|
json={
|
|
"chat_id": chat_id,
|
|
"session_id": session_id,
|
|
"messages": [{"role": "user", "content": question}],
|
|
"stream": False,
|
|
},
|
|
timeout=60,
|
|
)
|
|
assert completion_res.status_code == 200, (question, completion_res.text)
|
|
completion_payload = completion_res.json()
|
|
assert completion_payload["code"] == 0, (question, completion_payload)
|
|
answer = completion_payload["data"]["answer"]
|
|
assert isinstance(answer, str) and answer.strip(), (question, completion_payload)
|