From 036ed5b236b1bcde9cd70e29c51ea29a9011dec3 Mon Sep 17 00:00:00 2001 From: Idriss Sbaaoui <112825897+6ba3i@users.noreply.github.com> Date: Tue, 26 May 2026 13:24:22 +0800 Subject: [PATCH] Feat: add new tests and tescases for restful api suite (#15230) ### What problem does this PR solve? extend restful api suite ### Type of change - [x] New Feature (non-breaking change which adds functionality) - [x] Other (please describe): test --- test/testcases/restful_api/test_datasets.py | 975 +++++++++++++++++++- 1 file changed, 974 insertions(+), 1 deletion(-) diff --git a/test/testcases/restful_api/test_datasets.py b/test/testcases/restful_api/test_datasets.py index 4c8581a3fa..82ffdbc6e0 100644 --- a/test/testcases/restful_api/test_datasets.py +++ b/test/testcases/restful_api/test_datasets.py @@ -16,11 +16,14 @@ from concurrent.futures import ThreadPoolExecutor, as_completed import os +import uuid import pytest from configs import DATASET_NAME_LIMIT, DEFAULT_PARSER_CONFIG +from test.testcases.configs import INVALID_API_TOKEN +from test.testcases.restful_api.helpers.client import RestClient from test.testcases.utils import encode_avatar -from test.testcases.utils.file_utils import create_image_file +from test.testcases.utils.file_utils import create_image_file, create_txt_file @pytest.mark.p1 @@ -580,6 +583,468 @@ def test_dataset_update_concurrent_contract(rest_client, clear_datasets): assert payload["code"] == 0, (index, payload) +@pytest.mark.p1 +def test_dataset_update_requires_auth_contract(rest_client, clear_datasets): + create_res = rest_client.post("/datasets", json={"name": "dataset_update_auth_contract"}) + assert create_res.status_code == 200 + create_payload = create_res.json() + assert create_payload["code"] == 0, create_payload + dataset_id = create_payload["data"]["id"] + + for scenario_name, client in (("missing token", RestClient(token=None)), ("invalid token", RestClient(token=INVALID_API_TOKEN))): + res = client.put(f"/datasets/{dataset_id}", json={"name": "dataset_update_auth_invalid"}) + assert res.status_code == 401, (scenario_name, res.text) + payload = res.json() + assert payload["code"] == 401, (scenario_name, payload) + assert payload["message"] == "", (scenario_name, payload) + + +@pytest.mark.p2 +def test_dataset_update_content_type_and_payload_contract(rest_client, clear_datasets): + create_res = rest_client.post("/datasets", json={"name": "dataset_update_payload_contract"}) + assert create_res.status_code == 200 + create_payload = create_res.json() + assert create_payload["code"] == 0, create_payload + dataset_id = create_payload["data"]["id"] + + bad_content_type = "text/xml" + bad_content_type_res = rest_client.put( + f"/datasets/{dataset_id}", + data='{"name": "bad_content_type"}', + headers={"Content-Type": bad_content_type}, + ) + assert bad_content_type_res.status_code == 200 + bad_content_type_payload = bad_content_type_res.json() + assert bad_content_type_payload["code"] == 101, bad_content_type_payload + assert ( + f"Unsupported content type: Expected application/json, got {bad_content_type}" in bad_content_type_payload["message"] + ), bad_content_type_payload + + malformed_json_res = rest_client.put(f"/datasets/{dataset_id}", data="a") + assert malformed_json_res.status_code == 200 + malformed_json_payload = malformed_json_res.json() + assert malformed_json_payload["code"] == 101, malformed_json_payload + assert "Malformed JSON syntax: Missing commas/brackets or invalid encoding" in malformed_json_payload["message"], malformed_json_payload + + invalid_payload_type_res = rest_client.put(f"/datasets/{dataset_id}", data='"a"') + assert invalid_payload_type_res.status_code == 200 + invalid_payload_type_payload = invalid_payload_type_res.json() + assert invalid_payload_type_payload["code"] == 101, invalid_payload_type_payload + assert "Invalid request payload: expected object, got str" in invalid_payload_type_payload["message"], invalid_payload_type_payload + + empty_payload_res = rest_client.put(f"/datasets/{dataset_id}", json={}) + assert empty_payload_res.status_code == 200 + empty_payload = empty_payload_res.json() + assert empty_payload["code"] == 102, empty_payload + assert empty_payload["message"] == "No properties were modified", empty_payload + + unset_payload_res = rest_client.put(f"/datasets/{dataset_id}") + assert unset_payload_res.status_code == 200 + unset_payload = unset_payload_res.json() + assert unset_payload["code"] == 101, unset_payload + assert "Malformed JSON syntax: Missing commas/brackets or invalid encoding" in unset_payload["message"], unset_payload + + +@pytest.mark.p2 +def test_dataset_update_identifier_validation_contract(rest_client): + payload = {"name": "dataset_update_identifier_validation"} + + not_uuid_res = rest_client.put("/datasets/not_uuid", json=payload) + assert not_uuid_res.status_code == 200 + not_uuid_payload = not_uuid_res.json() + assert not_uuid_payload["code"] == 101, not_uuid_payload + assert "Invalid UUID1 format" in not_uuid_payload["message"], not_uuid_payload + + not_uuid1_res = rest_client.put(f"/datasets/{uuid.uuid4().hex}", json=payload) + assert not_uuid1_res.status_code == 200 + not_uuid1_payload = not_uuid1_res.json() + assert not_uuid1_payload["code"] == 101, not_uuid1_payload + assert "Invalid UUID1 format" in not_uuid1_payload["message"], not_uuid1_payload + + wrong_uuid_res = rest_client.put("/datasets/d94a8dc02c9711f0930f7fbc369eab6d", json=payload) + assert wrong_uuid_res.status_code == 200 + wrong_uuid_payload = wrong_uuid_res.json() + assert wrong_uuid_payload["code"] == 102, wrong_uuid_payload + assert "lacks permission for dataset" in wrong_uuid_payload["message"], wrong_uuid_payload + + +@pytest.mark.p2 +def test_dataset_update_avatar_invalid_and_none_contract(rest_client, clear_datasets, tmp_path): + create_res = rest_client.post("/datasets", json={"name": "dataset_update_avatar_invalid_contract"}) + assert create_res.status_code == 200 + create_payload = create_res.json() + assert create_payload["code"] == 0, create_payload + dataset_id = create_payload["data"]["id"] + + exceed_res = rest_client.put( + f"/datasets/{dataset_id}", + json={"avatar": "a" * 65536}, + ) + assert exceed_res.status_code == 200 + exceed_payload = exceed_res.json() + assert exceed_payload["code"] == 101, exceed_payload + assert "String should have at most 65535 characters" in exceed_payload["message"], exceed_payload + + image_path = create_image_file(tmp_path / "dataset_update_avatar_invalid.png") + encoded_avatar = encode_avatar(image_path) + invalid_prefix_cases = [ + ("", "Missing MIME prefix. Expected format: data:;base64,"), + ("data:image/png;base64", "Missing MIME prefix. Expected format: data:;base64,"), + ("invalid_mine_prefix:image/png;base64,", "Invalid MIME prefix format. Must start with 'data:'"), + ("data:unsupported_mine_type;base64,", "Unsupported MIME type. Allowed: ['image/jpeg', 'image/png']"), + ] + for prefix, expected_message in invalid_prefix_cases: + res = rest_client.put( + f"/datasets/{dataset_id}", + json={"avatar": f"{prefix}{encoded_avatar}"}, + ) + assert res.status_code == 200 + payload = res.json() + assert payload["code"] == 101, payload + assert expected_message in payload["message"], payload + + none_res = rest_client.put(f"/datasets/{dataset_id}", json={"avatar": None}) + assert none_res.status_code == 200 + none_payload = none_res.json() + assert none_payload["code"] == 0, none_payload + + list_res = rest_client.get("/datasets", params={"id": dataset_id}) + assert list_res.status_code == 200 + list_payload = list_res.json() + assert list_payload["code"] == 0, list_payload + assert list_payload["data"][0]["avatar"] is None, list_payload + + +@pytest.mark.p2 +def test_dataset_update_description_validation_contract(rest_client, clear_datasets): + create_res = rest_client.post("/datasets", json={"name": "dataset_update_description_contract"}) + assert create_res.status_code == 200 + create_payload = create_res.json() + assert create_payload["code"] == 0, create_payload + dataset_id = create_payload["data"]["id"] + + exceeds_limit_res = rest_client.put( + f"/datasets/{dataset_id}", + json={"description": "a" * 65536}, + ) + assert exceeds_limit_res.status_code == 200 + exceeds_limit_payload = exceeds_limit_res.json() + assert exceeds_limit_payload["code"] == 101, exceeds_limit_payload + assert "String should have at most 65535 characters" in exceeds_limit_payload["message"], exceeds_limit_payload + + none_res = rest_client.put(f"/datasets/{dataset_id}", json={"description": None}) + assert none_res.status_code == 200 + none_payload = none_res.json() + assert none_payload["code"] == 0, none_payload + + list_res = rest_client.get("/datasets", params={"id": dataset_id}) + assert list_res.status_code == 200 + list_payload = list_res.json() + assert list_payload["code"] == 0, list_payload + assert list_payload["data"][0]["description"] is None, list_payload + + +@pytest.mark.p2 +def test_dataset_update_name_invalid_and_duplicate_contract(rest_client, clear_datasets): + first_res = rest_client.post("/datasets", json={"name": "dataset_update_name_invalid_first"}) + assert first_res.status_code == 200 + first_payload = first_res.json() + assert first_payload["code"] == 0, first_payload + first_dataset_id = first_payload["data"]["id"] + + second_res = rest_client.post("/datasets", json={"name": "dataset_update_name_invalid_second"}) + assert second_res.status_code == 200 + second_payload = second_res.json() + assert second_payload["code"] == 0, second_payload + + invalid_cases = [ + ("", "String should have at least 1 character"), + (" ", "String should have at least 1 character"), + ("a" * (DATASET_NAME_LIMIT + 1), f"String should have at most {DATASET_NAME_LIMIT} characters"), + (0, "Input should be a valid string"), + (None, "Input should be a valid string"), + ] + for name, expected_message in invalid_cases: + res = rest_client.put(f"/datasets/{first_dataset_id}", json={"name": name}) + assert res.status_code == 200 + payload = res.json() + assert payload["code"] == 101, payload + assert expected_message in payload["message"], payload + + duplicated_res = rest_client.put( + f"/datasets/{first_dataset_id}", + json={"name": second_payload["data"]["name"]}, + ) + assert duplicated_res.status_code == 200 + duplicated_payload = duplicated_res.json() + assert duplicated_payload["code"] == 102, duplicated_payload + assert f"Dataset name '{second_payload['data']['name']}' already exists" == duplicated_payload["message"], duplicated_payload + + +@pytest.mark.p2 +def test_dataset_update_embedding_model_invalid_and_none_contract(rest_client, clear_datasets): + create_res = rest_client.post("/datasets", json={"name": "dataset_update_embedding_invalid_contract"}) + assert create_res.status_code == 200 + create_payload = create_res.json() + assert create_payload["code"] == 0, create_payload + dataset_id = create_payload["data"]["id"] + + invalid_cases = [ + ("unknown@ZHIPU-AI", "Unsupported model: "), + ("embedding-3@unknown", "Unsupported model: "), + ("text-embedding-v3@Tongyi-Qianwen", "Unauthorized model: "), + ("text-embedding-3-small@OpenAI", "Unauthorized model: "), + ] + for embedding_model, expected_message in invalid_cases: + res = rest_client.put( + f"/datasets/{dataset_id}", + json={"embedding_model": embedding_model}, + ) + assert res.status_code == 200 + payload = res.json() + assert payload["code"] == 102, payload + assert payload["message"] == expected_message, payload + + none_res = rest_client.put(f"/datasets/{dataset_id}", json={"embedding_model": None}) + assert none_res.status_code == 200 + none_payload = none_res.json() + assert none_payload["code"] == 0, none_payload + + list_res = rest_client.get("/datasets", params={"id": dataset_id}) + assert list_res.status_code == 200 + list_payload = list_res.json() + assert list_payload["code"] == 0, list_payload + assert list_payload["data"][0]["embedding_model"] == "BAAI/bge-small-en-v1.5@Builtin", list_payload + + +@pytest.mark.p2 +def test_dataset_update_permission_invalid_and_none_contract(rest_client, clear_datasets): + create_res = rest_client.post("/datasets", json={"name": "dataset_update_permission_invalid_contract"}) + assert create_res.status_code == 200 + create_payload = create_res.json() + assert create_payload["code"] == 0, create_payload + dataset_id = create_payload["data"]["id"] + + invalid_permissions = ["", "unknown", [], "ME", "TEAM", " ME "] + for permission in invalid_permissions: + res = rest_client.put(f"/datasets/{dataset_id}", json={"permission": permission}) + assert res.status_code == 200 + payload = res.json() + assert payload["code"] == 101, payload + assert "Input should be 'me' or 'team'" in payload["message"], payload + + none_res = rest_client.put(f"/datasets/{dataset_id}", json={"permission": None}) + assert none_res.status_code == 200 + none_payload = none_res.json() + assert none_payload["code"] == 101, none_payload + assert "Input should be 'me' or 'team'" in none_payload["message"], none_payload + + +@pytest.mark.p2 +def test_dataset_update_chunk_method_invalid_contract(rest_client, clear_datasets): + create_res = rest_client.post("/datasets", json={"name": "dataset_update_chunk_method_invalid_contract"}) + assert create_res.status_code == 200 + create_payload = create_res.json() + assert create_payload["code"] == 0, create_payload + dataset_id = create_payload["data"]["id"] + + expected_chunk_message = ( + "Input should be 'naive', 'book', 'email', 'laws', 'manual', 'one', 'paper', " + "'picture', 'presentation', 'qa', 'table', 'tag' or 'resume'" + ) + for chunk_method in ("", "unknown", []): + res = rest_client.put(f"/datasets/{dataset_id}", json={"chunk_method": chunk_method}) + assert res.status_code == 200 + payload = res.json() + assert payload["code"] == 101, payload + assert expected_chunk_message in payload["message"], payload + + none_res = rest_client.put(f"/datasets/{dataset_id}", json={"chunk_method": None}) + assert none_res.status_code == 200 + none_payload = none_res.json() + assert none_payload["code"] == 101, none_payload + assert expected_chunk_message in none_payload["message"], none_payload + + +@pytest.mark.p2 +def test_dataset_update_pagerank_invalid_and_none_contract(rest_client, clear_datasets): + create_res = rest_client.post("/datasets", json={"name": "dataset_update_pagerank_invalid_contract"}) + assert create_res.status_code == 200 + create_payload = create_res.json() + assert create_payload["code"] == 0, create_payload + dataset_id = create_payload["data"]["id"] + + for pagerank, expected_message in ((-1, "Input should be greater than or equal to 0"), (101, "Input should be less than or equal to 100")): + res = rest_client.put(f"/datasets/{dataset_id}", json={"pagerank": pagerank}) + assert res.status_code == 200 + payload = res.json() + assert payload["code"] == 101, payload + assert expected_message in payload["message"], payload + + none_res = rest_client.put(f"/datasets/{dataset_id}", json={"pagerank": None}) + assert none_res.status_code == 200 + none_payload = none_res.json() + assert none_payload["code"] == 101, none_payload + assert "Input should be a valid integer" in none_payload["message"], none_payload + + +@pytest.mark.p2 +def test_dataset_update_parser_config_defaults_contract(rest_client, clear_datasets): + create_res = rest_client.post("/datasets", json={"name": "dataset_update_parser_defaults_contract"}) + assert create_res.status_code == 200 + create_payload = create_res.json() + assert create_payload["code"] == 0, create_payload + dataset_id = create_payload["data"]["id"] + + empty_res = rest_client.put(f"/datasets/{dataset_id}", json={"parser_config": {}}) + assert empty_res.status_code == 200 + empty_payload = empty_res.json() + assert empty_payload["code"] == 0, empty_payload + + none_res = rest_client.put(f"/datasets/{dataset_id}", json={"parser_config": None}) + assert none_res.status_code == 200 + none_payload = none_res.json() + assert none_payload["code"] == 0, none_payload + + list_res = rest_client.get("/datasets", params={"id": dataset_id}) + assert list_res.status_code == 200 + list_payload = list_res.json() + assert list_payload["code"] == 0, list_payload + assert list_payload["data"][0]["parser_config"] == DEFAULT_PARSER_CONFIG, list_payload + + +@pytest.mark.p2 +def test_dataset_update_parser_config_invalid_contract(rest_client, clear_datasets): + create_res = rest_client.post("/datasets", json={"name": "dataset_update_parser_invalid_contract"}) + assert create_res.status_code == 200 + create_payload = create_res.json() + assert create_payload["code"] == 0, create_payload + dataset_id = create_payload["data"]["id"] + + invalid_cases = [ + ({"auto_keywords": -1}, "Input should be greater than or equal to 0"), + ({"auto_keywords": 33}, "Input should be less than or equal to 32"), + ({"auto_keywords": 3.14}, "Input should be a valid integer"), + ({"auto_keywords": "string"}, "Input should be a valid integer"), + ({"auto_questions": -1}, "Input should be greater than or equal to 0"), + ({"auto_questions": 11}, "Input should be less than or equal to 10"), + ({"auto_questions": 3.14}, "Input should be a valid integer"), + ({"auto_questions": "string"}, "Input should be a valid integer"), + ({"chunk_token_num": 0}, "Input should be greater than or equal to 1"), + ({"chunk_token_num": 2049}, "Input should be less than or equal to 2048"), + ({"chunk_token_num": 3.14}, "Input should be a valid integer"), + ({"chunk_token_num": "string"}, "Input should be a valid integer"), + ({"delimiter": ""}, "String should have at least 1 character"), + ({"html4excel": "string"}, "Input should be a valid boolean"), + ({"tag_kb_ids": "1,2"}, "Input should be a valid list"), + ({"tag_kb_ids": [1, 2]}, "Input should be a valid string"), + ({"topn_tags": 0}, "Input should be greater than or equal to 1"), + ({"topn_tags": 11}, "Input should be less than or equal to 10"), + ({"topn_tags": 3.14}, "Input should be a valid integer"), + ({"topn_tags": "string"}, "Input should be a valid integer"), + ({"filename_embd_weight": -1}, "Input should be greater than or equal to 0"), + ({"filename_embd_weight": 1.1}, "Input should be less than or equal to 1"), + ({"filename_embd_weight": "string"}, "Input should be a valid number"), + ({"task_page_size": 0}, "Input should be greater than or equal to 1"), + ({"task_page_size": 3.14}, "Input should be a valid integer"), + ({"task_page_size": "string"}, "Input should be a valid integer"), + ({"pages": "1,2"}, "Input should be a valid list"), + ({"pages": ["1,2"]}, "Input should be a valid list"), + ({"pages": [["string1", "string2"]]}, "Input should be a valid integer"), + ({"graphrag": {"use_graphrag": "string"}}, "Input should be a valid boolean"), + ({"graphrag": {"entity_types": "1,2"}}, "Input should be a valid list"), + ({"graphrag": {"entity_types": [1, 2]}}, "nput should be a valid string"), + ({"graphrag": {"method": "unknown"}}, "Input should be 'light', 'general' or 'ner'"), + ({"graphrag": {"method": None}}, "Input should be 'light', 'general' or 'ner'"), + ({"graphrag": {"community": "string"}}, "Input should be a valid boolean"), + ({"graphrag": {"resolution": "string"}}, "Input should be a valid boolean"), + ({"raptor": {"use_raptor": "string"}}, "Input should be a valid boolean"), + ({"raptor": {"prompt": ""}}, "String should have at least 1 character"), + ({"raptor": {"prompt": " "}}, "String should have at least 1 character"), + ({"raptor": {"max_token": 0}}, "Input should be greater than or equal to 1"), + ({"raptor": {"max_token": 2049}}, "Input should be less than or equal to 2048"), + ({"raptor": {"max_token": 3.14}}, "Input should be a valid integer"), + ({"raptor": {"max_token": "string"}}, "Input should be a valid integer"), + ({"raptor": {"threshold": -0.1}}, "Input should be greater than or equal to 0"), + ({"raptor": {"threshold": 1.1}}, "Input should be less than or equal to 1"), + ({"raptor": {"threshold": "string"}}, "Input should be a valid number"), + ({"raptor": {"max_cluster": 0}}, "Input should be greater than or equal to 1"), + ({"raptor": {"max_cluster": 1025}}, "Input should be less than or equal to 1024"), + ({"raptor": {"max_cluster": 3.14}}, "Input should be a valid integer"), + ({"raptor": {"max_cluster": "string"}}, "Input should be a valid integer"), + ({"raptor": {"random_seed": -1}}, "Input should be greater than or equal to 0"), + ({"raptor": {"random_seed": 3.14}}, "Input should be a valid integer"), + ({"raptor": {"random_seed": "string"}}, "Input should be a valid integer"), + ({"raptor": {"clustering_method": "unknown"}}, "Input should be 'gmm' or 'ahc'"), + ({"raptor": {"clustering_method": None}}, "Input should be 'gmm' or 'ahc'"), + ({"raptor": {"tree_builder": "ahc"}}, "Input should be 'raptor' or 'psi'"), + ({"raptor": {"tree_builder": None}}, "Input should be 'raptor' or 'psi'"), + ({"delimiter": "a" * 65536}, "Parser config exceeds size limit (max 65,535 characters)"), + ] + for parser_config, expected_message in invalid_cases: + res = rest_client.put( + f"/datasets/{dataset_id}", + json={"parser_config": parser_config}, + ) + assert res.status_code == 200 + payload = res.json() + assert payload["code"] == 101, payload + assert expected_message in payload["message"], payload + + +@pytest.mark.p2 +def test_dataset_update_field_unset_and_unsupported_contract(rest_client, clear_datasets): + create_res = rest_client.post("/datasets", json={"name": "dataset_update_field_unset_contract"}) + assert create_res.status_code == 200 + create_payload = create_res.json() + assert create_payload["code"] == 0, create_payload + dataset_id = create_payload["data"]["id"] + + list_res = rest_client.get("/datasets", params={"id": dataset_id}) + assert list_res.status_code == 200 + list_payload = list_res.json() + assert list_payload["code"] == 0, list_payload + original_data = list_payload["data"][0] + + name_update_res = rest_client.put(f"/datasets/{dataset_id}", json={"name": "dataset_update_field_unset_renamed"}) + assert name_update_res.status_code == 200 + name_update_payload = name_update_res.json() + assert name_update_payload["code"] == 0, name_update_payload + + after_list_res = rest_client.get("/datasets", params={"id": dataset_id}) + assert after_list_res.status_code == 200 + after_list_payload = after_list_res.json() + assert after_list_payload["code"] == 0, after_list_payload + assert after_list_payload["data"][0]["avatar"] == original_data["avatar"], after_list_payload + assert after_list_payload["data"][0]["description"] == original_data["description"], after_list_payload + assert after_list_payload["data"][0]["embedding_model"] == original_data["embedding_model"], after_list_payload + assert after_list_payload["data"][0]["permission"] == original_data["permission"], after_list_payload + assert after_list_payload["data"][0]["chunk_method"] == original_data["chunk_method"], after_list_payload + assert after_list_payload["data"][0]["pagerank"] == original_data["pagerank"], after_list_payload + assert after_list_payload["data"][0]["parser_config"] == original_data["parser_config"], after_list_payload + + unsupported_field_payloads = [ + {"id": "id"}, + {"tenant_id": "e57c1966f99211efb41e9e45646e0111"}, + {"created_by": "created_by"}, + {"create_date": "Tue, 11 Mar 2025 13:37:23 GMT"}, + {"create_time": 1741671443322}, + {"update_date": "Tue, 11 Mar 2025 13:37:23 GMT"}, + {"update_time": 1741671443339}, + {"document_count": 1}, + {"chunk_count": 1}, + {"token_num": 1}, + {"status": "1"}, + {"unknown_field": "unknown_field"}, + ] + for payload_data in unsupported_field_payloads: + res = rest_client.put(f"/datasets/{dataset_id}", json=payload_data) + assert res.status_code == 200 + payload = res.json() + assert payload["code"] == 101, payload + assert "Extra inputs are not permitted" in payload["message"], payload + + @pytest.mark.p2 @pytest.mark.parametrize( "name, expected_fragment", @@ -1277,6 +1742,514 @@ def test_dataset_list_ordering_and_pagination(rest_client, clear_datasets): assert list_payload.get("total_datasets", 0) >= 3, list_payload +@pytest.mark.p2 +def test_dataset_delete_contract_matrix(rest_client, clear_datasets): + ids = [] + for i in range(3): + create_res = rest_client.post("/datasets", json={"name": f"dataset_delete_matrix_{i}"}) + assert create_res.status_code == 200 + create_payload = create_res.json() + assert create_payload["code"] == 0, create_payload + ids.append(create_payload["data"]["id"]) + + for scenario_name, client in (("missing token", RestClient(token=None)), ("invalid token", RestClient(token=INVALID_API_TOKEN))): + auth_res = client.delete("/datasets", json={"ids": [ids[0]]}) + assert auth_res.status_code == 401, (scenario_name, auth_res.text) + auth_payload = auth_res.json() + assert auth_payload["code"] == 401, (scenario_name, auth_payload) + assert auth_payload["message"] == "", (scenario_name, auth_payload) + + bad_content_type = "text/xml" + bad_content_type_res = rest_client.delete( + "/datasets", + data='{"ids": []}', + headers={"Content-Type": bad_content_type}, + ) + assert bad_content_type_res.status_code == 200 + bad_content_type_payload = bad_content_type_res.json() + assert bad_content_type_payload["code"] == 101, bad_content_type_payload + assert ( + f"Unsupported content type: Expected application/json, got {bad_content_type}" in bad_content_type_payload["message"] + ), bad_content_type_payload + + malformed_json_res = rest_client.delete("/datasets", data="a") + assert malformed_json_res.status_code == 200 + malformed_json_payload = malformed_json_res.json() + assert malformed_json_payload["code"] == 101, malformed_json_payload + assert "Malformed JSON syntax: Missing commas/brackets or invalid encoding" in malformed_json_payload["message"], malformed_json_payload + + invalid_payload_type_res = rest_client.delete("/datasets", data='"a"') + assert invalid_payload_type_res.status_code == 200 + invalid_payload_type_payload = invalid_payload_type_res.json() + assert invalid_payload_type_payload["code"] == 101, invalid_payload_type_payload + assert "Invalid request payload: expected object, got str" in invalid_payload_type_payload["message"], invalid_payload_type_payload + + unset_payload_res = rest_client.delete("/datasets") + assert unset_payload_res.status_code == 200 + unset_payload = unset_payload_res.json() + assert unset_payload["code"] == 101, unset_payload + assert "Malformed JSON syntax: Missing commas/brackets or invalid encoding" in unset_payload["message"], unset_payload + + single_delete_res = rest_client.delete("/datasets", json={"ids": [ids[0]]}) + assert single_delete_res.status_code == 200 + single_delete_payload = single_delete_res.json() + assert single_delete_payload["code"] == 0, single_delete_payload + + list_after_single = rest_client.get("/datasets") + assert list_after_single.status_code == 200 + list_after_single_payload = list_after_single.json() + assert list_after_single_payload["code"] == 0, list_after_single_payload + assert len(list_after_single_payload["data"]) == 2, list_after_single_payload + + ids_empty_res = rest_client.delete("/datasets", json={"ids": []}) + assert ids_empty_res.status_code == 200 + ids_empty_payload = ids_empty_res.json() + assert ids_empty_payload["code"] == 0, ids_empty_payload + + ids_none_res = rest_client.delete("/datasets", json={"ids": None}) + assert ids_none_res.status_code == 200 + ids_none_payload = ids_none_res.json() + assert ids_none_payload["code"] == 0, ids_none_payload + + id_not_uuid_res = rest_client.delete("/datasets", json={"ids": ["not_uuid"]}) + assert id_not_uuid_res.status_code == 200 + id_not_uuid_payload = id_not_uuid_res.json() + assert id_not_uuid_payload["code"] == 101, id_not_uuid_payload + assert "Invalid UUID1 format" in id_not_uuid_payload["message"], id_not_uuid_payload + + id_not_uuid1_res = rest_client.delete("/datasets", json={"ids": [uuid.uuid4().hex]}) + assert id_not_uuid1_res.status_code == 200 + id_not_uuid1_payload = id_not_uuid1_res.json() + assert id_not_uuid1_payload["code"] == 101, id_not_uuid1_payload + assert "Invalid UUID1 format" in id_not_uuid1_payload["message"], id_not_uuid1_payload + + id_wrong_uuid_res = rest_client.delete("/datasets", json={"ids": ["d94a8dc02c9711f0930f7fbc369eab6d"]}) + assert id_wrong_uuid_res.status_code == 200 + id_wrong_uuid_payload = id_wrong_uuid_res.json() + assert id_wrong_uuid_payload["code"] == 102, id_wrong_uuid_payload + assert "lacks permission for dataset" in id_wrong_uuid_payload["message"], id_wrong_uuid_payload + + list_res = rest_client.get("/datasets") + assert list_res.status_code == 200 + list_payload = list_res.json() + assert list_payload["code"] == 0, list_payload + remaining_ids = [dataset["id"] for dataset in list_payload["data"]] + + for invalid_ids in ( + ["d94a8dc02c9711f0930f7fbc369eab6d"] + remaining_ids, + remaining_ids[:1] + ["d94a8dc02c9711f0930f7fbc369eab6d"] + remaining_ids[1:], + remaining_ids + ["d94a8dc02c9711f0930f7fbc369eab6d"], + ): + partial_invalid_res = rest_client.delete("/datasets", json={"ids": invalid_ids}) + assert partial_invalid_res.status_code == 200 + partial_invalid_payload = partial_invalid_res.json() + assert partial_invalid_payload["code"] == 102, partial_invalid_payload + assert "lacks permission for dataset" in partial_invalid_payload["message"], partial_invalid_payload + + duplicate_ids_res = rest_client.delete("/datasets", json={"ids": remaining_ids + remaining_ids}) + assert duplicate_ids_res.status_code == 200 + duplicate_ids_payload = duplicate_ids_res.json() + assert duplicate_ids_payload["code"] == 101, duplicate_ids_payload + assert "Duplicate ids:" in duplicate_ids_payload["message"], duplicate_ids_payload + + repeated_delete_payload = {"ids": remaining_ids} + first_delete_res = rest_client.delete("/datasets", json=repeated_delete_payload) + assert first_delete_res.status_code == 200 + first_delete_payload = first_delete_res.json() + assert first_delete_payload["code"] == 0, first_delete_payload + + second_delete_res = rest_client.delete("/datasets", json=repeated_delete_payload) + assert second_delete_res.status_code == 200 + second_delete_payload = second_delete_res.json() + assert second_delete_payload["code"] == 102, second_delete_payload + assert "lacks permission for dataset" in second_delete_payload["message"], second_delete_payload + + unsupported_field_res = rest_client.delete("/datasets", json={"unknown_field": "unknown_field"}) + assert unsupported_field_res.status_code == 200 + unsupported_field_payload = unsupported_field_res.json() + assert unsupported_field_payload["code"] == 101, unsupported_field_payload + assert "Extra inputs are not permitted" in unsupported_field_payload["message"], unsupported_field_payload + + +@pytest.mark.p3 +def test_dataset_delete_bulk_and_concurrent_contract(rest_client, clear_datasets): + bulk_ids = [] + for i in range(1000): + create_res = rest_client.post("/datasets", json={"name": f"dataset_delete_bulk_{i}"}) + assert create_res.status_code == 200 + create_payload = create_res.json() + assert create_payload["code"] == 0, create_payload + bulk_ids.append(create_payload["data"]["id"]) + + bulk_delete_res = rest_client.delete("/datasets", json={"ids": bulk_ids}) + assert bulk_delete_res.status_code == 200 + bulk_delete_payload = bulk_delete_res.json() + assert bulk_delete_payload["code"] == 0, bulk_delete_payload + + list_after_bulk_delete = rest_client.get("/datasets") + assert list_after_bulk_delete.status_code == 200 + list_after_bulk_delete_payload = list_after_bulk_delete.json() + assert list_after_bulk_delete_payload["code"] == 0, list_after_bulk_delete_payload + assert len(list_after_bulk_delete_payload["data"]) == 0, list_after_bulk_delete_payload + + concurrent_ids = [] + for i in range(100): + create_res = rest_client.post("/datasets", json={"name": f"dataset_delete_concurrent_{i}"}) + assert create_res.status_code == 200 + create_payload = create_res.json() + assert create_payload["code"] == 0, create_payload + concurrent_ids.append(create_payload["data"]["id"]) + + with ThreadPoolExecutor(max_workers=5) as executor: + futures = [executor.submit(rest_client.delete, "/datasets", json={"ids": [dataset_id]}) for dataset_id in concurrent_ids] + + responses = list(as_completed(futures)) + assert len(responses) == len(concurrent_ids), responses + for future in futures: + res = future.result() + assert res.status_code == 200, res.text + payload = res.json() + assert payload["code"] == 0, payload + + +@pytest.mark.p1 +def test_dataset_list_requires_auth_contract(rest_client, clear_datasets): + rest_client.post("/datasets", json={"name": "dataset_list_auth_contract"}) + + for scenario_name, client in (("missing token", RestClient(token=None)), ("invalid token", RestClient(token=INVALID_API_TOKEN))): + res = client.get("/datasets") + assert res.status_code == 401, (scenario_name, res.text) + payload = res.json() + assert payload["code"] == 401, (scenario_name, payload) + assert payload["message"] == "", (scenario_name, payload) + + +@pytest.mark.p2 +def test_dataset_list_query_contract_matrix(rest_client, clear_datasets): + dataset_ids = [] + for i in range(5): + create_res = rest_client.post("/datasets", json={"name": f"dataset_{i}"}) + assert create_res.status_code == 200 + create_payload = create_res.json() + assert create_payload["code"] == 0, create_payload + dataset_ids.append(create_payload["data"]["id"]) + + params_unset_res = rest_client.get("/datasets") + assert params_unset_res.status_code == 200 + params_unset_payload = params_unset_res.json() + assert params_unset_payload["code"] == 0, params_unset_payload + assert len(params_unset_payload["data"]) == 5, params_unset_payload + + params_empty_res = rest_client.get("/datasets", params={}) + assert params_empty_res.status_code == 200 + params_empty_payload = params_empty_res.json() + assert params_empty_payload["code"] == 0, params_empty_payload + assert len(params_empty_payload["data"]) == 5, params_empty_payload + + for params, expected_size in ( + ({"page": 2, "page_size": 2}, 2), + ({"page": 3, "page_size": 2}, 1), + ({"page": 4, "page_size": 2}, 0), + ({"page": "2", "page_size": 2}, 2), + ({"page": 1, "page_size": 10}, 5), + ): + page_res = rest_client.get("/datasets", params=params) + assert page_res.status_code == 200 + page_payload = page_res.json() + assert page_payload["code"] == 0, page_payload + assert len(page_payload["data"]) == expected_size, page_payload + + for params, expected_fragment in ( + ({"page": 0}, "Input should be greater than or equal to 1"), + ({"page": "a"}, "Input should be a valid integer, unable to parse string as an integer"), + ): + page_invalid_res = rest_client.get("/datasets", params=params) + assert page_invalid_res.status_code == 200 + page_invalid_payload = page_invalid_res.json() + assert page_invalid_payload["code"] == 101, page_invalid_payload + assert expected_fragment in page_invalid_payload["message"], page_invalid_payload + + page_none_res = rest_client.get("/datasets", params={"page": None}) + assert page_none_res.status_code == 200 + page_none_payload = page_none_res.json() + assert page_none_payload["code"] == 0, page_none_payload + assert len(page_none_payload["data"]) == 5, page_none_payload + + for params, expected_size in ( + ({"page_size": 1}, 1), + ({"page_size": 3}, 3), + ({"page_size": 5}, 5), + ({"page_size": 6}, 5), + ({"page_size": "1"}, 1), + ): + page_size_res = rest_client.get("/datasets", params=params) + assert page_size_res.status_code == 200 + page_size_payload = page_size_res.json() + assert page_size_payload["code"] == 0, page_size_payload + assert len(page_size_payload["data"]) == expected_size, page_size_payload + + for params, expected_fragment in ( + ({"page_size": 0}, "Input should be greater than or equal to 1"), + ({"page_size": "a"}, "Input should be a valid integer, unable to parse string as an integer"), + ): + page_size_invalid_res = rest_client.get("/datasets", params=params) + assert page_size_invalid_res.status_code == 200 + page_size_invalid_payload = page_size_invalid_res.json() + assert page_size_invalid_payload["code"] == 101, page_size_invalid_payload + assert expected_fragment in page_size_invalid_payload["message"], page_size_invalid_payload + + page_size_none_res = rest_client.get("/datasets", params={"page_size": None}) + assert page_size_none_res.status_code == 200 + page_size_none_payload = page_size_none_res.json() + assert page_size_none_payload["code"] == 0, page_size_none_payload + assert len(page_size_none_payload["data"]) == 5, page_size_none_payload + + for params in ({"orderby": "create_time"}, {"orderby": "update_time"}): + orderby_res = rest_client.get("/datasets", params=params) + assert orderby_res.status_code == 200 + orderby_payload = orderby_res.json() + assert orderby_payload["code"] == 0, orderby_payload + + for params in ( + {"orderby": ""}, + {"orderby": "unknown"}, + {"orderby": "CREATE_TIME"}, + {"orderby": "UPDATE_TIME"}, + {"orderby": " create_time "}, + ): + orderby_invalid_res = rest_client.get("/datasets", params=params) + assert orderby_invalid_res.status_code == 200 + orderby_invalid_payload = orderby_invalid_res.json() + assert orderby_invalid_payload["code"] == 101, orderby_invalid_payload + assert "Input should be 'create_time' or 'update_time'" in orderby_invalid_payload["message"], orderby_invalid_payload + + orderby_none_res = rest_client.get("/datasets", params={"orderby": None}) + assert orderby_none_res.status_code == 200 + orderby_none_payload = orderby_none_res.json() + assert orderby_none_payload["code"] == 0, orderby_none_payload + + for params in ( + {"desc": True}, + {"desc": False}, + {"desc": "true"}, + {"desc": "false"}, + {"desc": 1}, + {"desc": 0}, + {"desc": "yes"}, + {"desc": "no"}, + {"desc": "y"}, + {"desc": "n"}, + ): + desc_res = rest_client.get("/datasets", params=params) + assert desc_res.status_code == 200 + desc_payload = desc_res.json() + assert desc_payload["code"] == 0, desc_payload + + for params in ({"desc": 3.14}, {"desc": "unknown"}): + desc_invalid_res = rest_client.get("/datasets", params=params) + assert desc_invalid_res.status_code == 200 + desc_invalid_payload = desc_invalid_res.json() + assert desc_invalid_payload["code"] == 101, desc_invalid_payload + assert "Input should be a valid boolean, unable to interpret input" in desc_invalid_payload["message"], desc_invalid_payload + + desc_none_res = rest_client.get("/datasets", params={"desc": None}) + assert desc_none_res.status_code == 200 + desc_none_payload = desc_none_res.json() + assert desc_none_payload["code"] == 0, desc_none_payload + + name_res = rest_client.get("/datasets", params={"name": "dataset_1"}) + assert name_res.status_code == 200 + name_payload = name_res.json() + assert name_payload["code"] == 0, name_payload + assert len(name_payload["data"]) == 1, name_payload + assert name_payload["data"][0]["name"] == "dataset_1", name_payload + + name_wrong_res = rest_client.get("/datasets", params={"name": "wrong name"}) + assert name_wrong_res.status_code == 200 + name_wrong_payload = name_wrong_res.json() + assert name_wrong_payload["code"] == 102, name_wrong_payload + assert "lacks permission for dataset" in name_wrong_payload["message"], name_wrong_payload + + name_empty_res = rest_client.get("/datasets", params={"name": ""}) + assert name_empty_res.status_code == 200 + name_empty_payload = name_empty_res.json() + assert name_empty_payload["code"] == 0, name_empty_payload + assert len(name_empty_payload["data"]) == 5, name_empty_payload + + name_none_res = rest_client.get("/datasets", params={"name": None}) + assert name_none_res.status_code == 200 + name_none_payload = name_none_res.json() + assert name_none_payload["code"] == 0, name_none_payload + assert len(name_none_payload["data"]) == 5, name_none_payload + + id_res = rest_client.get("/datasets", params={"id": dataset_ids[0]}) + assert id_res.status_code == 200 + id_payload = id_res.json() + assert id_payload["code"] == 0, id_payload + assert len(id_payload["data"]) == 1, id_payload + assert id_payload["data"][0]["id"] == dataset_ids[0], id_payload + + id_not_uuid_res = rest_client.get("/datasets", params={"id": "not_uuid"}) + assert id_not_uuid_res.status_code == 200 + id_not_uuid_payload = id_not_uuid_res.json() + assert id_not_uuid_payload["code"] == 101, id_not_uuid_payload + assert "Invalid UUID1 format" in id_not_uuid_payload["message"], id_not_uuid_payload + + id_not_uuid1_res = rest_client.get("/datasets", params={"id": uuid.uuid4().hex}) + assert id_not_uuid1_res.status_code == 200 + id_not_uuid1_payload = id_not_uuid1_res.json() + assert id_not_uuid1_payload["code"] == 101, id_not_uuid1_payload + assert "Invalid UUID1 format" in id_not_uuid1_payload["message"], id_not_uuid1_payload + + id_wrong_uuid_res = rest_client.get("/datasets", params={"id": "d94a8dc02c9711f0930f7fbc369eab6d"}) + assert id_wrong_uuid_res.status_code == 200 + id_wrong_uuid_payload = id_wrong_uuid_res.json() + assert id_wrong_uuid_payload["code"] == 102, id_wrong_uuid_payload + assert "lacks permission for dataset" in id_wrong_uuid_payload["message"], id_wrong_uuid_payload + + id_empty_res = rest_client.get("/datasets", params={"id": ""}) + assert id_empty_res.status_code == 200 + id_empty_payload = id_empty_res.json() + assert id_empty_payload["code"] == 101, id_empty_payload + assert "Invalid UUID1 format" in id_empty_payload["message"], id_empty_payload + + id_none_res = rest_client.get("/datasets", params={"id": None}) + assert id_none_res.status_code == 200 + id_none_payload = id_none_res.json() + assert id_none_payload["code"] == 0, id_none_payload + assert len(id_none_payload["data"]) == 5, id_none_payload + + name_id_match_res = rest_client.get("/datasets", params={"id": dataset_ids[0], "name": "dataset_0"}) + assert name_id_match_res.status_code == 200 + name_id_match_payload = name_id_match_res.json() + assert name_id_match_payload["code"] == 0, name_id_match_payload + assert len(name_id_match_payload["data"]) == 1, name_id_match_payload + + name_id_mismatch_res = rest_client.get("/datasets", params={"id": dataset_ids[0], "name": "dataset_1"}) + assert name_id_mismatch_res.status_code == 200 + name_id_mismatch_payload = name_id_mismatch_res.json() + assert name_id_mismatch_payload["code"] == 0, name_id_mismatch_payload + assert len(name_id_mismatch_payload["data"]) == 0, name_id_mismatch_payload + + for dataset_id, name in ((dataset_ids[0], "wrong_name"), (uuid.uuid1().hex, "dataset_0")): + name_id_wrong_res = rest_client.get("/datasets", params={"id": dataset_id, "name": name}) + assert name_id_wrong_res.status_code == 200 + name_id_wrong_payload = name_id_wrong_res.json() + assert name_id_wrong_payload["code"] == 102, name_id_wrong_payload + assert "lacks permission for dataset" in name_id_wrong_payload["message"], name_id_wrong_payload + + unsupported_field_res = rest_client.get("/datasets", params={"unknown_field": "unknown_field"}) + assert unsupported_field_res.status_code == 200 + unsupported_field_payload = unsupported_field_res.json() + assert unsupported_field_payload["code"] == 101, unsupported_field_payload + assert "Extra inputs are not permitted" in unsupported_field_payload["message"], unsupported_field_payload + + +@pytest.mark.p3 +def test_dataset_list_concurrent_contract(rest_client, clear_datasets): + for i in range(5): + create_res = rest_client.post("/datasets", json={"name": f"dataset_list_concurrent_{i}"}) + assert create_res.status_code == 200 + create_payload = create_res.json() + assert create_payload["code"] == 0, create_payload + + with ThreadPoolExecutor(max_workers=5) as executor: + futures = [executor.submit(rest_client.get, "/datasets") for _ in range(100)] + responses = list(as_completed(futures)) + assert len(responses) == 100, responses + for future in futures: + res = future.result() + assert res.status_code == 200, res.text + payload = res.json() + assert payload["code"] == 0, payload + + +@pytest.mark.p2 +def test_dataset_get_contract(rest_client, create_dataset): + dataset_id = create_dataset("dataset_get_success") + + success_res = rest_client.get(f"/datasets/{dataset_id}") + assert success_res.status_code == 200 + success_payload = success_res.json() + assert success_payload["code"] == 0, success_payload + assert success_payload["data"]["id"] == dataset_id, success_payload + + invalid_id_res = rest_client.get("/datasets/invalid_dataset_id") + assert invalid_id_res.status_code == 200 + invalid_id_payload = invalid_id_res.json() + assert invalid_id_payload["code"] != 0, invalid_id_payload + + unauthorized_res = RestClient(token=INVALID_API_TOKEN).get(f"/datasets/{dataset_id}") + assert unauthorized_res.status_code == 401 + unauthorized_payload = unauthorized_res.json() + assert unauthorized_payload["code"] == 401, unauthorized_payload + + nonexistent_res = rest_client.get(f"/datasets/{'0' * 32}") + assert nonexistent_res.status_code == 200 + nonexistent_payload = nonexistent_res.json() + assert nonexistent_payload["code"] != 0, nonexistent_payload + + +@pytest.mark.p2 +def test_dataset_metadata_summary_contract(rest_client, create_dataset, tmp_path): + dataset_id = create_dataset("dataset_metadata_summary") + document_ids = [] + for i in range(3): + fp = create_txt_file(tmp_path / f"metadata_summary_{i}.txt") + with fp.open("rb") as file_obj: + upload_res = rest_client.post( + f"/datasets/{dataset_id}/documents", + files=[("file", (fp.name, file_obj))], + ) + assert upload_res.status_code == 200 + upload_payload = upload_res.json() + assert upload_payload["code"] == 0, upload_payload + document_ids.append(upload_payload["data"][0]["id"]) + + payloads = [ + {"tags": ["foo", "bar"], "author": "alice"}, + {"tags": ["foo"], "author": "bob"}, + {"tags": ["bar", "baz"], "author": ""}, + ] + for document_id, meta_fields in zip(document_ids, payloads): + update_res = rest_client.patch( + f"/datasets/{dataset_id}/documents/{document_id}", + json={"meta_fields": meta_fields}, + ) + assert update_res.status_code == 200 + update_payload = update_res.json() + assert update_payload["code"] == 0, update_payload + + success_res = rest_client.get(f"/datasets/{dataset_id}/metadata/summary") + assert success_res.status_code == 200 + success_payload = success_res.json() + assert success_payload["code"] == 0, success_payload + assert "summary" in success_payload["data"], success_payload + + summary = success_payload["data"]["summary"] + counts = {} + for key, field_data in summary.items(): + counts[key] = {str(k): v for k, v in field_data["values"]} + assert counts["tags"]["foo"] == 2, counts + assert counts["tags"]["bar"] == 2, counts + assert counts["tags"]["baz"] == 1, counts + assert counts["author"]["alice"] == 1, counts + assert counts["author"]["bob"] == 1, counts + assert "None" not in counts["author"], counts + + invalid_dataset_id = f"invalid_{dataset_id}" + invalid_dataset_res = rest_client.get(f"/datasets/{invalid_dataset_id}/metadata/summary") + assert invalid_dataset_res.status_code == 200 + invalid_dataset_payload = invalid_dataset_res.json() + assert invalid_dataset_payload["code"] == 102, invalid_dataset_payload + assert invalid_dataset_payload["message"] == f"You don't own the dataset {invalid_dataset_id}. ", invalid_dataset_payload + + nonexistent_res = rest_client.get(f"/datasets/{'0' * 32}/metadata/summary") + assert nonexistent_res.status_code == 200 + nonexistent_payload = nonexistent_res.json() + assert nonexistent_payload["code"] == 102, nonexistent_payload + + @pytest.mark.p2 def test_dataset_search_endpoint(rest_client, ensure_parsed_document): dataset_id, _ = ensure_parsed_document()