From 0940f1a13593e2d8e383cec2f0cb9a6ff324ee7d Mon Sep 17 00:00:00 2001 From: Idriss Sbaaoui <112825897+6ba3i@users.noreply.github.com> Date: Thu, 28 May 2026 11:03:12 +0800 Subject: [PATCH] Feat: add new tests and tescases for restful api suite (#15299) ### What problem does this PR solve? extend restful api suite ### Type of change - [x] New Feature (non-breaking change which adds functionality) - [x] Other (please describe): test --- test/testcases/restful_api/test_datasets.py | 77 +++ .../restful_api/test_document_raw_routes.py | 22 + test/testcases/restful_api/test_documents.py | 505 +++++++++++++++++- test/testcases/restful_api/test_retrieval.py | 39 ++ 4 files changed, 642 insertions(+), 1 deletion(-) diff --git a/test/testcases/restful_api/test_datasets.py b/test/testcases/restful_api/test_datasets.py index 021fc998b4..fb4fe73c24 100644 --- a/test/testcases/restful_api/test_datasets.py +++ b/test/testcases/restful_api/test_datasets.py @@ -2211,6 +2211,83 @@ def test_dataset_get_contract(rest_client, create_dataset): @pytest.mark.p2 +def test_dataset_metadata_config_get_and_update_contract(rest_client, create_dataset): + dataset_id = create_dataset("dataset_metadata_config_contract") + + success_res = rest_client.get(f"/datasets/{dataset_id}/metadata/config") + assert success_res.status_code == 200 + success_payload = success_res.json() + assert success_payload["code"] == 0, success_payload + assert success_payload["data"] == {"metadata": [], "built_in_metadata": []}, success_payload + + for scenario_name, client in (("missing token", RestClient(token=None)), ("invalid token", RestClient(token=INVALID_API_TOKEN))): + get_res = client.get(f"/datasets/{dataset_id}/metadata/config") + assert get_res.status_code == 401, (scenario_name, get_res.text) + get_payload = get_res.json() + assert get_payload["code"] == 401, (scenario_name, get_payload) + assert get_payload["message"] == "", (scenario_name, get_payload) + + put_res = client.put( + f"/datasets/{dataset_id}/metadata/config", + json={"metadata": [], "built_in_metadata": []}, + ) + assert put_res.status_code == 401, (scenario_name, put_res.text) + put_payload = put_res.json() + assert put_payload["code"] == 401, (scenario_name, put_payload) + assert put_payload["message"] == "", (scenario_name, put_payload) + + invalid_dataset_res = rest_client.get("/datasets/invalid_dataset_id/metadata/config") + assert invalid_dataset_res.status_code == 200 + invalid_dataset_payload = invalid_dataset_res.json() + assert invalid_dataset_payload["code"] == 102, invalid_dataset_payload + assert "lacks permission for dataset 'invalid_dataset_id'" in invalid_dataset_payload["message"], invalid_dataset_payload + + update_payload = { + "metadata": [ + {"key": "author", "type": "string", "description": "Author name"}, + {"key": "tags", "type": "list", "description": "Tag list", "enum": ["foo", "bar"]}, + ], + "built_in_metadata": [ + {"key": "size", "type": "number", "description": "File size"}, + ], + } + normalized_update_payload = { + "metadata": [ + {"key": "author", "type": "string", "description": "Author name", "enum": None}, + {"key": "tags", "type": "list", "description": "Tag list", "enum": ["foo", "bar"]}, + ], + "built_in_metadata": [ + {"key": "size", "type": "number", "description": "File size", "enum": None}, + ], + } + update_res = rest_client.put(f"/datasets/{dataset_id}/metadata/config", json=update_payload) + assert update_res.status_code == 200 + update_body = update_res.json() + assert update_body["code"] == 0, update_body + assert update_body["data"] == normalized_update_payload, update_body + + refetch_res = rest_client.get(f"/datasets/{dataset_id}/metadata/config") + assert refetch_res.status_code == 200 + refetch_payload = refetch_res.json() + assert refetch_payload["code"] == 0, refetch_payload + assert refetch_payload["data"] == normalized_update_payload, refetch_payload + + missing_payload_res = rest_client.put(f"/datasets/{dataset_id}/metadata/config", json={}) + assert missing_payload_res.status_code == 200 + missing_payload = missing_payload_res.json() + assert missing_payload["code"] == 0, missing_payload + assert missing_payload["data"] == {"metadata": [], "built_in_metadata": []}, missing_payload + + invalid_update_dataset_res = rest_client.put( + "/datasets/invalid_dataset_id/metadata/config", + json={"metadata": [], "built_in_metadata": []}, + ) + assert invalid_update_dataset_res.status_code == 200 + invalid_update_dataset_payload = invalid_update_dataset_res.json() + assert invalid_update_dataset_payload["code"] == 102, invalid_update_dataset_payload + assert "lacks permission for dataset 'invalid_dataset_id'" in invalid_update_dataset_payload["message"], invalid_update_dataset_payload + + def test_dataset_metadata_summary_contract(rest_client, create_dataset, tmp_path): dataset_id = create_dataset("dataset_metadata_summary") document_ids = [] diff --git a/test/testcases/restful_api/test_document_raw_routes.py b/test/testcases/restful_api/test_document_raw_routes.py index 07f65230df..d1ff2520f1 100644 --- a/test/testcases/restful_api/test_document_raw_routes.py +++ b/test/testcases/restful_api/test_document_raw_routes.py @@ -15,6 +15,8 @@ # import pytest +from test.testcases.configs import INVALID_API_TOKEN +from test.testcases.restful_api.helpers.client import RestClient @pytest.mark.p2 @@ -26,6 +28,26 @@ def test_document_image_invalid_id_contract(rest_client_noauth): assert payload["message"] == "Image not found.", payload +@pytest.mark.p2 +def test_document_download_by_id_requires_auth(create_document): + _dataset_id, document_id = create_document("document_raw_download_auth.txt") + for scenario_name, client in (("missing token", RestClient(token=None)), ("invalid token", RestClient(token=INVALID_API_TOKEN))): + res = client.get(f"/documents/{document_id}") + assert res.status_code == 401, (scenario_name, res.text) + payload = res.json() + assert payload["code"] == 401, (scenario_name, payload) + assert payload["message"] == "", (scenario_name, payload) + + +@pytest.mark.p2 +def test_document_download_by_id_invalid_id_contract(rest_client): + res = rest_client.get("/documents/invalid_document_id") + assert res.status_code == 200 + payload = res.json() + assert payload["code"] == 102, payload + assert payload["message"] == "The dataset not own the document invalid_document_id.", payload + + @pytest.mark.p2 def test_document_artifact_requires_auth(rest_client_noauth): res = rest_client_noauth.get("/documents/artifact/not-an-artifact.txt") diff --git a/test/testcases/restful_api/test_documents.py b/test/testcases/restful_api/test_documents.py index d0b7718cf8..05a1743d0c 100644 --- a/test/testcases/restful_api/test_documents.py +++ b/test/testcases/restful_api/test_documents.py @@ -24,7 +24,7 @@ from openpyxl import Workbook import pytest import requests from requests_toolbelt import MultipartEncoder -from test.testcases.configs import DOCUMENT_NAME_LIMIT, HOST_ADDRESS, INVALID_API_TOKEN, VERSION +from test.testcases.configs import DEFAULT_PARSER_CONFIG, DOCUMENT_NAME_LIMIT, HOST_ADDRESS, INVALID_API_TOKEN, INVALID_ID_32, VERSION from test.testcases.restful_api.helpers.client import RestClient from test.testcases.utils import compare_by_hash from test.testcases.utils.file_utils import ( @@ -81,6 +81,19 @@ def _seed_documents(rest_client, create_dataset, tmp_path, count=5): return dataset_id, payload["data"] +def _seed_documents_for_update(rest_client, create_dataset, tmp_path): + dataset_id = create_dataset("dataset_update_contract") + file_paths = [ + create_txt_file(tmp_path / "ragflow_test_upload_0.txt"), + create_txt_file(tmp_path / "ragflow_test_upload_1.txt"), + ] + res = _upload_files(rest_client, dataset_id, file_paths) + assert res.status_code == 200 + payload = res.json() + assert payload["code"] == 0, payload + return dataset_id, payload["data"] + + def _assert_docs_sorted(docs, key, reverse): values = [doc.get(key) for doc in docs] assert values == sorted(values, reverse=reverse) @@ -509,6 +522,315 @@ def test_documents_update_patch_and_delete(rest_client, create_document): assert all(doc["id"] != document_id for doc in list_payload["data"]["docs"]), list_payload +@pytest.mark.p2 +def test_documents_update_requires_auth(create_document): + dataset_id, document_id = create_document("update_auth_target.txt") + for scenario_name, client in (("missing token", RestClient(token=None)), ("invalid token", RestClient(token=INVALID_API_TOKEN))): + res = client.patch( + f"/datasets/{dataset_id}/documents/{document_id}", + json={"name": "updated_auth_target.txt"}, + ) + assert res.status_code == 401, (scenario_name, res.text) + body = res.json() + assert body["code"] == 401, (scenario_name, body) + assert body["message"] == "", (scenario_name, body) + + +@pytest.mark.p2 +def test_documents_update_name_contract(rest_client, create_dataset, tmp_path): + dataset_id, uploaded_docs = _seed_documents_for_update(rest_client, create_dataset, tmp_path) + first_document_id = uploaded_docs[0]["id"] + + long_name = f"{'a' * (DOCUMENT_NAME_LIMIT - 4)}.txt" + name_cases = [ + ("new_name.txt", 0, ""), + (long_name, 0, ""), + (0, 102, "Field: - Message: - Value: <0>"), + (None, 100, "AttributeError('NoneType' object has no attribute 'encode')"), + ("", 101, "The extension of file can't be changed"), + ("ragflow_test_upload_0", 101, "The extension of file can't be changed"), + ("ragflow_test_upload_1.txt", 102, "Duplicated document name in the same dataset."), + ("RAGFLOW_TEST_UPLOAD_1.TXT", 0, ""), + ] + for name, expected_code, expected_message in name_cases: + res = rest_client.patch( + f"/datasets/{dataset_id}/documents/{first_document_id}", + json={"name": name}, + ) + assert res.status_code == 200, (name, res.text) + body = res.json() + assert body["code"] == expected_code, (name, body) + if expected_code == 0: + assert body["data"]["name"] == name, (name, body) + list_res = rest_client.get(f"/datasets/{dataset_id}/documents", params={"id": first_document_id}) + assert list_res.status_code == 200, (name, list_res.text) + list_body = list_res.json() + assert list_body["code"] == 0, (name, list_body) + assert list_body["data"]["docs"][0]["name"] == name, (name, list_body) + else: + assert body["message"] == expected_message, (name, body) + + +@pytest.mark.p2 +def test_documents_update_invalid_dataset_and_document_contract(rest_client, create_dataset, tmp_path): + dataset_id, uploaded_docs = _seed_documents_for_update(rest_client, create_dataset, tmp_path) + first_document_id = uploaded_docs[0]["id"] + + invalid_dataset_res = rest_client.patch( + f"/datasets/{INVALID_ID_32}/documents/{first_document_id}", + json={"name": "new_name.txt"}, + ) + assert invalid_dataset_res.status_code == 200 + invalid_dataset_body = invalid_dataset_res.json() + assert invalid_dataset_body["code"] == 102, invalid_dataset_body + assert "You don't own the dataset." in invalid_dataset_body["message"], invalid_dataset_body + + invalid_document_res = rest_client.patch( + f"/datasets/{dataset_id}/documents/{INVALID_ID_32}", + json={"name": "new_name.txt"}, + ) + assert invalid_document_res.status_code == 200 + invalid_document_body = invalid_document_res.json() + assert invalid_document_body["code"] == 102, invalid_document_body + assert invalid_document_body["message"] == "The dataset doesn't own the document.", invalid_document_body + + +@pytest.mark.p2 +def test_documents_update_chunk_method_contract(rest_client, create_dataset, tmp_path): + dataset_id, uploaded_docs = _seed_documents_for_update(rest_client, create_dataset, tmp_path) + first_document_id = uploaded_docs[0]["id"] + + chunk_method_cases = [ + ("naive", 0, ""), + ("manual", 0, ""), + ("qa", 0, ""), + ("table", 0, ""), + ("paper", 0, ""), + ("book", 0, ""), + ("laws", 0, ""), + ("presentation", 0, ""), + ("picture", 0, ""), + ("one", 0, ""), + ("knowledge_graph", 0, ""), + ("email", 0, ""), + ("tag", 0, ""), + ("", 102, "`chunk_method` (empty string) is not valid"), + ( + "other_chunk_method", + 102, + "Field: - Message: <`chunk_method` other_chunk_method doesn't exist> - Value: ", + ), + ] + for chunk_method, expected_code, expected_message in chunk_method_cases: + res = rest_client.patch( + f"/datasets/{dataset_id}/documents/{first_document_id}", + json={"chunk_method": chunk_method}, + ) + assert res.status_code == 200, (chunk_method, res.text) + body = res.json() + assert body["code"] == expected_code, (chunk_method, body) + if expected_code == 0: + list_res = rest_client.get(f"/datasets/{dataset_id}/documents", params={"id": first_document_id}) + assert list_res.status_code == 200, (chunk_method, list_res.text) + list_body = list_res.json() + assert list_body["code"] == 0, (chunk_method, list_body) + assert list_body["data"]["docs"][0]["chunk_method"] == chunk_method, (chunk_method, list_body) + else: + assert body["message"] == expected_message, (chunk_method, body) + + +@pytest.mark.p2 +def test_documents_update_meta_fields_contract(rest_client, create_dataset, tmp_path): + dataset_id, uploaded_docs = _seed_documents_for_update(rest_client, create_dataset, tmp_path) + first_document_id = uploaded_docs[0]["id"] + + meta_fields_cases = [ + ({"test": "test"}, 0, ""), + ({"author": "alice", "year": 2024}, 0, ""), + ({"tags": ["tag1", "tag2"]}, 0, ""), + ({"count": 42, "price": 19.99}, 0, ""), + ("test", 102, "Field: - Message: - Value: "), + ([], 102, "Field: - Message: - Value: <[]>"), + ({"tags": [{"x": {"a": "b"}}]}, 102, "Field: - Message: - Value: <{'tags': [{'x': {'a': 'b'}}]}>"), + ({"tags": [{"x": 1}]}, 102, "Field: - Message: - Value: <{'tags': [{'x': 1}]}>"), + ({"obj": {"x": 1}}, 102, "Field: - Message: - Value: <{'obj': {'x': 1}}>"), + ({"tags": [2, 1]}, 0, ""), + ] + for meta_fields, expected_code, expected_message in meta_fields_cases: + res = rest_client.patch( + f"/datasets/{dataset_id}/documents/{first_document_id}", + json={"meta_fields": meta_fields}, + ) + assert res.status_code == 200, (meta_fields, res.text) + body = res.json() + assert body["code"] == expected_code, (meta_fields, body) + if expected_code == 0: + list_res = rest_client.get(f"/datasets/{dataset_id}/documents", params={"id": first_document_id}) + assert list_res.status_code == 200, (meta_fields, list_res.text) + list_body = list_res.json() + assert list_body["code"] == 0, (meta_fields, list_body) + assert list_body["data"]["docs"][0]["meta_fields"] == meta_fields, (meta_fields, list_body) + else: + assert expected_message in body["message"] or body["message"] == expected_message, (meta_fields, body) + + invalid_meta_doc_res = rest_client.patch( + f"/datasets/{dataset_id}/documents/invalid_doc_id_12345678901234567890", + json={"meta_fields": {"author": "alice"}}, + ) + assert invalid_meta_doc_res.status_code == 200 + invalid_meta_doc_body = invalid_meta_doc_res.json() + assert invalid_meta_doc_body["code"] == 102, invalid_meta_doc_body + assert "The dataset doesn't own the document." in invalid_meta_doc_body["message"], invalid_meta_doc_body + + +@pytest.mark.p2 +def test_documents_update_invalid_field_and_guard_contract(rest_client, create_dataset, tmp_path): + dataset_id, uploaded_docs = _seed_documents_for_update(rest_client, create_dataset, tmp_path) + first_document_id = uploaded_docs[0]["id"] + + strict_guard_cases = [ + ({"chunk_count": 1}, 102, "Can't change `chunk_count`."), + ({"token_count": 1}, 102, "Can't change `token_count`."), + ({"chunk_count": 100}, 102, "Can't change `chunk_count`."), + ({"token_count": 100}, 102, "Can't change `token_count`."), + ({"progress": 2.0}, 102, "Field: - Message: - Value: <2.0>"), + ({"progress": 1.0}, 102, "Can't change `progress`."), + ({"meta_fields": []}, 102, "Field: - Message: - Value: <[]>"), + ] + for payload, expected_code, expected_message in strict_guard_cases: + res = rest_client.patch( + f"/datasets/{dataset_id}/documents/{first_document_id}", + json=payload, + ) + assert res.status_code == 200, (payload, res.text) + body = res.json() + assert body["code"] == expected_code, (payload, body) + assert expected_message in body["message"] or body["message"] == expected_message, (payload, body) + + legacy_invalid_field_cases = [ + {"create_date": "Fri, 14 Mar 2025 16:53:42 GMT"}, + {"create_time": 1}, + {"created_by": "ragflow_test"}, + {"dataset_id": "ragflow_test"}, + {"id": "ragflow_test"}, + {"location": "ragflow_test.txt"}, + {"process_begin_at": 1}, + {"process_duration": 1.0}, + {"progress_msg": "ragflow_test"}, + {"run": "ragflow_test"}, + {"size": 1}, + {"source_type": "ragflow_test"}, + {"thumbnail": "ragflow_test"}, + {"type": "ragflow_test"}, + {"update_date": "Fri, 14 Mar 2025 16:33:17 GMT"}, + {"update_time": 1}, + ] + for payload in legacy_invalid_field_cases: + res = rest_client.patch( + f"/datasets/{dataset_id}/documents/{first_document_id}", + json=payload, + ) + assert res.status_code == 200, (payload, res.text) + body = res.json() + assert body["code"] in (0, 102), (payload, body) + if body["code"] == 102: + assert "invalid" in body["message"].lower(), (payload, body) + else: + assert "data" in body, (payload, body) + + +@pytest.mark.p2 +def test_documents_update_parser_config_contract(rest_client, create_dataset, tmp_path): + dataset_id, uploaded_docs = _seed_documents_for_update(rest_client, create_dataset, tmp_path) + first_document_id = uploaded_docs[0]["id"] + default_parser_config_for_test = { + "layout_recognize": "DeepDOC", + "chunk_token_num": 512, + "delimiter": "\n", + "auto_keywords": 0, + "auto_questions": 0, + "html4excel": False, + "topn_tags": 3, + "raptor": { + "use_raptor": True, + "prompt": "Please summarize the following paragraphs. Be careful with the numbers, do not make things up. Paragraphs as following:\n {cluster_content}\nThe above is the content you need to summarize.", + "max_token": 256, + "threshold": 0.1, + "max_cluster": 64, + "random_seed": 0, + }, + "graphrag": { + "use_graphrag": True, + "entity_types": ["organization", "person", "geo", "event", "category"], + "method": "light", + "batch_chunk_token_size": 4096, + }, + } + + parser_cases = [ + ({}, 0, ""), + (default_parser_config_for_test, 0, ""), + ({"chunk_token_num": -1}, 102, "Field: - Message: - Value: <-1>"), + ({"chunk_token_num": 0}, 102, "Field: - Message: - Value: <0>"), + ({"chunk_token_num": 100000000}, 102, "Field: - Message: - Value: <100000000>"), + ({"chunk_token_num": 3.14}, 102, "Field: - Message: - Value: <3.14>"), + ({"chunk_token_num": "1024"}, 102, "Field: - Message: - Value: <1024>"), + ({"layout_recognize": "DeepDOC"}, 0, ""), + ({"layout_recognize": "Naive"}, 0, ""), + ({"html4excel": True}, 0, ""), + ({"html4excel": False}, 0, ""), + ({"html4excel": 1}, 102, "Field: - Message: - Value: <1>"), + ({"delimiter": ""}, 102, "Field: - Message: - Value: <>"), + ({"delimiter": "`##`"}, 0, ""), + ({"delimiter": 1}, 102, "Field: - Message: - Value: <1>"), + ({"task_page_size": -1}, 102, "Field: - Message: - Value: <-1>"), + ({"task_page_size": 0}, 102, "Field: - Message: - Value: <0>"), + ({"task_page_size": 100000000}, 0, ""), + ({"task_page_size": 3.14}, 102, "Field: - Message: - Value: <3.14>"), + ({"task_page_size": "1024"}, 102, "Field: - Message: - Value: <1024>"), + ({"raptor": {"use_raptor": {"a": "b"}}}, 102, "Field: - Message: - Value: <{'a': 'b'}>"), + ({"raptor": {"use_raptor": False}}, 0, ""), + ({"invalid_key": "invalid_value"}, 102, "Field: - Message: - Value: "), + ({"auto_keywords": -1}, 102, "Field: - Message: - Value: <-1>"), + ({"auto_keywords": 32}, 0, ""), + ({"auto_keywords": "1024"}, 102, "Field: - Message: - Value: <1024>"), + ({"auto_keywords": 3.14}, 102, "Field: - Message: - Value: <3.14>"), + ({"auto_questions": -1}, 102, "Field: - Message: - Value: <-1>"), + ({"auto_questions": 10}, 0, ""), + ({"auto_questions": 3.14}, 102, "Field: - Message: - Value: <3.14>"), + ({"auto_questions": "1024"}, 102, "Field: - Message: - Value: <1024>"), + ({"topn_tags": -1}, 102, "Field: - Message: - Value: <-1>"), + ({"topn_tags": 10}, 0, ""), + ({"topn_tags": 3.14}, 102, "Field: - Message: - Value: <3.14>"), + ({"topn_tags": "1024"}, 102, "Field: - Message: - Value: <1024>"), + ] + for parser_config, expected_code, expected_message in parser_cases: + res = rest_client.patch( + f"/datasets/{dataset_id}/documents/{first_document_id}", + json={"chunk_method": "naive", "parser_config": parser_config}, + ) + assert res.status_code == 200, (parser_config, res.text) + body = res.json() + assert body["code"] == expected_code, (parser_config, body) + if expected_code == 0: + list_res = rest_client.get(f"/datasets/{dataset_id}/documents", params={"id": first_document_id}) + assert list_res.status_code == 200, (parser_config, list_res.text) + list_body = list_res.json() + assert list_body["code"] == 0, (parser_config, list_body) + doc_parser_config = list_body["data"]["docs"][0]["parser_config"] + if parser_config == {}: + assert doc_parser_config == DEFAULT_PARSER_CONFIG, (parser_config, list_body) + else: + for key, value in parser_config.items(): + if isinstance(value, dict): + for sub_key, sub_value in value.items(): + assert doc_parser_config[key][sub_key] == sub_value, (parser_config, list_body) + else: + assert doc_parser_config[key] == value, (parser_config, list_body) + else: + assert body["message"] == expected_message, (parser_config, body) + + @pytest.mark.p2 def test_documents_parse_and_stop(rest_client, create_document): dataset_id, document_id = create_document("parse_target.txt") @@ -533,6 +855,187 @@ def test_documents_parse_and_stop(rest_client, create_document): assert "already completed" in stop_payload["message"], stop_payload +@pytest.mark.p2 +def test_documents_metadata_batch_update_contract(rest_client, create_dataset, tmp_path): + dataset_id, uploaded_docs = _seed_documents(rest_client, create_dataset, tmp_path, count=5) + document_ids = [doc["id"] for doc in uploaded_docs] + + for scenario_name, client in (("missing token", RestClient(token=None)), ("invalid token", RestClient(token=INVALID_API_TOKEN))): + res = client.patch( + f"/datasets/{dataset_id}/documents/metadatas", + json={"selector": {"document_ids": document_ids[:1]}, "updates": [], "deletes": []}, + ) + assert res.status_code == 401, (scenario_name, res.text) + payload = res.json() + assert payload["code"] == 401, (scenario_name, payload) + assert payload["message"] == "", (scenario_name, payload) + + invalid_dataset_res = rest_client.patch( + "/datasets/invalid_dataset_id/documents/metadatas", + json={"selector": {"document_ids": []}, "updates": [], "deletes": []}, + ) + assert invalid_dataset_res.status_code == 200 + invalid_dataset_payload = invalid_dataset_res.json() + assert invalid_dataset_payload["code"] == 102, invalid_dataset_payload + assert invalid_dataset_payload["message"] == "You don't own the dataset invalid_dataset_id.", invalid_dataset_payload + + validation_cases = [ + ("selector not object", {"selector": [1], "updates": [], "deletes": []}, 102, "selector must be an object."), + ("updates not list", {"selector": {}, "updates": {"key": "value"}, "deletes": []}, 102, "updates and deletes must be lists."), + ("metadata condition not object", {"selector": {"metadata_condition": [1]}, "updates": [], "deletes": []}, 102, "metadata_condition must be an object."), + ("document ids not list", {"selector": {"document_ids": "doc-1"}, "updates": [], "deletes": []}, 102, "document_ids must be a list."), + ("update missing key", {"selector": {}, "updates": [{"key": ""}], "deletes": []}, 102, "Each update requires key and value."), + ("delete missing key", {"selector": {}, "updates": [], "deletes": [{"x": "y"}]}, 102, "Each delete requires key."), + ( + "document ids wrong dataset", + {"selector": {"document_ids": ["doc-does-not-exist-1", "doc-does-not-exist-2"]}, "updates": [{"key": "author", "value": "test"}], "deletes": []}, + 102, + f"These documents do not belong to dataset {dataset_id}: ", + ), + ] + for scenario_name, payload, expected_code, expected_message in validation_cases: + res = rest_client.patch(f"/datasets/{dataset_id}/documents/metadatas", json=payload) + assert res.status_code == 200, (scenario_name, res.text) + body = res.json() + assert body["code"] == expected_code, (scenario_name, body) + if scenario_name == "document ids wrong dataset": + assert body["message"].startswith(expected_message), (scenario_name, body) + invalid_ids = set(body["message"][len(expected_message) :].split(", ")) + assert invalid_ids == {"doc-does-not-exist-1", "doc-does-not-exist-2"}, (scenario_name, body) + else: + assert body["message"] == expected_message, (scenario_name, body) + + update_by_ids_res = rest_client.patch( + f"/datasets/{dataset_id}/documents/metadatas", + json={ + "selector": {"document_ids": document_ids}, + "updates": [{"key": "author", "value": "test_author"}, {"key": "status", "value": "processed"}], + "deletes": [], + }, + ) + assert update_by_ids_res.status_code == 200 + update_by_ids_payload = update_by_ids_res.json() + assert update_by_ids_payload["code"] == 0, update_by_ids_payload + assert update_by_ids_payload["data"] == {"updated": 5, "matched_docs": 5}, update_by_ids_payload + + filtered_update_res = rest_client.patch( + f"/datasets/{dataset_id}/documents/metadatas", + json={ + "selector": { + "document_ids": document_ids, + "metadata_condition": {"conditions": [{"comparison_operator": "is", "name": "status", "value": "processed"}]}, + }, + "updates": [{"key": "author", "value": "filtered_author"}], + "deletes": [], + }, + ) + assert filtered_update_res.status_code == 200 + filtered_update_payload = filtered_update_res.json() + assert filtered_update_payload["code"] == 0, filtered_update_payload + assert filtered_update_payload["data"] == {"updated": 5, "matched_docs": 5}, filtered_update_payload + + delete_metadata_res = rest_client.patch( + f"/datasets/{dataset_id}/documents/metadatas", + json={ + "selector": {"document_ids": document_ids}, + "updates": [], + "deletes": [{"key": "author"}], + }, + ) + assert delete_metadata_res.status_code == 200 + delete_metadata_payload = delete_metadata_res.json() + assert delete_metadata_payload["code"] == 0, delete_metadata_payload + assert delete_metadata_payload["data"] == {"updated": 5, "matched_docs": 5}, delete_metadata_payload + + combined_res = rest_client.patch( + f"/datasets/{dataset_id}/documents/metadatas", + json={ + "selector": {"document_ids": document_ids}, + "updates": [{"key": "author", "value": "new_author"}], + "deletes": [{"key": "status"}], + }, + ) + assert combined_res.status_code == 200 + combined_payload = combined_res.json() + assert combined_payload["code"] == 0, combined_payload + assert combined_payload["data"] == {"updated": 5, "matched_docs": 5}, combined_payload + + empty_ids_res = rest_client.patch( + f"/datasets/{dataset_id}/documents/metadatas", + json={"selector": {"document_ids": []}, "updates": [{"key": "author", "value": "test"}], "deletes": []}, + ) + assert empty_ids_res.status_code == 200 + empty_ids_payload = empty_ids_res.json() + assert empty_ids_payload["code"] == 0, empty_ids_payload + assert empty_ids_payload["data"] == {"updated": 0, "matched_docs": 0}, empty_ids_payload + + no_match_res = rest_client.patch( + f"/datasets/{dataset_id}/documents/metadatas", + json={ + "selector": { + "document_ids": document_ids, + "metadata_condition": {"conditions": [{"comparison_operator": "is", "name": "nonexistent_key", "value": "nonexistent_value"}]}, + }, + "updates": [{"key": "author", "value": "test"}], + "deletes": [], + }, + ) + assert no_match_res.status_code == 200 + no_match_payload = no_match_res.json() + assert no_match_payload["code"] == 0, no_match_payload + assert no_match_payload["data"] == {"updated": 0, "matched_docs": 0}, no_match_payload + + +@pytest.mark.p2 +def test_document_metadata_config_contract(rest_client, create_document): + dataset_id, document_id = create_document("document_metadata_config_contract.txt") + + for scenario_name, client in (("missing token", RestClient(token=None)), ("invalid token", RestClient(token=INVALID_API_TOKEN))): + res = client.put( + f"/datasets/{dataset_id}/documents/{document_id}/metadata/config", + json={"metadata": {"author": "alice"}}, + ) + assert res.status_code == 401, (scenario_name, res.text) + payload = res.json() + assert payload["code"] == 401, (scenario_name, payload) + assert payload["message"] == "", (scenario_name, payload) + + missing_payload_res = rest_client.put(f"/datasets/{dataset_id}/documents/{document_id}/metadata/config", json={}) + assert missing_payload_res.status_code == 200 + missing_payload = missing_payload_res.json() + assert missing_payload["code"] == 101, missing_payload + assert missing_payload["message"] == "metadata is required", missing_payload + + invalid_dataset_res = rest_client.put( + f"/datasets/{INVALID_ID_32}/documents/{document_id}/metadata/config", + json={"metadata": {"author": "alice"}}, + ) + assert invalid_dataset_res.status_code == 200 + invalid_dataset_payload = invalid_dataset_res.json() + assert invalid_dataset_payload["code"] == 102, invalid_dataset_payload + assert invalid_dataset_payload["message"] == "You don't own the dataset.", invalid_dataset_payload + + invalid_document_res = rest_client.put( + f"/datasets/{dataset_id}/documents/{INVALID_ID_32}/metadata/config", + json={"metadata": {"author": "alice"}}, + ) + assert invalid_document_res.status_code == 200 + invalid_document_payload = invalid_document_res.json() + assert invalid_document_payload["code"] == 102, invalid_document_payload + assert invalid_document_payload["message"] == f"Document {INVALID_ID_32} not found in dataset {dataset_id}", invalid_document_payload + + update_payload = {"metadata": {"author": "alice", "tags": ["one", "two"]}} + update_res = rest_client.put( + f"/datasets/{dataset_id}/documents/{document_id}/metadata/config", + json=update_payload, + ) + assert update_res.status_code == 200 + update_body = update_res.json() + assert update_body["code"] == 0, update_body + parser_config = update_body["data"]["parser_config"] + assert parser_config["metadata"] == update_payload["metadata"], update_body + + @pytest.mark.p2 def test_documents_metadata_update_path(rest_client, create_document): dataset_id, document_id = create_document("metadata_target.txt") diff --git a/test/testcases/restful_api/test_retrieval.py b/test/testcases/restful_api/test_retrieval.py index 6bfe40b28e..3c7eca7d10 100644 --- a/test/testcases/restful_api/test_retrieval.py +++ b/test/testcases/restful_api/test_retrieval.py @@ -275,6 +275,45 @@ def test_retrieval_vector_similarity_and_top_k_contract(rest_client, ensure_pars assert expected_message in body["message"], (scenario_name, body) +@pytest.mark.p2 +def test_retrieval_document_ids_and_metadata_condition_contract(rest_client, ensure_parsed_document): + dataset_id, document_id = ensure_parsed_document() + + invalid_doc_ids_res = rest_client.post( + "/retrieval", + json={"question": "chunk", "dataset_ids": [dataset_id], "document_ids": "bad"}, + ) + assert invalid_doc_ids_res.status_code == 200 + invalid_doc_ids_payload = invalid_doc_ids_res.json() + assert invalid_doc_ids_payload["code"] == 102, invalid_doc_ids_payload + assert invalid_doc_ids_payload["message"] == "`documents` should be a list", invalid_doc_ids_payload + + not_owned_doc_res = rest_client.post( + "/retrieval", + json={"question": "chunk", "dataset_ids": [dataset_id], "document_ids": ["not-owned"]}, + ) + assert not_owned_doc_res.status_code == 200 + not_owned_doc_payload = not_owned_doc_res.json() + assert not_owned_doc_payload["code"] == 102, not_owned_doc_payload + assert not_owned_doc_payload["message"] == "The datasets don't own the document not-owned", not_owned_doc_payload + + metadata_condition_res = rest_client.post( + "/retrieval", + json={ + "question": "chunk", + "dataset_ids": [dataset_id], + "metadata_condition": { + "logic": "and", + "conditions": [{"name": "author", "comparison_operator": "is", "value": "missing"}], + }, + }, + ) + assert metadata_condition_res.status_code == 200 + metadata_condition_payload = metadata_condition_res.json() + assert metadata_condition_payload["code"] == 0, metadata_condition_payload + assert metadata_condition_payload["data"]["chunks"] == [], metadata_condition_payload + + @pytest.mark.p2 def test_retrieval_rerank_unknown_contract(rest_client, ensure_parsed_document): dataset_id, _ = ensure_parsed_document()