mirror of
https://github.com/infiniflow/ragflow.git
synced 2026-06-29 23:41:12 +08:00
Feat: add new tests and tescases for restful api suite (#15299)
### What problem does this PR solve? extend restful api suite ### Type of change - [x] New Feature (non-breaking change which adds functionality) - [x] Other (please describe): test
This commit is contained in:
@@ -2211,6 +2211,83 @@ def test_dataset_get_contract(rest_client, create_dataset):
|
||||
|
||||
|
||||
@pytest.mark.p2
|
||||
def test_dataset_metadata_config_get_and_update_contract(rest_client, create_dataset):
|
||||
dataset_id = create_dataset("dataset_metadata_config_contract")
|
||||
|
||||
success_res = rest_client.get(f"/datasets/{dataset_id}/metadata/config")
|
||||
assert success_res.status_code == 200
|
||||
success_payload = success_res.json()
|
||||
assert success_payload["code"] == 0, success_payload
|
||||
assert success_payload["data"] == {"metadata": [], "built_in_metadata": []}, success_payload
|
||||
|
||||
for scenario_name, client in (("missing token", RestClient(token=None)), ("invalid token", RestClient(token=INVALID_API_TOKEN))):
|
||||
get_res = client.get(f"/datasets/{dataset_id}/metadata/config")
|
||||
assert get_res.status_code == 401, (scenario_name, get_res.text)
|
||||
get_payload = get_res.json()
|
||||
assert get_payload["code"] == 401, (scenario_name, get_payload)
|
||||
assert get_payload["message"] == "<Unauthorized '401: Unauthorized'>", (scenario_name, get_payload)
|
||||
|
||||
put_res = client.put(
|
||||
f"/datasets/{dataset_id}/metadata/config",
|
||||
json={"metadata": [], "built_in_metadata": []},
|
||||
)
|
||||
assert put_res.status_code == 401, (scenario_name, put_res.text)
|
||||
put_payload = put_res.json()
|
||||
assert put_payload["code"] == 401, (scenario_name, put_payload)
|
||||
assert put_payload["message"] == "<Unauthorized '401: Unauthorized'>", (scenario_name, put_payload)
|
||||
|
||||
invalid_dataset_res = rest_client.get("/datasets/invalid_dataset_id/metadata/config")
|
||||
assert invalid_dataset_res.status_code == 200
|
||||
invalid_dataset_payload = invalid_dataset_res.json()
|
||||
assert invalid_dataset_payload["code"] == 102, invalid_dataset_payload
|
||||
assert "lacks permission for dataset 'invalid_dataset_id'" in invalid_dataset_payload["message"], invalid_dataset_payload
|
||||
|
||||
update_payload = {
|
||||
"metadata": [
|
||||
{"key": "author", "type": "string", "description": "Author name"},
|
||||
{"key": "tags", "type": "list", "description": "Tag list", "enum": ["foo", "bar"]},
|
||||
],
|
||||
"built_in_metadata": [
|
||||
{"key": "size", "type": "number", "description": "File size"},
|
||||
],
|
||||
}
|
||||
normalized_update_payload = {
|
||||
"metadata": [
|
||||
{"key": "author", "type": "string", "description": "Author name", "enum": None},
|
||||
{"key": "tags", "type": "list", "description": "Tag list", "enum": ["foo", "bar"]},
|
||||
],
|
||||
"built_in_metadata": [
|
||||
{"key": "size", "type": "number", "description": "File size", "enum": None},
|
||||
],
|
||||
}
|
||||
update_res = rest_client.put(f"/datasets/{dataset_id}/metadata/config", json=update_payload)
|
||||
assert update_res.status_code == 200
|
||||
update_body = update_res.json()
|
||||
assert update_body["code"] == 0, update_body
|
||||
assert update_body["data"] == normalized_update_payload, update_body
|
||||
|
||||
refetch_res = rest_client.get(f"/datasets/{dataset_id}/metadata/config")
|
||||
assert refetch_res.status_code == 200
|
||||
refetch_payload = refetch_res.json()
|
||||
assert refetch_payload["code"] == 0, refetch_payload
|
||||
assert refetch_payload["data"] == normalized_update_payload, refetch_payload
|
||||
|
||||
missing_payload_res = rest_client.put(f"/datasets/{dataset_id}/metadata/config", json={})
|
||||
assert missing_payload_res.status_code == 200
|
||||
missing_payload = missing_payload_res.json()
|
||||
assert missing_payload["code"] == 0, missing_payload
|
||||
assert missing_payload["data"] == {"metadata": [], "built_in_metadata": []}, missing_payload
|
||||
|
||||
invalid_update_dataset_res = rest_client.put(
|
||||
"/datasets/invalid_dataset_id/metadata/config",
|
||||
json={"metadata": [], "built_in_metadata": []},
|
||||
)
|
||||
assert invalid_update_dataset_res.status_code == 200
|
||||
invalid_update_dataset_payload = invalid_update_dataset_res.json()
|
||||
assert invalid_update_dataset_payload["code"] == 102, invalid_update_dataset_payload
|
||||
assert "lacks permission for dataset 'invalid_dataset_id'" in invalid_update_dataset_payload["message"], invalid_update_dataset_payload
|
||||
|
||||
|
||||
def test_dataset_metadata_summary_contract(rest_client, create_dataset, tmp_path):
|
||||
dataset_id = create_dataset("dataset_metadata_summary")
|
||||
document_ids = []
|
||||
|
||||
@@ -15,6 +15,8 @@
|
||||
#
|
||||
|
||||
import pytest
|
||||
from test.testcases.configs import INVALID_API_TOKEN
|
||||
from test.testcases.restful_api.helpers.client import RestClient
|
||||
|
||||
|
||||
@pytest.mark.p2
|
||||
@@ -26,6 +28,26 @@ def test_document_image_invalid_id_contract(rest_client_noauth):
|
||||
assert payload["message"] == "Image not found.", payload
|
||||
|
||||
|
||||
@pytest.mark.p2
|
||||
def test_document_download_by_id_requires_auth(create_document):
|
||||
_dataset_id, document_id = create_document("document_raw_download_auth.txt")
|
||||
for scenario_name, client in (("missing token", RestClient(token=None)), ("invalid token", RestClient(token=INVALID_API_TOKEN))):
|
||||
res = client.get(f"/documents/{document_id}")
|
||||
assert res.status_code == 401, (scenario_name, res.text)
|
||||
payload = res.json()
|
||||
assert payload["code"] == 401, (scenario_name, payload)
|
||||
assert payload["message"] == "<Unauthorized '401: Unauthorized'>", (scenario_name, payload)
|
||||
|
||||
|
||||
@pytest.mark.p2
|
||||
def test_document_download_by_id_invalid_id_contract(rest_client):
|
||||
res = rest_client.get("/documents/invalid_document_id")
|
||||
assert res.status_code == 200
|
||||
payload = res.json()
|
||||
assert payload["code"] == 102, payload
|
||||
assert payload["message"] == "The dataset not own the document invalid_document_id.", payload
|
||||
|
||||
|
||||
@pytest.mark.p2
|
||||
def test_document_artifact_requires_auth(rest_client_noauth):
|
||||
res = rest_client_noauth.get("/documents/artifact/not-an-artifact.txt")
|
||||
|
||||
@@ -24,7 +24,7 @@ from openpyxl import Workbook
|
||||
import pytest
|
||||
import requests
|
||||
from requests_toolbelt import MultipartEncoder
|
||||
from test.testcases.configs import DOCUMENT_NAME_LIMIT, HOST_ADDRESS, INVALID_API_TOKEN, VERSION
|
||||
from test.testcases.configs import DEFAULT_PARSER_CONFIG, DOCUMENT_NAME_LIMIT, HOST_ADDRESS, INVALID_API_TOKEN, INVALID_ID_32, VERSION
|
||||
from test.testcases.restful_api.helpers.client import RestClient
|
||||
from test.testcases.utils import compare_by_hash
|
||||
from test.testcases.utils.file_utils import (
|
||||
@@ -81,6 +81,19 @@ def _seed_documents(rest_client, create_dataset, tmp_path, count=5):
|
||||
return dataset_id, payload["data"]
|
||||
|
||||
|
||||
def _seed_documents_for_update(rest_client, create_dataset, tmp_path):
|
||||
dataset_id = create_dataset("dataset_update_contract")
|
||||
file_paths = [
|
||||
create_txt_file(tmp_path / "ragflow_test_upload_0.txt"),
|
||||
create_txt_file(tmp_path / "ragflow_test_upload_1.txt"),
|
||||
]
|
||||
res = _upload_files(rest_client, dataset_id, file_paths)
|
||||
assert res.status_code == 200
|
||||
payload = res.json()
|
||||
assert payload["code"] == 0, payload
|
||||
return dataset_id, payload["data"]
|
||||
|
||||
|
||||
def _assert_docs_sorted(docs, key, reverse):
|
||||
values = [doc.get(key) for doc in docs]
|
||||
assert values == sorted(values, reverse=reverse)
|
||||
@@ -509,6 +522,315 @@ def test_documents_update_patch_and_delete(rest_client, create_document):
|
||||
assert all(doc["id"] != document_id for doc in list_payload["data"]["docs"]), list_payload
|
||||
|
||||
|
||||
@pytest.mark.p2
|
||||
def test_documents_update_requires_auth(create_document):
|
||||
dataset_id, document_id = create_document("update_auth_target.txt")
|
||||
for scenario_name, client in (("missing token", RestClient(token=None)), ("invalid token", RestClient(token=INVALID_API_TOKEN))):
|
||||
res = client.patch(
|
||||
f"/datasets/{dataset_id}/documents/{document_id}",
|
||||
json={"name": "updated_auth_target.txt"},
|
||||
)
|
||||
assert res.status_code == 401, (scenario_name, res.text)
|
||||
body = res.json()
|
||||
assert body["code"] == 401, (scenario_name, body)
|
||||
assert body["message"] == "<Unauthorized '401: Unauthorized'>", (scenario_name, body)
|
||||
|
||||
|
||||
@pytest.mark.p2
|
||||
def test_documents_update_name_contract(rest_client, create_dataset, tmp_path):
|
||||
dataset_id, uploaded_docs = _seed_documents_for_update(rest_client, create_dataset, tmp_path)
|
||||
first_document_id = uploaded_docs[0]["id"]
|
||||
|
||||
long_name = f"{'a' * (DOCUMENT_NAME_LIMIT - 4)}.txt"
|
||||
name_cases = [
|
||||
("new_name.txt", 0, ""),
|
||||
(long_name, 0, ""),
|
||||
(0, 102, "Field: <name> - Message: <Input should be a valid string> - Value: <0>"),
|
||||
(None, 100, "AttributeError('NoneType' object has no attribute 'encode')"),
|
||||
("", 101, "The extension of file can't be changed"),
|
||||
("ragflow_test_upload_0", 101, "The extension of file can't be changed"),
|
||||
("ragflow_test_upload_1.txt", 102, "Duplicated document name in the same dataset."),
|
||||
("RAGFLOW_TEST_UPLOAD_1.TXT", 0, ""),
|
||||
]
|
||||
for name, expected_code, expected_message in name_cases:
|
||||
res = rest_client.patch(
|
||||
f"/datasets/{dataset_id}/documents/{first_document_id}",
|
||||
json={"name": name},
|
||||
)
|
||||
assert res.status_code == 200, (name, res.text)
|
||||
body = res.json()
|
||||
assert body["code"] == expected_code, (name, body)
|
||||
if expected_code == 0:
|
||||
assert body["data"]["name"] == name, (name, body)
|
||||
list_res = rest_client.get(f"/datasets/{dataset_id}/documents", params={"id": first_document_id})
|
||||
assert list_res.status_code == 200, (name, list_res.text)
|
||||
list_body = list_res.json()
|
||||
assert list_body["code"] == 0, (name, list_body)
|
||||
assert list_body["data"]["docs"][0]["name"] == name, (name, list_body)
|
||||
else:
|
||||
assert body["message"] == expected_message, (name, body)
|
||||
|
||||
|
||||
@pytest.mark.p2
|
||||
def test_documents_update_invalid_dataset_and_document_contract(rest_client, create_dataset, tmp_path):
|
||||
dataset_id, uploaded_docs = _seed_documents_for_update(rest_client, create_dataset, tmp_path)
|
||||
first_document_id = uploaded_docs[0]["id"]
|
||||
|
||||
invalid_dataset_res = rest_client.patch(
|
||||
f"/datasets/{INVALID_ID_32}/documents/{first_document_id}",
|
||||
json={"name": "new_name.txt"},
|
||||
)
|
||||
assert invalid_dataset_res.status_code == 200
|
||||
invalid_dataset_body = invalid_dataset_res.json()
|
||||
assert invalid_dataset_body["code"] == 102, invalid_dataset_body
|
||||
assert "You don't own the dataset." in invalid_dataset_body["message"], invalid_dataset_body
|
||||
|
||||
invalid_document_res = rest_client.patch(
|
||||
f"/datasets/{dataset_id}/documents/{INVALID_ID_32}",
|
||||
json={"name": "new_name.txt"},
|
||||
)
|
||||
assert invalid_document_res.status_code == 200
|
||||
invalid_document_body = invalid_document_res.json()
|
||||
assert invalid_document_body["code"] == 102, invalid_document_body
|
||||
assert invalid_document_body["message"] == "The dataset doesn't own the document.", invalid_document_body
|
||||
|
||||
|
||||
@pytest.mark.p2
|
||||
def test_documents_update_chunk_method_contract(rest_client, create_dataset, tmp_path):
|
||||
dataset_id, uploaded_docs = _seed_documents_for_update(rest_client, create_dataset, tmp_path)
|
||||
first_document_id = uploaded_docs[0]["id"]
|
||||
|
||||
chunk_method_cases = [
|
||||
("naive", 0, ""),
|
||||
("manual", 0, ""),
|
||||
("qa", 0, ""),
|
||||
("table", 0, ""),
|
||||
("paper", 0, ""),
|
||||
("book", 0, ""),
|
||||
("laws", 0, ""),
|
||||
("presentation", 0, ""),
|
||||
("picture", 0, ""),
|
||||
("one", 0, ""),
|
||||
("knowledge_graph", 0, ""),
|
||||
("email", 0, ""),
|
||||
("tag", 0, ""),
|
||||
("", 102, "`chunk_method` (empty string) is not valid"),
|
||||
(
|
||||
"other_chunk_method",
|
||||
102,
|
||||
"Field: <chunk_method> - Message: <`chunk_method` other_chunk_method doesn't exist> - Value: <other_chunk_method>",
|
||||
),
|
||||
]
|
||||
for chunk_method, expected_code, expected_message in chunk_method_cases:
|
||||
res = rest_client.patch(
|
||||
f"/datasets/{dataset_id}/documents/{first_document_id}",
|
||||
json={"chunk_method": chunk_method},
|
||||
)
|
||||
assert res.status_code == 200, (chunk_method, res.text)
|
||||
body = res.json()
|
||||
assert body["code"] == expected_code, (chunk_method, body)
|
||||
if expected_code == 0:
|
||||
list_res = rest_client.get(f"/datasets/{dataset_id}/documents", params={"id": first_document_id})
|
||||
assert list_res.status_code == 200, (chunk_method, list_res.text)
|
||||
list_body = list_res.json()
|
||||
assert list_body["code"] == 0, (chunk_method, list_body)
|
||||
assert list_body["data"]["docs"][0]["chunk_method"] == chunk_method, (chunk_method, list_body)
|
||||
else:
|
||||
assert body["message"] == expected_message, (chunk_method, body)
|
||||
|
||||
|
||||
@pytest.mark.p2
|
||||
def test_documents_update_meta_fields_contract(rest_client, create_dataset, tmp_path):
|
||||
dataset_id, uploaded_docs = _seed_documents_for_update(rest_client, create_dataset, tmp_path)
|
||||
first_document_id = uploaded_docs[0]["id"]
|
||||
|
||||
meta_fields_cases = [
|
||||
({"test": "test"}, 0, ""),
|
||||
({"author": "alice", "year": 2024}, 0, ""),
|
||||
({"tags": ["tag1", "tag2"]}, 0, ""),
|
||||
({"count": 42, "price": 19.99}, 0, ""),
|
||||
("test", 102, "Field: <meta_fields> - Message: <Input should be a valid dictionary> - Value: <test>"),
|
||||
([], 102, "Field: <meta_fields> - Message: <Input should be a valid dictionary> - Value: <[]>"),
|
||||
({"tags": [{"x": {"a": "b"}}]}, 102, "Field: <meta_fields> - Message: <The type is not supported in list: [{'x': {'a': 'b'}}]> - Value: <{'tags': [{'x': {'a': 'b'}}]}>"),
|
||||
({"tags": [{"x": 1}]}, 102, "Field: <meta_fields> - Message: <The type is not supported in list: [{'x': 1}]> - Value: <{'tags': [{'x': 1}]}>"),
|
||||
({"obj": {"x": 1}}, 102, "Field: <meta_fields> - Message: <The type is not supported: {'x': 1}> - Value: <{'obj': {'x': 1}}>"),
|
||||
({"tags": [2, 1]}, 0, ""),
|
||||
]
|
||||
for meta_fields, expected_code, expected_message in meta_fields_cases:
|
||||
res = rest_client.patch(
|
||||
f"/datasets/{dataset_id}/documents/{first_document_id}",
|
||||
json={"meta_fields": meta_fields},
|
||||
)
|
||||
assert res.status_code == 200, (meta_fields, res.text)
|
||||
body = res.json()
|
||||
assert body["code"] == expected_code, (meta_fields, body)
|
||||
if expected_code == 0:
|
||||
list_res = rest_client.get(f"/datasets/{dataset_id}/documents", params={"id": first_document_id})
|
||||
assert list_res.status_code == 200, (meta_fields, list_res.text)
|
||||
list_body = list_res.json()
|
||||
assert list_body["code"] == 0, (meta_fields, list_body)
|
||||
assert list_body["data"]["docs"][0]["meta_fields"] == meta_fields, (meta_fields, list_body)
|
||||
else:
|
||||
assert expected_message in body["message"] or body["message"] == expected_message, (meta_fields, body)
|
||||
|
||||
invalid_meta_doc_res = rest_client.patch(
|
||||
f"/datasets/{dataset_id}/documents/invalid_doc_id_12345678901234567890",
|
||||
json={"meta_fields": {"author": "alice"}},
|
||||
)
|
||||
assert invalid_meta_doc_res.status_code == 200
|
||||
invalid_meta_doc_body = invalid_meta_doc_res.json()
|
||||
assert invalid_meta_doc_body["code"] == 102, invalid_meta_doc_body
|
||||
assert "The dataset doesn't own the document." in invalid_meta_doc_body["message"], invalid_meta_doc_body
|
||||
|
||||
|
||||
@pytest.mark.p2
|
||||
def test_documents_update_invalid_field_and_guard_contract(rest_client, create_dataset, tmp_path):
|
||||
dataset_id, uploaded_docs = _seed_documents_for_update(rest_client, create_dataset, tmp_path)
|
||||
first_document_id = uploaded_docs[0]["id"]
|
||||
|
||||
strict_guard_cases = [
|
||||
({"chunk_count": 1}, 102, "Can't change `chunk_count`."),
|
||||
({"token_count": 1}, 102, "Can't change `token_count`."),
|
||||
({"chunk_count": 100}, 102, "Can't change `chunk_count`."),
|
||||
({"token_count": 100}, 102, "Can't change `token_count`."),
|
||||
({"progress": 2.0}, 102, "Field: <progress> - Message: <Input should be less than or equal to 1> - Value: <2.0>"),
|
||||
({"progress": 1.0}, 102, "Can't change `progress`."),
|
||||
({"meta_fields": []}, 102, "Field: <meta_fields> - Message: <Input should be a valid dictionary> - Value: <[]>"),
|
||||
]
|
||||
for payload, expected_code, expected_message in strict_guard_cases:
|
||||
res = rest_client.patch(
|
||||
f"/datasets/{dataset_id}/documents/{first_document_id}",
|
||||
json=payload,
|
||||
)
|
||||
assert res.status_code == 200, (payload, res.text)
|
||||
body = res.json()
|
||||
assert body["code"] == expected_code, (payload, body)
|
||||
assert expected_message in body["message"] or body["message"] == expected_message, (payload, body)
|
||||
|
||||
legacy_invalid_field_cases = [
|
||||
{"create_date": "Fri, 14 Mar 2025 16:53:42 GMT"},
|
||||
{"create_time": 1},
|
||||
{"created_by": "ragflow_test"},
|
||||
{"dataset_id": "ragflow_test"},
|
||||
{"id": "ragflow_test"},
|
||||
{"location": "ragflow_test.txt"},
|
||||
{"process_begin_at": 1},
|
||||
{"process_duration": 1.0},
|
||||
{"progress_msg": "ragflow_test"},
|
||||
{"run": "ragflow_test"},
|
||||
{"size": 1},
|
||||
{"source_type": "ragflow_test"},
|
||||
{"thumbnail": "ragflow_test"},
|
||||
{"type": "ragflow_test"},
|
||||
{"update_date": "Fri, 14 Mar 2025 16:33:17 GMT"},
|
||||
{"update_time": 1},
|
||||
]
|
||||
for payload in legacy_invalid_field_cases:
|
||||
res = rest_client.patch(
|
||||
f"/datasets/{dataset_id}/documents/{first_document_id}",
|
||||
json=payload,
|
||||
)
|
||||
assert res.status_code == 200, (payload, res.text)
|
||||
body = res.json()
|
||||
assert body["code"] in (0, 102), (payload, body)
|
||||
if body["code"] == 102:
|
||||
assert "invalid" in body["message"].lower(), (payload, body)
|
||||
else:
|
||||
assert "data" in body, (payload, body)
|
||||
|
||||
|
||||
@pytest.mark.p2
|
||||
def test_documents_update_parser_config_contract(rest_client, create_dataset, tmp_path):
|
||||
dataset_id, uploaded_docs = _seed_documents_for_update(rest_client, create_dataset, tmp_path)
|
||||
first_document_id = uploaded_docs[0]["id"]
|
||||
default_parser_config_for_test = {
|
||||
"layout_recognize": "DeepDOC",
|
||||
"chunk_token_num": 512,
|
||||
"delimiter": "\n",
|
||||
"auto_keywords": 0,
|
||||
"auto_questions": 0,
|
||||
"html4excel": False,
|
||||
"topn_tags": 3,
|
||||
"raptor": {
|
||||
"use_raptor": True,
|
||||
"prompt": "Please summarize the following paragraphs. Be careful with the numbers, do not make things up. Paragraphs as following:\n {cluster_content}\nThe above is the content you need to summarize.",
|
||||
"max_token": 256,
|
||||
"threshold": 0.1,
|
||||
"max_cluster": 64,
|
||||
"random_seed": 0,
|
||||
},
|
||||
"graphrag": {
|
||||
"use_graphrag": True,
|
||||
"entity_types": ["organization", "person", "geo", "event", "category"],
|
||||
"method": "light",
|
||||
"batch_chunk_token_size": 4096,
|
||||
},
|
||||
}
|
||||
|
||||
parser_cases = [
|
||||
({}, 0, ""),
|
||||
(default_parser_config_for_test, 0, ""),
|
||||
({"chunk_token_num": -1}, 102, "Field: <parser_config.chunk_token_num> - Message: <Input should be greater than or equal to 1> - Value: <-1>"),
|
||||
({"chunk_token_num": 0}, 102, "Field: <parser_config.chunk_token_num> - Message: <Input should be greater than or equal to 1> - Value: <0>"),
|
||||
({"chunk_token_num": 100000000}, 102, "Field: <parser_config.chunk_token_num> - Message: <Input should be less than or equal to 2048> - Value: <100000000>"),
|
||||
({"chunk_token_num": 3.14}, 102, "Field: <parser_config.chunk_token_num> - Message: <Input should be a valid integer> - Value: <3.14>"),
|
||||
({"chunk_token_num": "1024"}, 102, "Field: <parser_config.chunk_token_num> - Message: <Input should be a valid integer> - Value: <1024>"),
|
||||
({"layout_recognize": "DeepDOC"}, 0, ""),
|
||||
({"layout_recognize": "Naive"}, 0, ""),
|
||||
({"html4excel": True}, 0, ""),
|
||||
({"html4excel": False}, 0, ""),
|
||||
({"html4excel": 1}, 102, "Field: <parser_config.html4excel> - Message: <Input should be a valid boolean> - Value: <1>"),
|
||||
({"delimiter": ""}, 102, "Field: <parser_config.delimiter> - Message: <String should have at least 1 character> - Value: <>"),
|
||||
({"delimiter": "`##`"}, 0, ""),
|
||||
({"delimiter": 1}, 102, "Field: <parser_config.delimiter> - Message: <Input should be a valid string> - Value: <1>"),
|
||||
({"task_page_size": -1}, 102, "Field: <parser_config.task_page_size> - Message: <Input should be greater than or equal to 1> - Value: <-1>"),
|
||||
({"task_page_size": 0}, 102, "Field: <parser_config.task_page_size> - Message: <Input should be greater than or equal to 1> - Value: <0>"),
|
||||
({"task_page_size": 100000000}, 0, ""),
|
||||
({"task_page_size": 3.14}, 102, "Field: <parser_config.task_page_size> - Message: <Input should be a valid integer> - Value: <3.14>"),
|
||||
({"task_page_size": "1024"}, 102, "Field: <parser_config.task_page_size> - Message: <Input should be a valid integer> - Value: <1024>"),
|
||||
({"raptor": {"use_raptor": {"a": "b"}}}, 102, "Field: <parser_config.raptor.use_raptor> - Message: <Input should be a valid boolean> - Value: <{'a': 'b'}>"),
|
||||
({"raptor": {"use_raptor": False}}, 0, ""),
|
||||
({"invalid_key": "invalid_value"}, 102, "Field: <parser_config.invalid_key> - Message: <Extra inputs are not permitted> - Value: <invalid_value>"),
|
||||
({"auto_keywords": -1}, 102, "Field: <parser_config.auto_keywords> - Message: <Input should be greater than or equal to 0> - Value: <-1>"),
|
||||
({"auto_keywords": 32}, 0, ""),
|
||||
({"auto_keywords": "1024"}, 102, "Field: <parser_config.auto_keywords> - Message: <Input should be a valid integer> - Value: <1024>"),
|
||||
({"auto_keywords": 3.14}, 102, "Field: <parser_config.auto_keywords> - Message: <Input should be a valid integer> - Value: <3.14>"),
|
||||
({"auto_questions": -1}, 102, "Field: <parser_config.auto_questions> - Message: <Input should be greater than or equal to 0> - Value: <-1>"),
|
||||
({"auto_questions": 10}, 0, ""),
|
||||
({"auto_questions": 3.14}, 102, "Field: <parser_config.auto_questions> - Message: <Input should be a valid integer> - Value: <3.14>"),
|
||||
({"auto_questions": "1024"}, 102, "Field: <parser_config.auto_questions> - Message: <Input should be a valid integer> - Value: <1024>"),
|
||||
({"topn_tags": -1}, 102, "Field: <parser_config.topn_tags> - Message: <Input should be greater than or equal to 1> - Value: <-1>"),
|
||||
({"topn_tags": 10}, 0, ""),
|
||||
({"topn_tags": 3.14}, 102, "Field: <parser_config.topn_tags> - Message: <Input should be a valid integer> - Value: <3.14>"),
|
||||
({"topn_tags": "1024"}, 102, "Field: <parser_config.topn_tags> - Message: <Input should be a valid integer> - Value: <1024>"),
|
||||
]
|
||||
for parser_config, expected_code, expected_message in parser_cases:
|
||||
res = rest_client.patch(
|
||||
f"/datasets/{dataset_id}/documents/{first_document_id}",
|
||||
json={"chunk_method": "naive", "parser_config": parser_config},
|
||||
)
|
||||
assert res.status_code == 200, (parser_config, res.text)
|
||||
body = res.json()
|
||||
assert body["code"] == expected_code, (parser_config, body)
|
||||
if expected_code == 0:
|
||||
list_res = rest_client.get(f"/datasets/{dataset_id}/documents", params={"id": first_document_id})
|
||||
assert list_res.status_code == 200, (parser_config, list_res.text)
|
||||
list_body = list_res.json()
|
||||
assert list_body["code"] == 0, (parser_config, list_body)
|
||||
doc_parser_config = list_body["data"]["docs"][0]["parser_config"]
|
||||
if parser_config == {}:
|
||||
assert doc_parser_config == DEFAULT_PARSER_CONFIG, (parser_config, list_body)
|
||||
else:
|
||||
for key, value in parser_config.items():
|
||||
if isinstance(value, dict):
|
||||
for sub_key, sub_value in value.items():
|
||||
assert doc_parser_config[key][sub_key] == sub_value, (parser_config, list_body)
|
||||
else:
|
||||
assert doc_parser_config[key] == value, (parser_config, list_body)
|
||||
else:
|
||||
assert body["message"] == expected_message, (parser_config, body)
|
||||
|
||||
|
||||
@pytest.mark.p2
|
||||
def test_documents_parse_and_stop(rest_client, create_document):
|
||||
dataset_id, document_id = create_document("parse_target.txt")
|
||||
@@ -533,6 +855,187 @@ def test_documents_parse_and_stop(rest_client, create_document):
|
||||
assert "already completed" in stop_payload["message"], stop_payload
|
||||
|
||||
|
||||
@pytest.mark.p2
|
||||
def test_documents_metadata_batch_update_contract(rest_client, create_dataset, tmp_path):
|
||||
dataset_id, uploaded_docs = _seed_documents(rest_client, create_dataset, tmp_path, count=5)
|
||||
document_ids = [doc["id"] for doc in uploaded_docs]
|
||||
|
||||
for scenario_name, client in (("missing token", RestClient(token=None)), ("invalid token", RestClient(token=INVALID_API_TOKEN))):
|
||||
res = client.patch(
|
||||
f"/datasets/{dataset_id}/documents/metadatas",
|
||||
json={"selector": {"document_ids": document_ids[:1]}, "updates": [], "deletes": []},
|
||||
)
|
||||
assert res.status_code == 401, (scenario_name, res.text)
|
||||
payload = res.json()
|
||||
assert payload["code"] == 401, (scenario_name, payload)
|
||||
assert payload["message"] == "<Unauthorized '401: Unauthorized'>", (scenario_name, payload)
|
||||
|
||||
invalid_dataset_res = rest_client.patch(
|
||||
"/datasets/invalid_dataset_id/documents/metadatas",
|
||||
json={"selector": {"document_ids": []}, "updates": [], "deletes": []},
|
||||
)
|
||||
assert invalid_dataset_res.status_code == 200
|
||||
invalid_dataset_payload = invalid_dataset_res.json()
|
||||
assert invalid_dataset_payload["code"] == 102, invalid_dataset_payload
|
||||
assert invalid_dataset_payload["message"] == "You don't own the dataset invalid_dataset_id.", invalid_dataset_payload
|
||||
|
||||
validation_cases = [
|
||||
("selector not object", {"selector": [1], "updates": [], "deletes": []}, 102, "selector must be an object."),
|
||||
("updates not list", {"selector": {}, "updates": {"key": "value"}, "deletes": []}, 102, "updates and deletes must be lists."),
|
||||
("metadata condition not object", {"selector": {"metadata_condition": [1]}, "updates": [], "deletes": []}, 102, "metadata_condition must be an object."),
|
||||
("document ids not list", {"selector": {"document_ids": "doc-1"}, "updates": [], "deletes": []}, 102, "document_ids must be a list."),
|
||||
("update missing key", {"selector": {}, "updates": [{"key": ""}], "deletes": []}, 102, "Each update requires key and value."),
|
||||
("delete missing key", {"selector": {}, "updates": [], "deletes": [{"x": "y"}]}, 102, "Each delete requires key."),
|
||||
(
|
||||
"document ids wrong dataset",
|
||||
{"selector": {"document_ids": ["doc-does-not-exist-1", "doc-does-not-exist-2"]}, "updates": [{"key": "author", "value": "test"}], "deletes": []},
|
||||
102,
|
||||
f"These documents do not belong to dataset {dataset_id}: ",
|
||||
),
|
||||
]
|
||||
for scenario_name, payload, expected_code, expected_message in validation_cases:
|
||||
res = rest_client.patch(f"/datasets/{dataset_id}/documents/metadatas", json=payload)
|
||||
assert res.status_code == 200, (scenario_name, res.text)
|
||||
body = res.json()
|
||||
assert body["code"] == expected_code, (scenario_name, body)
|
||||
if scenario_name == "document ids wrong dataset":
|
||||
assert body["message"].startswith(expected_message), (scenario_name, body)
|
||||
invalid_ids = set(body["message"][len(expected_message) :].split(", "))
|
||||
assert invalid_ids == {"doc-does-not-exist-1", "doc-does-not-exist-2"}, (scenario_name, body)
|
||||
else:
|
||||
assert body["message"] == expected_message, (scenario_name, body)
|
||||
|
||||
update_by_ids_res = rest_client.patch(
|
||||
f"/datasets/{dataset_id}/documents/metadatas",
|
||||
json={
|
||||
"selector": {"document_ids": document_ids},
|
||||
"updates": [{"key": "author", "value": "test_author"}, {"key": "status", "value": "processed"}],
|
||||
"deletes": [],
|
||||
},
|
||||
)
|
||||
assert update_by_ids_res.status_code == 200
|
||||
update_by_ids_payload = update_by_ids_res.json()
|
||||
assert update_by_ids_payload["code"] == 0, update_by_ids_payload
|
||||
assert update_by_ids_payload["data"] == {"updated": 5, "matched_docs": 5}, update_by_ids_payload
|
||||
|
||||
filtered_update_res = rest_client.patch(
|
||||
f"/datasets/{dataset_id}/documents/metadatas",
|
||||
json={
|
||||
"selector": {
|
||||
"document_ids": document_ids,
|
||||
"metadata_condition": {"conditions": [{"comparison_operator": "is", "name": "status", "value": "processed"}]},
|
||||
},
|
||||
"updates": [{"key": "author", "value": "filtered_author"}],
|
||||
"deletes": [],
|
||||
},
|
||||
)
|
||||
assert filtered_update_res.status_code == 200
|
||||
filtered_update_payload = filtered_update_res.json()
|
||||
assert filtered_update_payload["code"] == 0, filtered_update_payload
|
||||
assert filtered_update_payload["data"] == {"updated": 5, "matched_docs": 5}, filtered_update_payload
|
||||
|
||||
delete_metadata_res = rest_client.patch(
|
||||
f"/datasets/{dataset_id}/documents/metadatas",
|
||||
json={
|
||||
"selector": {"document_ids": document_ids},
|
||||
"updates": [],
|
||||
"deletes": [{"key": "author"}],
|
||||
},
|
||||
)
|
||||
assert delete_metadata_res.status_code == 200
|
||||
delete_metadata_payload = delete_metadata_res.json()
|
||||
assert delete_metadata_payload["code"] == 0, delete_metadata_payload
|
||||
assert delete_metadata_payload["data"] == {"updated": 5, "matched_docs": 5}, delete_metadata_payload
|
||||
|
||||
combined_res = rest_client.patch(
|
||||
f"/datasets/{dataset_id}/documents/metadatas",
|
||||
json={
|
||||
"selector": {"document_ids": document_ids},
|
||||
"updates": [{"key": "author", "value": "new_author"}],
|
||||
"deletes": [{"key": "status"}],
|
||||
},
|
||||
)
|
||||
assert combined_res.status_code == 200
|
||||
combined_payload = combined_res.json()
|
||||
assert combined_payload["code"] == 0, combined_payload
|
||||
assert combined_payload["data"] == {"updated": 5, "matched_docs": 5}, combined_payload
|
||||
|
||||
empty_ids_res = rest_client.patch(
|
||||
f"/datasets/{dataset_id}/documents/metadatas",
|
||||
json={"selector": {"document_ids": []}, "updates": [{"key": "author", "value": "test"}], "deletes": []},
|
||||
)
|
||||
assert empty_ids_res.status_code == 200
|
||||
empty_ids_payload = empty_ids_res.json()
|
||||
assert empty_ids_payload["code"] == 0, empty_ids_payload
|
||||
assert empty_ids_payload["data"] == {"updated": 0, "matched_docs": 0}, empty_ids_payload
|
||||
|
||||
no_match_res = rest_client.patch(
|
||||
f"/datasets/{dataset_id}/documents/metadatas",
|
||||
json={
|
||||
"selector": {
|
||||
"document_ids": document_ids,
|
||||
"metadata_condition": {"conditions": [{"comparison_operator": "is", "name": "nonexistent_key", "value": "nonexistent_value"}]},
|
||||
},
|
||||
"updates": [{"key": "author", "value": "test"}],
|
||||
"deletes": [],
|
||||
},
|
||||
)
|
||||
assert no_match_res.status_code == 200
|
||||
no_match_payload = no_match_res.json()
|
||||
assert no_match_payload["code"] == 0, no_match_payload
|
||||
assert no_match_payload["data"] == {"updated": 0, "matched_docs": 0}, no_match_payload
|
||||
|
||||
|
||||
@pytest.mark.p2
|
||||
def test_document_metadata_config_contract(rest_client, create_document):
|
||||
dataset_id, document_id = create_document("document_metadata_config_contract.txt")
|
||||
|
||||
for scenario_name, client in (("missing token", RestClient(token=None)), ("invalid token", RestClient(token=INVALID_API_TOKEN))):
|
||||
res = client.put(
|
||||
f"/datasets/{dataset_id}/documents/{document_id}/metadata/config",
|
||||
json={"metadata": {"author": "alice"}},
|
||||
)
|
||||
assert res.status_code == 401, (scenario_name, res.text)
|
||||
payload = res.json()
|
||||
assert payload["code"] == 401, (scenario_name, payload)
|
||||
assert payload["message"] == "<Unauthorized '401: Unauthorized'>", (scenario_name, payload)
|
||||
|
||||
missing_payload_res = rest_client.put(f"/datasets/{dataset_id}/documents/{document_id}/metadata/config", json={})
|
||||
assert missing_payload_res.status_code == 200
|
||||
missing_payload = missing_payload_res.json()
|
||||
assert missing_payload["code"] == 101, missing_payload
|
||||
assert missing_payload["message"] == "metadata is required", missing_payload
|
||||
|
||||
invalid_dataset_res = rest_client.put(
|
||||
f"/datasets/{INVALID_ID_32}/documents/{document_id}/metadata/config",
|
||||
json={"metadata": {"author": "alice"}},
|
||||
)
|
||||
assert invalid_dataset_res.status_code == 200
|
||||
invalid_dataset_payload = invalid_dataset_res.json()
|
||||
assert invalid_dataset_payload["code"] == 102, invalid_dataset_payload
|
||||
assert invalid_dataset_payload["message"] == "You don't own the dataset.", invalid_dataset_payload
|
||||
|
||||
invalid_document_res = rest_client.put(
|
||||
f"/datasets/{dataset_id}/documents/{INVALID_ID_32}/metadata/config",
|
||||
json={"metadata": {"author": "alice"}},
|
||||
)
|
||||
assert invalid_document_res.status_code == 200
|
||||
invalid_document_payload = invalid_document_res.json()
|
||||
assert invalid_document_payload["code"] == 102, invalid_document_payload
|
||||
assert invalid_document_payload["message"] == f"Document {INVALID_ID_32} not found in dataset {dataset_id}", invalid_document_payload
|
||||
|
||||
update_payload = {"metadata": {"author": "alice", "tags": ["one", "two"]}}
|
||||
update_res = rest_client.put(
|
||||
f"/datasets/{dataset_id}/documents/{document_id}/metadata/config",
|
||||
json=update_payload,
|
||||
)
|
||||
assert update_res.status_code == 200
|
||||
update_body = update_res.json()
|
||||
assert update_body["code"] == 0, update_body
|
||||
parser_config = update_body["data"]["parser_config"]
|
||||
assert parser_config["metadata"] == update_payload["metadata"], update_body
|
||||
|
||||
|
||||
@pytest.mark.p2
|
||||
def test_documents_metadata_update_path(rest_client, create_document):
|
||||
dataset_id, document_id = create_document("metadata_target.txt")
|
||||
|
||||
@@ -275,6 +275,45 @@ def test_retrieval_vector_similarity_and_top_k_contract(rest_client, ensure_pars
|
||||
assert expected_message in body["message"], (scenario_name, body)
|
||||
|
||||
|
||||
@pytest.mark.p2
|
||||
def test_retrieval_document_ids_and_metadata_condition_contract(rest_client, ensure_parsed_document):
|
||||
dataset_id, document_id = ensure_parsed_document()
|
||||
|
||||
invalid_doc_ids_res = rest_client.post(
|
||||
"/retrieval",
|
||||
json={"question": "chunk", "dataset_ids": [dataset_id], "document_ids": "bad"},
|
||||
)
|
||||
assert invalid_doc_ids_res.status_code == 200
|
||||
invalid_doc_ids_payload = invalid_doc_ids_res.json()
|
||||
assert invalid_doc_ids_payload["code"] == 102, invalid_doc_ids_payload
|
||||
assert invalid_doc_ids_payload["message"] == "`documents` should be a list", invalid_doc_ids_payload
|
||||
|
||||
not_owned_doc_res = rest_client.post(
|
||||
"/retrieval",
|
||||
json={"question": "chunk", "dataset_ids": [dataset_id], "document_ids": ["not-owned"]},
|
||||
)
|
||||
assert not_owned_doc_res.status_code == 200
|
||||
not_owned_doc_payload = not_owned_doc_res.json()
|
||||
assert not_owned_doc_payload["code"] == 102, not_owned_doc_payload
|
||||
assert not_owned_doc_payload["message"] == "The datasets don't own the document not-owned", not_owned_doc_payload
|
||||
|
||||
metadata_condition_res = rest_client.post(
|
||||
"/retrieval",
|
||||
json={
|
||||
"question": "chunk",
|
||||
"dataset_ids": [dataset_id],
|
||||
"metadata_condition": {
|
||||
"logic": "and",
|
||||
"conditions": [{"name": "author", "comparison_operator": "is", "value": "missing"}],
|
||||
},
|
||||
},
|
||||
)
|
||||
assert metadata_condition_res.status_code == 200
|
||||
metadata_condition_payload = metadata_condition_res.json()
|
||||
assert metadata_condition_payload["code"] == 0, metadata_condition_payload
|
||||
assert metadata_condition_payload["data"]["chunks"] == [], metadata_condition_payload
|
||||
|
||||
|
||||
@pytest.mark.p2
|
||||
def test_retrieval_rerank_unknown_contract(rest_client, ensure_parsed_document):
|
||||
dataset_id, _ = ensure_parsed_document()
|
||||
|
||||
Reference in New Issue
Block a user