mirror of
https://github.com/infiniflow/ragflow.git
synced 2026-06-29 23:41:12 +08:00
332 lines
10 KiB
Python
332 lines
10 KiB
Python
#
|
|
# Copyright 2026 The InfiniFlow Authors. All Rights Reserved.
|
|
#
|
|
# Licensed under the Apache License, Version 2.0 (the "License");
|
|
# you may not use this file except in compliance with the License.
|
|
# You may obtain a copy of the License at
|
|
#
|
|
# http://www.apache.org/licenses/LICENSE-2.0
|
|
#
|
|
# Unless required by applicable law or agreed to in writing, software
|
|
# distributed under the License is distributed on an "AS IS" BASIS,
|
|
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
|
|
# See the License for the specific language governing permissions and
|
|
# limitations under the License.
|
|
#
|
|
|
|
"""Unit tests for api.apps.sdk.doc_validation module."""
|
|
|
|
from unittest.mock import Mock
|
|
|
|
import pytest
|
|
from pydantic import ValidationError
|
|
|
|
from api.utils.pagination_utils import REST_API_MAX_PAGE_SIZE, validate_rest_api_page_size
|
|
from api.utils.validation_utils import (
|
|
ListDatasetReq,
|
|
ListFileReq,
|
|
ParserConfig,
|
|
UpdateDocumentReq,
|
|
validate_chunk_method,
|
|
validate_document_name,
|
|
validate_immutable_fields,
|
|
)
|
|
from api.constants import FILE_NAME_LEN_LIMIT
|
|
from api.db import FileType
|
|
from common.constants import RetCode
|
|
|
|
|
|
def test_rest_api_page_size_rejects_values_above_100():
|
|
assert validate_rest_api_page_size(REST_API_MAX_PAGE_SIZE) == REST_API_MAX_PAGE_SIZE
|
|
with pytest.raises(ValueError, match="page_size must be less than or equal to 100"):
|
|
validate_rest_api_page_size(REST_API_MAX_PAGE_SIZE + 1)
|
|
with pytest.raises(ValidationError, match="page_size must be less than or equal to 100"):
|
|
ListDatasetReq(page_size=REST_API_MAX_PAGE_SIZE + 1)
|
|
with pytest.raises(ValidationError, match="page_size must be less than or equal to 100"):
|
|
ListFileReq(page_size=REST_API_MAX_PAGE_SIZE + 1)
|
|
|
|
|
|
def test_validate_immutable_fields_no_changes():
|
|
"""Test when no immutable fields are present in request."""
|
|
update_doc_req = UpdateDocumentReq()
|
|
doc = Mock()
|
|
doc.chunk_num = 10
|
|
doc.token_num = 100
|
|
doc.progress = 0.5
|
|
|
|
error_msg, error_code = validate_immutable_fields(update_doc_req, doc)
|
|
assert error_msg is None
|
|
assert error_code is None
|
|
|
|
|
|
def test_validate_immutable_fields_chunk_count_matches():
|
|
"""Test when chunk_count matches the document's chunk_num."""
|
|
update_doc_req = UpdateDocumentReq(chunk_count=10)
|
|
doc = Mock()
|
|
doc.chunk_num = 10
|
|
doc.token_num = 100
|
|
doc.progress = 0.5
|
|
|
|
error_msg, error_code = validate_immutable_fields(update_doc_req, doc)
|
|
assert error_msg is None
|
|
assert error_code is None
|
|
|
|
|
|
def test_validate_immutable_fields_token_count_matches():
|
|
"""Test when token_count matches the document's token_num."""
|
|
update_doc_req = UpdateDocumentReq(token_count=100)
|
|
doc = Mock()
|
|
doc.chunk_num = 10
|
|
doc.token_num = 100
|
|
doc.progress = 0.5
|
|
|
|
error_msg, error_code = validate_immutable_fields(update_doc_req, doc)
|
|
assert error_msg is None
|
|
assert error_code is None
|
|
|
|
|
|
def test_validate_immutable_fields_progress_matches():
|
|
"""Test when progress matches the document's progress."""
|
|
update_doc_req = UpdateDocumentReq(progress=0.5)
|
|
doc = Mock()
|
|
doc.chunk_num = 10
|
|
doc.token_num = 100
|
|
doc.progress = 0.5
|
|
|
|
error_msg, error_code = validate_immutable_fields(update_doc_req, doc)
|
|
assert error_msg is None
|
|
assert error_code is None
|
|
|
|
|
|
def test_validate_immutable_fields_chunk_count_mismatch():
|
|
"""Test when chunk_count doesn't match the document's chunk_num."""
|
|
update_doc_req = UpdateDocumentReq(chunk_count=15)
|
|
doc = Mock()
|
|
doc.chunk_num = 10
|
|
doc.token_num = 100
|
|
doc.progress = 0.5
|
|
|
|
error_msg, error_code = validate_immutable_fields(update_doc_req, doc)
|
|
assert error_msg == "Can't change `chunk_count`."
|
|
assert error_code == RetCode.DATA_ERROR
|
|
|
|
|
|
def test_validate_immutable_fields_token_count_mismatch():
|
|
"""Test when token_count doesn't match the document's token_num."""
|
|
update_doc_req = UpdateDocumentReq(token_count=150)
|
|
doc = Mock()
|
|
doc.chunk_num = 10
|
|
doc.token_num = 100
|
|
doc.progress = 0.5
|
|
|
|
error_msg, error_code = validate_immutable_fields(update_doc_req, doc)
|
|
assert error_msg == "Can't change `token_count`."
|
|
assert error_code == RetCode.DATA_ERROR
|
|
|
|
|
|
def test_validate_immutable_fields_progress_mismatch():
|
|
"""Test when progress doesn't match the document's progress."""
|
|
update_doc_req = UpdateDocumentReq(progress=0.75)
|
|
doc = Mock()
|
|
doc.chunk_num = 10
|
|
doc.token_num = 100
|
|
doc.progress = 0.5
|
|
|
|
error_msg, error_code = validate_immutable_fields(update_doc_req, doc)
|
|
assert error_msg == "Can't change `progress`."
|
|
assert error_code == RetCode.DATA_ERROR
|
|
|
|
|
|
def test_validate_immutable_fields_progress_boundary_values():
|
|
"""Test progress with boundary values (0.0 and 1.0)."""
|
|
# Test with 0.0
|
|
update_doc_req = UpdateDocumentReq(progress=0.0)
|
|
doc = Mock()
|
|
doc.chunk_num = 10
|
|
doc.token_num = 100
|
|
doc.progress = 0.0
|
|
|
|
error_msg, error_code = validate_immutable_fields(update_doc_req, doc)
|
|
assert error_msg is None
|
|
assert error_code is None
|
|
|
|
# Test with 1.0
|
|
update_doc_req = UpdateDocumentReq(progress=1.0)
|
|
doc = Mock()
|
|
doc.chunk_num = 10
|
|
doc.token_num = 100
|
|
doc.progress = 1.0
|
|
|
|
error_msg, error_code = validate_immutable_fields(update_doc_req, doc)
|
|
assert error_msg is None
|
|
assert error_code is None
|
|
|
|
|
|
def test_validate_immutable_fields_none_values():
|
|
"""Test when request fields are None."""
|
|
update_doc_req = UpdateDocumentReq(chunk_count=None, token_count=None, progress=None)
|
|
doc = Mock()
|
|
doc.chunk_num = 10
|
|
doc.token_num = 100
|
|
doc.progress = 0.5
|
|
|
|
error_msg, error_code = validate_immutable_fields(update_doc_req, doc)
|
|
assert error_msg is None
|
|
assert error_code is None
|
|
|
|
|
|
def test_validate_document_name_valid():
|
|
"""Test valid document name update."""
|
|
req_doc_name = "new_document.pdf"
|
|
doc = Mock()
|
|
doc.name = "old_document.pdf"
|
|
|
|
docs_from_name = []
|
|
|
|
error_msg, error_code = validate_document_name(req_doc_name, doc, docs_from_name)
|
|
assert error_msg is None
|
|
assert error_code is None
|
|
|
|
def test_validate_document_name_attr_error():
|
|
"""Test valid document name update."""
|
|
req_doc_name = 0
|
|
doc = Mock()
|
|
doc.name = "old_document.pdf"
|
|
|
|
docs_from_name = []
|
|
|
|
error_msg, error_code = validate_document_name(req_doc_name, doc, docs_from_name)
|
|
assert error_msg == f"AttributeError('{type(req_doc_name).__name__}' object has no attribute 'encode')"
|
|
assert error_code == RetCode.EXCEPTION_ERROR
|
|
|
|
|
|
def test_validate_document_name_exceeds_byte_limit():
|
|
"""Test when name exceeds byte limit."""
|
|
long_name = "a" * (FILE_NAME_LEN_LIMIT + 1)
|
|
doc = Mock()
|
|
doc.name = "old_document.pdf"
|
|
|
|
docs_from_name = []
|
|
|
|
error_msg, error_code = validate_document_name(long_name, doc, docs_from_name)
|
|
assert f"File name must be {FILE_NAME_LEN_LIMIT} bytes or less." in error_msg
|
|
assert error_code == RetCode.ARGUMENT_ERROR
|
|
|
|
|
|
def test_validate_document_name_different_extension():
|
|
"""Test when extension is different from original."""
|
|
req_doc_name = "new_document.docx"
|
|
doc = Mock()
|
|
doc.name = "old_document.pdf"
|
|
|
|
docs_from_name = []
|
|
|
|
error_msg, error_code = validate_document_name(req_doc_name, doc, docs_from_name)
|
|
assert "The extension of file can't be changed" in error_msg
|
|
assert error_code == RetCode.ARGUMENT_ERROR
|
|
|
|
|
|
def test_validate_document_name_duplicate():
|
|
"""Test when name already exists in the same dataset."""
|
|
req_doc_name = "duplicate.pdf"
|
|
doc = Mock()
|
|
doc.name = "original.pdf"
|
|
|
|
duplicate_doc = Mock()
|
|
duplicate_doc.name = "duplicate.pdf"
|
|
docs_from_name = [duplicate_doc]
|
|
|
|
error_msg, error_code = validate_document_name(req_doc_name, doc, docs_from_name)
|
|
assert "Duplicated document name in the same dataset." in error_msg
|
|
assert error_code == RetCode.DATA_ERROR
|
|
|
|
|
|
def test_validate_document_name_case_insensitive_extension():
|
|
"""Test that extension check is case-insensitive."""
|
|
req_doc_name = "new_document.PDF"
|
|
doc = Mock()
|
|
doc.name = "old_document.pdf"
|
|
|
|
docs_from_name = []
|
|
|
|
error_msg, error_code = validate_document_name(req_doc_name, doc, docs_from_name)
|
|
assert error_msg is None
|
|
assert error_code is None
|
|
|
|
|
|
def test_validate_chunk_method_valid():
|
|
"""Test with a valid chunk method."""
|
|
doc = Mock()
|
|
doc.type = FileType.PDF
|
|
doc.name = "document.pdf"
|
|
|
|
error_msg, error_code = validate_chunk_method(doc)
|
|
assert error_msg is None
|
|
assert error_code is None
|
|
|
|
|
|
def test_validate_chunk_method_visual_not_supported():
|
|
"""Test that visual file types are not supported."""
|
|
doc = Mock()
|
|
doc.type = FileType.VISUAL
|
|
doc.name = "image.jpg"
|
|
|
|
error_msg, error_code = validate_chunk_method(doc)
|
|
assert "Not supported yet!" in error_msg
|
|
assert error_code == RetCode.DATA_ERROR
|
|
|
|
|
|
def test_validate_chunk_method_ppt_not_supported():
|
|
"""Test that PPT files are not supported."""
|
|
doc = Mock()
|
|
doc.type = FileType.PDF
|
|
doc.name = "presentation.ppt"
|
|
|
|
error_msg, error_code = validate_chunk_method(doc)
|
|
assert "Not supported yet!" in error_msg
|
|
assert error_code == RetCode.DATA_ERROR
|
|
|
|
|
|
def test_validate_chunk_method_pptx_not_supported():
|
|
"""Test that PPTX files are not supported."""
|
|
doc = Mock()
|
|
doc.type = FileType.PDF
|
|
doc.name = "presentation.pptx"
|
|
|
|
error_msg, error_code = validate_chunk_method(doc)
|
|
assert "Not supported yet!" in error_msg
|
|
assert error_code == RetCode.DATA_ERROR
|
|
|
|
|
|
def test_validate_chunk_method_pages_not_supported():
|
|
"""Test that Pages files are not supported."""
|
|
doc = Mock()
|
|
doc.type = FileType.PDF
|
|
doc.name = "document.pages"
|
|
|
|
error_msg, error_code = validate_chunk_method(doc)
|
|
assert "Not supported yet!" in error_msg
|
|
assert error_code == RetCode.DATA_ERROR
|
|
|
|
|
|
def test_validate_chunk_method_other_extensions_still_valid():
|
|
"""Test that other file extensions are still valid."""
|
|
doc = Mock()
|
|
doc.type = FileType.PDF
|
|
doc.name = "document.docx"
|
|
|
|
error_msg, error_code = validate_chunk_method(doc)
|
|
assert error_msg is None
|
|
assert error_code is None
|
|
|
|
|
|
def test_parser_config_normalizes_legacy_vectorize_table_column_role():
|
|
p = ParserConfig(
|
|
table_column_roles={"title": "vectorize", "country": "metadata", "x": "both"},
|
|
)
|
|
assert p.table_column_roles == {
|
|
"title": "indexing",
|
|
"country": "metadata",
|
|
"x": "both",
|
|
}
|