Files
ragflow/test/unit_test/api/utils/test_doc_validation.py
Wang Qi 0aff6a3f32 Feature: Allow page_size max value 100 (#15292)
Feature: Allow page_size max value 100
2026-05-28 11:13:01 +08:00

332 lines
10 KiB
Python

#
# Copyright 2026 The InfiniFlow Authors. All Rights Reserved.
#
# Licensed under the Apache License, Version 2.0 (the "License");
# you may not use this file except in compliance with the License.
# You may obtain a copy of the License at
#
# http://www.apache.org/licenses/LICENSE-2.0
#
# Unless required by applicable law or agreed to in writing, software
# distributed under the License is distributed on an "AS IS" BASIS,
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# See the License for the specific language governing permissions and
# limitations under the License.
#
"""Unit tests for api.apps.sdk.doc_validation module."""
from unittest.mock import Mock
import pytest
from pydantic import ValidationError
from api.utils.pagination_utils import REST_API_MAX_PAGE_SIZE, validate_rest_api_page_size
from api.utils.validation_utils import (
ListDatasetReq,
ListFileReq,
ParserConfig,
UpdateDocumentReq,
validate_chunk_method,
validate_document_name,
validate_immutable_fields,
)
from api.constants import FILE_NAME_LEN_LIMIT
from api.db import FileType
from common.constants import RetCode
def test_rest_api_page_size_rejects_values_above_100():
assert validate_rest_api_page_size(REST_API_MAX_PAGE_SIZE) == REST_API_MAX_PAGE_SIZE
with pytest.raises(ValueError, match="page_size must be less than or equal to 100"):
validate_rest_api_page_size(REST_API_MAX_PAGE_SIZE + 1)
with pytest.raises(ValidationError, match="page_size must be less than or equal to 100"):
ListDatasetReq(page_size=REST_API_MAX_PAGE_SIZE + 1)
with pytest.raises(ValidationError, match="page_size must be less than or equal to 100"):
ListFileReq(page_size=REST_API_MAX_PAGE_SIZE + 1)
def test_validate_immutable_fields_no_changes():
"""Test when no immutable fields are present in request."""
update_doc_req = UpdateDocumentReq()
doc = Mock()
doc.chunk_num = 10
doc.token_num = 100
doc.progress = 0.5
error_msg, error_code = validate_immutable_fields(update_doc_req, doc)
assert error_msg is None
assert error_code is None
def test_validate_immutable_fields_chunk_count_matches():
"""Test when chunk_count matches the document's chunk_num."""
update_doc_req = UpdateDocumentReq(chunk_count=10)
doc = Mock()
doc.chunk_num = 10
doc.token_num = 100
doc.progress = 0.5
error_msg, error_code = validate_immutable_fields(update_doc_req, doc)
assert error_msg is None
assert error_code is None
def test_validate_immutable_fields_token_count_matches():
"""Test when token_count matches the document's token_num."""
update_doc_req = UpdateDocumentReq(token_count=100)
doc = Mock()
doc.chunk_num = 10
doc.token_num = 100
doc.progress = 0.5
error_msg, error_code = validate_immutable_fields(update_doc_req, doc)
assert error_msg is None
assert error_code is None
def test_validate_immutable_fields_progress_matches():
"""Test when progress matches the document's progress."""
update_doc_req = UpdateDocumentReq(progress=0.5)
doc = Mock()
doc.chunk_num = 10
doc.token_num = 100
doc.progress = 0.5
error_msg, error_code = validate_immutable_fields(update_doc_req, doc)
assert error_msg is None
assert error_code is None
def test_validate_immutable_fields_chunk_count_mismatch():
"""Test when chunk_count doesn't match the document's chunk_num."""
update_doc_req = UpdateDocumentReq(chunk_count=15)
doc = Mock()
doc.chunk_num = 10
doc.token_num = 100
doc.progress = 0.5
error_msg, error_code = validate_immutable_fields(update_doc_req, doc)
assert error_msg == "Can't change `chunk_count`."
assert error_code == RetCode.DATA_ERROR
def test_validate_immutable_fields_token_count_mismatch():
"""Test when token_count doesn't match the document's token_num."""
update_doc_req = UpdateDocumentReq(token_count=150)
doc = Mock()
doc.chunk_num = 10
doc.token_num = 100
doc.progress = 0.5
error_msg, error_code = validate_immutable_fields(update_doc_req, doc)
assert error_msg == "Can't change `token_count`."
assert error_code == RetCode.DATA_ERROR
def test_validate_immutable_fields_progress_mismatch():
"""Test when progress doesn't match the document's progress."""
update_doc_req = UpdateDocumentReq(progress=0.75)
doc = Mock()
doc.chunk_num = 10
doc.token_num = 100
doc.progress = 0.5
error_msg, error_code = validate_immutable_fields(update_doc_req, doc)
assert error_msg == "Can't change `progress`."
assert error_code == RetCode.DATA_ERROR
def test_validate_immutable_fields_progress_boundary_values():
"""Test progress with boundary values (0.0 and 1.0)."""
# Test with 0.0
update_doc_req = UpdateDocumentReq(progress=0.0)
doc = Mock()
doc.chunk_num = 10
doc.token_num = 100
doc.progress = 0.0
error_msg, error_code = validate_immutable_fields(update_doc_req, doc)
assert error_msg is None
assert error_code is None
# Test with 1.0
update_doc_req = UpdateDocumentReq(progress=1.0)
doc = Mock()
doc.chunk_num = 10
doc.token_num = 100
doc.progress = 1.0
error_msg, error_code = validate_immutable_fields(update_doc_req, doc)
assert error_msg is None
assert error_code is None
def test_validate_immutable_fields_none_values():
"""Test when request fields are None."""
update_doc_req = UpdateDocumentReq(chunk_count=None, token_count=None, progress=None)
doc = Mock()
doc.chunk_num = 10
doc.token_num = 100
doc.progress = 0.5
error_msg, error_code = validate_immutable_fields(update_doc_req, doc)
assert error_msg is None
assert error_code is None
def test_validate_document_name_valid():
"""Test valid document name update."""
req_doc_name = "new_document.pdf"
doc = Mock()
doc.name = "old_document.pdf"
docs_from_name = []
error_msg, error_code = validate_document_name(req_doc_name, doc, docs_from_name)
assert error_msg is None
assert error_code is None
def test_validate_document_name_attr_error():
"""Test valid document name update."""
req_doc_name = 0
doc = Mock()
doc.name = "old_document.pdf"
docs_from_name = []
error_msg, error_code = validate_document_name(req_doc_name, doc, docs_from_name)
assert error_msg == f"AttributeError('{type(req_doc_name).__name__}' object has no attribute 'encode')"
assert error_code == RetCode.EXCEPTION_ERROR
def test_validate_document_name_exceeds_byte_limit():
"""Test when name exceeds byte limit."""
long_name = "a" * (FILE_NAME_LEN_LIMIT + 1)
doc = Mock()
doc.name = "old_document.pdf"
docs_from_name = []
error_msg, error_code = validate_document_name(long_name, doc, docs_from_name)
assert f"File name must be {FILE_NAME_LEN_LIMIT} bytes or less." in error_msg
assert error_code == RetCode.ARGUMENT_ERROR
def test_validate_document_name_different_extension():
"""Test when extension is different from original."""
req_doc_name = "new_document.docx"
doc = Mock()
doc.name = "old_document.pdf"
docs_from_name = []
error_msg, error_code = validate_document_name(req_doc_name, doc, docs_from_name)
assert "The extension of file can't be changed" in error_msg
assert error_code == RetCode.ARGUMENT_ERROR
def test_validate_document_name_duplicate():
"""Test when name already exists in the same dataset."""
req_doc_name = "duplicate.pdf"
doc = Mock()
doc.name = "original.pdf"
duplicate_doc = Mock()
duplicate_doc.name = "duplicate.pdf"
docs_from_name = [duplicate_doc]
error_msg, error_code = validate_document_name(req_doc_name, doc, docs_from_name)
assert "Duplicated document name in the same dataset." in error_msg
assert error_code == RetCode.DATA_ERROR
def test_validate_document_name_case_insensitive_extension():
"""Test that extension check is case-insensitive."""
req_doc_name = "new_document.PDF"
doc = Mock()
doc.name = "old_document.pdf"
docs_from_name = []
error_msg, error_code = validate_document_name(req_doc_name, doc, docs_from_name)
assert error_msg is None
assert error_code is None
def test_validate_chunk_method_valid():
"""Test with a valid chunk method."""
doc = Mock()
doc.type = FileType.PDF
doc.name = "document.pdf"
error_msg, error_code = validate_chunk_method(doc)
assert error_msg is None
assert error_code is None
def test_validate_chunk_method_visual_not_supported():
"""Test that visual file types are not supported."""
doc = Mock()
doc.type = FileType.VISUAL
doc.name = "image.jpg"
error_msg, error_code = validate_chunk_method(doc)
assert "Not supported yet!" in error_msg
assert error_code == RetCode.DATA_ERROR
def test_validate_chunk_method_ppt_not_supported():
"""Test that PPT files are not supported."""
doc = Mock()
doc.type = FileType.PDF
doc.name = "presentation.ppt"
error_msg, error_code = validate_chunk_method(doc)
assert "Not supported yet!" in error_msg
assert error_code == RetCode.DATA_ERROR
def test_validate_chunk_method_pptx_not_supported():
"""Test that PPTX files are not supported."""
doc = Mock()
doc.type = FileType.PDF
doc.name = "presentation.pptx"
error_msg, error_code = validate_chunk_method(doc)
assert "Not supported yet!" in error_msg
assert error_code == RetCode.DATA_ERROR
def test_validate_chunk_method_pages_not_supported():
"""Test that Pages files are not supported."""
doc = Mock()
doc.type = FileType.PDF
doc.name = "document.pages"
error_msg, error_code = validate_chunk_method(doc)
assert "Not supported yet!" in error_msg
assert error_code == RetCode.DATA_ERROR
def test_validate_chunk_method_other_extensions_still_valid():
"""Test that other file extensions are still valid."""
doc = Mock()
doc.type = FileType.PDF
doc.name = "document.docx"
error_msg, error_code = validate_chunk_method(doc)
assert error_msg is None
assert error_code is None
def test_parser_config_normalizes_legacy_vectorize_table_column_role():
p = ParserConfig(
table_column_roles={"title": "vectorize", "country": "metadata", "x": "both"},
)
assert p.table_column_roles == {
"title": "indexing",
"country": "metadata",
"x": "both",
}