fix(connectors): enforce WebDAV numeric string size limits (#15731)

## Summary
- Normalize WebDAV file-size metadata before applying the sync size
threshold.
- Enforce the same threshold for numeric string sizes in both document
sync and slim snapshot paths.
- Add focused WebDAV unit coverage for size parsing and over-threshold
skips.

## Why
Some WebDAV servers return file sizes from PROPFIND metadata as strings.
The previous threshold check only handled integer values, so oversized
files could still be downloaded and sent into the chunking pipeline.

Closes #15724.

## Validation
- `uv run --no-project --with pytest --with pytest-asyncio pytest
test/unit_test/data_source/test_webdav_connector_unit.py -q`
- `uvx ruff check common/data_source/webdav_connector.py
test/unit_test/data_source/test_webdav_connector_unit.py`
- `python -m compileall -q common/data_source/webdav_connector.py
test/unit_test/data_source/test_webdav_connector_unit.py`
- `git diff --check`

---------

Co-authored-by: Claude Opus 4.8 <noreply@anthropic.com>
This commit is contained in:
oktofeesh
2026-06-11 00:47:54 -07:00
committed by GitHub
parent b978e26208
commit c15b2b3f66
2 changed files with 378 additions and 6 deletions

View File

@@ -3,6 +3,7 @@ import logging
import os
from datetime import datetime, timezone
from typing import Any, Optional
from urllib.parse import urlsplit
from webdav4.client import Client as WebDAVClient
@@ -60,6 +61,49 @@ class WebDAVConnector(LoadConnector, PollConnector, SlimConnectorWithPermSync):
file_ext = get_file_ext(file_name)
return is_accepted_file_ext(file_ext, self._build_extension_type())
@staticmethod
def _coerce_size_bytes(size_bytes: Any) -> int | None:
if isinstance(size_bytes, bool):
return None
if isinstance(size_bytes, int):
return size_bytes if size_bytes >= 0 else None
if isinstance(size_bytes, str):
size_text = size_bytes.strip()
if not size_text or len(size_text) > 20 or not size_text.isdecimal():
return None
parsed_size = int(size_text)
return parsed_size if parsed_size >= 0 else None
return None
@classmethod
def _get_size_bytes(cls, file_info: dict[str, Any]) -> int | None:
# webdav4's Client.ls(detail=True) reports the size under "content_length"
# (see webdav4.multistatus.Response.as_dict); other servers/libraries or
# webdav4's fsspec wrapper may instead use "size" or the raw
# "getcontentlength" property. Try each so the size guard isn't silently
# skipped — otherwise file_info.get("size") is always None and every file
# trips the missing-metadata warning.
for key in ("size", "content_length", "getcontentlength"):
if key not in file_info:
continue
size_bytes = cls._coerce_size_bytes(file_info[key])
if size_bytes is not None:
return size_bytes
return None
@staticmethod
def _get_log_file_identifier(file_info: dict[str, Any], fallback_path: str) -> str:
raw_identifier = str(file_info.get("name") or file_info.get("href") or fallback_path)
try:
parsed_identifier = urlsplit(raw_identifier)
identifier_path = parsed_identifier.path if parsed_identifier.scheme else raw_identifier
except ValueError:
identifier_path = fallback_path if "://" not in fallback_path else ""
identifier_path = identifier_path.split("?", 1)[0].split("#", 1)[0]
fallback_identifier = "" if "://" in fallback_path else os.path.basename(fallback_path.rstrip("/"))
identifier = os.path.basename(identifier_path.rstrip("/")) or fallback_identifier or "<unknown>"
return identifier.encode("unicode_escape").decode("ascii")
def set_allow_images(self, allow_images: bool) -> None:
"""Set whether to process images"""
logging.info(f"Setting allow_images to {allow_images}.")
@@ -228,14 +272,23 @@ class WebDAVConnector(LoadConnector, PollConnector, SlimConnectorWithPermSync):
logging.debug(f"Skipping file {file_path} due to unsupported extension.")
continue
size_bytes = file_info.get('size', 0)
size_bytes = self._get_size_bytes(file_info)
if self.size_threshold is not None and size_bytes is None:
file_identifier = self._get_log_file_identifier(file_info, file_path)
logging.warning(
f"{file_identifier}: size metadata missing from WebDAV server response, "
f"skipping to avoid processing potentially large files."
)
continue
if (
self.size_threshold is not None
and isinstance(size_bytes, int)
and size_bytes is not None
and size_bytes > self.size_threshold
):
file_identifier = self._get_log_file_identifier(file_info, file_path)
logging.warning(
f"{file_name} exceeds size threshold of {self.size_threshold}. Skipping."
f"{file_identifier} exceeds size threshold of {self.size_threshold} "
f"(size_bytes={size_bytes}). Skipping."
)
continue
@@ -289,7 +342,7 @@ class WebDAVConnector(LoadConnector, PollConnector, SlimConnectorWithPermSync):
semantic_identifier=semantic_id,
extension=get_file_ext(file_name),
doc_updated_at=modified,
size_bytes=size_bytes if size_bytes else 0
size_bytes=size_bytes if size_bytes is not None else 0
)
)
@@ -367,12 +420,24 @@ class WebDAVConnector(LoadConnector, PollConnector, SlimConnectorWithPermSync):
file_name = os.path.basename(file_path)
if not self._is_supported_file(file_name):
continue
size_bytes = file_info.get("size", 0)
size_bytes = self._get_size_bytes(file_info)
if self.size_threshold is not None and size_bytes is None:
file_identifier = self._get_log_file_identifier(file_info, file_path)
logging.warning(
f"{file_identifier}: size metadata missing from WebDAV server response, "
f"skipping to avoid processing potentially large files."
)
continue
if (
self.size_threshold is not None
and isinstance(size_bytes, int)
and size_bytes is not None
and size_bytes > self.size_threshold
):
file_identifier = self._get_log_file_identifier(file_info, file_path)
logging.warning(
f"{file_identifier} exceeds size threshold of {self.size_threshold} "
f"(size_bytes={size_bytes}). Skipping."
)
continue
batch.append(
SlimDocument(id=f"webdav:{self.base_url}:{file_path}")

View File

@@ -0,0 +1,307 @@
#
# Copyright 2026 The InfiniFlow Authors. All Rights Reserved.
#
# Licensed under the Apache License, Version 2.0 (the "License");
# you may not use this file except in compliance with the License.
# You may obtain a copy of the License at
#
# http://www.apache.org/licenses/LICENSE-2.0
#
# Unless required by applicable law or agreed to in writing, software
# distributed under the License is distributed on an "AS IS" BASIS,
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# See the License for the specific language governing permissions and
# limitations under the License.
#
import importlib.util
import logging
import sys
from dataclasses import dataclass
from datetime import datetime, timezone
from enum import IntFlag, auto
from pathlib import Path
from types import ModuleType
import pytest
@dataclass
class _Document:
id: str
blob: bytes
source: str
semantic_identifier: str
extension: str
doc_updated_at: datetime
size_bytes: int
@dataclass
class _SlimDocument:
id: str
class _DocumentSource:
WEBDAV = "webdav"
class _OnyxExtensionType(IntFlag):
Plain = auto()
Document = auto()
Multimedia = auto()
class _LoadConnector:
pass
class _PollConnector:
pass
class _SlimConnectorWithPermSync:
pass
def _install_dependency_stubs():
config_module = ModuleType("common.data_source.config")
config_module.DocumentSource = _DocumentSource
config_module.INDEX_BATCH_SIZE = 10
config_module.BLOB_STORAGE_SIZE_THRESHOLD = 20
exceptions_module = ModuleType("common.data_source.exceptions")
for name in (
"ConnectorMissingCredentialError",
"ConnectorValidationError",
"CredentialExpiredError",
"InsufficientPermissionsError",
):
setattr(exceptions_module, name, type(name, (Exception,), {}))
interfaces_module = ModuleType("common.data_source.interfaces")
interfaces_module.LoadConnector = _LoadConnector
interfaces_module.PollConnector = _PollConnector
interfaces_module.SlimConnectorWithPermSync = _SlimConnectorWithPermSync
interfaces_module.OnyxExtensionType = _OnyxExtensionType
interfaces_module.GenerateDocumentsOutput = object
interfaces_module.GenerateSlimDocumentOutput = object
interfaces_module.SecondsSinceUnixEpoch = int
models_module = ModuleType("common.data_source.models")
models_module.Document = _Document
models_module.SlimDocument = _SlimDocument
models_module.GenerateDocumentsOutput = object
models_module.GenerateSlimDocumentOutput = object
models_module.SecondsSinceUnixEpoch = int
utils_module = ModuleType("common.data_source.utils")
utils_module.get_file_ext = lambda file_name: Path(file_name).suffix.lstrip(".").lower()
utils_module.is_accepted_file_ext = lambda file_ext, extension_type: bool(file_ext)
webdav4_module = ModuleType("webdav4")
webdav4_client_module = ModuleType("webdav4.client")
webdav4_client_module.Client = object
sys.modules["common.data_source.config"] = config_module
sys.modules["common.data_source.exceptions"] = exceptions_module
sys.modules["common.data_source.interfaces"] = interfaces_module
sys.modules["common.data_source.models"] = models_module
sys.modules["common.data_source.utils"] = utils_module
sys.modules["webdav4"] = webdav4_module
sys.modules["webdav4.client"] = webdav4_client_module
def _load_webdav_connector_module():
"""Load webdav_connector.py in isolation (avoid the package __init__)."""
repo_root = Path(__file__).resolve().parents[3]
package_name = "common.data_source"
saved_modules = {name: module for name, module in sys.modules.items() if name == package_name or name.startswith(f"{package_name}.")}
saved_webdav_modules = {name: sys.modules[name] for name in ("webdav4", "webdav4.client") if name in sys.modules}
package_stub = ModuleType(package_name)
package_stub.__path__ = [str(repo_root / "common" / "data_source")]
sys.modules[package_name] = package_stub
_install_dependency_stubs()
try:
spec = importlib.util.spec_from_file_location(
"_webdav_connector_under_test",
repo_root / "common" / "data_source" / "webdav_connector.py",
)
module = importlib.util.module_from_spec(spec)
assert spec.loader is not None
spec.loader.exec_module(module)
return module
finally:
for name in list(sys.modules):
if name == package_name or name.startswith(f"{package_name}."):
if name in saved_modules:
sys.modules[name] = saved_modules[name]
else:
sys.modules.pop(name, None)
for name in ("webdav4", "webdav4.client"):
if name in saved_webdav_modules:
sys.modules[name] = saved_webdav_modules[name]
else:
sys.modules.pop(name, None)
webdav_connector = _load_webdav_connector_module()
WebDAVConnector = webdav_connector.WebDAVConnector
class _FakeClient:
def __init__(self):
self.downloaded_paths = []
def download_fileobj(self, path, buffer):
self.downloaded_paths.append(path)
buffer.write(b"hello webdav")
def _stub_files(files):
def _list_files_recursive(*args, **kwargs):
return files
return _list_files_recursive
@pytest.mark.p2
def test_get_size_bytes_accepts_integer_strings():
assert WebDAVConnector._get_size_bytes({"size": 128}) == 128
assert WebDAVConnector._get_size_bytes({"size": "128"}) == 128
assert WebDAVConnector._get_size_bytes({"size": " 128 "}) == 128
assert WebDAVConnector._get_size_bytes({"size": 0}) == 0
@pytest.mark.p2
def test_get_size_bytes_ignores_unknown_values():
assert WebDAVConnector._get_size_bytes({}) is None
assert WebDAVConnector._get_size_bytes({"size": ""}) is None
assert WebDAVConnector._get_size_bytes({"size": "unknown"}) is None
assert WebDAVConnector._get_size_bytes({"size": "-1"}) is None
assert WebDAVConnector._get_size_bytes({"size": -1}) is None
assert WebDAVConnector._get_size_bytes({"size": True}) is None
assert WebDAVConnector._get_size_bytes({"size": "1" * 21}) is None
@pytest.mark.p2
def test_get_size_bytes_reads_webdav4_content_length():
# webdav4's Client.ls(detail=True) exposes size as "content_length", not "size".
assert WebDAVConnector._get_size_bytes({"content_length": 128}) == 128
assert WebDAVConnector._get_size_bytes({"content_length": "128"}) == 128
assert WebDAVConnector._get_size_bytes({"content_length": 0}) == 0
assert WebDAVConnector._get_size_bytes({"getcontentlength": "128"}) == 128
@pytest.mark.p2
def test_get_size_bytes_falls_back_across_keys():
# When the preferred key is absent/invalid, fall back to the next valid key.
assert WebDAVConnector._get_size_bytes({"content_length": "256"}) == 256
assert WebDAVConnector._get_size_bytes({"size": None, "content_length": 256}) == 256
assert WebDAVConnector._get_size_bytes({"size": "bad", "content_length": 256}) == 256
assert WebDAVConnector._get_size_bytes({"content_length": None, "getcontentlength": "256"}) == 256
@pytest.mark.p1
def test_yield_webdav_documents_skips_numeric_string_sizes_over_threshold(caplog):
connector = WebDAVConnector("https://webdav.example", batch_size=10)
connector.client = _FakeClient()
connector.size_threshold = 20
modified = datetime(2026, 1, 1, tzinfo=timezone.utc)
caplog.set_level(logging.WARNING)
connector._list_files_recursive = _stub_files(
[
("/large.txt", {"size": "21", "modified": modified}),
("/small.txt", {"size": "20", "modified": modified}),
]
)
batches = list(
connector._yield_webdav_documents(
datetime(2025, 1, 1, tzinfo=timezone.utc),
datetime(2026, 2, 1, tzinfo=timezone.utc),
)
)
assert [doc.id for batch in batches for doc in batch] == [
"webdav:https://webdav.example:/small.txt",
]
assert connector.client.downloaded_paths == ["/small.txt"]
assert batches[0][0].size_bytes == 20
assert "large.txt exceeds size threshold of 20 (size_bytes=21). Skipping." in caplog.text
@pytest.mark.p1
def test_yield_webdav_documents_skips_missing_size_metadata(caplog):
connector = WebDAVConnector("https://webdav.example", batch_size=10)
connector.client = _FakeClient()
connector.size_threshold = 20
modified = datetime(2026, 1, 1, tzinfo=timezone.utc)
caplog.set_level(logging.WARNING)
connector._list_files_recursive = _stub_files(
[
("/missing.txt", {"modified": modified}),
("/small.txt", {"size": "20", "modified": modified}),
]
)
batches = list(
connector._yield_webdav_documents(
datetime(2025, 1, 1, tzinfo=timezone.utc),
datetime(2026, 2, 1, tzinfo=timezone.utc),
)
)
assert [doc.id for batch in batches for doc in batch] == [
"webdav:https://webdav.example:/small.txt",
]
assert connector.client.downloaded_paths == ["/small.txt"]
assert (
"missing.txt: size metadata missing from WebDAV server response, "
"skipping to avoid processing potentially large files."
) in caplog.text
@pytest.mark.p1
def test_retrieve_all_slim_docs_skips_numeric_string_sizes_over_threshold(caplog):
connector = WebDAVConnector("https://webdav.example", batch_size=10)
connector.client = object()
connector.size_threshold = 20
caplog.set_level(logging.WARNING)
connector._list_files_recursive = _stub_files(
[
("/large.txt", {"size": "21"}),
("/small.txt", {"size": "20"}),
]
)
batches = list(connector.retrieve_all_slim_docs_perm_sync())
assert [doc.id for batch in batches for doc in batch] == [
"webdav:https://webdav.example:/small.txt",
]
assert "large.txt exceeds size threshold of 20 (size_bytes=21). Skipping." in caplog.text
@pytest.mark.p1
def test_retrieve_all_slim_docs_skips_missing_size_metadata(caplog):
connector = WebDAVConnector("https://webdav.example", batch_size=10)
connector.client = object()
connector.size_threshold = 20
caplog.set_level(logging.WARNING)
connector._list_files_recursive = _stub_files(
[
("/missing.txt", {}),
("/small.txt", {"size": "20"}),
]
)
batches = list(connector.retrieve_all_slim_docs_perm_sync())
assert [doc.id for batch in batches for doc in batch] == [
"webdav:https://webdav.example:/small.txt",
]
assert (
"missing.txt: size metadata missing from WebDAV server response, "
"skipping to avoid processing potentially large files."
) in caplog.text