Files
ragflow/common/data_source/webdav_connector.py
oktofeesh c15b2b3f66 fix(connectors): enforce WebDAV numeric string size limits (#15731)
## Summary
- Normalize WebDAV file-size metadata before applying the sync size
threshold.
- Enforce the same threshold for numeric string sizes in both document
sync and slim snapshot paths.
- Add focused WebDAV unit coverage for size parsing and over-threshold
skips.

## Why
Some WebDAV servers return file sizes from PROPFIND metadata as strings.
The previous threshold check only handled integer values, so oversized
files could still be downloaded and sent into the chunking pipeline.

Closes #15724.

## Validation
- `uv run --no-project --with pytest --with pytest-asyncio pytest
test/unit_test/data_source/test_webdav_connector_unit.py -q`
- `uvx ruff check common/data_source/webdav_connector.py
test/unit_test/data_source/test_webdav_connector_unit.py`
- `python -m compileall -q common/data_source/webdav_connector.py
test/unit_test/data_source/test_webdav_connector_unit.py`
- `git diff --check`

---------

Co-authored-by: Claude Opus 4.8 <noreply@anthropic.com>
2026-06-11 15:47:54 +08:00

554 lines
22 KiB
Python

"""WebDAV connector"""
import logging
import os
from datetime import datetime, timezone
from typing import Any, Optional
from urllib.parse import urlsplit
from webdav4.client import Client as WebDAVClient
from common.data_source.utils import (
get_file_ext,
is_accepted_file_ext,
)
from common.data_source.config import DocumentSource, INDEX_BATCH_SIZE, BLOB_STORAGE_SIZE_THRESHOLD
from common.data_source.exceptions import (
ConnectorMissingCredentialError,
ConnectorValidationError,
CredentialExpiredError,
InsufficientPermissionsError
)
from common.data_source.interfaces import LoadConnector, OnyxExtensionType, PollConnector, SlimConnectorWithPermSync
from common.data_source.models import Document, GenerateDocumentsOutput, GenerateSlimDocumentOutput, SecondsSinceUnixEpoch, SlimDocument
class WebDAVConnector(LoadConnector, PollConnector, SlimConnectorWithPermSync):
"""WebDAV connector for syncing files from WebDAV servers"""
def __init__(
self,
base_url: str,
remote_path: str = "/",
batch_size: int = INDEX_BATCH_SIZE,
) -> None:
"""Initialize WebDAV connector
Args:
base_url: Base URL of the WebDAV server (e.g., "https://webdav.example.com")
remote_path: Remote path to sync from (default: "/")
batch_size: Number of documents per batch
"""
self.base_url = base_url.rstrip("/")
if not remote_path:
remote_path = "/"
if not remote_path.startswith("/"):
remote_path = f"/{remote_path}"
if remote_path.endswith("/") and remote_path != "/":
remote_path = remote_path.rstrip("/")
self.remote_path = remote_path
self.batch_size = batch_size
self.client: Optional[WebDAVClient] = None
self._allow_images: bool | None = None
self.size_threshold: int | None = BLOB_STORAGE_SIZE_THRESHOLD
def _build_extension_type(self) -> OnyxExtensionType:
extension_type = OnyxExtensionType.Plain | OnyxExtensionType.Document
if bool(self._allow_images):
extension_type |= OnyxExtensionType.Multimedia
return extension_type
def _is_supported_file(self, file_name: str) -> bool:
file_ext = get_file_ext(file_name)
return is_accepted_file_ext(file_ext, self._build_extension_type())
@staticmethod
def _coerce_size_bytes(size_bytes: Any) -> int | None:
if isinstance(size_bytes, bool):
return None
if isinstance(size_bytes, int):
return size_bytes if size_bytes >= 0 else None
if isinstance(size_bytes, str):
size_text = size_bytes.strip()
if not size_text or len(size_text) > 20 or not size_text.isdecimal():
return None
parsed_size = int(size_text)
return parsed_size if parsed_size >= 0 else None
return None
@classmethod
def _get_size_bytes(cls, file_info: dict[str, Any]) -> int | None:
# webdav4's Client.ls(detail=True) reports the size under "content_length"
# (see webdav4.multistatus.Response.as_dict); other servers/libraries or
# webdav4's fsspec wrapper may instead use "size" or the raw
# "getcontentlength" property. Try each so the size guard isn't silently
# skipped — otherwise file_info.get("size") is always None and every file
# trips the missing-metadata warning.
for key in ("size", "content_length", "getcontentlength"):
if key not in file_info:
continue
size_bytes = cls._coerce_size_bytes(file_info[key])
if size_bytes is not None:
return size_bytes
return None
@staticmethod
def _get_log_file_identifier(file_info: dict[str, Any], fallback_path: str) -> str:
raw_identifier = str(file_info.get("name") or file_info.get("href") or fallback_path)
try:
parsed_identifier = urlsplit(raw_identifier)
identifier_path = parsed_identifier.path if parsed_identifier.scheme else raw_identifier
except ValueError:
identifier_path = fallback_path if "://" not in fallback_path else ""
identifier_path = identifier_path.split("?", 1)[0].split("#", 1)[0]
fallback_identifier = "" if "://" in fallback_path else os.path.basename(fallback_path.rstrip("/"))
identifier = os.path.basename(identifier_path.rstrip("/")) or fallback_identifier or "<unknown>"
return identifier.encode("unicode_escape").decode("ascii")
def set_allow_images(self, allow_images: bool) -> None:
"""Set whether to process images"""
logging.info(f"Setting allow_images to {allow_images}.")
self._allow_images = allow_images
def load_credentials(self, credentials: dict[str, Any]) -> dict[str, Any] | None:
"""Load credentials and initialize WebDAV client
Args:
credentials: Dictionary containing 'username' and 'password'
Returns:
None
Raises:
ConnectorMissingCredentialError: If required credentials are missing
"""
logging.debug(f"Loading credentials for WebDAV server {self.base_url}")
username = credentials.get("username")
password = credentials.get("password")
if not username or not password:
raise ConnectorMissingCredentialError(
"WebDAV requires 'username' and 'password' credentials"
)
try:
# Initialize WebDAV client
self.client = WebDAVClient(
base_url=self.base_url,
auth=(username, password)
)
except Exception as e:
logging.error(f"Failed to connect to WebDAV server: {e}")
raise ConnectorMissingCredentialError(
f"Failed to authenticate with WebDAV server: {e}"
)
return None
def _list_files_recursive(
self,
path: str,
start: datetime,
end: datetime,
*,
filter_by_mtime: bool = True,
) -> list[tuple[str, dict]]:
"""Recursively list all files in the given path
Args:
path: Path to list files from
start: Start datetime for filtering (ignored when ``filter_by_mtime`` is False)
end: End datetime for filtering (ignored when ``filter_by_mtime`` is False)
filter_by_mtime: When False, include every supported extension without mtime window
Returns:
List of tuples containing (file_path, file_info)
"""
if self.client is None:
raise ConnectorMissingCredentialError("WebDAV client not initialized")
files = []
try:
logging.debug(f"Listing directory: {path}")
for item in self.client.ls(path, detail=True):
item_path = item['name']
if item_path == path or item_path == path + '/':
continue
logging.debug(f"Found item: {item_path}, type: {item.get('type')}")
if item.get('type') == 'directory':
try:
files.extend(
self._list_files_recursive(
item_path,
start,
end,
filter_by_mtime=filter_by_mtime,
)
)
except Exception as e:
logging.error(f"Error recursing into directory {item_path}: {e}")
continue
else:
try:
file_name = os.path.basename(item_path)
if not self._is_supported_file(file_name):
logging.debug(f"Skipping file {item_path} due to unsupported extension.")
continue
modified_time = item.get('modified')
if modified_time:
if isinstance(modified_time, datetime):
modified = modified_time
if modified.tzinfo is None:
modified = modified.replace(tzinfo=timezone.utc)
elif isinstance(modified_time, str):
try:
modified = datetime.strptime(modified_time, '%a, %d %b %Y %H:%M:%S %Z')
modified = modified.replace(tzinfo=timezone.utc)
except (ValueError, TypeError):
try:
modified = datetime.fromisoformat(modified_time.replace('Z', '+00:00'))
except (ValueError, TypeError):
logging.warning(f"Could not parse modified time for {item_path}: {modified_time}")
modified = datetime.now(timezone.utc)
else:
modified = datetime.now(timezone.utc)
else:
modified = datetime.now(timezone.utc)
logging.debug(f"File {item_path}: modified={modified}, start={start}, end={end}, include={start < modified <= end}")
if filter_by_mtime:
if start < modified <= end:
files.append((item_path, item))
else:
logging.debug(f"File {item_path} filtered out by time range")
else:
files.append((item_path, item))
except Exception as e:
logging.error(f"Error processing file {item_path}: {e}")
continue
except Exception as e:
logging.error(f"Error listing directory {path}: {e}")
return files
def _yield_webdav_documents(
self,
start: datetime,
end: datetime,
) -> GenerateDocumentsOutput:
"""Generate documents from WebDAV server
Args:
start: Start datetime for filtering
end: End datetime for filtering
Yields:
Batches of documents
"""
if self.client is None:
raise ConnectorMissingCredentialError("WebDAV client not initialized")
logging.info(f"Searching for files in {self.remote_path} between {start} and {end}")
files = self._list_files_recursive(self.remote_path, start, end)
logging.info(f"Found {len(files)} files matching time criteria")
filename_counts: dict[str, int] = {}
for file_path, _ in files:
file_name = os.path.basename(file_path)
filename_counts[file_name] = filename_counts.get(file_name, 0) + 1
batch: list[Document] = []
for file_path, file_info in files:
file_name = os.path.basename(file_path)
if not self._is_supported_file(file_name):
logging.debug(f"Skipping file {file_path} due to unsupported extension.")
continue
size_bytes = self._get_size_bytes(file_info)
if self.size_threshold is not None and size_bytes is None:
file_identifier = self._get_log_file_identifier(file_info, file_path)
logging.warning(
f"{file_identifier}: size metadata missing from WebDAV server response, "
f"skipping to avoid processing potentially large files."
)
continue
if (
self.size_threshold is not None
and size_bytes is not None
and size_bytes > self.size_threshold
):
file_identifier = self._get_log_file_identifier(file_info, file_path)
logging.warning(
f"{file_identifier} exceeds size threshold of {self.size_threshold} "
f"(size_bytes={size_bytes}). Skipping."
)
continue
try:
logging.debug(f"Downloading file: {file_path}")
from io import BytesIO
buffer = BytesIO()
self.client.download_fileobj(file_path, buffer)
blob = buffer.getvalue()
if blob is None or len(blob) == 0:
logging.warning(f"Downloaded content is empty for {file_path}")
continue
modified_time = file_info.get('modified')
if modified_time:
if isinstance(modified_time, datetime):
modified = modified_time
if modified.tzinfo is None:
modified = modified.replace(tzinfo=timezone.utc)
elif isinstance(modified_time, str):
try:
modified = datetime.strptime(modified_time, '%a, %d %b %Y %H:%M:%S %Z')
modified = modified.replace(tzinfo=timezone.utc)
except (ValueError, TypeError):
try:
modified = datetime.fromisoformat(modified_time.replace('Z', '+00:00'))
except (ValueError, TypeError):
logging.warning(f"Could not parse modified time for {file_path}: {modified_time}")
modified = datetime.now(timezone.utc)
else:
modified = datetime.now(timezone.utc)
else:
modified = datetime.now(timezone.utc)
if filename_counts.get(file_name, 0) > 1:
relative_path = file_path
if file_path.startswith(self.remote_path):
relative_path = file_path[len(self.remote_path):]
if relative_path.startswith('/'):
relative_path = relative_path[1:]
semantic_id = relative_path.replace('/', ' / ') if relative_path else file_name
else:
semantic_id = file_name
batch.append(
Document(
id=f"webdav:{self.base_url}:{file_path}",
blob=blob,
source=DocumentSource.WEBDAV,
semantic_identifier=semantic_id,
extension=get_file_ext(file_name),
doc_updated_at=modified,
size_bytes=size_bytes if size_bytes is not None else 0
)
)
if len(batch) == self.batch_size:
yield batch
batch = []
except Exception as e:
logging.exception(f"Error downloading file {file_path}: {e}")
if batch:
yield batch
def load_from_state(self) -> GenerateDocumentsOutput:
"""Load all documents from WebDAV server
Yields:
Batches of documents
"""
logging.debug(f"Loading documents from WebDAV server {self.base_url}")
return self._yield_webdav_documents(
start=datetime(1970, 1, 1, tzinfo=timezone.utc),
end=datetime.now(timezone.utc),
)
def poll_source(
self, start: SecondsSinceUnixEpoch, end: SecondsSinceUnixEpoch
) -> GenerateDocumentsOutput:
"""Poll WebDAV server for updated documents
Args:
start: Start timestamp (seconds since Unix epoch)
end: End timestamp (seconds since Unix epoch)
Yields:
Batches of documents
"""
if self.client is None:
raise ConnectorMissingCredentialError("WebDAV client not initialized")
start_datetime = datetime.fromtimestamp(start, tz=timezone.utc)
end_datetime = datetime.fromtimestamp(end, tz=timezone.utc)
for batch in self._yield_webdav_documents(start_datetime, end_datetime):
yield batch
def retrieve_all_slim_docs_perm_sync(
self,
callback: Any = None,
) -> GenerateSlimDocumentOutput:
"""Full-tree snapshot of indexed paths for stale-document reconciliation.
Uses the same ``webdav:{base_url}:{file_path}`` ids as :meth:`_yield_webdav_documents`,
without downloading file contents.
"""
del callback
if self.client is None:
raise ConnectorMissingCredentialError("WebDAV client not initialized")
logging.info(
"Starting WebDAV slim snapshot: base_url=%s path=%s",
self.base_url,
self.remote_path,
)
files = self._list_files_recursive(
self.remote_path,
datetime(1970, 1, 1, tzinfo=timezone.utc),
datetime.now(timezone.utc),
filter_by_mtime=False,
)
batch: list[SlimDocument] = []
total = 0
for file_path, file_info in files:
file_name = os.path.basename(file_path)
if not self._is_supported_file(file_name):
continue
size_bytes = self._get_size_bytes(file_info)
if self.size_threshold is not None and size_bytes is None:
file_identifier = self._get_log_file_identifier(file_info, file_path)
logging.warning(
f"{file_identifier}: size metadata missing from WebDAV server response, "
f"skipping to avoid processing potentially large files."
)
continue
if (
self.size_threshold is not None
and size_bytes is not None
and size_bytes > self.size_threshold
):
file_identifier = self._get_log_file_identifier(file_info, file_path)
logging.warning(
f"{file_identifier} exceeds size threshold of {self.size_threshold} "
f"(size_bytes={size_bytes}). Skipping."
)
continue
batch.append(
SlimDocument(id=f"webdav:{self.base_url}:{file_path}")
)
total += 1
if len(batch) >= self.batch_size:
yield batch
batch = []
if batch:
yield batch
logging.info(
"Completed WebDAV slim snapshot: %d documents (listed_paths=%d)",
total,
len(files),
)
def validate_connector_settings(self) -> None:
"""Validate WebDAV connector settings.
Validation should exercise the same code-paths used by the connector
(directory listing / PROPFIND), avoiding exists() which may probe with
methods that differ across servers.
"""
if self.client is None:
raise ConnectorMissingCredentialError("WebDAV credentials not loaded.")
if not self.base_url:
raise ConnectorValidationError("No base URL was provided in connector settings.")
# Normalize directory path: for collections, many servers behave better with trailing '/'
test_path = self.remote_path or "/"
if not test_path.startswith("/"):
test_path = f"/{test_path}"
if test_path != "/" and not test_path.endswith("/"):
test_path = f"{test_path}/"
try:
# Use the same behavior as real sync: list directory with details (PROPFIND)
self.client.ls(test_path, detail=True)
except Exception as e:
# Prefer structured status codes if present on the exception/response
status = None
for attr in ("status_code", "code"):
v = getattr(e, attr, None)
if isinstance(v, int):
status = v
break
if status is None:
resp = getattr(e, "response", None)
v = getattr(resp, "status_code", None)
if isinstance(v, int):
status = v
# If we can classify by status code, do it
if status == 401:
raise CredentialExpiredError("WebDAV credentials appear invalid or expired.")
if status == 403:
raise InsufficientPermissionsError(
f"Insufficient permissions to access path '{self.remote_path}' on WebDAV server."
)
if status == 404:
raise ConnectorValidationError(
f"Remote path '{self.remote_path}' does not exist on WebDAV server."
)
# Fallback: avoid brittle substring matching that caused false positives.
# Provide the original exception for diagnosis.
raise ConnectorValidationError(
f"WebDAV validation failed for path '{test_path}': {repr(e)}"
)
if __name__ == "__main__":
credentials_dict = {
"username": os.environ.get("WEBDAV_USERNAME"),
"password": os.environ.get("WEBDAV_PASSWORD"),
}
credentials_dict = {
"username": "user",
"password": "pass",
}
connector = WebDAVConnector(
base_url="http://172.17.0.1:8080/",
remote_path="/",
)
try:
connector.load_credentials(credentials_dict)
connector.validate_connector_settings()
document_batch_generator = connector.load_from_state()
for document_batch in document_batch_generator:
print("First batch of documents:")
for doc in document_batch:
print(f"Document ID: {doc.id}")
print(f"Semantic Identifier: {doc.semantic_identifier}")
print(f"Source: {doc.source}")
print(f"Updated At: {doc.doc_updated_at}")
print("---")
break
except ConnectorMissingCredentialError as e:
print(f"Error: {e}")
except Exception as e:
print(f"An unexpected error occurred: {e}")