ragflow/common/data_source/webdav_connector.py

"""WebDAV connector"""
import logging
import os
from datetime import datetime, timezone
from typing import Any, Optional
from urllib.parse import urlsplit

from webdav4.client import Client as WebDAVClient

from common.data_source.utils import (
    get_file_ext,
    is_accepted_file_ext,
)
from common.data_source.config import DocumentSource, INDEX_BATCH_SIZE, BLOB_STORAGE_SIZE_THRESHOLD
from common.data_source.exceptions import (
    ConnectorMissingCredentialError,
    ConnectorValidationError,
    CredentialExpiredError,
    InsufficientPermissionsError
)
from common.data_source.interfaces import LoadConnector, OnyxExtensionType, PollConnector, SlimConnectorWithPermSync
from common.data_source.models import Document, GenerateDocumentsOutput, GenerateSlimDocumentOutput, SecondsSinceUnixEpoch, SlimDocument


class WebDAVConnector(LoadConnector, PollConnector, SlimConnectorWithPermSync):
    """WebDAV connector for syncing files from WebDAV servers"""

    def __init__(
        self,
        base_url: str,
        remote_path: str = "/",
        batch_size: int = INDEX_BATCH_SIZE,
    ) -> None:
        """Initialize WebDAV connector

        Args:
            base_url: Base URL of the WebDAV server (e.g., "https://webdav.example.com")
            remote_path: Remote path to sync from (default: "/")
            batch_size: Number of documents per batch
        """
        self.base_url = base_url.rstrip("/")
        if not remote_path:
            remote_path = "/"
        if not remote_path.startswith("/"):
            remote_path = f"/{remote_path}"
        if remote_path.endswith("/") and remote_path != "/":
            remote_path = remote_path.rstrip("/")
        self.remote_path = remote_path
        self.batch_size = batch_size
        self.client: Optional[WebDAVClient] = None
        self._allow_images: bool | None = None
        self.size_threshold: int | None = BLOB_STORAGE_SIZE_THRESHOLD

    def _build_extension_type(self) -> OnyxExtensionType:
        extension_type = OnyxExtensionType.Plain | OnyxExtensionType.Document
        if bool(self._allow_images):
            extension_type |= OnyxExtensionType.Multimedia
        return extension_type

    def _is_supported_file(self, file_name: str) -> bool:
        file_ext = get_file_ext(file_name)
        return is_accepted_file_ext(file_ext, self._build_extension_type())

    @staticmethod
    def _coerce_size_bytes(size_bytes: Any) -> int | None:
        if isinstance(size_bytes, bool):
            return None
        if isinstance(size_bytes, int):
            return size_bytes if size_bytes >= 0 else None
        if isinstance(size_bytes, str):
            size_text = size_bytes.strip()
            if not size_text or len(size_text) > 20 or not size_text.isdecimal():
                return None
            parsed_size = int(size_text)
            return parsed_size if parsed_size >= 0 else None
        return None

    @classmethod
    def _get_size_bytes(cls, file_info: dict[str, Any]) -> int | None:
        # webdav4's Client.ls(detail=True) reports the size under "content_length"
        # (see webdav4.multistatus.Response.as_dict); other servers/libraries or
        # webdav4's fsspec wrapper may instead use "size" or the raw
        # "getcontentlength" property. Try each so the size guard isn't silently
        # skipped — otherwise file_info.get("size") is always None and every file
        # trips the missing-metadata warning.
        for key in ("size", "content_length", "getcontentlength"):
            if key not in file_info:
                continue
            size_bytes = cls._coerce_size_bytes(file_info[key])
            if size_bytes is not None:
                return size_bytes
        return None

    @staticmethod
    def _get_log_file_identifier(file_info: dict[str, Any], fallback_path: str) -> str:
        raw_identifier = str(file_info.get("name") or file_info.get("href") or fallback_path)
        try:
            parsed_identifier = urlsplit(raw_identifier)
            identifier_path = parsed_identifier.path if parsed_identifier.scheme else raw_identifier
        except ValueError:
            identifier_path = fallback_path if "://" not in fallback_path else ""
        identifier_path = identifier_path.split("?", 1)[0].split("#", 1)[0]
        fallback_identifier = "" if "://" in fallback_path else os.path.basename(fallback_path.rstrip("/"))
        identifier = os.path.basename(identifier_path.rstrip("/")) or fallback_identifier or "<unknown>"
        return identifier.encode("unicode_escape").decode("ascii")

    def set_allow_images(self, allow_images: bool) -> None:
        """Set whether to process images"""
        logging.info(f"Setting allow_images to {allow_images}.")
        self._allow_images = allow_images

    def load_credentials(self, credentials: dict[str, Any]) -> dict[str, Any] | None:
        """Load credentials and initialize WebDAV client

        Args:
            credentials: Dictionary containing 'username' and 'password'

        Returns:
            None

        Raises:
            ConnectorMissingCredentialError: If required credentials are missing
        """
        logging.debug(f"Loading credentials for WebDAV server {self.base_url}")

        username = credentials.get("username")
        password = credentials.get("password")

        if not username or not password:
            raise ConnectorMissingCredentialError(
                "WebDAV requires 'username' and 'password' credentials"
            )

        try:
            # Initialize WebDAV client
            self.client = WebDAVClient(
                base_url=self.base_url,
                auth=(username, password)
            )
        except Exception as e:
            logging.error(f"Failed to connect to WebDAV server: {e}")
            raise ConnectorMissingCredentialError(
                f"Failed to authenticate with WebDAV server: {e}"
            )

        return None

    def _list_files_recursive(
        self,
        path: str,
        start: datetime,
        end: datetime,
        *,
        filter_by_mtime: bool = True,
    ) -> list[tuple[str, dict]]:
        """Recursively list all files in the given path

        Args:
            path: Path to list files from
            start: Start datetime for filtering (ignored when ``filter_by_mtime`` is False)
            end: End datetime for filtering (ignored when ``filter_by_mtime`` is False)
            filter_by_mtime: When False, include every supported extension without mtime window

        Returns:
            List of tuples containing (file_path, file_info)
        """
        if self.client is None:
            raise ConnectorMissingCredentialError("WebDAV client not initialized")

        files = []

        try:
            logging.debug(f"Listing directory: {path}")
            for item in self.client.ls(path, detail=True):
                item_path = item['name']

                if item_path == path or item_path == path + '/':
                    continue

                logging.debug(f"Found item: {item_path}, type: {item.get('type')}")

                if item.get('type') == 'directory':
                    try:
                        files.extend(
                            self._list_files_recursive(
                                item_path,
                                start,
                                end,
                                filter_by_mtime=filter_by_mtime,
                            )
                        )
                    except Exception as e:
                        logging.error(f"Error recursing into directory {item_path}: {e}")
                        continue
                else:
                    try:
                        file_name = os.path.basename(item_path)
                        if not self._is_supported_file(file_name):
                            logging.debug(f"Skipping file {item_path} due to unsupported extension.")
                            continue

                        modified_time = item.get('modified')
                        if modified_time:
                            if isinstance(modified_time, datetime):
                                modified = modified_time
                                if modified.tzinfo is None:
                                    modified = modified.replace(tzinfo=timezone.utc)
                            elif isinstance(modified_time, str):
                                try:
                                    modified = datetime.strptime(modified_time, '%a, %d %b %Y %H:%M:%S %Z')
                                    modified = modified.replace(tzinfo=timezone.utc)
                                except (ValueError, TypeError):
                                    try:
                                        modified = datetime.fromisoformat(modified_time.replace('Z', '+00:00'))
                                    except (ValueError, TypeError):
                                        logging.warning(f"Could not parse modified time for {item_path}: {modified_time}")
                                        modified = datetime.now(timezone.utc)
                            else:
                                modified = datetime.now(timezone.utc)
                        else:
                            modified = datetime.now(timezone.utc)


                        logging.debug(f"File {item_path}: modified={modified}, start={start}, end={end}, include={start < modified <= end}")
                        if filter_by_mtime:
                            if start < modified <= end:
                                files.append((item_path, item))
                            else:
                                logging.debug(f"File {item_path} filtered out by time range")
                        else:
                            files.append((item_path, item))
                    except Exception as e:
                        logging.error(f"Error processing file {item_path}: {e}")
                        continue

        except Exception as e:
            logging.error(f"Error listing directory {path}: {e}")

        return files

    def _yield_webdav_documents(
        self,
        start: datetime,
        end: datetime,
    ) -> GenerateDocumentsOutput:
        """Generate documents from WebDAV server

        Args:
            start: Start datetime for filtering
            end: End datetime for filtering

        Yields:
            Batches of documents
        """
        if self.client is None:
            raise ConnectorMissingCredentialError("WebDAV client not initialized")

        logging.info(f"Searching for files in {self.remote_path} between {start} and {end}")
        files = self._list_files_recursive(self.remote_path, start, end)
        logging.info(f"Found {len(files)} files matching time criteria")

        filename_counts: dict[str, int] = {}
        for file_path, _ in files:
            file_name = os.path.basename(file_path)
            filename_counts[file_name] = filename_counts.get(file_name, 0) + 1

        batch: list[Document] = []
        for file_path, file_info in files:
            file_name = os.path.basename(file_path)

            if not self._is_supported_file(file_name):
                logging.debug(f"Skipping file {file_path} due to unsupported extension.")
                continue

            size_bytes = self._get_size_bytes(file_info)
            if self.size_threshold is not None and size_bytes is None:
                file_identifier = self._get_log_file_identifier(file_info, file_path)
                logging.warning(
                    f"{file_identifier}: size metadata missing from WebDAV server response, "
                    f"skipping to avoid processing potentially large files."
                )
                continue
            if (
                self.size_threshold is not None
                and size_bytes is not None
                and size_bytes > self.size_threshold
            ):
                file_identifier = self._get_log_file_identifier(file_info, file_path)
                logging.warning(
                    f"{file_identifier} exceeds size threshold of {self.size_threshold} "
                    f"(size_bytes={size_bytes}). Skipping."
                )
                continue

            try:
                logging.debug(f"Downloading file: {file_path}")
                from io import BytesIO
                buffer = BytesIO()
                self.client.download_fileobj(file_path, buffer)
                blob = buffer.getvalue()

                if blob is None or len(blob) == 0:
                    logging.warning(f"Downloaded content is empty for {file_path}")
                    continue

                modified_time = file_info.get('modified')
                if modified_time:
                    if isinstance(modified_time, datetime):
                        modified = modified_time
                        if modified.tzinfo is None:
                            modified = modified.replace(tzinfo=timezone.utc)
                    elif isinstance(modified_time, str):
                        try:
                            modified = datetime.strptime(modified_time, '%a, %d %b %Y %H:%M:%S %Z')
                            modified = modified.replace(tzinfo=timezone.utc)
                        except (ValueError, TypeError):
                            try:
                                modified = datetime.fromisoformat(modified_time.replace('Z', '+00:00'))
                            except (ValueError, TypeError):
                                logging.warning(f"Could not parse modified time for {file_path}: {modified_time}")
                                modified = datetime.now(timezone.utc)
                    else:
                        modified = datetime.now(timezone.utc)
                else:
                    modified = datetime.now(timezone.utc)

                if filename_counts.get(file_name, 0) > 1:
                    relative_path = file_path
                    if file_path.startswith(self.remote_path):
                        relative_path = file_path[len(self.remote_path):]
                    if relative_path.startswith('/'):
                        relative_path = relative_path[1:]
                    semantic_id = relative_path.replace('/', ' / ') if relative_path else file_name
                else:
                    semantic_id = file_name

                batch.append(
                    Document(
                        id=f"webdav:{self.base_url}:{file_path}",
                        blob=blob,
                        source=DocumentSource.WEBDAV,
                        semantic_identifier=semantic_id,
                        extension=get_file_ext(file_name),
                        doc_updated_at=modified,
                        size_bytes=size_bytes if size_bytes is not None else 0
                    )
                )

                if len(batch) == self.batch_size:
                    yield batch
                    batch = []

            except Exception as e:
                logging.exception(f"Error downloading file {file_path}: {e}")

        if batch:
            yield batch

    def load_from_state(self) -> GenerateDocumentsOutput:
        """Load all documents from WebDAV server

        Yields:
            Batches of documents
        """
        logging.debug(f"Loading documents from WebDAV server {self.base_url}")
        return self._yield_webdav_documents(
            start=datetime(1970, 1, 1, tzinfo=timezone.utc),
            end=datetime.now(timezone.utc),
        )

    def poll_source(
        self, start: SecondsSinceUnixEpoch, end: SecondsSinceUnixEpoch
    ) -> GenerateDocumentsOutput:
        """Poll WebDAV server for updated documents

        Args:
            start: Start timestamp (seconds since Unix epoch)
            end: End timestamp (seconds since Unix epoch)

        Yields:
            Batches of documents
        """
        if self.client is None:
            raise ConnectorMissingCredentialError("WebDAV client not initialized")

        start_datetime = datetime.fromtimestamp(start, tz=timezone.utc)
        end_datetime = datetime.fromtimestamp(end, tz=timezone.utc)

        for batch in self._yield_webdav_documents(start_datetime, end_datetime):
            yield batch

    def retrieve_all_slim_docs_perm_sync(
        self,
        callback: Any = None,
    ) -> GenerateSlimDocumentOutput:
        """Full-tree snapshot of indexed paths for stale-document reconciliation.

        Uses the same ``webdav:{base_url}:{file_path}`` ids as :meth:`_yield_webdav_documents`,
        without downloading file contents.
        """
        del callback
        if self.client is None:
            raise ConnectorMissingCredentialError("WebDAV client not initialized")

        logging.info(
            "Starting WebDAV slim snapshot: base_url=%s path=%s",
            self.base_url,
            self.remote_path,
        )

        files = self._list_files_recursive(
            self.remote_path,
            datetime(1970, 1, 1, tzinfo=timezone.utc),
            datetime.now(timezone.utc),
            filter_by_mtime=False,
        )
        batch: list[SlimDocument] = []
        total = 0
        for file_path, file_info in files:
            file_name = os.path.basename(file_path)
            if not self._is_supported_file(file_name):
                continue
            size_bytes = self._get_size_bytes(file_info)
            if self.size_threshold is not None and size_bytes is None:
                file_identifier = self._get_log_file_identifier(file_info, file_path)
                logging.warning(
                    f"{file_identifier}: size metadata missing from WebDAV server response, "
                    f"skipping to avoid processing potentially large files."
                )
                continue
            if (
                self.size_threshold is not None
                and size_bytes is not None
                and size_bytes > self.size_threshold
            ):
                file_identifier = self._get_log_file_identifier(file_info, file_path)
                logging.warning(
                    f"{file_identifier} exceeds size threshold of {self.size_threshold} "
                    f"(size_bytes={size_bytes}). Skipping."
                )
                continue
            batch.append(
                SlimDocument(id=f"webdav:{self.base_url}:{file_path}")
            )
            total += 1
            if len(batch) >= self.batch_size:
                yield batch
                batch = []

        if batch:
            yield batch

        logging.info(
            "Completed WebDAV slim snapshot: %d documents (listed_paths=%d)",
            total,
            len(files),
        )

    def validate_connector_settings(self) -> None:
        """Validate WebDAV connector settings.

        Validation should exercise the same code-paths used by the connector
        (directory listing / PROPFIND), avoiding exists() which may probe with
        methods that differ across servers.
        """
        if self.client is None:
            raise ConnectorMissingCredentialError("WebDAV credentials not loaded.")

        if not self.base_url:
            raise ConnectorValidationError("No base URL was provided in connector settings.")

        # Normalize directory path: for collections, many servers behave better with trailing '/'
        test_path = self.remote_path or "/"
        if not test_path.startswith("/"):
            test_path = f"/{test_path}"
        if test_path != "/" and not test_path.endswith("/"):
            test_path = f"{test_path}/"

        try:
            # Use the same behavior as real sync: list directory with details (PROPFIND)
            self.client.ls(test_path, detail=True)

        except Exception as e:
            # Prefer structured status codes if present on the exception/response
            status = None
            for attr in ("status_code", "code"):
                v = getattr(e, attr, None)
                if isinstance(v, int):
                    status = v
                    break
            if status is None:
                resp = getattr(e, "response", None)
                v = getattr(resp, "status_code", None)
                if isinstance(v, int):
                    status = v

            # If we can classify by status code, do it
            if status == 401:
                raise CredentialExpiredError("WebDAV credentials appear invalid or expired.")
            if status == 403:
                raise InsufficientPermissionsError(
                    f"Insufficient permissions to access path '{self.remote_path}' on WebDAV server."
                )
            if status == 404:
                raise ConnectorValidationError(
                    f"Remote path '{self.remote_path}' does not exist on WebDAV server."
                )

            # Fallback: avoid brittle substring matching that caused false positives.
            # Provide the original exception for diagnosis.
            raise ConnectorValidationError(
                f"WebDAV validation failed for path '{test_path}': {repr(e)}"
            )


if __name__ == "__main__":
    credentials_dict = {
        "username": os.environ.get("WEBDAV_USERNAME"),
        "password": os.environ.get("WEBDAV_PASSWORD"),
    }

    credentials_dict = {
        "username": "user",
        "password": "pass",
    }


    connector = WebDAVConnector(
        base_url="http://172.17.0.1:8080/",
        remote_path="/",
    )

    try:
        connector.load_credentials(credentials_dict)
        connector.validate_connector_settings()

        document_batch_generator = connector.load_from_state()
        for document_batch in document_batch_generator:
            print("First batch of documents:")
            for doc in document_batch:
                print(f"Document ID: {doc.id}")
                print(f"Semantic Identifier: {doc.semantic_identifier}")
                print(f"Source: {doc.source}")
                print(f"Updated At: {doc.doc_updated_at}")
                print("---")
            break

    except ConnectorMissingCredentialError as e:
        print(f"Error: {e}")
    except Exception as e:
        print(f"An unexpected error occurred: {e}")