mirror of
https://github.com/infiniflow/ragflow.git
synced 2026-06-29 15:31:05 +08:00
feat(webdav): support deleted-file sync via slim snapshot (#14491)
## What problem does this PR solve? Incremental WebDAV sync only ingested files whose modification time fell inside the poll window; documents removed on the WebDAV server were never removed from the knowledge base. This aligns with [#14362](https://github.com/infiniflow/ragflow/issues/14362) (coordinated datasource “sync deleted files” work). This PR adds a **full-tree slim snapshot** (`retrieve_all_slim_docs_perm_sync`) that enumerates current remote paths **without downloading file contents**, using the same logical document IDs as full ingest (`webdav:{base_url}:{file_path}`). When **`sync_deleted_files`** is enabled on incremental runs, sync returns **`(document_generator, file_list)`** so **`SyncBase`** runs **`cleanup_stale_documents_for_task`** and removes KB rows no longer present remotely. Design notes: - **`_list_files_recursive`** gains **`filter_by_mtime`**: snapshot passes **`filter_by_mtime=False`** (full tree under **`remote_path`**); **`poll_source`** keeps mtime-window filtering as before. - Slim snapshot applies the same **extension** and **`size_threshold`** rules as **`_yield_webdav_documents`** so retain IDs match what would be indexed. - **`end_ts`** is captured before building **`file_list`**, then **`poll_source`** uses the same upper bound (consistent with Dropbox-style connectors). ## Type of change - [x] New Feature (non-breaking change which adds functionality) ## Files changed | Area | Change | |------|--------| | `common/data_source/webdav_connector.py` | `SlimConnectorWithPermSync`, `retrieve_all_slim_docs_perm_sync`, `filter_by_mtime` on `_list_files_recursive` | | `rag/svr/sync_data_source.py` | WebDAV `_generate`: `file_list` + tuple return; pass **`batch_size`** from connector config | | `web/src/pages/user-setting/data-source/constant/index.tsx` | `syncDeletedFiles` for WebDAV in `DataSourceFeatureVisibilityMap` |
This commit is contained in:
@@ -17,11 +17,11 @@ from common.data_source.exceptions import (
|
||||
CredentialExpiredError,
|
||||
InsufficientPermissionsError
|
||||
)
|
||||
from common.data_source.interfaces import LoadConnector, OnyxExtensionType, PollConnector
|
||||
from common.data_source.models import Document, SecondsSinceUnixEpoch, GenerateDocumentsOutput
|
||||
from common.data_source.interfaces import LoadConnector, OnyxExtensionType, PollConnector, SlimConnectorWithPermSync
|
||||
from common.data_source.models import Document, GenerateDocumentsOutput, GenerateSlimDocumentOutput, SecondsSinceUnixEpoch, SlimDocument
|
||||
|
||||
|
||||
class WebDAVConnector(LoadConnector, PollConnector):
|
||||
class WebDAVConnector(LoadConnector, PollConnector, SlimConnectorWithPermSync):
|
||||
"""WebDAV connector for syncing files from WebDAV servers"""
|
||||
|
||||
def __init__(
|
||||
@@ -102,17 +102,20 @@ class WebDAVConnector(LoadConnector, PollConnector):
|
||||
return None
|
||||
|
||||
def _list_files_recursive(
|
||||
self,
|
||||
self,
|
||||
path: str,
|
||||
start: datetime,
|
||||
end: datetime,
|
||||
*,
|
||||
filter_by_mtime: bool = True,
|
||||
) -> list[tuple[str, dict]]:
|
||||
"""Recursively list all files in the given path
|
||||
|
||||
Args:
|
||||
path: Path to list files from
|
||||
start: Start datetime for filtering
|
||||
end: End datetime for filtering
|
||||
start: Start datetime for filtering (ignored when ``filter_by_mtime`` is False)
|
||||
end: End datetime for filtering (ignored when ``filter_by_mtime`` is False)
|
||||
filter_by_mtime: When False, include every supported extension without mtime window
|
||||
|
||||
Returns:
|
||||
List of tuples containing (file_path, file_info)
|
||||
@@ -134,7 +137,14 @@ class WebDAVConnector(LoadConnector, PollConnector):
|
||||
|
||||
if item.get('type') == 'directory':
|
||||
try:
|
||||
files.extend(self._list_files_recursive(item_path, start, end))
|
||||
files.extend(
|
||||
self._list_files_recursive(
|
||||
item_path,
|
||||
start,
|
||||
end,
|
||||
filter_by_mtime=filter_by_mtime,
|
||||
)
|
||||
)
|
||||
except Exception as e:
|
||||
logging.error(f"Error recursing into directory {item_path}: {e}")
|
||||
continue
|
||||
@@ -168,10 +178,13 @@ class WebDAVConnector(LoadConnector, PollConnector):
|
||||
|
||||
|
||||
logging.debug(f"File {item_path}: modified={modified}, start={start}, end={end}, include={start < modified <= end}")
|
||||
if start < modified <= end:
|
||||
files.append((item_path, item))
|
||||
if filter_by_mtime:
|
||||
if start < modified <= end:
|
||||
files.append((item_path, item))
|
||||
else:
|
||||
logging.debug(f"File {item_path} filtered out by time range")
|
||||
else:
|
||||
logging.debug(f"File {item_path} filtered out by time range")
|
||||
files.append((item_path, item))
|
||||
except Exception as e:
|
||||
logging.error(f"Error processing file {item_path}: {e}")
|
||||
continue
|
||||
@@ -323,6 +336,61 @@ class WebDAVConnector(LoadConnector, PollConnector):
|
||||
for batch in self._yield_webdav_documents(start_datetime, end_datetime):
|
||||
yield batch
|
||||
|
||||
def retrieve_all_slim_docs_perm_sync(
|
||||
self,
|
||||
callback: Any = None,
|
||||
) -> GenerateSlimDocumentOutput:
|
||||
"""Full-tree snapshot of indexed paths for stale-document reconciliation.
|
||||
|
||||
Uses the same ``webdav:{base_url}:{file_path}`` ids as :meth:`_yield_webdav_documents`,
|
||||
without downloading file contents.
|
||||
"""
|
||||
del callback
|
||||
if self.client is None:
|
||||
raise ConnectorMissingCredentialError("WebDAV client not initialized")
|
||||
|
||||
logging.info(
|
||||
"Starting WebDAV slim snapshot: base_url=%s path=%s",
|
||||
self.base_url,
|
||||
self.remote_path,
|
||||
)
|
||||
|
||||
files = self._list_files_recursive(
|
||||
self.remote_path,
|
||||
datetime(1970, 1, 1, tzinfo=timezone.utc),
|
||||
datetime.now(timezone.utc),
|
||||
filter_by_mtime=False,
|
||||
)
|
||||
batch: list[SlimDocument] = []
|
||||
total = 0
|
||||
for file_path, file_info in files:
|
||||
file_name = os.path.basename(file_path)
|
||||
if not self._is_supported_file(file_name):
|
||||
continue
|
||||
size_bytes = file_info.get("size", 0)
|
||||
if (
|
||||
self.size_threshold is not None
|
||||
and isinstance(size_bytes, int)
|
||||
and size_bytes > self.size_threshold
|
||||
):
|
||||
continue
|
||||
batch.append(
|
||||
SlimDocument(id=f"webdav:{self.base_url}:{file_path}")
|
||||
)
|
||||
total += 1
|
||||
if len(batch) >= self.batch_size:
|
||||
yield batch
|
||||
batch = []
|
||||
|
||||
if batch:
|
||||
yield batch
|
||||
|
||||
logging.info(
|
||||
"Completed WebDAV slim snapshot: %d documents (listed_paths=%d)",
|
||||
total,
|
||||
len(files),
|
||||
)
|
||||
|
||||
def validate_connector_settings(self) -> None:
|
||||
"""Validate WebDAV connector settings.
|
||||
|
||||
|
||||
Reference in New Issue
Block a user