mirror of
https://github.com/infiniflow/ragflow.git
synced 2026-06-29 23:41:12 +08:00
Fix blob sync: skip unsupported files before download (#14357)
### What problem does this PR solve? Blob storage sync was downloading unsupported files first and rejecting them later, which wasted bandwidth and made sync slower. This PR skips unsupported extensions before download and applies `allow_images` in blob sync. fixes #14338 ### Type of change - [x] Bug Fix (non-breaking change which fixes an issue)
This commit is contained in:
@@ -10,6 +10,7 @@ from common.data_source.utils import (
|
||||
download_object,
|
||||
extract_size_bytes,
|
||||
get_file_ext,
|
||||
is_accepted_file_ext,
|
||||
)
|
||||
from common.data_source.config import BlobType, DocumentSource, BLOB_STORAGE_SIZE_THRESHOLD, INDEX_BATCH_SIZE
|
||||
from common.data_source.exceptions import (
|
||||
@@ -18,7 +19,7 @@ from common.data_source.exceptions import (
|
||||
CredentialExpiredError,
|
||||
InsufficientPermissionsError
|
||||
)
|
||||
from common.data_source.interfaces import LoadConnector, PollConnector
|
||||
from common.data_source.interfaces import LoadConnector, OnyxExtensionType, PollConnector
|
||||
from common.data_source.models import Document, SecondsSinceUnixEpoch, GenerateDocumentsOutput
|
||||
|
||||
|
||||
@@ -130,15 +131,23 @@ class BlobStorageConnector(LoadConnector, PollConnector):
|
||||
|
||||
# Collect all objects first to count filename occurrences
|
||||
all_objects = []
|
||||
extension_type = OnyxExtensionType.Plain | OnyxExtensionType.Document
|
||||
if bool(self._allow_images):
|
||||
extension_type |= OnyxExtensionType.Multimedia
|
||||
for page in pages:
|
||||
if "Contents" not in page:
|
||||
continue
|
||||
for obj in page["Contents"]:
|
||||
if obj["Key"].endswith("/"):
|
||||
key = obj["Key"]
|
||||
if key.endswith("/"):
|
||||
continue
|
||||
last_modified = obj["LastModified"].replace(tzinfo=timezone.utc)
|
||||
if start < last_modified <= end:
|
||||
all_objects.append(obj)
|
||||
if not (start < last_modified <= end):
|
||||
continue
|
||||
file_name = os.path.basename(key)
|
||||
if not is_accepted_file_ext(get_file_ext(file_name), extension_type):
|
||||
continue
|
||||
all_objects.append(obj)
|
||||
|
||||
# Count filename occurrences to determine which need full paths
|
||||
filename_counts: dict[str, int] = {}
|
||||
|
||||
Reference in New Issue
Block a user