Fix blob sync: skip unsupported files before download (#14357)

### What problem does this PR solve?

Blob storage sync was downloading unsupported files first and rejecting
them later, which wasted bandwidth and made sync slower. This PR skips
unsupported extensions before download and applies `allow_images` in
blob sync. fixes #14338

### Type of change

- [x] Bug Fix (non-breaking change which fixes an issue)
This commit is contained in:
Idriss Sbaaoui
2026-04-24 19:22:32 +08:00
committed by GitHub
parent 620088be2f
commit ca01c7a745
2 changed files with 14 additions and 4 deletions

View File

@@ -10,6 +10,7 @@ from common.data_source.utils import (
download_object,
extract_size_bytes,
get_file_ext,
is_accepted_file_ext,
)
from common.data_source.config import BlobType, DocumentSource, BLOB_STORAGE_SIZE_THRESHOLD, INDEX_BATCH_SIZE
from common.data_source.exceptions import (
@@ -18,7 +19,7 @@ from common.data_source.exceptions import (
CredentialExpiredError,
InsufficientPermissionsError
)
from common.data_source.interfaces import LoadConnector, PollConnector
from common.data_source.interfaces import LoadConnector, OnyxExtensionType, PollConnector
from common.data_source.models import Document, SecondsSinceUnixEpoch, GenerateDocumentsOutput
@@ -130,15 +131,23 @@ class BlobStorageConnector(LoadConnector, PollConnector):
# Collect all objects first to count filename occurrences
all_objects = []
extension_type = OnyxExtensionType.Plain | OnyxExtensionType.Document
if bool(self._allow_images):
extension_type |= OnyxExtensionType.Multimedia
for page in pages:
if "Contents" not in page:
continue
for obj in page["Contents"]:
if obj["Key"].endswith("/"):
key = obj["Key"]
if key.endswith("/"):
continue
last_modified = obj["LastModified"].replace(tzinfo=timezone.utc)
if start < last_modified <= end:
all_objects.append(obj)
if not (start < last_modified <= end):
continue
file_name = os.path.basename(key)
if not is_accepted_file_ext(get_file_ext(file_name), extension_type):
continue
all_objects.append(obj)
# Count filename occurrences to determine which need full paths
filename_counts: dict[str, int] = {}