Fix blob sync: skip unsupported files before download (#14357)

### What problem does this PR solve? Blob storage sync was downloading unsupported files first and rejecting them later, which wasted bandwidth and made sync slower. This PR skips unsupported extensions before download and applies `allow_images` in blob sync. fixes #14338 ### Type of change - [x] Bug Fix (non-breaking change which fixes an issue)
2026-06-29 23:41:12 +08:00 · 2026-04-24 19:22:32 +08:00
parent 620088be2f
commit ca01c7a745
2 changed files with 14 additions and 4 deletions
--- a/common/data_source/blob_connector.py
+++ b/common/data_source/blob_connector.py
@@ -10,6 +10,7 @@ from common.data_source.utils import (
    download_object,
    extract_size_bytes,
    get_file_ext,
+    is_accepted_file_ext,
 )
 from common.data_source.config import BlobType, DocumentSource, BLOB_STORAGE_SIZE_THRESHOLD, INDEX_BATCH_SIZE
 from common.data_source.exceptions import (
@@ -18,7 +19,7 @@ from common.data_source.exceptions import (
    CredentialExpiredError,
    InsufficientPermissionsError
 )
-from common.data_source.interfaces import LoadConnector, PollConnector
+from common.data_source.interfaces import LoadConnector, OnyxExtensionType, PollConnector
 from common.data_source.models import Document, SecondsSinceUnixEpoch, GenerateDocumentsOutput


@@ -130,15 +131,23 @@ class BlobStorageConnector(LoadConnector, PollConnector):

        # Collect all objects first to count filename occurrences
        all_objects = []
+        extension_type = OnyxExtensionType.Plain | OnyxExtensionType.Document
+        if bool(self._allow_images):
+            extension_type |= OnyxExtensionType.Multimedia
        for page in pages:
            if "Contents" not in page:
                continue
            for obj in page["Contents"]:
-                if obj["Key"].endswith("/"):
+                key = obj["Key"]
+                if key.endswith("/"):
                    continue
                last_modified = obj["LastModified"].replace(tzinfo=timezone.utc)
-                if start < last_modified <= end:
-                    all_objects.append(obj)
+                if not (start < last_modified <= end):
+                    continue
+                file_name = os.path.basename(key)
+                if not is_accepted_file_ext(get_file_ext(file_name), extension_type):
+                    continue
+                all_objects.append(obj)
        
        # Count filename occurrences to determine which need full paths
        filename_counts: dict[str, int] = {}