common/data_source/airtable_connector.py

from datetime import datetime, timezone
import logging
from typing import Any, Generator

import requests

from pyairtable import Api as AirtableApi

from common.data_source.config import AIRTABLE_CONNECTOR_SIZE_THRESHOLD, INDEX_BATCH_SIZE, DocumentSource
from common.data_source.exceptions import ConnectorMissingCredentialError
from common.data_source.interfaces import LoadConnector, PollConnector, SlimConnectorWithPermSync
from common.data_source.models import (
    Document,
    GenerateDocumentsOutput,
    GenerateSlimDocumentOutput,
    SecondsSinceUnixEpoch,
    SlimDocument,
)
from common.data_source.utils import extract_size_bytes, get_file_ext

class AirtableClientNotSetUpError(PermissionError):
    def __init__(self) -> None:
        super().__init__(
            "Airtable client is not set up. Did you forget to call load_credentials()?"
        )


class AirtableConnector(LoadConnector, PollConnector, SlimConnectorWithPermSync):
    """
    Lightweight Airtable connector.

    This connector ingests Airtable attachments as raw blobs without
    parsing file content or generating text/image sections.
    """

    def __init__(
        self,
        base_id: str,
        table_name_or_id: str,
        batch_size: int = INDEX_BATCH_SIZE,
    ) -> None:
        self.base_id = base_id
        self.table_name_or_id = table_name_or_id
        self.batch_size = batch_size
        self._airtable_client: AirtableApi | None = None
        self.size_threshold = AIRTABLE_CONNECTOR_SIZE_THRESHOLD

    def _iter_attachment_entries(self) -> Generator[tuple[str, str, str, str, str | None, dict[str, Any]], None, None]:
        if not self._airtable_client:
            raise ConnectorMissingCredentialError("Airtable credentials not loaded")

        table = self.airtable_client.table(self.base_id, self.table_name_or_id)
        records = table.all()

        logging.info(
            f"Starting Airtable attachment scan for table {self.table_name_or_id}, "
            f"{len(records)} records found."
        )

        for record in records:
            record_id = record.get("id")
            fields = record.get("fields", {})
            created_time = record.get("createdTime")

            for field_value in fields.values():
                if not isinstance(field_value, list):
                    continue

                for attachment in field_value:
                    filename = attachment.get("filename")
                    attachment_id = attachment.get("id")

                    if not record_id or not filename or not attachment_id:
                        continue

                    yield (
                        record_id,
                        attachment_id,
                        filename,
                        f"airtable:{record_id}:{attachment_id}",
                        created_time,
                        attachment,
                    )

    # -------------------------
    # Credentials
    # -------------------------
    def load_credentials(self, credentials: dict[str, Any]) -> dict[str, Any] | None:
        self._airtable_client = AirtableApi(credentials["airtable_access_token"])
        return None

    @property
    def airtable_client(self) -> AirtableApi:
        if not self._airtable_client:
            raise AirtableClientNotSetUpError()
        return self._airtable_client

    # -------------------------
    # Core logic
    # -------------------------
    def load_from_state(self) -> GenerateDocumentsOutput:
        """
        Fetch all Airtable records and ingest attachments as raw blobs.

        Each attachment is converted into a single Document(blob=...).
        """
        if not self._airtable_client:
            raise ConnectorMissingCredentialError("Airtable credentials not loaded")

        batch: list[Document] = []

        for record_id, attachment_id, filename, doc_id, created_time, attachment in self._iter_attachment_entries():
            url = attachment.get("url")
            if not url or not created_time:
                continue

            try:
                resp = requests.get(url, timeout=30)
                resp.raise_for_status()
                content = resp.content
            except Exception:
                logging.exception(
                    f"Failed to download attachment {filename} "
                    f"(record={record_id})"
                )
                continue
            size_bytes = extract_size_bytes(attachment)
            if (
                self.size_threshold is not None
                and isinstance(size_bytes, int)
                and size_bytes > self.size_threshold
            ):
                logging.warning(
                    f"{filename} exceeds size threshold of {self.size_threshold}. Skipping."
                )
                continue
            batch.append(
                Document(
                    id=doc_id,
                    blob=content,
                    source=DocumentSource.AIRTABLE,
                    semantic_identifier=filename,
                    extension=get_file_ext(filename),
                    size_bytes=size_bytes if size_bytes else 0,
                    doc_updated_at=datetime.strptime(created_time, "%Y-%m-%dT%H:%M:%S.%fZ").replace(tzinfo=timezone.utc)
                )
            )

            if len(batch) >= self.batch_size:
                yield batch
                batch = []

        if batch:
            yield batch

    def retrieve_all_slim_docs_perm_sync(
        self,
        callback: Any = None,
    ) -> GenerateSlimDocumentOutput:
        del callback

        batch: list[SlimDocument] = []

        for _, _, _, doc_id, _, _ in self._iter_attachment_entries():
            batch.append(SlimDocument(id=doc_id))
            if len(batch) >= self.batch_size:
                yield batch
                batch = []

        if batch:
            yield batch

    def poll_source(self, start: SecondsSinceUnixEpoch, end: SecondsSinceUnixEpoch) -> Generator[list[Document], None, None]:
        """Poll source to get documents"""
        start_dt = datetime.fromtimestamp(start, tz=timezone.utc)
        end_dt = datetime.fromtimestamp(end, tz=timezone.utc)

        for batch in self.load_from_state():
            filtered: list[Document] = []

            for doc in batch:
                if not doc.doc_updated_at:
                    continue

                doc_dt = doc.doc_updated_at.astimezone(timezone.utc)

                if start_dt <= doc_dt < end_dt:
                    filtered.append(doc)

            if filtered:
                yield filtered

if __name__ == "__main__":
    import os

    logging.basicConfig(level=logging.DEBUG)
    connector = AirtableConnector("xxx","xxx")
    connector.load_credentials({"airtable_access_token": os.environ.get("AIRTABLE_ACCESS_TOKEN")})
    connector.validate_connector_settings()
    document_batches = connector.load_from_state()
    try:
        first_batch = next(document_batches)
        print(f"Loaded {len(first_batch)} documents in first batch.")
        for doc in first_batch:
            print(f"- {doc.semantic_identifier} ({doc.size_bytes} bytes)")
    except StopIteration:
        print("No documents available in Dropbox.")
Feat: add Airtable connector and integration for data synchronization (#12211) ### What problem does this PR solve? change: add Airtable connector and integration for data synchronization ### Type of change - [x] New Feature (non-breaking change which adds functionality) 2025-12-25 17:50:41 +08:00			`from datetime import datetime, timezone`
			`import logging`
Feat: add IMAP data source integration with configuration and sync capabilities (#12316) ### What problem does this PR solve? issue: #12217 [#12313](https://github.com/infiniflow/ragflow/issues/12313) change: add IMAP data source integration with configuration and sync capabilities ### Type of change - [x] New Feature (non-breaking change which adds functionality) 2025-12-30 17:09:13 +08:00			`from typing import Any, Generator`
Feat: add Airtable connector and integration for data synchronization (#12211) ### What problem does this PR solve? change: add Airtable connector and integration for data synchronization ### Type of change - [x] New Feature (non-breaking change which adds functionality) 2025-12-25 17:50:41 +08:00
			`import requests`

			`from pyairtable import Api as AirtableApi`

			`from common.data_source.config import AIRTABLE_CONNECTOR_SIZE_THRESHOLD, INDEX_BATCH_SIZE, DocumentSource`
			`from common.data_source.exceptions import ConnectorMissingCredentialError`
Fix: enable sync deleted file in airtable (#14438) ### What problem does this PR solve? Fix: enable sync deleted file in airtable ### Type of change - [x] Bug Fix (non-breaking change which fixes an issue) 2026-04-28 20:09:08 +08:00			`from common.data_source.interfaces import LoadConnector, PollConnector, SlimConnectorWithPermSync`
			`from common.data_source.models import (`
			`Document,`
			`GenerateDocumentsOutput,`
			`GenerateSlimDocumentOutput,`
			`SecondsSinceUnixEpoch,`
			`SlimDocument,`
			`)`
Feat: add Airtable connector and integration for data synchronization (#12211) ### What problem does this PR solve? change: add Airtable connector and integration for data synchronization ### Type of change - [x] New Feature (non-breaking change which adds functionality) 2025-12-25 17:50:41 +08:00			`from common.data_source.utils import extract_size_bytes, get_file_ext`

			`class AirtableClientNotSetUpError(PermissionError):`
			`def __init__(self) -> None:`
			`super().__init__(`
			`"Airtable client is not set up. Did you forget to call load_credentials()?"`
			`)`


Fix: enable sync deleted file in airtable (#14438) ### What problem does this PR solve? Fix: enable sync deleted file in airtable ### Type of change - [x] Bug Fix (non-breaking change which fixes an issue) 2026-04-28 20:09:08 +08:00			`class AirtableConnector(LoadConnector, PollConnector, SlimConnectorWithPermSync):`
Feat: add Airtable connector and integration for data synchronization (#12211) ### What problem does this PR solve? change: add Airtable connector and integration for data synchronization ### Type of change - [x] New Feature (non-breaking change which adds functionality) 2025-12-25 17:50:41 +08:00			`"""`
			`Lightweight Airtable connector.`

			`This connector ingests Airtable attachments as raw blobs without`
			`parsing file content or generating text/image sections.`
			`"""`

			`def __init__(`
			`self,`
			`base_id: str,`
			`table_name_or_id: str,`
			`batch_size: int = INDEX_BATCH_SIZE,`
			`) -> None:`
			`self.base_id = base_id`
			`self.table_name_or_id = table_name_or_id`
			`self.batch_size = batch_size`
			`self._airtable_client: AirtableApi \| None = None`
			`self.size_threshold = AIRTABLE_CONNECTOR_SIZE_THRESHOLD`

Fix: enable sync deleted file in airtable (#14438) ### What problem does this PR solve? Fix: enable sync deleted file in airtable ### Type of change - [x] Bug Fix (non-breaking change which fixes an issue) 2026-04-28 20:09:08 +08:00			`def _iter_attachment_entries(self) -> Generator[tuple[str, str, str, str, str \| None, dict[str, Any]], None, None]:`
			`if not self._airtable_client:`
			`raise ConnectorMissingCredentialError("Airtable credentials not loaded")`

			`table = self.airtable_client.table(self.base_id, self.table_name_or_id)`
			`records = table.all()`

			`logging.info(`
			`f"Starting Airtable attachment scan for table {self.table_name_or_id}, "`
			`f"{len(records)} records found."`
			`)`

			`for record in records:`
			`record_id = record.get("id")`
			`fields = record.get("fields", {})`
			`created_time = record.get("createdTime")`

			`for field_value in fields.values():`
			`if not isinstance(field_value, list):`
			`continue`

			`for attachment in field_value:`
			`filename = attachment.get("filename")`
			`attachment_id = attachment.get("id")`

			`if not record_id or not filename or not attachment_id:`
			`continue`

			`yield (`
			`record_id,`
			`attachment_id,`
			`filename,`
			`f"airtable:{record_id}:{attachment_id}",`
			`created_time,`
			`attachment,`
			`)`

Feat: add Airtable connector and integration for data synchronization (#12211) ### What problem does this PR solve? change: add Airtable connector and integration for data synchronization ### Type of change - [x] New Feature (non-breaking change which adds functionality) 2025-12-25 17:50:41 +08:00			`# -------------------------`
			`# Credentials`
			`# -------------------------`
			`def load_credentials(self, credentials: dict[str, Any]) -> dict[str, Any] \| None:`
			`self._airtable_client = AirtableApi(credentials["airtable_access_token"])`
			`return None`

			`@property`
			`def airtable_client(self) -> AirtableApi:`
			`if not self._airtable_client:`
			`raise AirtableClientNotSetUpError()`
			`return self._airtable_client`

			`# -------------------------`
			`# Core logic`
			`# -------------------------`
			`def load_from_state(self) -> GenerateDocumentsOutput:`
			`"""`
			`Fetch all Airtable records and ingest attachments as raw blobs.`

			`Each attachment is converted into a single Document(blob=...).`
			`"""`
			`if not self._airtable_client:`
			`raise ConnectorMissingCredentialError("Airtable credentials not loaded")`

			`batch: list[Document] = []`

Fix: enable sync deleted file in airtable (#14438) ### What problem does this PR solve? Fix: enable sync deleted file in airtable ### Type of change - [x] Bug Fix (non-breaking change which fixes an issue) 2026-04-28 20:09:08 +08:00			`for record_id, attachment_id, filename, doc_id, created_time, attachment in self._iter_attachment_entries():`
			`url = attachment.get("url")`
			`if not url or not created_time:`
			`continue`

			`try:`
			`resp = requests.get(url, timeout=30)`
			`resp.raise_for_status()`
			`content = resp.content`
			`except Exception:`
			`logging.exception(`
			`f"Failed to download attachment {filename} "`
			`f"(record={record_id})"`
			`)`
			`continue`
			`size_bytes = extract_size_bytes(attachment)`
			`if (`
			`self.size_threshold is not None`
			`and isinstance(size_bytes, int)`
			`and size_bytes > self.size_threshold`
			`):`
			`logging.warning(`
			`f"{filename} exceeds size threshold of {self.size_threshold}. Skipping."`
			`)`
			`continue`
			`batch.append(`
			`Document(`
			`id=doc_id,`
			`blob=content,`
			`source=DocumentSource.AIRTABLE,`
			`semantic_identifier=filename,`
			`extension=get_file_ext(filename),`
			`size_bytes=size_bytes if size_bytes else 0,`
			`doc_updated_at=datetime.strptime(created_time, "%Y-%m-%dT%H:%M:%S.%fZ").replace(tzinfo=timezone.utc)`
			`)`
			`)`

			`if len(batch) >= self.batch_size:`
			`yield batch`
			`batch = []`
Feat: add Airtable connector and integration for data synchronization (#12211) ### What problem does this PR solve? change: add Airtable connector and integration for data synchronization ### Type of change - [x] New Feature (non-breaking change which adds functionality) 2025-12-25 17:50:41 +08:00
Fix: enable sync deleted file in airtable (#14438) ### What problem does this PR solve? Fix: enable sync deleted file in airtable ### Type of change - [x] Bug Fix (non-breaking change which fixes an issue) 2026-04-28 20:09:08 +08:00			`if batch:`
			`yield batch`
Feat: add Airtable connector and integration for data synchronization (#12211) ### What problem does this PR solve? change: add Airtable connector and integration for data synchronization ### Type of change - [x] New Feature (non-breaking change which adds functionality) 2025-12-25 17:50:41 +08:00
Fix: enable sync deleted file in airtable (#14438) ### What problem does this PR solve? Fix: enable sync deleted file in airtable ### Type of change - [x] Bug Fix (non-breaking change which fixes an issue) 2026-04-28 20:09:08 +08:00			`def retrieve_all_slim_docs_perm_sync(`
			`self,`
			`callback: Any = None,`
			`) -> GenerateSlimDocumentOutput:`
			`del callback`
Feat: add Airtable connector and integration for data synchronization (#12211) ### What problem does this PR solve? change: add Airtable connector and integration for data synchronization ### Type of change - [x] New Feature (non-breaking change which adds functionality) 2025-12-25 17:50:41 +08:00
Fix: enable sync deleted file in airtable (#14438) ### What problem does this PR solve? Fix: enable sync deleted file in airtable ### Type of change - [x] Bug Fix (non-breaking change which fixes an issue) 2026-04-28 20:09:08 +08:00			`batch: list[SlimDocument] = []`
Feat: add Airtable connector and integration for data synchronization (#12211) ### What problem does this PR solve? change: add Airtable connector and integration for data synchronization ### Type of change - [x] New Feature (non-breaking change which adds functionality) 2025-12-25 17:50:41 +08:00
Fix: enable sync deleted file in airtable (#14438) ### What problem does this PR solve? Fix: enable sync deleted file in airtable ### Type of change - [x] Bug Fix (non-breaking change which fixes an issue) 2026-04-28 20:09:08 +08:00			`for _, _, _, doc_id, _, _ in self._iter_attachment_entries():`
			`batch.append(SlimDocument(id=doc_id))`
			`if len(batch) >= self.batch_size:`
			`yield batch`
			`batch = []`
Feat: add Airtable connector and integration for data synchronization (#12211) ### What problem does this PR solve? change: add Airtable connector and integration for data synchronization ### Type of change - [x] New Feature (non-breaking change which adds functionality) 2025-12-25 17:50:41 +08:00
			`if batch:`
			`yield batch`

Feat: add IMAP data source integration with configuration and sync capabilities (#12316) ### What problem does this PR solve? issue: #12217 [#12313](https://github.com/infiniflow/ragflow/issues/12313) change: add IMAP data source integration with configuration and sync capabilities ### Type of change - [x] New Feature (non-breaking change which adds functionality) 2025-12-30 17:09:13 +08:00			`def poll_source(self, start: SecondsSinceUnixEpoch, end: SecondsSinceUnixEpoch) -> Generator[list[Document], None, None]:`
			`"""Poll source to get documents"""`
			`start_dt = datetime.fromtimestamp(start, tz=timezone.utc)`
			`end_dt = datetime.fromtimestamp(end, tz=timezone.utc)`

			`for batch in self.load_from_state():`
			`filtered: list[Document] = []`

			`for doc in batch:`
			`if not doc.doc_updated_at:`
			`continue`

			`doc_dt = doc.doc_updated_at.astimezone(timezone.utc)`

			`if start_dt <= doc_dt < end_dt:`
			`filtered.append(doc)`

			`if filtered:`
			`yield filtered`

Feat: add Airtable connector and integration for data synchronization (#12211) ### What problem does this PR solve? change: add Airtable connector and integration for data synchronization ### Type of change - [x] New Feature (non-breaking change which adds functionality) 2025-12-25 17:50:41 +08:00			`if __name__ == "__main__":`
			`import os`

			`logging.basicConfig(level=logging.DEBUG)`
			`connector = AirtableConnector("xxx","xxx")`
			`connector.load_credentials({"airtable_access_token": os.environ.get("AIRTABLE_ACCESS_TOKEN")})`
			`connector.validate_connector_settings()`
			`document_batches = connector.load_from_state()`
			`try:`
			`first_batch = next(document_batches)`
			`print(f"Loaded {len(first_batch)} documents in first batch.")`
			`for doc in first_batch:`
			`print(f"- {doc.semantic_identifier} ({doc.size_bytes} bytes)")`
			`except StopIteration:`
Fix: enable sync deleted file in airtable (#14438) ### What problem does this PR solve? Fix: enable sync deleted file in airtable ### Type of change - [x] Bug Fix (non-breaking change which fixes an issue) 2026-04-28 20:09:08 +08:00			`print("No documents available in Dropbox.")`