common/data_source/models.py

"""Data model definitions for all connectors"""
from dataclasses import dataclass
from datetime import datetime
from typing import Any, Optional, List, Sequence, NamedTuple
from typing_extensions import TypedDict, NotRequired
from pydantic import BaseModel
from enum import Enum


@dataclass(frozen=True)
class ExternalAccess:

    # arbitrary limit to prevent excessively large permissions sets
    # not internally enforced ... the caller can check this before using the instance
    MAX_NUM_ENTRIES = 5000

    # Emails of external users with access to the doc externally
    external_user_emails: set[str]
    # Names or external IDs of groups with access to the doc
    external_user_group_ids: set[str]
    # Whether the document is public in the external system or Onyx
    is_public: bool

    def __str__(self) -> str:
        """Prevent extremely long logs"""

        def truncate_set(s: set[str], max_len: int = 100) -> str:
            s_str = str(s)
            if len(s_str) > max_len:
                return f"{s_str[:max_len]}... ({len(s)} items)"
            return s_str

        return (
            f"ExternalAccess("
            f"external_user_emails={truncate_set(self.external_user_emails)}, "
            f"external_user_group_ids={truncate_set(self.external_user_group_ids)}, "
            f"is_public={self.is_public})"
        )

    @property
    def num_entries(self) -> int:
        return len(self.external_user_emails) + len(self.external_user_group_ids)

    @classmethod
    def public(cls) -> "ExternalAccess":
        return cls(
            external_user_emails=set(),
            external_user_group_ids=set(),
            is_public=True,
        )

    @classmethod
    def empty(cls) -> "ExternalAccess":
        """
        A helper function that returns an *empty* set of external user-emails and group-ids, and sets `is_public` to `False`.
        This effectively makes the document in question "private" or inaccessible to anyone else.

        This is especially helpful to use when you are performing permission-syncing, and some document's permissions can't
        be determined (for whatever reason). Setting its `ExternalAccess` to "private" is a feasible fallback.
        """

        return cls(
            external_user_emails=set(),
            external_user_group_ids=set(),
            is_public=False,
        )


class ExtractionResult(NamedTuple):
    """Structured result from text and image extraction from various file types."""

    text_content: str
    embedded_images: Sequence[tuple[bytes, str]]
    metadata: dict[str, Any]


class TextSection(BaseModel):
    """Text section model"""
    link: str
    text: str


class ImageSection(BaseModel):
    """Image section model"""
    link: str
    image_file_id: str


class Document(BaseModel):
    """Document model"""
    id: str
    source: str
    semantic_identifier: str
    extension: str
    blob: bytes
    doc_updated_at: datetime
    size_bytes: int
    externale_access: Optional[ExternalAccess] = None
    primary_owners: Optional[list] = None
    metadata: Optional[dict[str, Any]] = None
    doc_metadata: Optional[dict[str, Any]] = None


class BasicExpertInfo(BaseModel):
    """Expert information model"""
    display_name: Optional[str] = None
    first_name: Optional[str] = None
    last_name: Optional[str] = None
    email: Optional[str] = None

    def get_semantic_name(self) -> str:
        """Get semantic name for display"""
        if self.display_name:
            return self.display_name
        elif self.first_name and self.last_name:
            return f"{self.first_name} {self.last_name}"
        elif self.first_name:
            return self.first_name
        elif self.last_name:
            return self.last_name
        else:
            return "Unknown"


class SlimDocument(BaseModel):
    """Simplified document model (contains only ID and permission info)"""
    id: str
    external_access: Optional[Any] = None


class ConnectorCheckpoint(BaseModel):
    """Connector checkpoint model"""
    has_more: bool = True


class DocumentFailure(BaseModel):
    """Document processing failure information"""
    document_id: str
    document_link: str


class EntityFailure(BaseModel):
    """Entity processing failure information"""
    entity_id: str
    missed_time_range: tuple[datetime, datetime]


class ConnectorFailure(BaseModel):
    """Connector failure information"""
    failed_document: Optional[DocumentFailure] = None
    failed_entity: Optional[EntityFailure] = None
    failure_message: str
    exception: Optional[Exception] = None

    model_config = {"arbitrary_types_allowed": True}


# Gmail Models
class GmailCredentials(BaseModel):
    """Gmail authentication credentials model"""
    primary_admin_email: str
    credentials: dict[str, Any]


class GmailThread(BaseModel):
    """Gmail thread data model"""
    id: str
    messages: list[dict[str, Any]]


class GmailMessage(BaseModel):
    """Gmail message data model"""
    id: str
    payload: dict[str, Any]
    label_ids: Optional[list[str]] = None


# Notion Models
class NotionPage(BaseModel):
    """Represents a Notion Page object"""
    id: str
    created_time: str
    last_edited_time: str
    archived: bool
    properties: dict[str, Any]
    url: str
    parent: Optional[dict[str, Any]] = None  # Parent reference for path reconstruction
    database_name: Optional[str] = None  # Only applicable to database type pages


class NotionBlock(BaseModel):
    """Represents a Notion Block object"""
    id: str  # Used for the URL
    text: str
    prefix: str  # How this block should be joined with existing text


class NotionSearchResponse(BaseModel):
    """Represents the response from the Notion Search API"""
    results: list[dict[str, Any]]
    next_cursor: Optional[str]
    has_more: bool = False


class NotionCredentials(BaseModel):
    """Notion authentication credentials model"""
    integration_token: str


# Slack Models
class ChannelTopicPurposeType(TypedDict):
    """Slack channel topic or purpose"""
    value: str
    creator: str
    last_set: int


class ChannelType(TypedDict):
    """Slack channel"""
    id: str
    name: str
    is_channel: bool
    is_group: bool
    is_im: bool
    created: int
    creator: str
    is_archived: bool
    is_general: bool
    unlinked: int
    name_normalized: str
    is_shared: bool
    is_ext_shared: bool
    is_org_shared: bool
    pending_shared: List[str]
    is_pending_ext_shared: bool
    is_member: bool
    is_private: bool
    is_mpim: bool
    updated: int
    topic: ChannelTopicPurposeType
    purpose: ChannelTopicPurposeType
    previous_names: List[str]
    num_members: int


class AttachmentType(TypedDict):
    """Slack message attachment"""
    service_name: NotRequired[str]
    text: NotRequired[str]
    fallback: NotRequired[str]
    thumb_url: NotRequired[str]
    thumb_width: NotRequired[int]
    thumb_height: NotRequired[int]
    id: NotRequired[int]


class BotProfileType(TypedDict):
    """Slack bot profile"""
    id: NotRequired[str]
    deleted: NotRequired[bool]
    name: NotRequired[str]
    updated: NotRequired[int]
    app_id: NotRequired[str]
    team_id: NotRequired[str]


class MessageType(TypedDict):
    """Slack message"""
    type: str
    user: str
    text: str
    ts: str
    attachments: NotRequired[List[AttachmentType]]
    bot_id: NotRequired[str]
    app_id: NotRequired[str]
    bot_profile: NotRequired[BotProfileType]
    thread_ts: NotRequired[str]
    subtype: NotRequired[str]


# Thread message list
ThreadType = List[MessageType]


class SlackCheckpoint(TypedDict):
    """Slack checkpoint"""
    channel_ids: List[str] | None
    channel_completion_map: dict[str, str]
    current_channel: ChannelType | None
    current_channel_access: Any | None
    seen_thread_ts: List[str]
    has_more: bool


class SlackMessageFilterReason(str):
    """Slack message filter reason"""
    BOT = "bot"
    DISALLOWED = "disallowed"


class ProcessedSlackMessage:
    """Processed Slack message"""
    def __init__(self, doc=None, thread_or_message_ts=None, filter_reason=None, failure=None):
        self.doc = doc
        self.thread_or_message_ts = thread_or_message_ts
        self.filter_reason = filter_reason
        self.failure = failure


class SeafileSyncScope(str, Enum):
    """Defines how much of SeaFile to synchronise."""
    ACCOUNT = "account"      # All libraries the token can see
    LIBRARY = "library"      # A single library (repo)
    DIRECTORY = "directory"  # A single directory inside a library
# Type aliases for type hints
SecondsSinceUnixEpoch = float
GenerateDocumentsOutput = Any
GenerateSlimDocumentOutput = Any
CheckpointOutput = Any
Feat: Support multiple data sources synchronizations (#10954) ### What problem does this PR solve? #10953 ### Type of change - [x] New Feature (non-breaking change which adds functionality) 2025-11-03 19:59:18 +08:00			`"""Data model definitions for all connectors"""`
			`from dataclasses import dataclass`
			`from datetime import datetime`
Feat: refine Confluence connector (#10994) ### What problem does this PR solve? Refine Confluence connector. #10953 ### Type of change - [x] New Feature (non-breaking change which adds functionality) - [x] Refactoring 2025-11-04 17:29:11 +08:00			`from typing import Any, Optional, List, Sequence, NamedTuple`
			`from typing_extensions import TypedDict, NotRequired`
Feat: Support multiple data sources synchronizations (#10954) ### What problem does this PR solve? #10953 ### Type of change - [x] New Feature (non-breaking change which adds functionality) 2025-11-03 19:59:18 +08:00			`from pydantic import BaseModel`
feat(seafile): add library and directory sync scope support (#13153) ### What problem does this PR solve? The SeaFile connector currently synchronises the entire account — every library visible to the authenticated user. This is impractical for users who only need a subset of their data indexed, especially on large SeaFile instances with many shared libraries. This PR introduces granular sync scope support, allowing users to choose between syncing their entire account, a single library, or a specific directory within a library. It also adds support for SeaFile library-scoped API tokens (`/api/v2.1/via-repo-token/` endpoints), enabling tighter access control without exposing account-level credentials. ### Type of change - [ ] Bug Fix (non-breaking change which fixes an issue) - [x] New Feature (non-breaking change which adds functionality) - [ ] Documentation Update - [ ] Refactoring - [ ] Performance Improvement - [ ] Other (please describe): ### Test ``` from seafile_connector import SeaFileConnector import logging import os logging.basicConfig(level=logging.DEBUG) URL = os.environ.get("SEAFILE_URL", "https://seafile.example.com") TOKEN = os.environ.get("SEAFILE_TOKEN", "") REPO_ID = os.environ.get("SEAFILE_REPO_ID", "") SYNC_PATH = os.environ.get("SEAFILE_SYNC_PATH", "/Documents") REPO_TOKEN = os.environ.get("SEAFILE_REPO_TOKEN", "") def _test_scope(scope, repo_id=None, sync_path=None): print(f"\n{'='50}") print(f"Testing scope: {scope}") print(f"{'='50}") creds = {"seafile_token": TOKEN} if TOKEN else {} if REPO_TOKEN and scope in ("library", "directory"): creds["repo_token"] = REPO_TOKEN connector = SeaFileConnector( seafile_url=URL, batch_size=5, sync_scope=scope, include_shared = False, repo_id=repo_id, sync_path=sync_path, ) connector.load_credentials(creds) connector.validate_connector_settings() count = 0 for batch in connector.load_from_state(): for doc in batch: count += 1 print(f" [{count}] {doc.semantic_identifier} " f"({doc.size_bytes} bytes, {doc.extension})") print(f"\n-> {scope} scope: {count} document(s) found.\n") # 1. Account scope if TOKEN: _test_scope("account") else: print("\nSkipping account scope (set SEAFILE_TOKEN)") # 2. Library scope if REPO_ID and (TOKEN or REPO_TOKEN): _test_scope("library", repo_id=REPO_ID) else: print("\nSkipping library scope (set SEAFILE_REPO_ID + token)") # 3. Directory scope if REPO_ID and SYNC_PATH and (TOKEN or REPO_TOKEN): _test_scope("directory", repo_id=REPO_ID, sync_path=SYNC_PATH) else: print("\nSkipping directory scope (set SEAFILE_REPO_ID + SEAFILE_SYNC_PATH + token)") ``` 2026-02-28 03:24:28 +01:00			`from enum import Enum`
Feat: Support multiple data sources synchronizations (#10954) ### What problem does this PR solve? #10953 ### Type of change - [x] New Feature (non-breaking change which adds functionality) 2025-11-03 19:59:18 +08:00

			`@dataclass(frozen=True)`
			`class ExternalAccess:`

			`# arbitrary limit to prevent excessively large permissions sets`
			`# not internally enforced ... the caller can check this before using the instance`
			`MAX_NUM_ENTRIES = 5000`

			`# Emails of external users with access to the doc externally`
			`external_user_emails: set[str]`
			`# Names or external IDs of groups with access to the doc`
			`external_user_group_ids: set[str]`
			`# Whether the document is public in the external system or Onyx`
			`is_public: bool`

			`def __str__(self) -> str:`
			`"""Prevent extremely long logs"""`

			`def truncate_set(s: set[str], max_len: int = 100) -> str:`
			`s_str = str(s)`
			`if len(s_str) > max_len:`
			`return f"{s_str[:max_len]}... ({len(s)} items)"`
			`return s_str`

			`return (`
			`f"ExternalAccess("`
			`f"external_user_emails={truncate_set(self.external_user_emails)}, "`
			`f"external_user_group_ids={truncate_set(self.external_user_group_ids)}, "`
			`f"is_public={self.is_public})"`
			`)`

			`@property`
			`def num_entries(self) -> int:`
			`return len(self.external_user_emails) + len(self.external_user_group_ids)`

			`@classmethod`
			`def public(cls) -> "ExternalAccess":`
			`return cls(`
			`external_user_emails=set(),`
			`external_user_group_ids=set(),`
			`is_public=True,`
			`)`

			`@classmethod`
			`def empty(cls) -> "ExternalAccess":`
			`"""`
			A helper function that returns an empty set of external user-emails and group-ids, and sets `is_public` to `False`.
			`This effectively makes the document in question "private" or inaccessible to anyone else.`

Fix errors (#11804) ### What problem does this PR solve? 1. typos 2. grammar errors. ### Type of change - [x] Refactoring Signed-off-by: Jin Hai <haijin.chn@gmail.com> 2025-12-08 12:21:18 +08:00			`This is especially helpful to use when you are performing permission-syncing, and some document's permissions can't`
			be determined (for whatever reason). Setting its `ExternalAccess` to "private" is a feasible fallback.
Feat: Support multiple data sources synchronizations (#10954) ### What problem does this PR solve? #10953 ### Type of change - [x] New Feature (non-breaking change which adds functionality) 2025-11-03 19:59:18 +08:00			`"""`

			`return cls(`
			`external_user_emails=set(),`
			`external_user_group_ids=set(),`
			`is_public=False,`
			`)`


			`class ExtractionResult(NamedTuple):`
			`"""Structured result from text and image extraction from various file types."""`

			`text_content: str`
			`embedded_images: Sequence[tuple[bytes, str]]`
			`metadata: dict[str, Any]`


			`class TextSection(BaseModel):`
			`"""Text section model"""`
			`link: str`
			`text: str`


			`class ImageSection(BaseModel):`
			`"""Image section model"""`
			`link: str`
			`image_file_id: str`


			`class Document(BaseModel):`
			`"""Document model"""`
			`id: str`
			`source: str`
			`semantic_identifier: str`
			`extension: str`
			`blob: bytes`
			`doc_updated_at: datetime`
			`size_bytes: int`
Feat: GitHub connector (#12314) ### What problem does this PR solve? Feat: GitHub connector ### Type of change - [x] New Feature (non-breaking change which adds functionality) 2025-12-30 15:09:52 +08:00			`externale_access: Optional[ExternalAccess] = None`
Feat: add Airtable connector and integration for data synchronization (#12211) ### What problem does this PR solve? change: add Airtable connector and integration for data synchronization ### Type of change - [x] New Feature (non-breaking change which adds functionality) 2025-12-25 17:50:41 +08:00			`primary_owners: Optional[list] = None`
feat: improve metadata handling in connector service (#11421) ### What problem does this PR solve? - Update sync data source to handle metadata properly ### Type of change - [x] New Feature (non-breaking change which adds functionality) --------- Co-authored-by: Kevin Hu <kevinhu.sh@gmail.com> 2025-11-26 12:55:48 +01:00			`metadata: Optional[dict[str, Any]] = None`
Feat: GitHub connector (#12314) ### What problem does this PR solve? Feat: GitHub connector ### Type of change - [x] New Feature (non-breaking change which adds functionality) 2025-12-30 15:09:52 +08:00			`doc_metadata: Optional[dict[str, Any]] = None`
Feat: Support multiple data sources synchronizations (#10954) ### What problem does this PR solve? #10953 ### Type of change - [x] New Feature (non-breaking change which adds functionality) 2025-11-03 19:59:18 +08:00

			`class BasicExpertInfo(BaseModel):`
			`"""Expert information model"""`
			`display_name: Optional[str] = None`
			`first_name: Optional[str] = None`
			`last_name: Optional[str] = None`
			`email: Optional[str] = None`

			`def get_semantic_name(self) -> str:`
			`"""Get semantic name for display"""`
			`if self.display_name:`
			`return self.display_name`
			`elif self.first_name and self.last_name:`
			`return f"{self.first_name} {self.last_name}"`
			`elif self.first_name:`
			`return self.first_name`
			`elif self.last_name:`
			`return self.last_name`
			`else:`
			`return "Unknown"`


			`class SlimDocument(BaseModel):`
			`"""Simplified document model (contains only ID and permission info)"""`
			`id: str`
			`external_access: Optional[Any] = None`


			`class ConnectorCheckpoint(BaseModel):`
			`"""Connector checkpoint model"""`
			`has_more: bool = True`


			`class DocumentFailure(BaseModel):`
			`"""Document processing failure information"""`
			`document_id: str`
			`document_link: str`


			`class EntityFailure(BaseModel):`
			`"""Entity processing failure information"""`
			`entity_id: str`
			`missed_time_range: tuple[datetime, datetime]`


			`class ConnectorFailure(BaseModel):`
			`"""Connector failure information"""`
			`failed_document: Optional[DocumentFailure] = None`
			`failed_entity: Optional[EntityFailure] = None`
			`failure_message: str`
			`exception: Optional[Exception] = None`

			`model_config = {"arbitrary_types_allowed": True}`


			`# Gmail Models`
			`class GmailCredentials(BaseModel):`
			`"""Gmail authentication credentials model"""`
			`primary_admin_email: str`
			`credentials: dict[str, Any]`


			`class GmailThread(BaseModel):`
			`"""Gmail thread data model"""`
			`id: str`
			`messages: list[dict[str, Any]]`


			`class GmailMessage(BaseModel):`
			`"""Gmail message data model"""`
			`id: str`
			`payload: dict[str, Any]`
			`label_ids: Optional[list[str]] = None`


			`# Notion Models`
			`class NotionPage(BaseModel):`
			`"""Represents a Notion Page object"""`
			`id: str`
			`created_time: str`
			`last_edited_time: str`
			`archived: bool`
			`properties: dict[str, Any]`
			`url: str`
Feat: use filepath for files with the same name for all data source types (#11819) ### What problem does this PR solve? When there are multiple files with the same name the file would just duplicate, making it hard to distinguish between the different files. Now if there are multiple files with the same name, they will be named after their folder path in the storage unit. This was done for the webdav connector and with this PR also for Notion, Confluence and S3 Storage. ### Type of change - [x] New Feature (non-breaking change which adds functionality) Contribution by RAGcon GmbH, visit us [here](https://www.ragcon.ai/) 2025-12-18 10:42:43 +01:00			`parent: Optional[dict[str, Any]] = None # Parent reference for path reconstruction`
Feat: Support multiple data sources synchronizations (#10954) ### What problem does this PR solve? #10953 ### Type of change - [x] New Feature (non-breaking change which adds functionality) 2025-11-03 19:59:18 +08:00			`database_name: Optional[str] = None # Only applicable to database type pages`


			`class NotionBlock(BaseModel):`
			`"""Represents a Notion Block object"""`
			`id: str # Used for the URL`
			`text: str`
			`prefix: str # How this block should be joined with existing text`


			`class NotionSearchResponse(BaseModel):`
			`"""Represents the response from the Notion Search API"""`
			`results: list[dict[str, Any]]`
			`next_cursor: Optional[str]`
			`has_more: bool = False`


			`class NotionCredentials(BaseModel):`
			`"""Notion authentication credentials model"""`
			`integration_token: str`


			`# Slack Models`
			`class ChannelTopicPurposeType(TypedDict):`
			`"""Slack channel topic or purpose"""`
			`value: str`
			`creator: str`
			`last_set: int`


			`class ChannelType(TypedDict):`
			`"""Slack channel"""`
			`id: str`
			`name: str`
			`is_channel: bool`
			`is_group: bool`
			`is_im: bool`
			`created: int`
			`creator: str`
			`is_archived: bool`
			`is_general: bool`
			`unlinked: int`
			`name_normalized: str`
			`is_shared: bool`
			`is_ext_shared: bool`
			`is_org_shared: bool`
			`pending_shared: List[str]`
			`is_pending_ext_shared: bool`
			`is_member: bool`
			`is_private: bool`
			`is_mpim: bool`
			`updated: int`
			`topic: ChannelTopicPurposeType`
			`purpose: ChannelTopicPurposeType`
			`previous_names: List[str]`
			`num_members: int`


			`class AttachmentType(TypedDict):`
			`"""Slack message attachment"""`
			`service_name: NotRequired[str]`
			`text: NotRequired[str]`
			`fallback: NotRequired[str]`
			`thumb_url: NotRequired[str]`
			`thumb_width: NotRequired[int]`
			`thumb_height: NotRequired[int]`
			`id: NotRequired[int]`


			`class BotProfileType(TypedDict):`
			`"""Slack bot profile"""`
			`id: NotRequired[str]`
			`deleted: NotRequired[bool]`
			`name: NotRequired[str]`
			`updated: NotRequired[int]`
			`app_id: NotRequired[str]`
			`team_id: NotRequired[str]`


			`class MessageType(TypedDict):`
			`"""Slack message"""`
			`type: str`
			`user: str`
			`text: str`
			`ts: str`
			`attachments: NotRequired[List[AttachmentType]]`
			`bot_id: NotRequired[str]`
			`app_id: NotRequired[str]`
			`bot_profile: NotRequired[BotProfileType]`
			`thread_ts: NotRequired[str]`
			`subtype: NotRequired[str]`


			`# Thread message list`
			`ThreadType = List[MessageType]`


			`class SlackCheckpoint(TypedDict):`
			`"""Slack checkpoint"""`
			`channel_ids: List[str] \| None`
			`channel_completion_map: dict[str, str]`
			`current_channel: ChannelType \| None`
			`current_channel_access: Any \| None`
			`seen_thread_ts: List[str]`
			`has_more: bool`


			`class SlackMessageFilterReason(str):`
			`"""Slack message filter reason"""`
			`BOT = "bot"`
			`DISALLOWED = "disallowed"`


			`class ProcessedSlackMessage:`
			`"""Processed Slack message"""`
			`def __init__(self, doc=None, thread_or_message_ts=None, filter_reason=None, failure=None):`
			`self.doc = doc`
			`self.thread_or_message_ts = thread_or_message_ts`
			`self.filter_reason = filter_reason`
			`self.failure = failure`


feat(seafile): add library and directory sync scope support (#13153) ### What problem does this PR solve? The SeaFile connector currently synchronises the entire account — every library visible to the authenticated user. This is impractical for users who only need a subset of their data indexed, especially on large SeaFile instances with many shared libraries. This PR introduces granular sync scope support, allowing users to choose between syncing their entire account, a single library, or a specific directory within a library. It also adds support for SeaFile library-scoped API tokens (`/api/v2.1/via-repo-token/` endpoints), enabling tighter access control without exposing account-level credentials. ### Type of change - [ ] Bug Fix (non-breaking change which fixes an issue) - [x] New Feature (non-breaking change which adds functionality) - [ ] Documentation Update - [ ] Refactoring - [ ] Performance Improvement - [ ] Other (please describe): ### Test ``` from seafile_connector import SeaFileConnector import logging import os logging.basicConfig(level=logging.DEBUG) URL = os.environ.get("SEAFILE_URL", "https://seafile.example.com") TOKEN = os.environ.get("SEAFILE_TOKEN", "") REPO_ID = os.environ.get("SEAFILE_REPO_ID", "") SYNC_PATH = os.environ.get("SEAFILE_SYNC_PATH", "/Documents") REPO_TOKEN = os.environ.get("SEAFILE_REPO_TOKEN", "") def _test_scope(scope, repo_id=None, sync_path=None): print(f"\n{'='50}") print(f"Testing scope: {scope}") print(f"{'='50}") creds = {"seafile_token": TOKEN} if TOKEN else {} if REPO_TOKEN and scope in ("library", "directory"): creds["repo_token"] = REPO_TOKEN connector = SeaFileConnector( seafile_url=URL, batch_size=5, sync_scope=scope, include_shared = False, repo_id=repo_id, sync_path=sync_path, ) connector.load_credentials(creds) connector.validate_connector_settings() count = 0 for batch in connector.load_from_state(): for doc in batch: count += 1 print(f" [{count}] {doc.semantic_identifier} " f"({doc.size_bytes} bytes, {doc.extension})") print(f"\n-> {scope} scope: {count} document(s) found.\n") # 1. Account scope if TOKEN: _test_scope("account") else: print("\nSkipping account scope (set SEAFILE_TOKEN)") # 2. Library scope if REPO_ID and (TOKEN or REPO_TOKEN): _test_scope("library", repo_id=REPO_ID) else: print("\nSkipping library scope (set SEAFILE_REPO_ID + token)") # 3. Directory scope if REPO_ID and SYNC_PATH and (TOKEN or REPO_TOKEN): _test_scope("directory", repo_id=REPO_ID, sync_path=SYNC_PATH) else: print("\nSkipping directory scope (set SEAFILE_REPO_ID + SEAFILE_SYNC_PATH + token)") ``` 2026-02-28 03:24:28 +01:00			`class SeafileSyncScope(str, Enum):`
			`"""Defines how much of SeaFile to synchronise."""`
			`ACCOUNT = "account" # All libraries the token can see`
			`LIBRARY = "library" # A single library (repo)`
			`DIRECTORY = "directory" # A single directory inside a library`
Feat: Support multiple data sources synchronizations (#10954) ### What problem does this PR solve? #10953 ### Type of change - [x] New Feature (non-breaking change which adds functionality) 2025-11-03 19:59:18 +08:00			`# Type aliases for type hints`
			`SecondsSinceUnixEpoch = float`
			`GenerateDocumentsOutput = Any`
			`GenerateSlimDocumentOutput = Any`
Feat: add initial Google Drive connector support (#11147) ### What problem does this PR solve? This feature is primarily ported from the [Onyx](https://github.com/onyx-dot-app/onyx) project with necessary modifications. Thanks for such a brilliant project. Minor: consistently use `google_drive` rather than `google_driver`. <img width="566" height="731" alt="image" src="https://github.com/user-attachments/assets/6f64e70e-881e-42c7-b45f-809d3e0024a4" /> <img width="904" height="830" alt="image" src="https://github.com/user-attachments/assets/dfa7d1ef-819a-4a82-8c52-0999f48ed4a6" /> <img width="911" height="869" alt="image" src="https://github.com/user-attachments/assets/39e792fb-9fbe-4f3d-9b3c-b2265186bc22" /> <img width="947" height="323" alt="image" src="https://github.com/user-attachments/assets/27d70e96-d9c0-42d9-8c89-276919b6d61d" /> ### Type of change - [x] New Feature (non-breaking change which adds functionality) 2025-11-10 19:15:02 +08:00			`CheckpointOutput = Any`