Files
ragflow/common/data_source/models.py

320 lines
8.5 KiB
Python
Raw Normal View History

"""Data model definitions for all connectors"""
from dataclasses import dataclass
from datetime import datetime
from typing import Any, Optional, List, Sequence, NamedTuple
from typing_extensions import TypedDict, NotRequired
from pydantic import BaseModel
feat(seafile): add library and directory sync scope support (#13153) ### What problem does this PR solve? The SeaFile connector currently synchronises the entire account — every library visible to the authenticated user. This is impractical for users who only need a subset of their data indexed, especially on large SeaFile instances with many shared libraries. This PR introduces granular sync scope support, allowing users to choose between syncing their entire account, a single library, or a specific directory within a library. It also adds support for SeaFile library-scoped API tokens (`/api/v2.1/via-repo-token/` endpoints), enabling tighter access control without exposing account-level credentials. ### Type of change - [ ] Bug Fix (non-breaking change which fixes an issue) - [x] New Feature (non-breaking change which adds functionality) - [ ] Documentation Update - [ ] Refactoring - [ ] Performance Improvement - [ ] Other (please describe): ### Test ``` from seafile_connector import SeaFileConnector import logging import os logging.basicConfig(level=logging.DEBUG) URL = os.environ.get("SEAFILE_URL", "https://seafile.example.com") TOKEN = os.environ.get("SEAFILE_TOKEN", "") REPO_ID = os.environ.get("SEAFILE_REPO_ID", "") SYNC_PATH = os.environ.get("SEAFILE_SYNC_PATH", "/Documents") REPO_TOKEN = os.environ.get("SEAFILE_REPO_TOKEN", "") def _test_scope(scope, repo_id=None, sync_path=None): print(f"\n{'='*50}") print(f"Testing scope: {scope}") print(f"{'='*50}") creds = {"seafile_token": TOKEN} if TOKEN else {} if REPO_TOKEN and scope in ("library", "directory"): creds["repo_token"] = REPO_TOKEN connector = SeaFileConnector( seafile_url=URL, batch_size=5, sync_scope=scope, include_shared = False, repo_id=repo_id, sync_path=sync_path, ) connector.load_credentials(creds) connector.validate_connector_settings() count = 0 for batch in connector.load_from_state(): for doc in batch: count += 1 print(f" [{count}] {doc.semantic_identifier} " f"({doc.size_bytes} bytes, {doc.extension})") print(f"\n-> {scope} scope: {count} document(s) found.\n") # 1. Account scope if TOKEN: _test_scope("account") else: print("\nSkipping account scope (set SEAFILE_TOKEN)") # 2. Library scope if REPO_ID and (TOKEN or REPO_TOKEN): _test_scope("library", repo_id=REPO_ID) else: print("\nSkipping library scope (set SEAFILE_REPO_ID + token)") # 3. Directory scope if REPO_ID and SYNC_PATH and (TOKEN or REPO_TOKEN): _test_scope("directory", repo_id=REPO_ID, sync_path=SYNC_PATH) else: print("\nSkipping directory scope (set SEAFILE_REPO_ID + SEAFILE_SYNC_PATH + token)") ```
2026-02-28 03:24:28 +01:00
from enum import Enum
@dataclass(frozen=True)
class ExternalAccess:
# arbitrary limit to prevent excessively large permissions sets
# not internally enforced ... the caller can check this before using the instance
MAX_NUM_ENTRIES = 5000
# Emails of external users with access to the doc externally
external_user_emails: set[str]
# Names or external IDs of groups with access to the doc
external_user_group_ids: set[str]
# Whether the document is public in the external system or Onyx
is_public: bool
def __str__(self) -> str:
"""Prevent extremely long logs"""
def truncate_set(s: set[str], max_len: int = 100) -> str:
s_str = str(s)
if len(s_str) > max_len:
return f"{s_str[:max_len]}... ({len(s)} items)"
return s_str
return (
f"ExternalAccess("
f"external_user_emails={truncate_set(self.external_user_emails)}, "
f"external_user_group_ids={truncate_set(self.external_user_group_ids)}, "
f"is_public={self.is_public})"
)
@property
def num_entries(self) -> int:
return len(self.external_user_emails) + len(self.external_user_group_ids)
@classmethod
def public(cls) -> "ExternalAccess":
return cls(
external_user_emails=set(),
external_user_group_ids=set(),
is_public=True,
)
@classmethod
def empty(cls) -> "ExternalAccess":
"""
A helper function that returns an *empty* set of external user-emails and group-ids, and sets `is_public` to `False`.
This effectively makes the document in question "private" or inaccessible to anyone else.
This is especially helpful to use when you are performing permission-syncing, and some document's permissions can't
be determined (for whatever reason). Setting its `ExternalAccess` to "private" is a feasible fallback.
"""
return cls(
external_user_emails=set(),
external_user_group_ids=set(),
is_public=False,
)
class ExtractionResult(NamedTuple):
"""Structured result from text and image extraction from various file types."""
text_content: str
embedded_images: Sequence[tuple[bytes, str]]
metadata: dict[str, Any]
class TextSection(BaseModel):
"""Text section model"""
link: str
text: str
class ImageSection(BaseModel):
"""Image section model"""
link: str
image_file_id: str
class Document(BaseModel):
"""Document model"""
id: str
source: str
semantic_identifier: str
extension: str
blob: bytes
doc_updated_at: datetime
size_bytes: int
externale_access: Optional[ExternalAccess] = None
primary_owners: Optional[list] = None
metadata: Optional[dict[str, Any]] = None
doc_metadata: Optional[dict[str, Any]] = None
class BasicExpertInfo(BaseModel):
"""Expert information model"""
display_name: Optional[str] = None
first_name: Optional[str] = None
last_name: Optional[str] = None
email: Optional[str] = None
def get_semantic_name(self) -> str:
"""Get semantic name for display"""
if self.display_name:
return self.display_name
elif self.first_name and self.last_name:
return f"{self.first_name} {self.last_name}"
elif self.first_name:
return self.first_name
elif self.last_name:
return self.last_name
else:
return "Unknown"
class SlimDocument(BaseModel):
"""Simplified document model (contains only ID and permission info)"""
id: str
external_access: Optional[Any] = None
class ConnectorCheckpoint(BaseModel):
"""Connector checkpoint model"""
has_more: bool = True
class DocumentFailure(BaseModel):
"""Document processing failure information"""
document_id: str
document_link: str
class EntityFailure(BaseModel):
"""Entity processing failure information"""
entity_id: str
missed_time_range: tuple[datetime, datetime]
class ConnectorFailure(BaseModel):
"""Connector failure information"""
failed_document: Optional[DocumentFailure] = None
failed_entity: Optional[EntityFailure] = None
failure_message: str
exception: Optional[Exception] = None
model_config = {"arbitrary_types_allowed": True}
# Gmail Models
class GmailCredentials(BaseModel):
"""Gmail authentication credentials model"""
primary_admin_email: str
credentials: dict[str, Any]
class GmailThread(BaseModel):
"""Gmail thread data model"""
id: str
messages: list[dict[str, Any]]
class GmailMessage(BaseModel):
"""Gmail message data model"""
id: str
payload: dict[str, Any]
label_ids: Optional[list[str]] = None
# Notion Models
class NotionPage(BaseModel):
"""Represents a Notion Page object"""
id: str
created_time: str
last_edited_time: str
archived: bool
properties: dict[str, Any]
url: str
parent: Optional[dict[str, Any]] = None # Parent reference for path reconstruction
database_name: Optional[str] = None # Only applicable to database type pages
class NotionBlock(BaseModel):
"""Represents a Notion Block object"""
id: str # Used for the URL
text: str
prefix: str # How this block should be joined with existing text
class NotionSearchResponse(BaseModel):
"""Represents the response from the Notion Search API"""
results: list[dict[str, Any]]
next_cursor: Optional[str]
has_more: bool = False
class NotionCredentials(BaseModel):
"""Notion authentication credentials model"""
integration_token: str
# Slack Models
class ChannelTopicPurposeType(TypedDict):
"""Slack channel topic or purpose"""
value: str
creator: str
last_set: int
class ChannelType(TypedDict):
"""Slack channel"""
id: str
name: str
is_channel: bool
is_group: bool
is_im: bool
created: int
creator: str
is_archived: bool
is_general: bool
unlinked: int
name_normalized: str
is_shared: bool
is_ext_shared: bool
is_org_shared: bool
pending_shared: List[str]
is_pending_ext_shared: bool
is_member: bool
is_private: bool
is_mpim: bool
updated: int
topic: ChannelTopicPurposeType
purpose: ChannelTopicPurposeType
previous_names: List[str]
num_members: int
class AttachmentType(TypedDict):
"""Slack message attachment"""
service_name: NotRequired[str]
text: NotRequired[str]
fallback: NotRequired[str]
thumb_url: NotRequired[str]
thumb_width: NotRequired[int]
thumb_height: NotRequired[int]
id: NotRequired[int]
class BotProfileType(TypedDict):
"""Slack bot profile"""
id: NotRequired[str]
deleted: NotRequired[bool]
name: NotRequired[str]
updated: NotRequired[int]
app_id: NotRequired[str]
team_id: NotRequired[str]
class MessageType(TypedDict):
"""Slack message"""
type: str
user: str
text: str
ts: str
attachments: NotRequired[List[AttachmentType]]
bot_id: NotRequired[str]
app_id: NotRequired[str]
bot_profile: NotRequired[BotProfileType]
thread_ts: NotRequired[str]
subtype: NotRequired[str]
# Thread message list
ThreadType = List[MessageType]
class SlackCheckpoint(TypedDict):
"""Slack checkpoint"""
channel_ids: List[str] | None
channel_completion_map: dict[str, str]
current_channel: ChannelType | None
current_channel_access: Any | None
seen_thread_ts: List[str]
has_more: bool
class SlackMessageFilterReason(str):
"""Slack message filter reason"""
BOT = "bot"
DISALLOWED = "disallowed"
class ProcessedSlackMessage:
"""Processed Slack message"""
def __init__(self, doc=None, thread_or_message_ts=None, filter_reason=None, failure=None):
self.doc = doc
self.thread_or_message_ts = thread_or_message_ts
self.filter_reason = filter_reason
self.failure = failure
feat(seafile): add library and directory sync scope support (#13153) ### What problem does this PR solve? The SeaFile connector currently synchronises the entire account — every library visible to the authenticated user. This is impractical for users who only need a subset of their data indexed, especially on large SeaFile instances with many shared libraries. This PR introduces granular sync scope support, allowing users to choose between syncing their entire account, a single library, or a specific directory within a library. It also adds support for SeaFile library-scoped API tokens (`/api/v2.1/via-repo-token/` endpoints), enabling tighter access control without exposing account-level credentials. ### Type of change - [ ] Bug Fix (non-breaking change which fixes an issue) - [x] New Feature (non-breaking change which adds functionality) - [ ] Documentation Update - [ ] Refactoring - [ ] Performance Improvement - [ ] Other (please describe): ### Test ``` from seafile_connector import SeaFileConnector import logging import os logging.basicConfig(level=logging.DEBUG) URL = os.environ.get("SEAFILE_URL", "https://seafile.example.com") TOKEN = os.environ.get("SEAFILE_TOKEN", "") REPO_ID = os.environ.get("SEAFILE_REPO_ID", "") SYNC_PATH = os.environ.get("SEAFILE_SYNC_PATH", "/Documents") REPO_TOKEN = os.environ.get("SEAFILE_REPO_TOKEN", "") def _test_scope(scope, repo_id=None, sync_path=None): print(f"\n{'='*50}") print(f"Testing scope: {scope}") print(f"{'='*50}") creds = {"seafile_token": TOKEN} if TOKEN else {} if REPO_TOKEN and scope in ("library", "directory"): creds["repo_token"] = REPO_TOKEN connector = SeaFileConnector( seafile_url=URL, batch_size=5, sync_scope=scope, include_shared = False, repo_id=repo_id, sync_path=sync_path, ) connector.load_credentials(creds) connector.validate_connector_settings() count = 0 for batch in connector.load_from_state(): for doc in batch: count += 1 print(f" [{count}] {doc.semantic_identifier} " f"({doc.size_bytes} bytes, {doc.extension})") print(f"\n-> {scope} scope: {count} document(s) found.\n") # 1. Account scope if TOKEN: _test_scope("account") else: print("\nSkipping account scope (set SEAFILE_TOKEN)") # 2. Library scope if REPO_ID and (TOKEN or REPO_TOKEN): _test_scope("library", repo_id=REPO_ID) else: print("\nSkipping library scope (set SEAFILE_REPO_ID + token)") # 3. Directory scope if REPO_ID and SYNC_PATH and (TOKEN or REPO_TOKEN): _test_scope("directory", repo_id=REPO_ID, sync_path=SYNC_PATH) else: print("\nSkipping directory scope (set SEAFILE_REPO_ID + SEAFILE_SYNC_PATH + token)") ```
2026-02-28 03:24:28 +01:00
class SeafileSyncScope(str, Enum):
"""Defines how much of SeaFile to synchronise."""
ACCOUNT = "account" # All libraries the token can see
LIBRARY = "library" # A single library (repo)
DIRECTORY = "directory" # A single directory inside a library
# Type aliases for type hints
SecondsSinceUnixEpoch = float
GenerateDocumentsOutput = Any
GenerateSlimDocumentOutput = Any
CheckpointOutput = Any