2025-11-03 19:59:18 +08:00
|
|
|
"""Data model definitions for all connectors"""
|
|
|
|
|
from dataclasses import dataclass
|
|
|
|
|
from datetime import datetime
|
2025-11-04 17:29:11 +08:00
|
|
|
from typing import Any, Optional, List, Sequence, NamedTuple
|
|
|
|
|
from typing_extensions import TypedDict, NotRequired
|
2025-11-03 19:59:18 +08:00
|
|
|
from pydantic import BaseModel
|
feat(seafile): add library and directory sync scope support (#13153)
### What problem does this PR solve?
The SeaFile connector currently synchronises the entire account — every
library
visible to the authenticated user. This is impractical for users who
only need
a subset of their data indexed, especially on large SeaFile instances
with many
shared libraries.
This PR introduces granular sync scope support, allowing users to choose
between
syncing their entire account, a single library, or a specific directory
within a
library. It also adds support for SeaFile library-scoped API tokens
(`/api/v2.1/via-repo-token/` endpoints), enabling tighter access control
without
exposing account-level credentials.
### Type of change
- [ ] Bug Fix (non-breaking change which fixes an issue)
- [x] New Feature (non-breaking change which adds functionality)
- [ ] Documentation Update
- [ ] Refactoring
- [ ] Performance Improvement
- [ ] Other (please describe):
### Test
```
from seafile_connector import SeaFileConnector
import logging
import os
logging.basicConfig(level=logging.DEBUG)
URL = os.environ.get("SEAFILE_URL", "https://seafile.example.com")
TOKEN = os.environ.get("SEAFILE_TOKEN", "")
REPO_ID = os.environ.get("SEAFILE_REPO_ID", "")
SYNC_PATH = os.environ.get("SEAFILE_SYNC_PATH", "/Documents")
REPO_TOKEN = os.environ.get("SEAFILE_REPO_TOKEN", "")
def _test_scope(scope, repo_id=None, sync_path=None):
print(f"\n{'='*50}")
print(f"Testing scope: {scope}")
print(f"{'='*50}")
creds = {"seafile_token": TOKEN} if TOKEN else {}
if REPO_TOKEN and scope in ("library", "directory"):
creds["repo_token"] = REPO_TOKEN
connector = SeaFileConnector(
seafile_url=URL,
batch_size=5,
sync_scope=scope,
include_shared = False,
repo_id=repo_id,
sync_path=sync_path,
)
connector.load_credentials(creds)
connector.validate_connector_settings()
count = 0
for batch in connector.load_from_state():
for doc in batch:
count += 1
print(f" [{count}] {doc.semantic_identifier} "
f"({doc.size_bytes} bytes, {doc.extension})")
print(f"\n-> {scope} scope: {count} document(s) found.\n")
# 1. Account scope
if TOKEN:
_test_scope("account")
else:
print("\nSkipping account scope (set SEAFILE_TOKEN)")
# 2. Library scope
if REPO_ID and (TOKEN or REPO_TOKEN):
_test_scope("library", repo_id=REPO_ID)
else:
print("\nSkipping library scope (set SEAFILE_REPO_ID + token)")
# 3. Directory scope
if REPO_ID and SYNC_PATH and (TOKEN or REPO_TOKEN):
_test_scope("directory", repo_id=REPO_ID, sync_path=SYNC_PATH)
else:
print("\nSkipping directory scope (set SEAFILE_REPO_ID + SEAFILE_SYNC_PATH + token)")
```
2026-02-28 03:24:28 +01:00
|
|
|
from enum import Enum
|
2025-11-03 19:59:18 +08:00
|
|
|
|
|
|
|
|
|
|
|
|
|
@dataclass(frozen=True)
|
|
|
|
|
class ExternalAccess:
|
|
|
|
|
|
|
|
|
|
# arbitrary limit to prevent excessively large permissions sets
|
|
|
|
|
# not internally enforced ... the caller can check this before using the instance
|
|
|
|
|
MAX_NUM_ENTRIES = 5000
|
|
|
|
|
|
|
|
|
|
# Emails of external users with access to the doc externally
|
|
|
|
|
external_user_emails: set[str]
|
|
|
|
|
# Names or external IDs of groups with access to the doc
|
|
|
|
|
external_user_group_ids: set[str]
|
|
|
|
|
# Whether the document is public in the external system or Onyx
|
|
|
|
|
is_public: bool
|
|
|
|
|
|
|
|
|
|
def __str__(self) -> str:
|
|
|
|
|
"""Prevent extremely long logs"""
|
|
|
|
|
|
|
|
|
|
def truncate_set(s: set[str], max_len: int = 100) -> str:
|
|
|
|
|
s_str = str(s)
|
|
|
|
|
if len(s_str) > max_len:
|
|
|
|
|
return f"{s_str[:max_len]}... ({len(s)} items)"
|
|
|
|
|
return s_str
|
|
|
|
|
|
|
|
|
|
return (
|
|
|
|
|
f"ExternalAccess("
|
|
|
|
|
f"external_user_emails={truncate_set(self.external_user_emails)}, "
|
|
|
|
|
f"external_user_group_ids={truncate_set(self.external_user_group_ids)}, "
|
|
|
|
|
f"is_public={self.is_public})"
|
|
|
|
|
)
|
|
|
|
|
|
|
|
|
|
@property
|
|
|
|
|
def num_entries(self) -> int:
|
|
|
|
|
return len(self.external_user_emails) + len(self.external_user_group_ids)
|
|
|
|
|
|
|
|
|
|
@classmethod
|
|
|
|
|
def public(cls) -> "ExternalAccess":
|
|
|
|
|
return cls(
|
|
|
|
|
external_user_emails=set(),
|
|
|
|
|
external_user_group_ids=set(),
|
|
|
|
|
is_public=True,
|
|
|
|
|
)
|
|
|
|
|
|
|
|
|
|
@classmethod
|
|
|
|
|
def empty(cls) -> "ExternalAccess":
|
|
|
|
|
"""
|
|
|
|
|
A helper function that returns an *empty* set of external user-emails and group-ids, and sets `is_public` to `False`.
|
|
|
|
|
This effectively makes the document in question "private" or inaccessible to anyone else.
|
|
|
|
|
|
2025-12-08 12:21:18 +08:00
|
|
|
This is especially helpful to use when you are performing permission-syncing, and some document's permissions can't
|
|
|
|
|
be determined (for whatever reason). Setting its `ExternalAccess` to "private" is a feasible fallback.
|
2025-11-03 19:59:18 +08:00
|
|
|
"""
|
|
|
|
|
|
|
|
|
|
return cls(
|
|
|
|
|
external_user_emails=set(),
|
|
|
|
|
external_user_group_ids=set(),
|
|
|
|
|
is_public=False,
|
|
|
|
|
)
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
class ExtractionResult(NamedTuple):
|
|
|
|
|
"""Structured result from text and image extraction from various file types."""
|
|
|
|
|
|
|
|
|
|
text_content: str
|
|
|
|
|
embedded_images: Sequence[tuple[bytes, str]]
|
|
|
|
|
metadata: dict[str, Any]
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
class TextSection(BaseModel):
|
|
|
|
|
"""Text section model"""
|
|
|
|
|
link: str
|
|
|
|
|
text: str
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
class ImageSection(BaseModel):
|
|
|
|
|
"""Image section model"""
|
|
|
|
|
link: str
|
|
|
|
|
image_file_id: str
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
class Document(BaseModel):
|
|
|
|
|
"""Document model"""
|
|
|
|
|
id: str
|
|
|
|
|
source: str
|
|
|
|
|
semantic_identifier: str
|
|
|
|
|
extension: str
|
|
|
|
|
blob: bytes
|
|
|
|
|
doc_updated_at: datetime
|
|
|
|
|
size_bytes: int
|
2025-12-30 15:09:52 +08:00
|
|
|
externale_access: Optional[ExternalAccess] = None
|
2025-12-25 17:50:41 +08:00
|
|
|
primary_owners: Optional[list] = None
|
2025-11-26 12:55:48 +01:00
|
|
|
metadata: Optional[dict[str, Any]] = None
|
2025-12-30 15:09:52 +08:00
|
|
|
doc_metadata: Optional[dict[str, Any]] = None
|
feat(connectors): ETag-based bypass for incremental S3 ingestion (#14628) (#14677)
### What problem does this PR solve?
S3-family connector syncs currently re-download every in-window object
just so we can compute `xxhash128(blob)` and compare against
`Document.content_hash`. Anything that bumps `LastModified` without
changing bytes (`aws s3 cp` touches, bucket re-encryption, etc.) pays
full bandwidth and re-parses files that didn't actually change. #14628
covers the broader incremental-ingestion redesign; this PR is the first
slice.
The fix is a pre-listing short-circuit. `BlobStorageConnector` (S3 / R2
/ GCS / OCI / S3-compat) now implements a new `FingerprintConnector`
interface: `list_keys()` paginates `list_objects_v2` and yields
`KeyRecord(key, fingerprint)` where `fingerprint = xxhash128(ETag)`. The
orchestrator joins those against the connector's existing `{doc_id:
content_hash}` map and only calls `get_value(key)` when the fingerprint
differs. Unchanged keys are skipped entirely — no `GetObject`, no
re-parse.
No DDL. xxhash128(ETag) is 32 hex chars and reuses the existing
`Document.content_hash` column per @yingfeng's suggestion; the connector
decides at listing time whether to populate it. Local uploads and
connectors that don't opt in fall through to the existing post-download
`xxhash128(blob)` path with no behavior change.
This is PR-1 of a 4-PR series — full design lives on #14628. Subsequent
PRs extend tier 1 to local FS / WebDAV / Dropbox / Seafile / RDBMS
(PR-2), wire up tier 2 cursor connectors with `SyncLogs.next_checkpoint`
(PR-3), and unify deletion via `KeyRecord(deleted=True)` reconciliation
(PR-4). Holding those back keeps this PR additive and reviewable on its
own.
#### Files touched
- `common/data_source/models.py` — new `KeyRecord`; optional
`fingerprint` on `Document`
- `common/data_source/interfaces.py` — `IncrementalCapability` enum,
`FingerprintConnector` ABC
- `common/data_source/blob_connector.py` — `BlobStorageConnector`
implements `FingerprintConnector`; per-object download factored into
`_build_document_from_obj()` so `_yield_blob_objects`, `list_keys`,
`get_value` all share it
- `rag/svr/sync_data_source.py` —
`_BlobLikeBase._fingerprint_filtered_generator` does the bypass loop;
`_run_task_logic` plumbs `doc.fingerprint` into the upload dict
- `api/db/services/document_service.py` —
`list_id_content_hash_map_by_kb_and_source_type()` helper
- `api/db/services/connector_service.py` + `file_service.py` —
fingerprint flows through `duplicate_and_parse → upload_document` and
lands in `content_hash`
- `test/unit_test/common/test_blob_connector_fingerprint.py` — 14 tests
covering ETag normalization (single-part, multipart, quoted, empty),
`list_keys()` not calling `GetObject`, `get_value()` materializing with
fingerprint, deterministic/stable fingerprints, and the bypass loop
asserting `GetObject` is *not* called on a match
#### Worth flagging for review
Old `_BlobLikeBase._generate` called `poll_source(start, now)` with a
`LastModified` window when `poll_range_start` was set. New code uses
`_fingerprint_filtered_generator` (full bucket listing + fingerprint
compare) outside of explicit `reindex=1`. Strictly better for
unchanged-bucket cases since it skips `GetObject`, but it does mean
every sync now does a full `list_objects_v2` paginate. Should still be
cheap for most buckets — flagging in case anyone has a very large bucket
where the time-window filter was meaningful.
On migration: existing rows have `content_hash = xxhash128(blob)` from
the old code. The first sync after this lands sees ETag-derived
fingerprints that don't match, re-fetches every object once, and writes
the new fingerprint. From the second sync onward the bypass works as
expected. "Slow day one, fast every day after." A `fingerprint_backfill:
trust` opt-out is sketched in the design doc but not in this PR.
#### Test plan
- [x] `uv run ruff check` — clean on all 8 touched files
- [x] `uv run pytest
test/unit_test/common/test_blob_connector_fingerprint.py -v` — 14 passed
- [x] Broader unit-test suite — no regressions in anything I touched
- [ ] Manual smoke against a real S3 bucket — configure a connector, run
sync twice, expect the second sync to log `bypassed=N, fetched=0` and no
`GetObject` calls in CloudTrail / bucket access logs
- [ ] Manual smoke with `reindex=1` — confirm the full re-download path
still works
### Type of change
- [x] New Feature (non-breaking change which adds functionality)
---------
Co-authored-by: Yingfeng <yingfeng.zhang@gmail.com>
2026-05-09 05:03:56 -07:00
|
|
|
# Opaque, connector-supplied fingerprint stored in Document.content_hash for
|
|
|
|
|
# change-detection. 32-char hex string; format is per-source (xxhash128 of
|
|
|
|
|
# bytes for local uploads, xxhash128(ETag) for blob storage, etc.). When set
|
|
|
|
|
# on a yielded Document, the orchestrator persists it as content_hash and
|
|
|
|
|
# skips the post-download xxhash128(blob) recomputation.
|
|
|
|
|
fingerprint: Optional[str] = None
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
class KeyRecord(BaseModel):
|
|
|
|
|
"""One entry returned by a FingerprintConnector.list_keys() call.
|
|
|
|
|
|
|
|
|
|
A KeyRecord is the cheap-listing primitive: connector enumerates all keys
|
|
|
|
|
it has, attaches a fingerprint when the source exposes one, and the
|
|
|
|
|
orchestrator only fetches content when the fingerprint differs from what's
|
|
|
|
|
persisted.
|
|
|
|
|
"""
|
|
|
|
|
key: str
|
|
|
|
|
fingerprint: Optional[str] = None
|
|
|
|
|
deleted: bool = False
|
2025-11-03 19:59:18 +08:00
|
|
|
|
|
|
|
|
|
|
|
|
|
class BasicExpertInfo(BaseModel):
|
|
|
|
|
"""Expert information model"""
|
|
|
|
|
display_name: Optional[str] = None
|
|
|
|
|
first_name: Optional[str] = None
|
|
|
|
|
last_name: Optional[str] = None
|
|
|
|
|
email: Optional[str] = None
|
|
|
|
|
|
|
|
|
|
def get_semantic_name(self) -> str:
|
|
|
|
|
"""Get semantic name for display"""
|
|
|
|
|
if self.display_name:
|
|
|
|
|
return self.display_name
|
|
|
|
|
elif self.first_name and self.last_name:
|
|
|
|
|
return f"{self.first_name} {self.last_name}"
|
|
|
|
|
elif self.first_name:
|
|
|
|
|
return self.first_name
|
|
|
|
|
elif self.last_name:
|
|
|
|
|
return self.last_name
|
|
|
|
|
else:
|
|
|
|
|
return "Unknown"
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
class SlimDocument(BaseModel):
|
|
|
|
|
"""Simplified document model (contains only ID and permission info)"""
|
|
|
|
|
id: str
|
|
|
|
|
external_access: Optional[Any] = None
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
class ConnectorCheckpoint(BaseModel):
|
|
|
|
|
"""Connector checkpoint model"""
|
|
|
|
|
has_more: bool = True
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
class DocumentFailure(BaseModel):
|
|
|
|
|
"""Document processing failure information"""
|
|
|
|
|
document_id: str
|
|
|
|
|
document_link: str
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
class EntityFailure(BaseModel):
|
|
|
|
|
"""Entity processing failure information"""
|
|
|
|
|
entity_id: str
|
|
|
|
|
missed_time_range: tuple[datetime, datetime]
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
class ConnectorFailure(BaseModel):
|
|
|
|
|
"""Connector failure information"""
|
|
|
|
|
failed_document: Optional[DocumentFailure] = None
|
|
|
|
|
failed_entity: Optional[EntityFailure] = None
|
|
|
|
|
failure_message: str
|
|
|
|
|
exception: Optional[Exception] = None
|
|
|
|
|
|
|
|
|
|
model_config = {"arbitrary_types_allowed": True}
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
# Gmail Models
|
|
|
|
|
class GmailCredentials(BaseModel):
|
|
|
|
|
"""Gmail authentication credentials model"""
|
|
|
|
|
primary_admin_email: str
|
|
|
|
|
credentials: dict[str, Any]
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
class GmailThread(BaseModel):
|
|
|
|
|
"""Gmail thread data model"""
|
|
|
|
|
id: str
|
|
|
|
|
messages: list[dict[str, Any]]
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
class GmailMessage(BaseModel):
|
|
|
|
|
"""Gmail message data model"""
|
|
|
|
|
id: str
|
|
|
|
|
payload: dict[str, Any]
|
|
|
|
|
label_ids: Optional[list[str]] = None
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
# Notion Models
|
|
|
|
|
class NotionPage(BaseModel):
|
|
|
|
|
"""Represents a Notion Page object"""
|
|
|
|
|
id: str
|
|
|
|
|
created_time: str
|
|
|
|
|
last_edited_time: str
|
|
|
|
|
archived: bool
|
|
|
|
|
properties: dict[str, Any]
|
|
|
|
|
url: str
|
2025-12-18 10:42:43 +01:00
|
|
|
parent: Optional[dict[str, Any]] = None # Parent reference for path reconstruction
|
2025-11-03 19:59:18 +08:00
|
|
|
database_name: Optional[str] = None # Only applicable to database type pages
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
class NotionBlock(BaseModel):
|
|
|
|
|
"""Represents a Notion Block object"""
|
|
|
|
|
id: str # Used for the URL
|
|
|
|
|
text: str
|
|
|
|
|
prefix: str # How this block should be joined with existing text
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
class NotionSearchResponse(BaseModel):
|
|
|
|
|
"""Represents the response from the Notion Search API"""
|
|
|
|
|
results: list[dict[str, Any]]
|
|
|
|
|
next_cursor: Optional[str]
|
|
|
|
|
has_more: bool = False
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
class NotionCredentials(BaseModel):
|
|
|
|
|
"""Notion authentication credentials model"""
|
|
|
|
|
integration_token: str
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
# Slack Models
|
|
|
|
|
class ChannelTopicPurposeType(TypedDict):
|
|
|
|
|
"""Slack channel topic or purpose"""
|
|
|
|
|
value: str
|
|
|
|
|
creator: str
|
|
|
|
|
last_set: int
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
class ChannelType(TypedDict):
|
|
|
|
|
"""Slack channel"""
|
|
|
|
|
id: str
|
|
|
|
|
name: str
|
|
|
|
|
is_channel: bool
|
|
|
|
|
is_group: bool
|
|
|
|
|
is_im: bool
|
|
|
|
|
created: int
|
|
|
|
|
creator: str
|
|
|
|
|
is_archived: bool
|
|
|
|
|
is_general: bool
|
|
|
|
|
unlinked: int
|
|
|
|
|
name_normalized: str
|
|
|
|
|
is_shared: bool
|
|
|
|
|
is_ext_shared: bool
|
|
|
|
|
is_org_shared: bool
|
|
|
|
|
pending_shared: List[str]
|
|
|
|
|
is_pending_ext_shared: bool
|
|
|
|
|
is_member: bool
|
|
|
|
|
is_private: bool
|
|
|
|
|
is_mpim: bool
|
|
|
|
|
updated: int
|
|
|
|
|
topic: ChannelTopicPurposeType
|
|
|
|
|
purpose: ChannelTopicPurposeType
|
|
|
|
|
previous_names: List[str]
|
|
|
|
|
num_members: int
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
class AttachmentType(TypedDict):
|
|
|
|
|
"""Slack message attachment"""
|
|
|
|
|
service_name: NotRequired[str]
|
|
|
|
|
text: NotRequired[str]
|
|
|
|
|
fallback: NotRequired[str]
|
|
|
|
|
thumb_url: NotRequired[str]
|
|
|
|
|
thumb_width: NotRequired[int]
|
|
|
|
|
thumb_height: NotRequired[int]
|
|
|
|
|
id: NotRequired[int]
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
class BotProfileType(TypedDict):
|
|
|
|
|
"""Slack bot profile"""
|
|
|
|
|
id: NotRequired[str]
|
|
|
|
|
deleted: NotRequired[bool]
|
|
|
|
|
name: NotRequired[str]
|
|
|
|
|
updated: NotRequired[int]
|
|
|
|
|
app_id: NotRequired[str]
|
|
|
|
|
team_id: NotRequired[str]
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
class MessageType(TypedDict):
|
|
|
|
|
"""Slack message"""
|
|
|
|
|
type: str
|
|
|
|
|
user: str
|
|
|
|
|
text: str
|
|
|
|
|
ts: str
|
|
|
|
|
attachments: NotRequired[List[AttachmentType]]
|
|
|
|
|
bot_id: NotRequired[str]
|
|
|
|
|
app_id: NotRequired[str]
|
|
|
|
|
bot_profile: NotRequired[BotProfileType]
|
|
|
|
|
thread_ts: NotRequired[str]
|
|
|
|
|
subtype: NotRequired[str]
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
# Thread message list
|
|
|
|
|
ThreadType = List[MessageType]
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
class SlackCheckpoint(TypedDict):
|
|
|
|
|
"""Slack checkpoint"""
|
|
|
|
|
channel_ids: List[str] | None
|
|
|
|
|
channel_completion_map: dict[str, str]
|
|
|
|
|
current_channel: ChannelType | None
|
|
|
|
|
current_channel_access: Any | None
|
|
|
|
|
seen_thread_ts: List[str]
|
|
|
|
|
has_more: bool
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
class SlackMessageFilterReason(str):
|
|
|
|
|
"""Slack message filter reason"""
|
|
|
|
|
BOT = "bot"
|
|
|
|
|
DISALLOWED = "disallowed"
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
class ProcessedSlackMessage:
|
|
|
|
|
"""Processed Slack message"""
|
|
|
|
|
def __init__(self, doc=None, thread_or_message_ts=None, filter_reason=None, failure=None):
|
|
|
|
|
self.doc = doc
|
|
|
|
|
self.thread_or_message_ts = thread_or_message_ts
|
|
|
|
|
self.filter_reason = filter_reason
|
|
|
|
|
self.failure = failure
|
|
|
|
|
|
|
|
|
|
|
feat(seafile): add library and directory sync scope support (#13153)
### What problem does this PR solve?
The SeaFile connector currently synchronises the entire account — every
library
visible to the authenticated user. This is impractical for users who
only need
a subset of their data indexed, especially on large SeaFile instances
with many
shared libraries.
This PR introduces granular sync scope support, allowing users to choose
between
syncing their entire account, a single library, or a specific directory
within a
library. It also adds support for SeaFile library-scoped API tokens
(`/api/v2.1/via-repo-token/` endpoints), enabling tighter access control
without
exposing account-level credentials.
### Type of change
- [ ] Bug Fix (non-breaking change which fixes an issue)
- [x] New Feature (non-breaking change which adds functionality)
- [ ] Documentation Update
- [ ] Refactoring
- [ ] Performance Improvement
- [ ] Other (please describe):
### Test
```
from seafile_connector import SeaFileConnector
import logging
import os
logging.basicConfig(level=logging.DEBUG)
URL = os.environ.get("SEAFILE_URL", "https://seafile.example.com")
TOKEN = os.environ.get("SEAFILE_TOKEN", "")
REPO_ID = os.environ.get("SEAFILE_REPO_ID", "")
SYNC_PATH = os.environ.get("SEAFILE_SYNC_PATH", "/Documents")
REPO_TOKEN = os.environ.get("SEAFILE_REPO_TOKEN", "")
def _test_scope(scope, repo_id=None, sync_path=None):
print(f"\n{'='*50}")
print(f"Testing scope: {scope}")
print(f"{'='*50}")
creds = {"seafile_token": TOKEN} if TOKEN else {}
if REPO_TOKEN and scope in ("library", "directory"):
creds["repo_token"] = REPO_TOKEN
connector = SeaFileConnector(
seafile_url=URL,
batch_size=5,
sync_scope=scope,
include_shared = False,
repo_id=repo_id,
sync_path=sync_path,
)
connector.load_credentials(creds)
connector.validate_connector_settings()
count = 0
for batch in connector.load_from_state():
for doc in batch:
count += 1
print(f" [{count}] {doc.semantic_identifier} "
f"({doc.size_bytes} bytes, {doc.extension})")
print(f"\n-> {scope} scope: {count} document(s) found.\n")
# 1. Account scope
if TOKEN:
_test_scope("account")
else:
print("\nSkipping account scope (set SEAFILE_TOKEN)")
# 2. Library scope
if REPO_ID and (TOKEN or REPO_TOKEN):
_test_scope("library", repo_id=REPO_ID)
else:
print("\nSkipping library scope (set SEAFILE_REPO_ID + token)")
# 3. Directory scope
if REPO_ID and SYNC_PATH and (TOKEN or REPO_TOKEN):
_test_scope("directory", repo_id=REPO_ID, sync_path=SYNC_PATH)
else:
print("\nSkipping directory scope (set SEAFILE_REPO_ID + SEAFILE_SYNC_PATH + token)")
```
2026-02-28 03:24:28 +01:00
|
|
|
class SeafileSyncScope(str, Enum):
|
|
|
|
|
"""Defines how much of SeaFile to synchronise."""
|
|
|
|
|
ACCOUNT = "account" # All libraries the token can see
|
|
|
|
|
LIBRARY = "library" # A single library (repo)
|
|
|
|
|
DIRECTORY = "directory" # A single directory inside a library
|
2025-11-03 19:59:18 +08:00
|
|
|
# Type aliases for type hints
|
|
|
|
|
SecondsSinceUnixEpoch = float
|
|
|
|
|
GenerateDocumentsOutput = Any
|
|
|
|
|
GenerateSlimDocumentOutput = Any
|
2025-11-10 19:15:02 +08:00
|
|
|
CheckpointOutput = Any
|