From d1afcc9e71d2bc063cbb84951337cc172981598b Mon Sep 17 00:00:00 2001 From: Yesid Cano Castro <46203884+yesidc@users.noreply.github.com> Date: Sat, 28 Feb 2026 03:24:28 +0100 Subject: [PATCH] feat(seafile): add library and directory sync scope support (#13153) MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit ### What problem does this PR solve? The SeaFile connector currently synchronises the entire account — every library visible to the authenticated user. This is impractical for users who only need a subset of their data indexed, especially on large SeaFile instances with many shared libraries. This PR introduces granular sync scope support, allowing users to choose between syncing their entire account, a single library, or a specific directory within a library. It also adds support for SeaFile library-scoped API tokens (`/api/v2.1/via-repo-token/` endpoints), enabling tighter access control without exposing account-level credentials. ### Type of change - [ ] Bug Fix (non-breaking change which fixes an issue) - [x] New Feature (non-breaking change which adds functionality) - [ ] Documentation Update - [ ] Refactoring - [ ] Performance Improvement - [ ] Other (please describe): ### Test ``` from seafile_connector import SeaFileConnector import logging import os logging.basicConfig(level=logging.DEBUG) URL = os.environ.get("SEAFILE_URL", "https://seafile.example.com") TOKEN = os.environ.get("SEAFILE_TOKEN", "") REPO_ID = os.environ.get("SEAFILE_REPO_ID", "") SYNC_PATH = os.environ.get("SEAFILE_SYNC_PATH", "/Documents") REPO_TOKEN = os.environ.get("SEAFILE_REPO_TOKEN", "") def _test_scope(scope, repo_id=None, sync_path=None): print(f"\n{'='*50}") print(f"Testing scope: {scope}") print(f"{'='*50}") creds = {"seafile_token": TOKEN} if TOKEN else {} if REPO_TOKEN and scope in ("library", "directory"): creds["repo_token"] = REPO_TOKEN connector = SeaFileConnector( seafile_url=URL, batch_size=5, sync_scope=scope, include_shared = False, repo_id=repo_id, sync_path=sync_path, ) connector.load_credentials(creds) connector.validate_connector_settings() count = 0 for batch in connector.load_from_state(): for doc in batch: count += 1 print(f" [{count}] {doc.semantic_identifier} " f"({doc.size_bytes} bytes, {doc.extension})") print(f"\n-> {scope} scope: {count} document(s) found.\n") # 1. Account scope if TOKEN: _test_scope("account") else: print("\nSkipping account scope (set SEAFILE_TOKEN)") # 2. Library scope if REPO_ID and (TOKEN or REPO_TOKEN): _test_scope("library", repo_id=REPO_ID) else: print("\nSkipping library scope (set SEAFILE_REPO_ID + token)") # 3. Directory scope if REPO_ID and SYNC_PATH and (TOKEN or REPO_TOKEN): _test_scope("directory", repo_id=REPO_ID, sync_path=SYNC_PATH) else: print("\nSkipping directory scope (set SEAFILE_REPO_ID + SEAFILE_SYNC_PATH + token)") ``` --- common/data_source/models.py | 6 + common/data_source/seafile_connector.py | 568 +++++++++++------- rag/svr/sync_data_source.py | 32 +- web/src/locales/de.ts | 52 +- web/src/locales/en.ts | 54 +- .../data-source/constant/index.tsx | 43 +- .../data-source/constant/seafile-constant.tsx | 210 +++++++ 7 files changed, 700 insertions(+), 265 deletions(-) create mode 100644 web/src/pages/user-setting/data-source/constant/seafile-constant.tsx diff --git a/common/data_source/models.py b/common/data_source/models.py index 5ddafb0172..71f8c27242 100644 --- a/common/data_source/models.py +++ b/common/data_source/models.py @@ -4,6 +4,7 @@ from datetime import datetime from typing import Any, Optional, List, Sequence, NamedTuple from typing_extensions import TypedDict, NotRequired from pydantic import BaseModel +from enum import Enum @dataclass(frozen=True) @@ -306,6 +307,11 @@ class ProcessedSlackMessage: self.failure = failure +class SeafileSyncScope(str, Enum): + """Defines how much of SeaFile to synchronise.""" + ACCOUNT = "account" # All libraries the token can see + LIBRARY = "library" # A single library (repo) + DIRECTORY = "directory" # A single directory inside a library # Type aliases for type hints SecondsSinceUnixEpoch = float GenerateDocumentsOutput = Any diff --git a/common/data_source/seafile_connector.py b/common/data_source/seafile_connector.py index 0181269e85..ef7afeecf4 100644 --- a/common/data_source/seafile_connector.py +++ b/common/data_source/seafile_connector.py @@ -1,4 +1,4 @@ -"""SeaFile connector""" +"""SeaFile connector with granular sync support""" import logging from datetime import datetime, timezone from typing import Any, Optional @@ -25,71 +25,151 @@ from common.data_source.models import ( Document, SecondsSinceUnixEpoch, GenerateDocumentsOutput, + SeafileSyncScope, ) logger = logging.getLogger(__name__) - class SeaFileConnector(LoadConnector, PollConnector): - """SeaFile connector for syncing files from SeaFile servers""" + """SeaFile connector supporting account-, library- and directory-level sync. + + API endpoints used: + Account token (api2): + GET /api2/account/info/ + GET /api2/repos/ + GET /api2/repos/{repo_id}/ + GET /api2/repos/{repo_id}/dir/?p=... + GET /api2/repos/{repo_id}/file/?p=...&reuse=1 + + Repo token (api/v2.1/via-repo-token): + GET /api/v2.1/via-repo-token/repo-info/ + GET /api/v2.1/via-repo-token/dir/?path=... + GET /api/v2.1/via-repo-token/download-link/?path=... + """ def __init__( self, seafile_url: str, batch_size: int = INDEX_BATCH_SIZE, include_shared: bool = True, + sync_scope: str = SeafileSyncScope.ACCOUNT, + repo_id: Optional[str] = None, + sync_path: Optional[str] = None, ) -> None: - """Initialize SeaFile connector. - - Args: - seafile_url: Base URL of the SeaFile server (e.g., https://seafile.example.com) - batch_size: Number of documents to yield per batch - include_shared: Whether to include shared libraries - """ - self.seafile_url = seafile_url.rstrip("/") - self.api_url = f"{self.seafile_url}/api2" self.batch_size = batch_size self.include_shared = include_shared - self.token: Optional[str] = None + self.sync_scope = SeafileSyncScope(sync_scope) + self.repo_id = repo_id + self.sync_path = self._normalise_path(sync_path) + + self.token: Optional[str] = None # account-level + self.repo_token: Optional[str] = None # library-scoped self.current_user_email: Optional[str] = None self.size_threshold: int = BLOB_STORAGE_SIZE_THRESHOLD - def _get_headers(self) -> dict[str, str]: - """Get authorization headers for API requests""" + self._validate_scope_params() + + + @staticmethod + def _normalise_path(path: Optional[str]) -> str: + if not path: + return "/" + path = path.strip() + if not path.startswith("/"): + path = f"/{path}" + return path.rstrip("/") or "/" + + @staticmethod + def _parse_mtime(raw_mtime) -> datetime: + """Parse mtime from SeaFile API response. + + Handles: + - Unix timestamp as int: 1575514722 + - Unix timestamp as str: "1575514722" + - ISO 8601 datetime str: "2026-02-15T17:26:53+01:00" + - None / missing + """ + if not raw_mtime: + return datetime.now(timezone.utc) + + # Try as unix timestamp (int or numeric string) + if isinstance(raw_mtime, (int, float)): + return datetime.fromtimestamp(raw_mtime, tz=timezone.utc) + + if isinstance(raw_mtime, str): + # Try numeric string first + try: + return datetime.fromtimestamp(int(raw_mtime), tz=timezone.utc) + except ValueError: + pass + + # Try ISO 8601 + try: + return datetime.fromisoformat(raw_mtime) + except ValueError: + pass + + logger.warning("Unparseable mtime %r, using current time", raw_mtime) + return datetime.now(timezone.utc) + + def _validate_scope_params(self) -> None: + if self.sync_scope in (SeafileSyncScope.LIBRARY, SeafileSyncScope.DIRECTORY): + if not self.repo_id: + raise ConnectorValidationError( + f"sync_scope={self.sync_scope.value!r} requires 'repo_id'." + ) + if self.sync_scope == SeafileSyncScope.DIRECTORY: + if self.sync_path == "/": + raise ConnectorValidationError( + "sync_scope='directory' requires a non-root 'sync_path'. " + "Use sync_scope='library' to sync an entire library." + ) + + @property + def _use_repo_token(self) -> bool: + """Whether we should use repo-token endpoints.""" + return self.repo_token is not None + + + def _account_headers(self) -> dict[str, str]: if not self.token: - raise ConnectorMissingCredentialError("SeaFile token not set") + raise ConnectorMissingCredentialError("Account token not set") return { "Authorization": f"Token {self.token}", "Accept": "application/json", } - def _make_get_request(self, endpoint: str, params: Optional[dict] = None): - """Make authenticated GET request""" - url = f"{self.api_url}/{endpoint.lstrip('/')}" - response = rl_requests.get( - url, - headers=self._get_headers(), - params=params, - timeout=60, + def _repo_token_headers(self) -> dict[str, str]: + if not self.repo_token: + raise ConnectorMissingCredentialError("Repo token not set") + return { + "Authorization": f"Bearer {self.repo_token}", # <-- Bearer, not Token + "Accept": "application/json", + } + + def _account_get(self, endpoint: str, params: Optional[dict] = None): + """GET against /api2/... using the account token.""" + url = f"{self.seafile_url}/api2/{endpoint.lstrip('/')}" + resp = rl_requests.get( + url, headers=self._account_headers(), params=params, timeout=60, ) - return response + return resp + + def _repo_token_get(self, endpoint: str, params: Optional[dict] = None): + """GET against /api/v2.1/via-repo-token/... using the repo token.""" + url = f"{self.seafile_url}/api/v2.1/via-repo-token/{endpoint.lstrip('/')}" + resp = rl_requests.get( + url, headers=self._repo_token_headers(), params=params, timeout=60, + ) + return resp + def load_credentials(self, credentials: dict[str, Any]) -> dict[str, Any] | None: - """Load and validate SeaFile credentials. - - Args: - credentials: Dictionary containing 'seafile_token' or 'username'/'password' - - Returns: - None - - Raises: - ConnectorMissingCredentialError: If required credentials are missing - """ - logger.debug(f"Loading credentials for SeaFile server {self.seafile_url}") + logger.debug("Loading credentials for SeaFile server %s", self.seafile_url) token = credentials.get("seafile_token") + repo_token = credentials.get("repo_token") username = credentials.get("username") password = credentials.get("password") @@ -97,130 +177,234 @@ class SeaFileConnector(LoadConnector, PollConnector): self.token = token elif username and password: self.token = self._authenticate_with_password(username, password) - else: - raise ConnectorMissingCredentialError( - "SeaFile requires 'seafile_token' or 'username'/'password' credentials" + + if repo_token and self.sync_scope in (SeafileSyncScope.LIBRARY, SeafileSyncScope.DIRECTORY): + self.repo_token = repo_token + elif repo_token: + logger.debug( + "repo_token supplied but scope=%s; ignoring.", + self.sync_scope.value, + ) + + if not self.token and not self.repo_token: + raise ConnectorMissingCredentialError( + "SeaFile requires 'seafile_token', 'repo_token', " + "or 'username'/'password'." ) - # Validate token and get current user info try: - self._validate_token() + self._validate_credentials() + except ConnectorMissingCredentialError: + raise except Exception as e: - raise CredentialExpiredError(f"SeaFile token validation failed: {e}") + raise CredentialExpiredError( + f"SeaFile credential validation failed: {e}" + ) return None def _authenticate_with_password(self, username: str, password: str) -> str: - """Authenticate with username/password and return API token""" try: - response = rl_requests.post( - f"{self.api_url}/auth-token/", + resp = rl_requests.post( + f"{self.seafile_url}/api2/auth-token/", data={"username": username, "password": password}, timeout=30, ) - response.raise_for_status() - data = response.json() - token = data.get("token") + resp.raise_for_status() + token = resp.json().get("token") if not token: - raise CredentialExpiredError("No token returned from SeaFile") + raise CredentialExpiredError("No token returned") return token except Exception as e: raise ConnectorMissingCredentialError( f"Failed to authenticate with SeaFile: {e}" ) - def _validate_token(self) -> dict: - """Validate token by fetching account info""" - response = self._make_get_request("/account/info/") - response.raise_for_status() - account_info = response.json() - self.current_user_email = account_info.get("email") - logger.info(f"SeaFile authenticated as: {self.current_user_email}") - return account_info + def _validate_credentials(self) -> None: + if self.token: + self._validate_account_token() + + if self.repo_token: + self._validate_repo_token() + elif self.sync_scope in (SeafileSyncScope.LIBRARY, SeafileSyncScope.DIRECTORY): + self._validate_repo_access_via_account() + + def _validate_account_token(self) -> dict: + resp = self._account_get("/account/info/") + resp.raise_for_status() + info = resp.json() + self.current_user_email = info.get("email") + logger.info("SeaFile authenticated as: %s", self.current_user_email) + return info + + def _validate_repo_token(self) -> None: + """Validate repo token using /api/v2.1/via-repo-token/repo-info/""" + try: + resp = self._repo_token_get("repo-info/") + resp.raise_for_status() + info = resp.json() + logger.info( + "Repo token validated — library: %s (id: %s)", + info.get("repo_name", "?"), info.get("repo_id", self.repo_id), + ) + # Update repo_id from response if not set + if not self.repo_id and info.get("repo_id"): + self.repo_id = info["repo_id"] + except Exception as e: + raise CredentialExpiredError( + f"Repo token validation failed: {e}" + ) + + def _validate_repo_access_via_account(self) -> None: + repo_info = self._get_repo_info_via_account(self.repo_id) + if not repo_info: + raise ConnectorValidationError( + f"Library {self.repo_id} not accessible with account token." + ) + if self.sync_scope == SeafileSyncScope.DIRECTORY: + entries = self._get_directory_entries(self.repo_id, self.sync_path) + if entries is None: + raise ConnectorValidationError( + f"Directory {self.sync_path!r} does not exist " + f"in library {self.repo_id}." + ) + def validate_connector_settings(self) -> None: - """Validate SeaFile connector settings""" - if self.token is None: + if not self.token and not self.repo_token: raise ConnectorMissingCredentialError("SeaFile credentials not loaded.") - if not self.seafile_url: raise ConnectorValidationError("No SeaFile URL was provided.") try: - account_info = self._validate_token() - if not account_info.get("email"): - raise InsufficientPermissionsError("Invalid SeaFile API response") - - # Check if we can list libraries - libraries = self._get_libraries() - logger.info(f"SeaFile connection validated. Found {len(libraries)} libraries.") - - except Exception as e: - status = None - resp = getattr(e, "response", None) - if resp is not None: - status = getattr(resp, "status_code", None) - - if status == 401: - raise CredentialExpiredError("SeaFile token is invalid or expired.") - if status == 403: - raise InsufficientPermissionsError( - "Insufficient permissions to access SeaFile API." + if self.sync_scope == SeafileSyncScope.ACCOUNT: + libs = self._get_libraries() + logger.info("Validated (account scope). %d libraries.", len(libs)) + elif self.sync_scope == SeafileSyncScope.LIBRARY: + info = self._get_repo_info() + logger.info( + "Validated (library scope): %s", info.get("name", self.repo_id) ) - raise ConnectorValidationError(f"SeaFile validation failed: {repr(e)}") + elif self.sync_scope == SeafileSyncScope.DIRECTORY: + entries = self._get_directory_entries(self.repo_id, self.sync_path) + logger.info( + "Validated (directory scope): %s:%s (%d entries)", + self.repo_id, self.sync_path, len(entries), + ) + except ( + ConnectorValidationError, ConnectorMissingCredentialError, + CredentialExpiredError, InsufficientPermissionsError, + ): + raise + except Exception as e: + status = getattr(getattr(e, "response", None), "status_code", None) + if status == 401: + raise CredentialExpiredError("Token invalid or expired.") + if status == 403: + raise InsufficientPermissionsError("Insufficient permissions.") + raise ConnectorValidationError(f"Validation failed: {repr(e)}") + @retry(tries=3, delay=1, backoff=2) def _get_libraries(self) -> list[dict]: - """Fetch all accessible libraries (repos)""" - response = self._make_get_request("/repos/") - response.raise_for_status() - libraries = response.json() - - logger.debug(f"Found {len(libraries)} total libraries") + """List all libraries (account token only).""" + resp = self._account_get("/repos/") + resp.raise_for_status() + libraries = resp.json() if not self.include_shared and self.current_user_email: - # Filter to only owned libraries - owned_libraries = [ + libraries = [ lib for lib in libraries if lib.get("owner") == self.current_user_email or lib.get("owner_email") == self.current_user_email ] - logger.debug( - f"Filtered to {len(owned_libraries)} owned libraries " - f"(excluded {len(libraries) - len(owned_libraries)} shared)" - ) - return owned_libraries return libraries @retry(tries=3, delay=1, backoff=2) - def _get_directory_entries(self, repo_id: str, path: str = "/") -> list[dict]: - """Fetch directory entries for a given path""" + def _get_repo_info_via_account(self, repo_id: str) -> Optional[dict]: + """GET /api2/repos/{repo_id}/ — account token.""" try: - response = self._make_get_request( - f"/repos/{repo_id}/dir/", - params={"p": path}, - ) - response.raise_for_status() - return response.json() + resp = self._account_get(f"/repos/{repo_id}/") + resp.raise_for_status() + return resp.json() except Exception as e: - logger.warning(f"Error fetching directory {path} in repo {repo_id}: {e}") + logger.warning("Error fetching repo info for %s: %s", repo_id, e) + return None + + @retry(tries=3, delay=1, backoff=2) + def _get_repo_info_via_repo_token(self) -> Optional[dict]: + """GET /api/v2.1/via-repo-token/repo-info/ — repo token.""" + try: + resp = self._repo_token_get("repo-info/") + resp.raise_for_status() + return resp.json() + except Exception as e: + logger.warning("Error fetching repo info via repo token: %s", e) + return None + + def _get_repo_info(self) -> Optional[dict]: + """Get repo info using whichever token is available.""" + if self._use_repo_token: + info = self._get_repo_info_via_repo_token() + if info: + # Normalise keys to match account-token response shape + return { + "id": info.get("repo_id", self.repo_id), + "name": info.get("repo_name", self.repo_id), + } + return None + return self._get_repo_info_via_account(self.repo_id) + + @retry(tries=3, delay=1, backoff=2) + def _get_directory_entries(self, repo_id: str, path: str = "/") -> list[dict]: + """List directory contents using the appropriate endpoint.""" + try: + if self._use_repo_token: + # GET /api/v2.1/via-repo-token/dir/?path=/foo + resp = self._repo_token_get("dir/", params={"path": path}) + else: + # GET /api2/repos/{repo_id}/dir/?p=/foo + resp = self._account_get( + f"/repos/{repo_id}/dir/", params={"p": path}, + ) + resp.raise_for_status() + data = resp.json() + + # v2.1 wraps entries in {"dirent_list": [...]} + if isinstance(data, dict) and "dirent_list" in data: + return data["dirent_list"] + return data + + except Exception as e: + logger.warning( + "Error fetching directory %s in repo %s: %s", path, repo_id, e, + ) return [] @retry(tries=3, delay=1, backoff=2) - def _get_file_download_link(self, repo_id: str, path: str) -> Optional[str]: - """Get download link for a file""" + def _get_file_download_link( + self, repo_id: str, path: str + ) -> Optional[str]: + """Get a temporary download URL for a file.""" try: - response = self._make_get_request( - f"/repos/{repo_id}/file/", - params={"p": path, "reuse": 1}, - ) - response.raise_for_status() - return response.text.strip('"') + if self._use_repo_token: + # GET /api/v2.1/via-repo-token/download-link/?path=/foo.pdf + resp = self._repo_token_get( + "download-link/", params={"path": path}, + ) + else: + # GET /api2/repos/{repo_id}/file/?p=/foo.pdf&reuse=1 + resp = self._account_get( + f"/repos/{repo_id}/file/", params={"p": path, "reuse": 1}, + ) + resp.raise_for_status() + return resp.text.strip('"') except Exception as e: - logger.warning(f"Error getting download link for {path}: {e}") + logger.warning("Error getting download link for %s: %s", path, e) return None + def _list_files_recursive( self, repo_id: str, @@ -229,11 +413,6 @@ class SeaFileConnector(LoadConnector, PollConnector): start: datetime, end: datetime, ) -> list[tuple[str, dict, dict]]: - """Recursively list all files in the given path within time range. - - Returns: - List of tuples: (file_path, file_entry, library_info) - """ files = [] entries = self._get_directory_entries(repo_id, path) @@ -243,148 +422,121 @@ class SeaFileConnector(LoadConnector, PollConnector): entry_path = f"{path.rstrip('/')}/{entry_name}" if entry_type == "dir": - # Recursively process subdirectories files.extend( - self._list_files_recursive(repo_id, repo_name, entry_path, start, end) + self._list_files_recursive( + repo_id, repo_name, entry_path, start, end, + ) ) elif entry_type == "file": - # Check modification time - mtime = entry.get("mtime", 0) - if mtime: - modified = datetime.fromtimestamp(mtime, tz=timezone.utc) - if start < modified <= end: - files.append((entry_path, entry, {"id": repo_id, "name": repo_name})) + modified = self._parse_mtime(entry.get("mtime")) + if start < modified <= end: + files.append( + (entry_path, entry, + {"id": repo_id, "name": repo_name}) + ) return files + def _resolve_libraries_to_scan(self) -> list[dict]: + if self.sync_scope == SeafileSyncScope.ACCOUNT: + return [ + {"id": lib["id"], "name": lib.get("name", "Unknown")} + for lib in self._get_libraries() if lib.get("id") + ] + + info = self._get_repo_info() + if info: + return [{"id": info.get("id", self.repo_id), + "name": info.get("name", self.repo_id)}] + return [{"id": self.repo_id, "name": self.repo_id}] + + def _root_path_for_repo(self, repo_id: str) -> str: + if (self.sync_scope == SeafileSyncScope.DIRECTORY + and repo_id == self.repo_id): + return self.sync_path + return "/" + + def _yield_seafile_documents( - self, - start: datetime, - end: datetime, + self, start: datetime, end: datetime, ) -> GenerateDocumentsOutput: - """Generate documents from SeaFile server. + libraries = self._resolve_libraries_to_scan() + logger.info( + "Processing %d library(ies) [scope=%s]", + len(libraries), self.sync_scope.value, + ) - Args: - start: Start datetime for filtering - end: End datetime for filtering - - Yields: - Batches of documents - """ - logger.info(f"Searching for files between {start} and {end}") - - libraries = self._get_libraries() - logger.info(f"Processing {len(libraries)} libraries") - - all_files = [] + all_files: list[tuple[str, dict, dict]] = [] for lib in libraries: - repo_id = lib.get("id") - repo_name = lib.get("name", "Unknown") - - if not repo_id: - continue - - logger.debug(f"Scanning library: {repo_name}") + root = self._root_path_for_repo(lib["id"]) + logger.debug("Scanning %s starting at %s", lib["name"], root) try: - files = self._list_files_recursive(repo_id, repo_name, "/", start, end) + files = self._list_files_recursive( + lib["id"], lib["name"], root, start, end, + ) all_files.extend(files) - logger.debug(f"Found {len(files)} files in {repo_name}") except Exception as e: - logger.error(f"Error processing library {repo_name}: {e}") + logger.error("Error in library %s: %s", lib["name"], e) - logger.info(f"Found {len(all_files)} total files matching time criteria") + logger.info("Found %d file(s) matching criteria", len(all_files)) batch: list[Document] = [] for file_path, file_entry, library in all_files: file_name = file_entry.get("name", "") file_size = file_entry.get("size", 0) file_id = file_entry.get("id", "") - mtime = file_entry.get("mtime", 0) repo_id = library["id"] repo_name = library["name"] - # Skip files that are too large + modified = self._parse_mtime(file_entry.get("mtime")) + if file_size > self.size_threshold: - logger.warning( - f"Skipping large file: {file_path} ({file_size} bytes)" - ) + logger.warning("Skipping large file: %s (%d B)", file_path, file_size) continue try: - # Get download link download_link = self._get_file_download_link(repo_id, file_path) if not download_link: - logger.warning(f"Could not get download link for {file_path}") continue - # Download file content - logger.debug(f"Downloading: {file_path}") - response = rl_requests.get(download_link, timeout=120) - response.raise_for_status() - blob = response.content - + resp = rl_requests.get(download_link, timeout=120) + resp.raise_for_status() + blob = resp.content if not blob: - logger.warning(f"Downloaded content is empty for {file_path}") continue - # Build semantic identifier - semantic_id = f"{repo_name}{file_path}" - - # Get modification time - modified = datetime.fromtimestamp(mtime, tz=timezone.utc) if mtime else datetime.now(timezone.utc) - - batch.append( - Document( - id=f"seafile:{repo_id}:{file_id}", - blob=blob, - source=DocumentSource.SEAFILE, - semantic_identifier=semantic_id, - extension=get_file_ext(file_name), - doc_updated_at=modified, - size_bytes=len(blob), - ) - ) + batch.append(Document( + id=f"seafile:{repo_id}:{file_id}", + blob=blob, + source=DocumentSource.SEAFILE, + semantic_identifier=f"{repo_name}{file_path}", + extension=get_file_ext(file_name), + doc_updated_at=modified, # <-- already parsed + size_bytes=len(blob), + )) if len(batch) >= self.batch_size: yield batch batch = [] except Exception as e: - logger.error(f"Error downloading file {file_path}: {e}") + logger.error("Error downloading %s: %s", file_path, e) if batch: yield batch def load_from_state(self) -> GenerateDocumentsOutput: - """Load all documents from SeaFile server. - - Yields: - Batches of documents - """ - logger.info(f"Loading all documents from SeaFile server {self.seafile_url}") return self._yield_seafile_documents( start=datetime(1970, 1, 1, tzinfo=timezone.utc), end=datetime.now(timezone.utc), ) def poll_source( - self, start: SecondsSinceUnixEpoch, end: SecondsSinceUnixEpoch + self, start: SecondsSinceUnixEpoch, end: SecondsSinceUnixEpoch, ) -> GenerateDocumentsOutput: - """Poll SeaFile server for updated documents. - - Args: - start: Start timestamp (seconds since Unix epoch) - end: End timestamp (seconds since Unix epoch) - - Yields: - Batches of documents - """ - start_datetime = datetime.fromtimestamp(start, tz=timezone.utc) - end_datetime = datetime.fromtimestamp(end, tz=timezone.utc) - - logger.info(f"Polling SeaFile for updates from {start_datetime} to {end_datetime}") - - for batch in self._yield_seafile_documents(start_datetime, end_datetime): + start_dt = datetime.fromtimestamp(start, tz=timezone.utc) + end_dt = datetime.fromtimestamp(end, tz=timezone.utc) + for batch in self._yield_seafile_documents(start_dt, end_dt): yield batch - + \ No newline at end of file diff --git a/rag/svr/sync_data_source.py b/rag/svr/sync_data_source.py index e2e9319a48..ac317d418e 100644 --- a/rag/svr/sync_data_source.py +++ b/rag/svr/sync_data_source.py @@ -58,7 +58,7 @@ from common.data_source import ( ) from common.constants import FileSource, TaskStatus from common.data_source.config import INDEX_BATCH_SIZE -from common.data_source.models import ConnectorFailure +from common.data_source.models import ConnectorFailure, SeafileSyncScope from common.data_source.webdav_connector import WebDAVConnector from common.data_source.confluence_connector import ConfluenceConnector from common.data_source.gmail_connector import GmailConnector @@ -1180,21 +1180,23 @@ class Bitbucket(SyncBase): return wrapper() + class SeaFile(SyncBase): SOURCE_NAME: str = FileSource.SEAFILE async def _generate(self, task: dict): + conf = self.conf self.connector = SeaFileConnector( - seafile_url=self.conf["seafile_url"], - batch_size=self.conf.get("batch_size", INDEX_BATCH_SIZE), - include_shared=self.conf.get("include_shared", True) + seafile_url=conf["seafile_url"], + batch_size=conf.get("batch_size", INDEX_BATCH_SIZE), + include_shared=conf.get("include_shared", True), + sync_scope=conf.get("sync_scope", SeafileSyncScope.ACCOUNT), + repo_id=conf.get("repo_id") or None, + sync_path=conf.get("sync_path") or None, ) + self.connector.load_credentials(conf["credentials"]) - self.connector.load_credentials(self.conf["credentials"]) - - # Determine the time range for synchronization based on reindex or poll_range_start poll_start = task.get("poll_range_start") - if task["reindex"] == "1" or poll_start is None: document_generator = self.connector.load_from_state() begin_info = "totally" @@ -1205,12 +1207,16 @@ class SeaFile(SyncBase): ) begin_info = f"from {poll_start}" + scope = conf.get("sync_scope", "account") + extra = "" + if scope in ("library", "directory"): + extra = f" repo_id={conf.get('repo_id')}" + if scope == "directory": + extra += f" path={conf.get('sync_path')}" + logging.info( - "Connect to SeaFile: {} (include_shared: {}) {}".format( - self.conf["seafile_url"], - self.conf.get("include_shared", True), - begin_info - ) + "Connect to SeaFile: %s (scope=%s%s) %s", + conf["seafile_url"], scope, extra, begin_info, ) return document_generator diff --git a/web/src/locales/de.ts b/web/src/locales/de.ts index 498c475d99..dd34c499f8 100644 --- a/web/src/locales/de.ts +++ b/web/src/locales/de.ts @@ -1031,13 +1031,57 @@ Beispiel: Virtual Hosted Style`, seafileDescription: 'Verbinden Sie sich mit Ihrem SeaFile-Server, um Dateien und Dokumente aus Ihren Bibliotheken zu synchronisieren.', seafileUrlTip: - 'Die Basis-URL Ihres SeaFile-Servers (z.B. https://seafile.example.com). Fügen Sie kein /api2 oder andere Pfade hinzu.', + 'Die vollstaendige URL Ihres SeaFile-Servers inklusive Protokoll. Beispiel: https://seafile.example.com - Kein abschliessender Schraegstrich und kein Pfad nach der Domain.', + seafileAccountScopeTip: + 'Synchronisiert alle Bibliotheken, die für den unten angegebenen Konto-API-Token sichtbar sind.', + seafileTokenPanelHeading: + 'Wählen Sie eine der folgenden Authentifizierungsmethoden:', + seafileTokenPanelAccountBullet: + '- gewährt Zugriff auf alle Ihre Bibliotheken.', + seafileTokenPanelLibraryBullet: + '- auf eine einzelne Bibliothek beschränkt (sicherer).', + seafileValidationAccountTokenRequired: + 'Konto-API-Token ist erforderlich für den Umfang „Gesamtes Konto"', + seafileValidationTokenRequired: + 'Geben Sie entweder einen Konto-API-Token oder einen Bibliotheks-Token an', + seafileValidationLibraryIdRequired: + 'Bibliotheks-ID ist erforderlich', + seafileValidationDirectoryPathRequired: + 'Verzeichnispfad ist erforderlich', + seafileSyncScopeTip: + 'Legt fest, was synchronisiert wird: ' + + '(1) Gesamtes Konto - Synchronisiert alle Bibliotheken, auf die Ihr Token Zugriff hat. Erfordert einen Konto-API-Token. ' + + '(2) Einzelne Bibliothek - Synchronisiert alle Dateien innerhalb einer bestimmten Bibliothek. Erfordert die Bibliotheks-ID und entweder einen Konto-API-Token oder einen Bibliotheks-API-Token. ' + + '(3) Bestimmtes Verzeichnis - Synchronisiert nur Dateien in einem bestimmten Ordner innerhalb einer Bibliothek. Erfordert die Bibliotheks-ID, den Ordnerpfad innerhalb dieser Bibliothek und entweder einen Konto-API-Token oder einen Bibliotheks-API-Token.', seafileTokenTip: - 'Erstellen Sie ein API-Token in SeaFile: Gehen Sie zu Einstellungen → API-Token → Token erstellen. Das Token ermöglicht den Zugriff auf alle für Ihr Konto sichtbaren Bibliotheken.', + 'Ihr kontoweiter SeaFile-API-Token. ' + + 'Gewährt Zugriff auf alle fuer Ihr Konto sichtbaren Bibliotheken. ' + + 'Erforderlich bei Synchronisierungsumfang "Gesamtes Konto". ' + + 'Für "Einzelne Bibliothek" oder "Bestimmtes Verzeichnis" können Sie alternativ einen Bibliotheks-API-Token verwenden.', + seafileRepoTokenTip: + 'Ein bibliotheksbezogener API-Token, der nur Zugriff auf eine bestimmte Bibliothek gewährt. ' + + 'Kann anstelle des Konto-API-Tokens für "Einzelne Bibliothek" und "Bestimmtes Verzeichnis" verwendet werden.', + seafileRepoIdTip: + 'Die eindeutige Kennung (UUID) der SeaFile-Bibliothek. ' + + 'Sie finden diese in der Adressleiste Ihres Browsers, wenn Sie die Bibliothek in der SeaFile-Weboberflaeche öffnen. ' + + 'Beispiel: 7a9e1b3c-4d5f-6a7b-8c9d-0e1f2a3b4c5d. ' + + 'Erforderlich bei Synchronisierungsumfang "Einzelne Bibliothek" oder "Bestimmtes Verzeichnis".', + seafileSyncPathTip: + 'Der absolute Pfad des zu synchronisierenden Ordners innerhalb der oben angegebenen Bibliothek. ' + + 'Muss mit einem Schraegstrich beginnen. ' + + 'Alle Dateien und Unterordner unter diesem Pfad werden rekursiv einbezogen. ' + + 'Beispiel: /Dokumente/Berichte. ' + + 'Wichtig: Der Ordner muss innerhalb der angegebenen Bibliothek existieren. ' + + 'Pfade ausserhalb der Bibliothek werden nicht unterstuetzt. ' + + 'Wird nur verwendet bei Synchronisierungsumfang "Bestimmtes Verzeichnis".', seafileIncludeSharedTip: - 'Wenn aktiviert, werden auch Bibliotheken synchronisiert, die andere Benutzer mit Ihnen geteilt haben.', + 'Wenn aktiviert, werden auch Bibliotheken synchronisiert, die andere Benutzer mit Ihnen geteilt haben. ' + + 'Wenn deaktiviert, werden nur Bibliotheken synchronisiert, die Ihrem Konto gehoeren. ' + + 'Gilt nur bei Synchronisierungsumfang "Gesamtes Konto".', seafileBatchSizeTip: - 'Anzahl der Dokumente, die pro Batch verarbeitet werden. Höhere Werte können die Leistung verbessern, benötigen aber mehr Arbeitsspeicher. Standard: 100.', + 'Anzahl der Dokumente, die pro Durchlauf verarbeitet und zurueckgegeben werden. ' + + 'Ein kleinerer Wert verbraucht weniger Arbeitsspeicher, kann aber insgesamt langsamer sein. ' + + 'Standardwert: 100.', jiraDescription: 'Verbinden Sie Ihren Jira-Arbeitsbereich, um Vorgänge, Kommentare und Anhänge zu synchronisieren.', jiraBaseUrlTip: diff --git a/web/src/locales/en.ts b/web/src/locales/en.ts index b46fb9f954..f3fb3db126 100644 --- a/web/src/locales/en.ts +++ b/web/src/locales/en.ts @@ -1007,15 +1007,59 @@ Example: Virtual Hosted Style`, moodleTokenTip: 'Generate a web service token in Moodle: Go to Site administration → Server → Web services → Manage tokens. The user must be enrolled in the courses you want to sync.', seafileDescription: - 'Connect to your SeaFile server to sync files and documents from your libraries.', + 'Connect to your SeaFile server to synchronise files and documents from your libraries.', seafileUrlTip: - 'The base URL of your SeaFile server (e.g., https://seafile.example.com). Do not include /api2 or other paths.', + 'The full URL of your SeaFile server including the protocol. Example: https://seafile.example.com - Do not include a trailing slash or any path after the domain.', + seafileAccountScopeTip: + 'Syncs all libraries visible to the Account API Token below.', + seafileTokenPanelHeading: + 'Provide one of these authentication methods:', + seafileTokenPanelAccountBullet: + '- grants access to all your libraries.', + seafileTokenPanelLibraryBullet: + '— scoped to a single library only (more secure).', + seafileValidationAccountTokenRequired: + 'Account API Token is required for Entire Account scope', + seafileValidationTokenRequired: + 'Provide either an Account API Token or a Library Token', + seafileValidationLibraryIdRequired: + 'Library ID is required', + seafileValidationDirectoryPathRequired: + 'Directory Path is required', + seafileSyncScopeTip: + 'Controls what gets synchronised: ' + + '(1) Entire Account - Syncs all libraries your token has access to. Requires an Account API Token. ' + + '(2) Single Library - Syncs all files within one specific library. Requires the Library ID and either an Account API Token or a Library API Token. ' + + '(3) Specific Directory - Syncs only files within a specific folder inside a library. Requires the Library ID, the folder path within that library, and either an Account API Token or a Library API Token.', seafileTokenTip: - 'Generate an API token in SeaFile: Go to Settings → API Token → Generate Token. The token provides access to all libraries visible to your account.', + 'Your account-level SeaFile API token. ' + + 'Grants access to all libraries visible to your account. ' + + 'Required when sync scope is "Entire Account". ' + + 'For "Single Library" or "Specific Directory" you can use this token or a Library API Token instead.', + seafileRepoTokenTip: + 'A library-scoped API token that only grants access to one specific library. ' + + 'Can be used instead of the Account API Token for "Single Library" and "Specific Directory" sync scopes.', + seafileRepoIdTip: + 'The unique identifier (UUID) of the SeaFile library you want to synchronise. ' + + 'You can find it in your browser address bar when you open the library in the SeaFile web interface. ' + + 'Example: 7a9e1b3c-4d5f-6a7b-8c9d-0e1f2a3b4c5d. ' + + 'Required when sync scope is "Single Library" or "Specific Directory".', + seafileSyncPathTip: + 'The absolute path of the folder to synchronise within the library specified by the Library ID above. ' + + 'Must start with a forward slash. ' + + 'All files and subfolders under this path will be included recursively. ' + + 'Example: /Documents/Reports. ' + + 'Important: The folder must exist inside the specified library. ' + + 'Paths outside the library are not supported. ' + + 'Only used when sync scope is "Specific Directory".', seafileIncludeSharedTip: - 'When enabled, libraries shared with you by other users will also be synced.', + 'When enabled, libraries that other users have shared with you are included in the synchronisation. ' + + 'When disabled, only libraries owned by your account are synchronised. ' + + 'Only applies when sync scope is "Entire Account".', seafileBatchSizeTip: - 'Number of documents to process per batch. Higher values may improve performance but use more memory. Default: 100.', + 'The number of documents processed and returned per batch during synchronisation. ' + + 'A smaller value uses less memory but may be slower overall. ' + + 'Default: 100.', jiraDescription: 'Connect your Jira workspace to sync issues, comments, and attachments.', jiraBaseUrlTip: diff --git a/web/src/pages/user-setting/data-source/constant/index.tsx b/web/src/pages/user-setting/data-source/constant/index.tsx index 4b4f8cb757..15c29a2ddf 100644 --- a/web/src/pages/user-setting/data-source/constant/index.tsx +++ b/web/src/pages/user-setting/data-source/constant/index.tsx @@ -12,6 +12,7 @@ import { IDataSourceInfoMap } from '../interface'; import { bitbucketConstant } from './bitbucket-constant'; import { confluenceConstant } from './confluence-constant'; import { S3Constant } from './s3-constant'; +import { seafileConstant } from './seafile-constant'; export enum DataSourceKey { CONFLUENCE = 'confluence', @@ -834,39 +835,7 @@ export const DataSourceFormFields = { ], }, ], - [DataSourceKey.SEAFILE]: [ - { - label: 'SeaFile Server URL', - name: 'config.seafile_url', - type: FormFieldType.Text, - required: true, - placeholder: 'https://seafile.example.com', - tooltip: t('setting.seafileUrlTip'), - }, - { - label: 'API Token', - name: 'config.credentials.seafile_token', - type: FormFieldType.Password, - required: true, - tooltip: t('setting.seafileTokenTip'), - }, - { - label: 'Include Shared Libraries', - name: 'config.include_shared', - type: FormFieldType.Checkbox, - required: false, - defaultValue: true, - tooltip: t('setting.seafileIncludeSharedTip'), - }, - { - label: 'Batch Size', - name: 'config.batch_size', - type: FormFieldType.Number, - required: false, - placeholder: '100', - tooltip: t('setting.seafileBatchSizeTip'), - }, - ], + [DataSourceKey.SEAFILE]: seafileConstant(t), [DataSourceKey.MYSQL]: [ { label: 'Host', @@ -1253,10 +1222,14 @@ export const DataSourceFormDefaultValues = { source: DataSourceKey.SEAFILE, config: { seafile_url: '', - include_shared: true, + sync_scope: 'account', + repo_id: '', + sync_path: '', + include_shared: true, batch_size: 100, credentials: { - seafile_token: '', + seafile_token: '', + repo_token: '', }, }, }, diff --git a/web/src/pages/user-setting/data-source/constant/seafile-constant.tsx b/web/src/pages/user-setting/data-source/constant/seafile-constant.tsx new file mode 100644 index 0000000000..cc434c598d --- /dev/null +++ b/web/src/pages/user-setting/data-source/constant/seafile-constant.tsx @@ -0,0 +1,210 @@ +import { FilterFormField, FormFieldType } from '@/components/dynamic-form'; +import { TFunction } from 'i18next'; + +export const seafileConstant = (t: TFunction) => [ + { + label: 'SeaFile Server URL', + name: 'config.seafile_url', + type: FormFieldType.Text, + required: true, + placeholder: 'https://seafile.example.com', + tooltip: t('setting.seafileUrlTip'), + }, + { + label: 'Sync Scope', + name: 'config.sync_scope', + type: FormFieldType.Segmented, + options: [ + { label: 'Entire Account', value: 'account' }, + { label: 'Single Library', value: 'library' }, + { label: 'Specific Directory', value: 'directory' }, + ], + tooltip: t('setting.seafileSyncScopeTip'), + }, + + { + name: FilterFormField + '.account-tip', + label: ' ', + type: FormFieldType.Custom, + shouldRender: (formValues: any) => { + const scope = formValues?.config?.sync_scope ?? 'account'; + return scope === 'account'; + }, + render: () => ( +
{t('setting.seafileTokenPanelHeading')}
+