2025-11-03 19:59:18 +08:00
|
|
|
"""Configuration constants and enum definitions"""
|
|
|
|
|
import json
|
|
|
|
|
import os
|
|
|
|
|
from datetime import datetime, timezone
|
|
|
|
|
from enum import Enum
|
|
|
|
|
from typing import cast
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
def get_current_tz_offset() -> int:
|
|
|
|
|
# datetime now() gets local time, datetime.now(timezone.utc) gets UTC time.
|
|
|
|
|
# remove tzinfo to compare non-timezone-aware objects.
|
|
|
|
|
time_diff = datetime.now() - datetime.now(timezone.utc).replace(tzinfo=None)
|
|
|
|
|
return round(time_diff.total_seconds() / 3600)
|
|
|
|
|
|
|
|
|
|
|
2025-12-31 17:18:30 +08:00
|
|
|
# Default request timeout, mostly used by connectors
|
|
|
|
|
REQUEST_TIMEOUT_SECONDS = int(os.environ.get("REQUEST_TIMEOUT_SECONDS") or 60)
|
|
|
|
|
|
2025-11-17 09:38:04 +08:00
|
|
|
ONE_MINUTE = 60
|
2025-11-03 19:59:18 +08:00
|
|
|
ONE_HOUR = 3600
|
|
|
|
|
ONE_DAY = ONE_HOUR * 24
|
|
|
|
|
|
|
|
|
|
# Slack API limits
|
|
|
|
|
_SLACK_LIMIT = 900
|
|
|
|
|
|
|
|
|
|
# Redis lock configuration
|
|
|
|
|
ONYX_SLACK_LOCK_TTL = 1800
|
|
|
|
|
ONYX_SLACK_LOCK_BLOCKING_TIMEOUT = 60
|
|
|
|
|
ONYX_SLACK_LOCK_TOTAL_BLOCKING_TIMEOUT = 3600
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
class BlobType(str, Enum):
|
|
|
|
|
"""Supported storage types"""
|
|
|
|
|
S3 = "s3"
|
|
|
|
|
R2 = "r2"
|
|
|
|
|
GOOGLE_CLOUD_STORAGE = "google_cloud_storage"
|
|
|
|
|
OCI_STORAGE = "oci_storage"
|
2025-11-18 02:39:25 +01:00
|
|
|
S3_COMPATIBLE = "s3_compatible"
|
2025-11-03 19:59:18 +08:00
|
|
|
|
|
|
|
|
|
|
|
|
|
class DocumentSource(str, Enum):
|
|
|
|
|
"""Document sources"""
|
2026-03-27 22:58:44 +08:00
|
|
|
RSS = "rss"
|
2025-11-03 19:59:18 +08:00
|
|
|
S3 = "s3"
|
|
|
|
|
NOTION = "notion"
|
Feature/generic api connector (#13545)
# feat: Add Generic REST API Connector
## What problem does this PR solve?
RAGFlow supports many specific data source connectors (MySQL, Slack,
Google Drive, etc.), but there was no way to connect an arbitrary REST
API as a data source. Users with custom or third-party APIs had to write
a new connector class for each one.
This PR adds a **generic, configuration-driven REST API connector** that
lets users connect any REST API as a data source entirely through the UI
— no code changes needed per API.
---
## Features
### Core Connector (`common/data_source/rest_api_connector.py`)
- Implements `LoadConnector` and `PollConnector` interfaces for full and
incremental sync
- **Configurable authentication:** None, API Key (custom header), Bearer
Token, Basic Auth
- **Pluggable pagination:** Page-based, Offset-based, Cursor-based, or
None
- Smart page-size inference from user's query parameters to avoid
duplicate/conflicting params
- Configurable request delay between pages to prevent API rate limiting
- Auto-detection of the items array in JSON responses (`items`,
`results`, `data`, `records`, or first list found)
- **Advanced field mapping** with dot-notation (`country.name`), array
wildcards (`newsType[*].name`), type hints, and default values
- Optional content template rendering (`"Title: {title}\nBody: {body}"`)
- HTML stripping for content fields
- Stable document IDs via `hash128` from a configurable ID field or
auto-generated from item content
- Pydantic configuration schema with automatic coercion of UI string
inputs to dicts/lists
### Backend Registration (`rag/svr/sync_data_source.py`,
`common/constants.py`, `common/data_source/config.py`)
- `REST_API` sync class wired into RAGFlow's `func_factory`
- Full sync (`load_from_state`) and incremental polling (`poll_source`)
support
- Credentials and config passed from task to connector following
existing patterns (MySQL, SeaFile, etc.)
### Test Connection Endpoint (`api/apps/connector_app.py`)
- `POST /v1/connector/<id>/test` validates config schema,
authentication, and API connectivity without triggering a sync
- Clear error messages for auth failures vs. config issues
### Frontend UI (`web/src/pages/user-setting/data-source/constant/`)
- **Postman-style configuration:** Base URL, Query Parameters (key=value
per line), Auth, Content Fields, Metadata Fields, Pagination Type
- Auth-type-aware form: fields for API key header/value, Bearer token,
or Basic username/password appear only when relevant
- **Advanced Settings** toggle for: Custom Headers, Max Pages, Request
Delay, Poll Timestamp Field, Request Body (POST)
- Connector icon (SVG) and i18n strings (English)
- **"Test Connection"** button to validate before syncing
---
## Controls & Safety
- Configurable max pages safety cap (default: 1000, adjustable in UI)
- Configurable request delay between pages (default: 0.5s, adjustable in
UI)
- Auth errors (401/403) fail immediately without retries; transient
errors retry with exponential backoff
- Diagnostic logging: auth setup confirmation, request details on
failure, content field extraction status
---
## Type of change
- [x] New Feature (non-breaking change which adds functionality)
##Visual Screenshots of Features
<img width="482" height="510" alt="Screenshot 2026-03-11 at 5 19 52 PM"
src="https://github.com/user-attachments/assets/dcb7ab4a-1622-44f3-bb02-d6f0527314c4"
/>
(Connector can be configured within the external data sources tab)
Configuration Parameters:
<img width="661" height="682" alt="Screenshot 2026-03-11 at 5 20 46 PM"
src="https://github.com/user-attachments/assets/5e154e71-4ab5-4872-bfb2-04f02b73c18a"
/>
<img width="661" height="682" alt="Screenshot 2026-03-11 at 5 20 54 PM"
src="https://github.com/user-attachments/assets/00cb14b7-0bcf-4b94-9d71-34e93369ecb2"
/>
Connection can be tested before attaching to dataset:
<img width="981" height="681" alt="Screenshot 2026-03-11 at 5 21 40 PM"
src="https://github.com/user-attachments/assets/aaa6eeeb-89a7-4349-bc34-2423bf8be9ee"
/>
Ingestion tested with API connector (works perfectly fine):
<img width="1062" height="705" alt="Screenshot 2026-03-11 at 5 22 30 PM"
src="https://github.com/user-attachments/assets/afcd0d58-cadd-4152-badc-d2f14d96fbec"
/>
Search & Retrieval works as well with metadata flow:
<img width="1062" height="705" alt="Screenshot 2026-03-11 at 5 23 05 PM"
src="https://github.com/user-attachments/assets/d41ee935-dcf7-4456-b317-22a76ca032c0"
/>
---------
Co-authored-by: Ahmad Intisar <ahmadintisar@Ahmads-MacBook-M4-Pro.local>
Co-authored-by: Copilot Autofix powered by AI <175728472+Copilot@users.noreply.github.com>
2026-05-13 17:35:01 +05:00
|
|
|
REST_API = "rest_api"
|
2025-11-03 19:59:18 +08:00
|
|
|
R2 = "r2"
|
|
|
|
|
GOOGLE_CLOUD_STORAGE = "google_cloud_storage"
|
|
|
|
|
OCI_STORAGE = "oci_storage"
|
|
|
|
|
SLACK = "slack"
|
|
|
|
|
CONFLUENCE = "confluence"
|
2025-11-17 09:38:04 +08:00
|
|
|
JIRA = "jira"
|
2025-11-10 19:15:02 +08:00
|
|
|
GOOGLE_DRIVE = "google_drive"
|
|
|
|
|
GMAIL = "gmail"
|
2025-11-04 17:29:11 +08:00
|
|
|
DISCORD = "discord"
|
2025-11-26 07:14:42 +01:00
|
|
|
WEBDAV = "webdav"
|
2025-11-21 12:58:49 +01:00
|
|
|
MOODLE = "moodle"
|
2025-11-18 02:39:25 +01:00
|
|
|
S3_COMPATIBLE = "s3_compatible"
|
2025-11-25 09:40:03 +08:00
|
|
|
DROPBOX = "dropbox"
|
2025-12-12 10:23:40 +08:00
|
|
|
BOX = "box"
|
2025-12-25 17:50:41 +08:00
|
|
|
AIRTABLE = "airtable"
|
2025-12-29 13:28:37 +08:00
|
|
|
ASANA = "asana"
|
2025-12-29 17:05:20 +08:00
|
|
|
GITHUB = "github"
|
|
|
|
|
GITLAB = "gitlab"
|
2025-12-30 17:09:13 +08:00
|
|
|
IMAP = "imap"
|
2025-12-31 17:18:30 +08:00
|
|
|
BITBUCKET = "bitbucket"
|
2025-12-31 14:40:49 +08:00
|
|
|
ZENDESK = "zendesk"
|
2026-02-03 23:14:32 -03:00
|
|
|
SEAFILE = "seafile"
|
|
|
|
|
MYSQL = "mysql"
|
|
|
|
|
POSTGRESQL = "postgresql"
|
Feature big query connector (#15871)
### What problem does this PR solve?
This PR adds Google BigQuery as a first-class data source connector in
RAGFlow.
It enables users to ingest and sync BigQuery data using the same
row-to-document model used by relational database connectors: selected
content columns become document text, metadata columns become document
metadata, an optional ID column provides stable document IDs, and an
optional timestamp column enables cursor-based incremental sync.
The connector supports service-account JSON credentials, table mode,
custom query mode, GoogleSQL queries, cursor-based incremental sync,
deleted-row pruning support, configurable query limits such as
`maximum_bytes_billed`, dry-run validation, batch loading, stable
document IDs, and BigQuery-aware value serialization.
2026-06-29 17:08:40 +03:00
|
|
|
BIGQUERY = "bigquery"
|
2026-03-06 21:13:23 +08:00
|
|
|
DINGTALK_AI_TABLE = "dingtalk_ai_table"
|
feat(connector): implement OneDrive data source connector (issue #15330) (#15331)
### What problem does this PR solve?
Closes #15330.
RAGFlow had no connector for OneDrive / OneDrive for Business. Users who
store working documents in OneDrive could not index them into a
knowledge base without manually downloading and re-uploading files.
This PR adds a net-new OneDrive data source that:
- Authenticates against Microsoft Graph with the same MSAL
client-credentials flow already used by the SharePoint and Teams
connectors (no new auth primitives).
- Enumerates every drive visible to the service principal and pages
through `/drives/{id}/root/delta`, persisting `@odata.deltaLink` values
per drive so subsequent syncs only fetch changed items.
- Optionally narrows ingestion to a sub-folder (`folder_path`) without
needing a separate code path.
- Surfaces typed errors on the validation probe (`GET /drives?$top=1`):
401 → `ConnectorMissingCredentialError`, 403 →
`InsufficientPermissionsError` (with a `Files.Read.All` hint), 5xx →
`UnexpectedValidationError`.
- Filters folders, soft-deleted items, and unsupported extensions (`.pdf
.docx .doc .xlsx .xls .pptx .ppt .txt .md .csv`).
#### Files
| File | Change |
|------|--------|
| `common/data_source/onedrive_connector.py` | **New** —
`OneDriveConnector` + `OneDriveCheckpoint`. |
| `common/data_source/config.py` | `DocumentSource.ONEDRIVE =
"onedrive"`. |
| `common/constants.py` | `FileSource.ONEDRIVE = "onedrive"`. |
| `common/data_source/__init__.py` | Export `OneDriveConnector`. |
| `rag/svr/sync_data_source.py` | `OneDrive(SyncBase)` with `batch_size`
normalisation; registered in `func_factory`. |
| `web/src/pages/user-setting/data-source/constant/index.tsx` |
`DataSourceKey.ONEDRIVE`, visibility map (`syncDeletedFiles: true`),
info entry, form fields (tenant_id, client_id, client_secret,
folder_path, batch_size), default values. |
| `web/src/locales/en.ts`, `web/src/locales/zh.ts` |
`onedriveDescription` + 4 tooltip keys (EN + ZH). |
| `test/unit_test/data_source/test_onedrive_connector_unit.py` | **New**
— 13 unit tests (`p1`/`p2`) covering auth, validation, checkpoint
helpers, and document filtering. |
#### Required Azure AD permission
`Files.Read.All` (Application, admin-granted).
#### Out of scope
- Interactive end-user OAuth (delegated permissions) — the connector
uses app-only credentials, consistent with the SharePoint / Teams
precedent.
- Binary download of file contents — the sync layer emits `Document`s
carrying `webUrl` + metadata; bytes are hydrated downstream by the parse
pipeline.
### Type of change
- [x] New Feature (non-breaking change which adds functionality)
2026-05-29 05:26:06 -06:00
|
|
|
ONEDRIVE = "onedrive"
|
feat(connector): implement Outlook data source connector (issue #15332) (#15333)
### What problem does this PR solve?
Closes #15332.
RAGFlow can index Gmail and generic IMAP mailboxes but had no native
connector for Outlook / Microsoft 365 mail. Organisations on Microsoft
365 had no way to bring mailbox content into a knowledge base through
Microsoft Graph.
This PR adds a net-new Outlook data source that:
- Authenticates against Microsoft Graph with the same MSAL
client-credentials flow already used by the SharePoint and Teams
connectors (no new auth primitives).
- Pages over `/users/{id}/mailFolders/{folder}/messages/delta` per
mailbox and persists `@odata.deltaLink` values in
`OutlookCheckpoint.delta_links`, so incremental syncs only fetch changed
messages.
- Supports two scoping modes:
- **Tenant-wide** (default): enumerates every user in the tenant via
`/users` and syncs each mailbox. Requires `User.Read.All`.
- **Targeted**: when `user_ids` is provided (comma-separated UPNs or
object IDs), only those mailboxes are synced. `User.Read.All` is not
needed in this mode.
- Lets the caller pick the mail folder (`inbox`, `sentitems`, `archive`,
...). Defaults to `inbox`.
- Maps each message to a `Document` shaped after the Gmail connector:
one `TextSection` carrying `From/To/Cc/Subject` headers + body, with
HTML bodies stripped to text inline (no extra dependency).
- Surfaces typed errors on the validation probe:
401 → `ConnectorMissingCredentialError`, 403 →
`InsufficientPermissionsError` (with `Mail.Read` / `User.Read.All`
hint), 404 on a configured mailbox → `ConnectorValidationError`, 5xx →
`UnexpectedValidationError`.
- Skips messages flagged `@removed` by the delta semantics and messages
whose `receivedDateTime` is older than `poll_range_start`.
#### Files
| File | Change |
|------|--------|
| `common/data_source/outlook_connector.py` | **New** —
`OutlookConnector` (`CheckpointedConnectorWithPermSync` +
`SlimConnectorWithPermSync`) + `OutlookCheckpoint` + tiny `_strip_html`
helper. |
| `common/data_source/config.py` | `DocumentSource.OUTLOOK = "outlook"`.
|
| `common/constants.py` | `FileSource.OUTLOOK = "outlook"`. |
| `common/data_source/__init__.py` | Export `OutlookConnector`. |
| `rag/svr/sync_data_source.py` | `Outlook(SyncBase)` with `batch_size`
normalisation, CSV/list parsing of `user_ids`; registered in
`func_factory`. |
| `web/src/pages/user-setting/data-source/constant/index.tsx` |
`DataSourceKey.OUTLOOK`, visibility map (`syncDeletedFiles: true`), info
entry, form fields (tenant_id, client_id, client_secret, folder,
user_ids, batch_size), default values. |
| `web/src/locales/en.ts`, `web/src/locales/zh.ts` |
`outlookDescription` + 5 tooltip keys (EN + ZH). |
| `test/unit_test/data_source/test_outlook_connector_unit.py` | **New**
— 19 unit tests (`p1`/`p2`/`p3`) covering auth, validation (tenant-wide
vs specific user vs error paths), checkpoint helpers, user enumeration
pagination, message filtering, HTML body stripping. |
#### Required Azure AD permissions
- `Mail.Read` (Application, admin-granted) — always.
- `User.Read.All` (Application, admin-granted) — only when `user_ids` is
left blank so the connector can enumerate mailboxes.
#### Out of scope
- **Attachment indexing.** The current connector emits message body +
headers; binary attachments are flagged via `metadata.has_attachments`
but not pulled. Adding attachment hydration is straightforward but
scoped out per the issue's "decide whether attachments are indexed in
the first version" note.
- **Delegated (per-user) OAuth.** The connector uses app-only
credentials, consistent with the SharePoint / Teams precedent in this
codebase.
### Type of change
- [x] New Feature (non-breaking change which adds functionality)
2026-05-29 07:52:29 -06:00
|
|
|
OUTLOOK = "outlook"
|
feat(connectors): add Salesforce CRM data source connector (#15462)
### What problem does this PR solve?
Closes #15461.
RAGFlow had no way to ingest Salesforce CRM data, so support / sales
teams couldn't ground responses on live Accounts, Contacts,
Opportunities, Cases, or Knowledge articles. This adds a first-class
Salesforce data source connector that authenticates against a Connected
App via OAuth 2.0 client-credentials, queries selected SObjects via
SOQL, and turns each record into an indexable document with incremental
sync.
**Highlights**
- `common/data_source/salesforce_connector.py`: new
`SalesforceConnector` (`CheckpointedConnectorWithPermSync` +
`SlimConnectorWithPermSync`).
- OAuth 2.0 client-credentials flow; canonical `instance_url` from the
token response so multi-pod orgs route correctly.
- Per-object `SystemModstamp` cursor stored in
`SalesforceCheckpoint.cursors` — a failure mid-object doesn't rewind
sibling objects, and re-syncs only fetch changed rows.
- Deterministic record-to-text formatter (sorted keys) so SOQL field
reordering on the server doesn't mark every row "changed" on each poll.
- `_get_json` raises on non-2xx so 429 / 5xx never silently advance the
checkpoint past missing data.
- `Knowledge__kav` is in the default object set but is skipped silently
when the org doesn't have Salesforce Knowledge enabled (404 on
describe).
- Slim-doc IDs are scoped as `<Object>/<Id>` so prune deletes can't
collide across object types.
- `common/constants.py`, `common/data_source/config.py`,
`common/data_source/__init__.py`: register `salesforce` in `FileSource`
/ `DocumentSource` and export `SalesforceConnector`.
- `rag/svr/sync_data_source.py`: new `Salesforce(SyncBase)` class routed
through `load_from_checkpoint` (poll_source would re-walk every object
each run) and added to `func_factory`.
- Frontend:
- `web/src/pages/user-setting/data-source/constant/index.tsx`: new
`DataSourceKey.SALESFORCE`, form fields (instance URL, client ID/secret,
objects, api_version, batch size), `syncDeletedFiles` capability,
default form values, and tile entry with the new icon.
- `web/src/locales/{en,zh}.ts`: description + per-field tooltips.
- `web/src/assets/svg/data-source/salesforce.svg`: 48x48 brand-style
icon to match the other Microsoft / cloud tiles.
**Verification**
- `npm run build` (vite + esbuild) passes (1m 26s).
### Type of change
- [x] New Feature (non-breaking change which adds functionality)
2026-06-04 23:24:36 -06:00
|
|
|
SALESFORCE = "salesforce"
|
feat(connectors): add Azure Blob Storage data source connector (#15466)
### What problem does this PR solve?
Closes #15465.
RAGFlow supports S3, Google Cloud Storage, R2, and OCI as data sources
but not Azure Blob Storage, leaving Azure users without a way to index
container objects into a knowledge base. This adds a first-class Azure
Blob Storage data-source connector — distinct from RAGFlow's existing
Azure storage *backends* (`rag/utils/azure_sas_conn.py`,
`rag/utils/azure_spn_conn.py`) which store RAGFlow's own files.
**Highlights**
- `common/data_source/azure_blob_connector.py`: new `AzureBlobConnector`
(`CheckpointedConnectorWithPermSync` + `SlimConnectorWithPermSync`).
- Uses the existing `azure-storage-blob` dependency (already in
`pyproject.toml`).
- Three auth modes, tried in order of precedence:
1. **Account key** — `account_name` + `account_key` + `container_name`.
2. **Connection string** — `connection_string` + `container_name`.
3. **SAS token** — `container_url` + `sas_token` (same shape as
`RAGFlowAzureSasBlob`).
- ETag fingerprint stored per blob in `AzureBlobCheckpoint.etags` —
unchanged blobs (same ETag as last run) are skipped without a download.
Only new/modified blobs are fetched.
- Optional `prefix` scopes indexing to a virtual folder.
- `validate_connector_settings()` probes `get_container_properties()`
and maps `AuthenticationFailed / 403 / ContainerNotFound` to typed
connector exceptions.
- Slim-doc IDs are blob names so prune reconciles correctly.
- `common/constants.py`, `common/data_source/config.py`,
`common/data_source/__init__.py`: register `azure_blob` in `FileSource`
/ `DocumentSource` and export `AzureBlobConnector`.
- `rag/svr/sync_data_source.py`: new `AzureBlob(SyncBase)` class routed
through `load_from_checkpoint` (ETag fingerprint owns change-detection)
and added to `func_factory`.
- Frontend:
- `web/src/pages/user-setting/data-source/constant/index.tsx`: new
`DataSourceKey.AZURE_BLOB`, auth-mode selector (account key / connection
string / SAS token), all credential fields, prefix + batch-size,
`syncDeletedFiles` capability, default form values, tile entry with
icon.
- `web/src/locales/{en,zh}.ts`: description + per-field tooltips for all
9 new keys.
- `web/src/assets/svg/data-source/azure-blob.svg`: Azure-branded
stacked-cylinders icon.
**Verification**
- `npm run build` (vite + esbuild) passes (37 s).
### Type of change
- [x] New Feature (non-breaking change which adds functionality)
2026-06-04 07:06:01 -06:00
|
|
|
AZURE_BLOB = "azure_blob"
|
2025-12-31 14:40:49 +08:00
|
|
|
|
2025-11-03 19:59:18 +08:00
|
|
|
|
|
|
|
|
class FileOrigin(str, Enum):
|
|
|
|
|
"""File origins"""
|
|
|
|
|
CONNECTOR = "connector"
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
# Standard image MIME types supported by most vision LLMs
|
|
|
|
|
IMAGE_MIME_TYPES = [
|
|
|
|
|
"image/png",
|
|
|
|
|
"image/jpeg",
|
|
|
|
|
"image/jpg",
|
|
|
|
|
"image/webp",
|
|
|
|
|
]
|
|
|
|
|
|
|
|
|
|
# Image types that should be excluded from processing
|
|
|
|
|
EXCLUDED_IMAGE_TYPES = [
|
|
|
|
|
"image/bmp",
|
|
|
|
|
"image/tiff",
|
|
|
|
|
"image/gif",
|
|
|
|
|
"image/svg+xml",
|
|
|
|
|
"image/avif",
|
|
|
|
|
]
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
_PAGE_EXPANSION_FIELDS = [
|
|
|
|
|
"body.storage.value",
|
|
|
|
|
"version",
|
|
|
|
|
"space",
|
|
|
|
|
"metadata.labels",
|
|
|
|
|
"history.lastUpdated",
|
2025-12-18 10:42:43 +01:00
|
|
|
"ancestors",
|
2025-11-03 19:59:18 +08:00
|
|
|
]
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
# Configuration constants
|
|
|
|
|
BLOB_STORAGE_SIZE_THRESHOLD = 20 * 1024 * 1024 # 20MB
|
|
|
|
|
INDEX_BATCH_SIZE = 2
|
|
|
|
|
SLACK_NUM_THREADS = 4
|
|
|
|
|
ENABLE_EXPENSIVE_EXPERT_CALLS = False
|
|
|
|
|
|
|
|
|
|
# Slack related constants
|
|
|
|
|
_SLACK_LIMIT = 900
|
|
|
|
|
FAST_TIMEOUT = 1
|
|
|
|
|
MAX_RETRIES = 7
|
|
|
|
|
MAX_CHANNELS_TO_LOG = 50
|
|
|
|
|
BOT_CHANNEL_MIN_BATCH_SIZE = 256
|
|
|
|
|
BOT_CHANNEL_PERCENTAGE_THRESHOLD = 0.95
|
|
|
|
|
|
|
|
|
|
# Download configuration
|
|
|
|
|
DOWNLOAD_CHUNK_SIZE = 1024 * 1024 # 1MB
|
|
|
|
|
SIZE_THRESHOLD_BUFFER = 64
|
|
|
|
|
|
|
|
|
|
NOTION_CONNECTOR_DISABLE_RECURSIVE_PAGE_LOOKUP = (
|
|
|
|
|
os.environ.get("NOTION_CONNECTOR_DISABLE_RECURSIVE_PAGE_LOOKUP", "").lower()
|
|
|
|
|
== "true"
|
|
|
|
|
)
|
|
|
|
|
|
|
|
|
|
SLIM_BATCH_SIZE = 100
|
|
|
|
|
|
|
|
|
|
# Notion API constants
|
|
|
|
|
_NOTION_PAGE_SIZE = 100
|
|
|
|
|
_NOTION_CALL_TIMEOUT = 30 # 30 seconds
|
|
|
|
|
|
|
|
|
|
_ITERATION_LIMIT = 100_000
|
|
|
|
|
|
|
|
|
|
#####
|
|
|
|
|
# Indexing Configs
|
|
|
|
|
#####
|
|
|
|
|
# NOTE: Currently only supported in the Confluence and Google Drive connectors +
|
|
|
|
|
# only handles some failures (Confluence = handles API call failures, Google
|
|
|
|
|
# Drive = handles failures pulling files / parsing them)
|
|
|
|
|
CONTINUE_ON_CONNECTOR_FAILURE = os.environ.get(
|
|
|
|
|
"CONTINUE_ON_CONNECTOR_FAILURE", ""
|
|
|
|
|
).lower() not in ["false", ""]
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
#####
|
|
|
|
|
# Confluence Connector Configs
|
|
|
|
|
#####
|
|
|
|
|
|
|
|
|
|
CONFLUENCE_CONNECTOR_LABELS_TO_SKIP = [
|
|
|
|
|
ignored_tag
|
|
|
|
|
for ignored_tag in os.environ.get("CONFLUENCE_CONNECTOR_LABELS_TO_SKIP", "").split(
|
|
|
|
|
","
|
|
|
|
|
)
|
|
|
|
|
if ignored_tag
|
|
|
|
|
]
|
|
|
|
|
|
|
|
|
|
# Avoid to get archived pages
|
|
|
|
|
CONFLUENCE_CONNECTOR_INDEX_ARCHIVED_PAGES = (
|
|
|
|
|
os.environ.get("CONFLUENCE_CONNECTOR_INDEX_ARCHIVED_PAGES", "").lower() == "true"
|
|
|
|
|
)
|
|
|
|
|
|
|
|
|
|
# Attachments exceeding this size will not be retrieved (in bytes)
|
|
|
|
|
CONFLUENCE_CONNECTOR_ATTACHMENT_SIZE_THRESHOLD = int(
|
|
|
|
|
os.environ.get("CONFLUENCE_CONNECTOR_ATTACHMENT_SIZE_THRESHOLD", 10 * 1024 * 1024)
|
|
|
|
|
)
|
|
|
|
|
# Attachments with more chars than this will not be indexed. This is to prevent extremely
|
|
|
|
|
# large files from freezing indexing. 200,000 is ~100 google doc pages.
|
|
|
|
|
CONFLUENCE_CONNECTOR_ATTACHMENT_CHAR_COUNT_THRESHOLD = int(
|
|
|
|
|
os.environ.get("CONFLUENCE_CONNECTOR_ATTACHMENT_CHAR_COUNT_THRESHOLD", 200_000)
|
|
|
|
|
)
|
|
|
|
|
|
|
|
|
|
_RAW_CONFLUENCE_CONNECTOR_USER_PROFILES_OVERRIDE = os.environ.get(
|
|
|
|
|
"CONFLUENCE_CONNECTOR_USER_PROFILES_OVERRIDE", ""
|
|
|
|
|
)
|
|
|
|
|
CONFLUENCE_CONNECTOR_USER_PROFILES_OVERRIDE = cast(
|
|
|
|
|
list[dict[str, str]] | None,
|
|
|
|
|
(
|
|
|
|
|
json.loads(_RAW_CONFLUENCE_CONNECTOR_USER_PROFILES_OVERRIDE)
|
|
|
|
|
if _RAW_CONFLUENCE_CONNECTOR_USER_PROFILES_OVERRIDE
|
|
|
|
|
else None
|
|
|
|
|
),
|
|
|
|
|
)
|
|
|
|
|
|
|
|
|
|
# enter as a floating point offset from UTC in hours (-24 < val < 24)
|
|
|
|
|
# this will be applied globally, so it probably makes sense to transition this to per
|
|
|
|
|
# connector as some point.
|
|
|
|
|
# For the default value, we assume that the user's local timezone is more likely to be
|
|
|
|
|
# correct (i.e. the configured user's timezone or the default server one) than UTC.
|
|
|
|
|
# https://developer.atlassian.com/cloud/confluence/cql-fields/#created
|
|
|
|
|
CONFLUENCE_TIMEZONE_OFFSET = float(
|
|
|
|
|
os.environ.get("CONFLUENCE_TIMEZONE_OFFSET", get_current_tz_offset())
|
|
|
|
|
)
|
|
|
|
|
|
2025-11-12 09:37:32 +08:00
|
|
|
CONFLUENCE_SYNC_TIME_BUFFER_SECONDS = int(
|
|
|
|
|
os.environ.get("CONFLUENCE_SYNC_TIME_BUFFER_SECONDS", ONE_DAY)
|
|
|
|
|
)
|
|
|
|
|
|
2026-04-10 06:39:19 +03:00
|
|
|
GOOGLE_DRIVE_SYNC_TIME_BUFFER_SECONDS = int(
|
|
|
|
|
os.environ.get("GOOGLE_DRIVE_SYNC_TIME_BUFFER_SECONDS", ONE_DAY)
|
|
|
|
|
)
|
|
|
|
|
|
2025-11-10 19:15:02 +08:00
|
|
|
GOOGLE_DRIVE_CONNECTOR_SIZE_THRESHOLD = int(
|
|
|
|
|
os.environ.get("GOOGLE_DRIVE_CONNECTOR_SIZE_THRESHOLD", 10 * 1024 * 1024)
|
|
|
|
|
)
|
|
|
|
|
|
2025-11-17 09:38:04 +08:00
|
|
|
JIRA_CONNECTOR_LABELS_TO_SKIP = [
|
|
|
|
|
ignored_tag
|
|
|
|
|
for ignored_tag in os.environ.get("JIRA_CONNECTOR_LABELS_TO_SKIP", "").split(",")
|
|
|
|
|
if ignored_tag
|
|
|
|
|
]
|
|
|
|
|
JIRA_CONNECTOR_MAX_TICKET_SIZE = int(
|
|
|
|
|
os.environ.get("JIRA_CONNECTOR_MAX_TICKET_SIZE", 100 * 1024)
|
|
|
|
|
)
|
|
|
|
|
JIRA_SYNC_TIME_BUFFER_SECONDS = int(
|
|
|
|
|
os.environ.get("JIRA_SYNC_TIME_BUFFER_SECONDS", ONE_MINUTE)
|
|
|
|
|
)
|
|
|
|
|
JIRA_TIMEZONE_OFFSET = float(
|
|
|
|
|
os.environ.get("JIRA_TIMEZONE_OFFSET", get_current_tz_offset())
|
|
|
|
|
)
|
|
|
|
|
|
2025-11-03 19:59:18 +08:00
|
|
|
OAUTH_SLACK_CLIENT_ID = os.environ.get("OAUTH_SLACK_CLIENT_ID", "")
|
|
|
|
|
OAUTH_SLACK_CLIENT_SECRET = os.environ.get("OAUTH_SLACK_CLIENT_SECRET", "")
|
|
|
|
|
OAUTH_CONFLUENCE_CLOUD_CLIENT_ID = os.environ.get(
|
|
|
|
|
"OAUTH_CONFLUENCE_CLOUD_CLIENT_ID", ""
|
|
|
|
|
)
|
|
|
|
|
|
|
|
|
|
OAUTH_CONFLUENCE_CLOUD_CLIENT_SECRET = os.environ.get(
|
|
|
|
|
"OAUTH_CONFLUENCE_CLOUD_CLIENT_SECRET", ""
|
|
|
|
|
)
|
|
|
|
|
|
|
|
|
|
OAUTH_JIRA_CLOUD_CLIENT_ID = os.environ.get("OAUTH_JIRA_CLOUD_CLIENT_ID", "")
|
|
|
|
|
OAUTH_JIRA_CLOUD_CLIENT_SECRET = os.environ.get("OAUTH_JIRA_CLOUD_CLIENT_SECRET", "")
|
|
|
|
|
OAUTH_GOOGLE_DRIVE_CLIENT_ID = os.environ.get("OAUTH_GOOGLE_DRIVE_CLIENT_ID", "")
|
|
|
|
|
OAUTH_GOOGLE_DRIVE_CLIENT_SECRET = os.environ.get(
|
|
|
|
|
"OAUTH_GOOGLE_DRIVE_CLIENT_SECRET", ""
|
|
|
|
|
)
|
2025-11-11 17:21:08 +08:00
|
|
|
GOOGLE_DRIVE_WEB_OAUTH_REDIRECT_URI = os.environ.get("GOOGLE_DRIVE_WEB_OAUTH_REDIRECT_URI", "http://localhost:9380/v1/connector/google-drive/oauth/web/callback")
|
2025-11-28 13:09:40 +08:00
|
|
|
GMAIL_WEB_OAUTH_REDIRECT_URI = os.environ.get("GMAIL_WEB_OAUTH_REDIRECT_URI", "http://localhost:9380/v1/connector/gmail/oauth/web/callback")
|
2025-11-03 19:59:18 +08:00
|
|
|
|
|
|
|
|
CONFLUENCE_OAUTH_TOKEN_URL = "https://auth.atlassian.com/oauth/token"
|
|
|
|
|
RATE_LIMIT_MESSAGE_LOWERCASE = "Rate limit exceeded".lower()
|
|
|
|
|
|
|
|
|
|
_DEFAULT_PAGINATION_LIMIT = 1000
|
|
|
|
|
|
|
|
|
|
_PROBLEMATIC_EXPANSIONS = "body.storage.value"
|
|
|
|
|
_REPLACEMENT_EXPANSIONS = "body.view.value"
|
|
|
|
|
|
2025-12-12 10:23:40 +08:00
|
|
|
BOX_WEB_OAUTH_REDIRECT_URI = os.environ.get("BOX_WEB_OAUTH_REDIRECT_URI", "http://localhost:9380/v1/connector/box/oauth/web/callback")
|
2025-11-03 19:59:18 +08:00
|
|
|
|
2025-12-30 15:09:52 +08:00
|
|
|
GITHUB_CONNECTOR_BASE_URL = os.environ.get("GITHUB_CONNECTOR_BASE_URL") or None
|
|
|
|
|
|
2025-11-03 19:59:18 +08:00
|
|
|
class HtmlBasedConnectorTransformLinksStrategy(str, Enum):
|
|
|
|
|
# remove links entirely
|
|
|
|
|
STRIP = "strip"
|
|
|
|
|
# turn HTML links into markdown links
|
|
|
|
|
MARKDOWN = "markdown"
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
HTML_BASED_CONNECTOR_TRANSFORM_LINKS_STRATEGY = os.environ.get(
|
|
|
|
|
"HTML_BASED_CONNECTOR_TRANSFORM_LINKS_STRATEGY",
|
|
|
|
|
HtmlBasedConnectorTransformLinksStrategy.STRIP,
|
|
|
|
|
)
|
|
|
|
|
|
|
|
|
|
PARSE_WITH_TRAFILATURA = os.environ.get("PARSE_WITH_TRAFILATURA", "").lower() == "true"
|
|
|
|
|
|
|
|
|
|
WEB_CONNECTOR_IGNORED_CLASSES = os.environ.get(
|
|
|
|
|
"WEB_CONNECTOR_IGNORED_CLASSES", "sidebar,footer"
|
|
|
|
|
).split(",")
|
|
|
|
|
WEB_CONNECTOR_IGNORED_ELEMENTS = os.environ.get(
|
|
|
|
|
"WEB_CONNECTOR_IGNORED_ELEMENTS", "nav,footer,meta,script,style,symbol,aside"
|
|
|
|
|
).split(",")
|
|
|
|
|
|
2025-12-25 17:50:41 +08:00
|
|
|
AIRTABLE_CONNECTOR_SIZE_THRESHOLD = int(
|
|
|
|
|
os.environ.get("AIRTABLE_CONNECTOR_SIZE_THRESHOLD", 10 * 1024 * 1024)
|
|
|
|
|
)
|
|
|
|
|
|
2025-12-29 13:28:37 +08:00
|
|
|
ASANA_CONNECTOR_SIZE_THRESHOLD = int(
|
|
|
|
|
os.environ.get("ASANA_CONNECTOR_SIZE_THRESHOLD", 10 * 1024 * 1024)
|
|
|
|
|
)
|
|
|
|
|
|
2025-12-30 17:09:13 +08:00
|
|
|
IMAP_CONNECTOR_SIZE_THRESHOLD = int(
|
|
|
|
|
os.environ.get("IMAP_CONNECTOR_SIZE_THRESHOLD", 10 * 1024 * 1024)
|
|
|
|
|
)
|
|
|
|
|
|
2025-12-31 14:40:49 +08:00
|
|
|
ZENDESK_CONNECTOR_SKIP_ARTICLE_LABELS = os.environ.get(
|
|
|
|
|
"ZENDESK_CONNECTOR_SKIP_ARTICLE_LABELS", ""
|
|
|
|
|
).split(",")
|
|
|
|
|
|
2025-11-03 19:59:18 +08:00
|
|
|
_USER_NOT_FOUND = "Unknown Confluence User"
|
|
|
|
|
|
|
|
|
|
_COMMENT_EXPANSION_FIELDS = ["body.storage.value"]
|
|
|
|
|
|
|
|
|
|
_ATTACHMENT_EXPANSION_FIELDS = [
|
|
|
|
|
"version",
|
|
|
|
|
"space",
|
|
|
|
|
"metadata.labels",
|
|
|
|
|
]
|
|
|
|
|
|
|
|
|
|
_RESTRICTIONS_EXPANSION_FIELDS = [
|
|
|
|
|
"space",
|
|
|
|
|
"restrictions.read.restrictions.user",
|
|
|
|
|
"restrictions.read.restrictions.group",
|
|
|
|
|
"ancestors.restrictions.read.restrictions.user",
|
|
|
|
|
"ancestors.restrictions.read.restrictions.group",
|
|
|
|
|
]
|
|
|
|
|
|
|
|
|
|
|
2025-11-04 17:29:11 +08:00
|
|
|
_SLIM_DOC_BATCH_SIZE = 5000
|