mirror of
https://github.com/infiniflow/ragflow.git
synced 2026-06-29 23:41:12 +08:00
### What problem does this PR solve? Closes #15332. RAGFlow can index Gmail and generic IMAP mailboxes but had no native connector for Outlook / Microsoft 365 mail. Organisations on Microsoft 365 had no way to bring mailbox content into a knowledge base through Microsoft Graph. This PR adds a net-new Outlook data source that: - Authenticates against Microsoft Graph with the same MSAL client-credentials flow already used by the SharePoint and Teams connectors (no new auth primitives). - Pages over `/users/{id}/mailFolders/{folder}/messages/delta` per mailbox and persists `@odata.deltaLink` values in `OutlookCheckpoint.delta_links`, so incremental syncs only fetch changed messages. - Supports two scoping modes: - **Tenant-wide** (default): enumerates every user in the tenant via `/users` and syncs each mailbox. Requires `User.Read.All`. - **Targeted**: when `user_ids` is provided (comma-separated UPNs or object IDs), only those mailboxes are synced. `User.Read.All` is not needed in this mode. - Lets the caller pick the mail folder (`inbox`, `sentitems`, `archive`, ...). Defaults to `inbox`. - Maps each message to a `Document` shaped after the Gmail connector: one `TextSection` carrying `From/To/Cc/Subject` headers + body, with HTML bodies stripped to text inline (no extra dependency). - Surfaces typed errors on the validation probe: 401 → `ConnectorMissingCredentialError`, 403 → `InsufficientPermissionsError` (with `Mail.Read` / `User.Read.All` hint), 404 on a configured mailbox → `ConnectorValidationError`, 5xx → `UnexpectedValidationError`. - Skips messages flagged `@removed` by the delta semantics and messages whose `receivedDateTime` is older than `poll_range_start`. #### Files | File | Change | |------|--------| | `common/data_source/outlook_connector.py` | **New** — `OutlookConnector` (`CheckpointedConnectorWithPermSync` + `SlimConnectorWithPermSync`) + `OutlookCheckpoint` + tiny `_strip_html` helper. | | `common/data_source/config.py` | `DocumentSource.OUTLOOK = "outlook"`. | | `common/constants.py` | `FileSource.OUTLOOK = "outlook"`. | | `common/data_source/__init__.py` | Export `OutlookConnector`. | | `rag/svr/sync_data_source.py` | `Outlook(SyncBase)` with `batch_size` normalisation, CSV/list parsing of `user_ids`; registered in `func_factory`. | | `web/src/pages/user-setting/data-source/constant/index.tsx` | `DataSourceKey.OUTLOOK`, visibility map (`syncDeletedFiles: true`), info entry, form fields (tenant_id, client_id, client_secret, folder, user_ids, batch_size), default values. | | `web/src/locales/en.ts`, `web/src/locales/zh.ts` | `outlookDescription` + 5 tooltip keys (EN + ZH). | | `test/unit_test/data_source/test_outlook_connector_unit.py` | **New** — 19 unit tests (`p1`/`p2`/`p3`) covering auth, validation (tenant-wide vs specific user vs error paths), checkpoint helpers, user enumeration pagination, message filtering, HTML body stripping. | #### Required Azure AD permissions - `Mail.Read` (Application, admin-granted) — always. - `User.Read.All` (Application, admin-granted) — only when `user_ids` is left blank so the connector can enumerate mailboxes. #### Out of scope - **Attachment indexing.** The current connector emits message body + headers; binary attachments are flagged via `metadata.has_attachments` but not pulled. Adding attachment hydration is straightforward but scoped out per the issue's "decide whether attachments are indexed in the first version" note. - **Delegated (per-user) OAuth.** The connector uses app-only credentials, consistent with the SharePoint / Teams precedent in this codebase. ### Type of change - [x] New Feature (non-breaking change which adds functionality)
287 lines
7.8 KiB
Python
287 lines
7.8 KiB
Python
#
|
|
# Copyright 2025 The InfiniFlow Authors. All Rights Reserved.
|
|
#
|
|
# Licensed under the Apache License, Version 2.0 (the "License");
|
|
# you may not use this file except in compliance with the License.
|
|
# You may obtain a copy of the License at
|
|
#
|
|
# http://www.apache.org/licenses/LICENSE-2.0
|
|
#
|
|
# Unless required by applicable law or agreed to in writing, software
|
|
# distributed under the License is distributed on an "AS IS" BASIS,
|
|
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
|
|
# See the License for the specific language governing permissions and
|
|
# limitations under the License.
|
|
#
|
|
|
|
import os
|
|
from enum import Enum, IntEnum
|
|
from enum import StrEnum
|
|
|
|
SERVICE_CONF = "service_conf.yaml"
|
|
RAG_FLOW_SERVICE_NAME = "ragflow"
|
|
SANDBOX_ARTIFACT_BUCKET = os.environ.get("SANDBOX_ARTIFACT_BUCKET", "sandbox-artifacts")
|
|
SANDBOX_ARTIFACT_EXPIRE_DAYS = int(os.environ.get("SANDBOX_ARTIFACT_EXPIRE_DAYS", "7"))
|
|
|
|
|
|
class CustomEnum(Enum):
|
|
@classmethod
|
|
def valid(cls, value):
|
|
try:
|
|
cls(value)
|
|
return True
|
|
except BaseException:
|
|
return False
|
|
|
|
@classmethod
|
|
def values(cls):
|
|
return [member.value for member in cls.__members__.values()]
|
|
|
|
@classmethod
|
|
def names(cls):
|
|
return [member.name for member in cls.__members__.values()]
|
|
|
|
|
|
class RetCode(IntEnum, CustomEnum):
|
|
SUCCESS = 0
|
|
NOT_EFFECTIVE = 10
|
|
EXCEPTION_ERROR = 100
|
|
ARGUMENT_ERROR = 101
|
|
DATA_ERROR = 102
|
|
OPERATING_ERROR = 103
|
|
CONNECTION_ERROR = 105
|
|
RUNNING = 106
|
|
PERMISSION_ERROR = 108
|
|
AUTHENTICATION_ERROR = 109
|
|
BAD_REQUEST = 400
|
|
UNAUTHORIZED = 401
|
|
SERVER_ERROR = 500
|
|
FORBIDDEN = 403
|
|
NOT_FOUND = 404
|
|
CONFLICT = 409
|
|
|
|
|
|
class StatusEnum(Enum):
|
|
VALID = "1"
|
|
INVALID = "0"
|
|
|
|
|
|
class ActiveStatusEnum(Enum):
|
|
ACTIVE = "active"
|
|
INACTIVE = "inactive"
|
|
|
|
|
|
class ActiveEnum(Enum):
|
|
ACTIVE = "1"
|
|
INACTIVE = "0"
|
|
|
|
|
|
class LLMType(StrEnum):
|
|
CHAT = "chat"
|
|
EMBEDDING = "embedding"
|
|
SPEECH2TEXT = "speech2text"
|
|
IMAGE2TEXT = "image2text"
|
|
RERANK = "rerank"
|
|
TTS = "tts"
|
|
OCR = "ocr"
|
|
|
|
|
|
class TaskStatus(StrEnum):
|
|
UNSTART = "0"
|
|
RUNNING = "1"
|
|
CANCEL = "2"
|
|
DONE = "3"
|
|
FAIL = "4"
|
|
SCHEDULE = "5"
|
|
|
|
|
|
VALID_TASK_STATUS = {TaskStatus.UNSTART, TaskStatus.RUNNING, TaskStatus.CANCEL, TaskStatus.DONE, TaskStatus.FAIL, TaskStatus.SCHEDULE}
|
|
|
|
|
|
class ConnectorTaskType(StrEnum):
|
|
SYNC = "sync"
|
|
PRUNE = "prune"
|
|
|
|
|
|
class ParserType(StrEnum):
|
|
PRESENTATION = "presentation"
|
|
LAWS = "laws"
|
|
MANUAL = "manual"
|
|
PAPER = "paper"
|
|
RESUME = "resume"
|
|
BOOK = "book"
|
|
QA = "qa"
|
|
TABLE = "table"
|
|
NAIVE = "naive"
|
|
PICTURE = "picture"
|
|
ONE = "one"
|
|
AUDIO = "audio"
|
|
EMAIL = "email"
|
|
KG = "knowledge_graph"
|
|
TAG = "tag"
|
|
|
|
|
|
class FileSource(StrEnum):
|
|
LOCAL = ""
|
|
KNOWLEDGEBASE = "knowledgebase"
|
|
RSS = "rss"
|
|
S3 = "s3"
|
|
NOTION = "notion"
|
|
REST_API = "rest_api"
|
|
DISCORD = "discord"
|
|
CONFLUENCE = "confluence"
|
|
GMAIL = "gmail"
|
|
GOOGLE_DRIVE = "google_drive"
|
|
JIRA = "jira"
|
|
SHAREPOINT = "sharepoint"
|
|
SLACK = "slack"
|
|
TEAMS = "teams"
|
|
WEBDAV = "webdav"
|
|
MOODLE = "moodle"
|
|
DROPBOX = "dropbox"
|
|
BOX = "box"
|
|
R2 = "r2"
|
|
OCI_STORAGE = "oci_storage"
|
|
GOOGLE_CLOUD_STORAGE = "google_cloud_storage"
|
|
AIRTABLE = "airtable"
|
|
ASANA = "asana"
|
|
GITHUB = "github"
|
|
GITLAB = "gitlab"
|
|
IMAP = "imap"
|
|
BITBUCKET = "bitbucket"
|
|
ZENDESK = "zendesk"
|
|
SEAFILE = "seafile"
|
|
MYSQL = "mysql"
|
|
POSTGRESQL = "postgresql"
|
|
DINGTALK_AI_TABLE = "dingtalk_ai_table"
|
|
ONEDRIVE = "onedrive"
|
|
OUTLOOK = "outlook"
|
|
|
|
|
|
class PipelineTaskType(StrEnum):
|
|
PARSE = "Parse"
|
|
DOWNLOAD = "Download"
|
|
RAPTOR = "RAPTOR"
|
|
GRAPH_RAG = "GraphRAG"
|
|
MINDMAP = "Mindmap"
|
|
MEMORY = "Memory"
|
|
|
|
|
|
VALID_PIPELINE_TASK_TYPES = {PipelineTaskType.PARSE, PipelineTaskType.DOWNLOAD, PipelineTaskType.RAPTOR, PipelineTaskType.GRAPH_RAG, PipelineTaskType.MINDMAP}
|
|
|
|
|
|
class MCPServerType(StrEnum):
|
|
SSE = "sse"
|
|
STREAMABLE_HTTP = "streamable-http"
|
|
|
|
|
|
VALID_MCP_SERVER_TYPES = {MCPServerType.SSE, MCPServerType.STREAMABLE_HTTP}
|
|
|
|
|
|
class Storage(Enum):
|
|
MINIO = 1
|
|
AZURE_SPN = 2
|
|
AZURE_SAS = 3
|
|
AWS_S3 = 4
|
|
OSS = 5
|
|
OPENDAL = 6
|
|
GCS = 7
|
|
|
|
|
|
class MemoryType(Enum):
|
|
RAW = 0b0001 # 1 << 0 = 1 (0b00000001)
|
|
SEMANTIC = 0b0010 # 1 << 1 = 2 (0b00000010)
|
|
EPISODIC = 0b0100 # 1 << 2 = 4 (0b00000100)
|
|
PROCEDURAL = 0b1000 # 1 << 3 = 8 (0b00001000)
|
|
|
|
|
|
class MemoryStorageType(StrEnum):
|
|
TABLE = "table"
|
|
GRAPH = "graph"
|
|
|
|
|
|
class ForgettingPolicy(StrEnum):
|
|
FIFO = "FIFO"
|
|
|
|
|
|
# environment
|
|
# ENV_STRONG_TEST_COUNT = "STRONG_TEST_COUNT"
|
|
# ENV_RAGFLOW_SECRET_KEY = "RAGFLOW_SECRET_KEY"
|
|
# ENV_REGISTER_ENABLED = "REGISTER_ENABLED"
|
|
# ENV_DOC_ENGINE = "DOC_ENGINE"
|
|
# ENV_SANDBOX_ENABLED = "SANDBOX_ENABLED"
|
|
# ENV_SANDBOX_HOST = "SANDBOX_HOST"
|
|
# ENV_MAX_CONTENT_LENGTH = "MAX_CONTENT_LENGTH"
|
|
# ENV_COMPONENT_EXEC_TIMEOUT = "COMPONENT_EXEC_TIMEOUT"
|
|
# ENV_TRINO_USE_TLS = "TRINO_USE_TLS"
|
|
# ENV_MAX_FILE_NUM_PER_USER = "MAX_FILE_NUM_PER_USER"
|
|
# ENV_MACOS = "MACOS"
|
|
# ENV_RAGFLOW_DEBUGPY_LISTEN = "RAGFLOW_DEBUGPY_LISTEN"
|
|
# ENV_WERKZEUG_RUN_MAIN = "WERKZEUG_RUN_MAIN"
|
|
# ENV_DISABLE_SDK = "DISABLE_SDK"
|
|
# ENV_ENABLE_TIMEOUT_ASSERTION = "ENABLE_TIMEOUT_ASSERTION"
|
|
# ENV_LOG_LEVELS = "LOG_LEVELS"
|
|
# ENV_TENSORRT_DLA_SVR = "TENSORRT_DLA_SVR"
|
|
# ENV_OCR_GPU_MEM_LIMIT_MB = "OCR_GPU_MEM_LIMIT_MB"
|
|
# ENV_OCR_ARENA_EXTEND_STRATEGY = "OCR_ARENA_EXTEND_STRATEGY"
|
|
# ENV_MAX_CONCURRENT_PROCESS_AND_EXTRACT_CHUNK = "MAX_CONCURRENT_PROCESS_AND_EXTRACT_CHUNK"
|
|
# ENV_MAX_MAX_CONCURRENT_CHATS = "MAX_CONCURRENT_CHATS"
|
|
# ENV_RAGFLOW_MCP_BASE_URL = "RAGFLOW_MCP_BASE_URL"
|
|
# ENV_RAGFLOW_MCP_HOST = "RAGFLOW_MCP_HOST"
|
|
# ENV_RAGFLOW_MCP_PORT = "RAGFLOW_MCP_PORT"
|
|
# ENV_RAGFLOW_MCP_LAUNCH_MODE = "RAGFLOW_MCP_LAUNCH_MODE"
|
|
# ENV_RAGFLOW_MCP_HOST_API_KEY = "RAGFLOW_MCP_HOST_API_KEY"
|
|
# ENV_MINERU_EXECUTABLE = "MINERU_EXECUTABLE"
|
|
# ENV_MINERU_APISERVER = "MINERU_APISERVER"
|
|
# ENV_MINERU_OUTPUT_DIR = "MINERU_OUTPUT_DIR"
|
|
# ENV_MINERU_BACKEND = "MINERU_BACKEND"
|
|
# ENV_MINERU_DELETE_OUTPUT = "MINERU_DELETE_OUTPUT"
|
|
# ENV_DOCLING_SERVER_URL = "DOCLING_SERVER_URL"
|
|
# ENV_DOCLING_OUTPUT_DIR = "DOCLING_OUTPUT_DIR"
|
|
# ENV_DOCLING_DELETE_OUTPUT = "DOCLING_DELETE_OUTPUT"
|
|
# ENV_TCADP_OUTPUT_DIR = "TCADP_OUTPUT_DIR"
|
|
# ENV_LM_TIMEOUT_SECONDS = "LM_TIMEOUT_SECONDS"
|
|
# ENV_LLM_MAX_RETRIES = "LLM_MAX_RETRIES"
|
|
# ENV_LLM_BASE_DELAY = "LLM_BASE_DELAY"
|
|
# ENV_OLLAMA_KEEP_ALIVE = "OLLAMA_KEEP_ALIVE"
|
|
# ENV_DOC_BULK_SIZE = "DOC_BULK_SIZE"
|
|
# ENV_EMBEDDING_BATCH_SIZE = "EMBEDDING_BATCH_SIZE"
|
|
# ENV_MAX_CONCURRENT_TASKS = "MAX_CONCURRENT_TASKS"
|
|
# ENV_MAX_CONCURRENT_CHUNK_BUILDERS = "MAX_CONCURRENT_CHUNK_BUILDERS"
|
|
# ENV_MAX_CONCURRENT_MINIO = "MAX_CONCURRENT_MINIO"
|
|
# ENV_WORKER_HEARTBEAT_TIMEOUT = "WORKER_HEARTBEAT_TIMEOUT"
|
|
# ENV_TRACE_MALLOC_ENABLED = "TRACE_MALLOC_ENABLED"
|
|
|
|
PAGERANK_FLD = "pagerank_fea"
|
|
SVR_QUEUE_NAME = "te"
|
|
SVR_CONSUMER_GROUP_NAME = "rag_flow_svr_task_broker"
|
|
TAG_FLD = "tag_feas"
|
|
|
|
# Maximum page number used as "unlimited" sentinel value.
|
|
# Parsing layer (chunk/Pdf.__call__) uses MAXIMUM_PAGE_NUMBER.
|
|
# Task/DB layer (Task model) uses MAXIMUM_PAGE_NUMBER * 1000 to avoid collision with user-specified page ranges.
|
|
MAXIMUM_PAGE_NUMBER = 100000
|
|
MAXIMUM_TASK_PAGE_NUMBER = MAXIMUM_PAGE_NUMBER * 1000
|
|
|
|
|
|
MINERU_ENV_KEYS = ["MINERU_APISERVER", "MINERU_OUTPUT_DIR", "MINERU_BACKEND", "MINERU_SERVER_URL", "MINERU_DELETE_OUTPUT"]
|
|
MINERU_DEFAULT_CONFIG = {
|
|
"MINERU_APISERVER": "",
|
|
"MINERU_OUTPUT_DIR": "",
|
|
"MINERU_BACKEND": "pipeline",
|
|
"MINERU_SERVER_URL": "",
|
|
"MINERU_DELETE_OUTPUT": 1,
|
|
}
|
|
|
|
PADDLEOCR_ENV_KEYS = ["PADDLEOCR_API_URL", "PADDLEOCR_ACCESS_TOKEN", "PADDLEOCR_ALGORITHM"]
|
|
PADDLEOCR_DEFAULT_CONFIG = {
|
|
"PADDLEOCR_API_URL": "",
|
|
"PADDLEOCR_ACCESS_TOKEN": None,
|
|
"PADDLEOCR_ALGORITHM": "PaddleOCR-VL",
|
|
}
|
|
|
|
OPENDATALOADER_ENV_KEYS = ["OPENDATALOADER_APISERVER"]
|
|
OPENDATALOADER_DEFAULT_CONFIG = {
|
|
"OPENDATALOADER_APISERVER": "",
|
|
}
|