2025-11-03 16:32:37 +08:00
|
|
|
#
|
|
|
|
|
# Copyright 2025 The InfiniFlow Authors. All Rights Reserved.
|
|
|
|
|
#
|
|
|
|
|
# Licensed under the Apache License, Version 2.0 (the "License");
|
|
|
|
|
# you may not use this file except in compliance with the License.
|
|
|
|
|
# You may obtain a copy of the License at
|
|
|
|
|
#
|
|
|
|
|
# http://www.apache.org/licenses/LICENSE-2.0
|
|
|
|
|
#
|
|
|
|
|
# Unless required by applicable law or agreed to in writing, software
|
|
|
|
|
# distributed under the License is distributed on an "AS IS" BASIS,
|
|
|
|
|
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
|
|
|
|
|
# See the License for the specific language governing permissions and
|
|
|
|
|
# limitations under the License.
|
|
|
|
|
#
|
|
|
|
|
|
2026-03-20 20:32:00 +08:00
|
|
|
import os
|
2025-11-04 15:12:53 +08:00
|
|
|
from enum import Enum, IntEnum
|
2026-05-15 08:40:53 +02:00
|
|
|
from enum import StrEnum
|
2025-11-04 15:12:53 +08:00
|
|
|
|
2025-11-03 16:32:37 +08:00
|
|
|
SERVICE_CONF = "service_conf.yaml"
|
2025-11-06 09:36:38 +08:00
|
|
|
RAG_FLOW_SERVICE_NAME = "ragflow"
|
2026-03-20 20:32:00 +08:00
|
|
|
SANDBOX_ARTIFACT_BUCKET = os.environ.get("SANDBOX_ARTIFACT_BUCKET", "sandbox-artifacts")
|
|
|
|
|
SANDBOX_ARTIFACT_EXPIRE_DAYS = int(os.environ.get("SANDBOX_ARTIFACT_EXPIRE_DAYS", "7"))
|
2025-11-04 15:12:53 +08:00
|
|
|
|
2026-01-09 17:48:45 +08:00
|
|
|
|
2025-11-04 15:12:53 +08:00
|
|
|
class CustomEnum(Enum):
|
|
|
|
|
@classmethod
|
|
|
|
|
def valid(cls, value):
|
|
|
|
|
try:
|
|
|
|
|
cls(value)
|
|
|
|
|
return True
|
|
|
|
|
except BaseException:
|
|
|
|
|
return False
|
|
|
|
|
|
|
|
|
|
@classmethod
|
|
|
|
|
def values(cls):
|
|
|
|
|
return [member.value for member in cls.__members__.values()]
|
|
|
|
|
|
|
|
|
|
@classmethod
|
|
|
|
|
def names(cls):
|
|
|
|
|
return [member.name for member in cls.__members__.values()]
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
class RetCode(IntEnum, CustomEnum):
|
|
|
|
|
SUCCESS = 0
|
|
|
|
|
NOT_EFFECTIVE = 10
|
|
|
|
|
EXCEPTION_ERROR = 100
|
|
|
|
|
ARGUMENT_ERROR = 101
|
|
|
|
|
DATA_ERROR = 102
|
|
|
|
|
OPERATING_ERROR = 103
|
|
|
|
|
CONNECTION_ERROR = 105
|
|
|
|
|
RUNNING = 106
|
|
|
|
|
PERMISSION_ERROR = 108
|
|
|
|
|
AUTHENTICATION_ERROR = 109
|
2025-11-27 09:41:24 +08:00
|
|
|
BAD_REQUEST = 400
|
2025-11-04 15:12:53 +08:00
|
|
|
UNAUTHORIZED = 401
|
|
|
|
|
SERVER_ERROR = 500
|
|
|
|
|
FORBIDDEN = 403
|
|
|
|
|
NOT_FOUND = 404
|
2025-12-22 16:47:21 +08:00
|
|
|
CONFLICT = 409
|
2025-11-04 15:12:53 +08:00
|
|
|
|
2025-11-04 19:25:25 +08:00
|
|
|
|
|
|
|
|
class StatusEnum(Enum):
|
|
|
|
|
VALID = "1"
|
|
|
|
|
INVALID = "0"
|
|
|
|
|
|
|
|
|
|
|
2026-05-29 17:39:41 +08:00
|
|
|
class ActiveStatusEnum(Enum):
|
|
|
|
|
ACTIVE = "active"
|
|
|
|
|
INACTIVE = "inactive"
|
2026-06-15 19:10:33 +08:00
|
|
|
UNSUPPORTED = "unsupported"
|
2026-05-29 17:39:41 +08:00
|
|
|
|
|
|
|
|
|
2025-11-04 19:25:25 +08:00
|
|
|
class ActiveEnum(Enum):
|
|
|
|
|
ACTIVE = "1"
|
|
|
|
|
INACTIVE = "0"
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
class LLMType(StrEnum):
|
2026-01-09 17:48:45 +08:00
|
|
|
CHAT = "chat"
|
|
|
|
|
EMBEDDING = "embedding"
|
|
|
|
|
SPEECH2TEXT = "speech2text"
|
|
|
|
|
IMAGE2TEXT = "image2text"
|
|
|
|
|
RERANK = "rerank"
|
|
|
|
|
TTS = "tts"
|
|
|
|
|
OCR = "ocr"
|
2025-11-04 19:25:25 +08:00
|
|
|
|
|
|
|
|
|
|
|
|
|
class TaskStatus(StrEnum):
|
|
|
|
|
UNSTART = "0"
|
|
|
|
|
RUNNING = "1"
|
|
|
|
|
CANCEL = "2"
|
|
|
|
|
DONE = "3"
|
|
|
|
|
FAIL = "4"
|
|
|
|
|
SCHEDULE = "5"
|
|
|
|
|
|
|
|
|
|
|
2026-01-09 17:48:45 +08:00
|
|
|
VALID_TASK_STATUS = {TaskStatus.UNSTART, TaskStatus.RUNNING, TaskStatus.CANCEL, TaskStatus.DONE, TaskStatus.FAIL, TaskStatus.SCHEDULE}
|
2025-11-04 19:25:25 +08:00
|
|
|
|
|
|
|
|
|
2026-05-19 10:07:11 +08:00
|
|
|
class ConnectorTaskType(StrEnum):
|
|
|
|
|
SYNC = "sync"
|
|
|
|
|
PRUNE = "prune"
|
|
|
|
|
|
|
|
|
|
|
2025-11-04 19:25:25 +08:00
|
|
|
class ParserType(StrEnum):
|
|
|
|
|
PRESENTATION = "presentation"
|
|
|
|
|
LAWS = "laws"
|
|
|
|
|
MANUAL = "manual"
|
|
|
|
|
PAPER = "paper"
|
|
|
|
|
RESUME = "resume"
|
|
|
|
|
BOOK = "book"
|
|
|
|
|
QA = "qa"
|
|
|
|
|
TABLE = "table"
|
|
|
|
|
NAIVE = "naive"
|
|
|
|
|
PICTURE = "picture"
|
|
|
|
|
ONE = "one"
|
|
|
|
|
AUDIO = "audio"
|
|
|
|
|
EMAIL = "email"
|
|
|
|
|
KG = "knowledge_graph"
|
|
|
|
|
TAG = "tag"
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
class FileSource(StrEnum):
|
|
|
|
|
LOCAL = ""
|
|
|
|
|
KNOWLEDGEBASE = "knowledgebase"
|
2026-03-27 22:58:44 +08:00
|
|
|
RSS = "rss"
|
2025-11-04 19:25:25 +08:00
|
|
|
S3 = "s3"
|
|
|
|
|
NOTION = "notion"
|
Feature/generic api connector (#13545)
# feat: Add Generic REST API Connector
## What problem does this PR solve?
RAGFlow supports many specific data source connectors (MySQL, Slack,
Google Drive, etc.), but there was no way to connect an arbitrary REST
API as a data source. Users with custom or third-party APIs had to write
a new connector class for each one.
This PR adds a **generic, configuration-driven REST API connector** that
lets users connect any REST API as a data source entirely through the UI
— no code changes needed per API.
---
## Features
### Core Connector (`common/data_source/rest_api_connector.py`)
- Implements `LoadConnector` and `PollConnector` interfaces for full and
incremental sync
- **Configurable authentication:** None, API Key (custom header), Bearer
Token, Basic Auth
- **Pluggable pagination:** Page-based, Offset-based, Cursor-based, or
None
- Smart page-size inference from user's query parameters to avoid
duplicate/conflicting params
- Configurable request delay between pages to prevent API rate limiting
- Auto-detection of the items array in JSON responses (`items`,
`results`, `data`, `records`, or first list found)
- **Advanced field mapping** with dot-notation (`country.name`), array
wildcards (`newsType[*].name`), type hints, and default values
- Optional content template rendering (`"Title: {title}\nBody: {body}"`)
- HTML stripping for content fields
- Stable document IDs via `hash128` from a configurable ID field or
auto-generated from item content
- Pydantic configuration schema with automatic coercion of UI string
inputs to dicts/lists
### Backend Registration (`rag/svr/sync_data_source.py`,
`common/constants.py`, `common/data_source/config.py`)
- `REST_API` sync class wired into RAGFlow's `func_factory`
- Full sync (`load_from_state`) and incremental polling (`poll_source`)
support
- Credentials and config passed from task to connector following
existing patterns (MySQL, SeaFile, etc.)
### Test Connection Endpoint (`api/apps/connector_app.py`)
- `POST /v1/connector/<id>/test` validates config schema,
authentication, and API connectivity without triggering a sync
- Clear error messages for auth failures vs. config issues
### Frontend UI (`web/src/pages/user-setting/data-source/constant/`)
- **Postman-style configuration:** Base URL, Query Parameters (key=value
per line), Auth, Content Fields, Metadata Fields, Pagination Type
- Auth-type-aware form: fields for API key header/value, Bearer token,
or Basic username/password appear only when relevant
- **Advanced Settings** toggle for: Custom Headers, Max Pages, Request
Delay, Poll Timestamp Field, Request Body (POST)
- Connector icon (SVG) and i18n strings (English)
- **"Test Connection"** button to validate before syncing
---
## Controls & Safety
- Configurable max pages safety cap (default: 1000, adjustable in UI)
- Configurable request delay between pages (default: 0.5s, adjustable in
UI)
- Auth errors (401/403) fail immediately without retries; transient
errors retry with exponential backoff
- Diagnostic logging: auth setup confirmation, request details on
failure, content field extraction status
---
## Type of change
- [x] New Feature (non-breaking change which adds functionality)
##Visual Screenshots of Features
<img width="482" height="510" alt="Screenshot 2026-03-11 at 5 19 52 PM"
src="https://github.com/user-attachments/assets/dcb7ab4a-1622-44f3-bb02-d6f0527314c4"
/>
(Connector can be configured within the external data sources tab)
Configuration Parameters:
<img width="661" height="682" alt="Screenshot 2026-03-11 at 5 20 46 PM"
src="https://github.com/user-attachments/assets/5e154e71-4ab5-4872-bfb2-04f02b73c18a"
/>
<img width="661" height="682" alt="Screenshot 2026-03-11 at 5 20 54 PM"
src="https://github.com/user-attachments/assets/00cb14b7-0bcf-4b94-9d71-34e93369ecb2"
/>
Connection can be tested before attaching to dataset:
<img width="981" height="681" alt="Screenshot 2026-03-11 at 5 21 40 PM"
src="https://github.com/user-attachments/assets/aaa6eeeb-89a7-4349-bc34-2423bf8be9ee"
/>
Ingestion tested with API connector (works perfectly fine):
<img width="1062" height="705" alt="Screenshot 2026-03-11 at 5 22 30 PM"
src="https://github.com/user-attachments/assets/afcd0d58-cadd-4152-badc-d2f14d96fbec"
/>
Search & Retrieval works as well with metadata flow:
<img width="1062" height="705" alt="Screenshot 2026-03-11 at 5 23 05 PM"
src="https://github.com/user-attachments/assets/d41ee935-dcf7-4456-b317-22a76ca032c0"
/>
---------
Co-authored-by: Ahmad Intisar <ahmadintisar@Ahmads-MacBook-M4-Pro.local>
Co-authored-by: Copilot Autofix powered by AI <175728472+Copilot@users.noreply.github.com>
2026-05-13 17:35:01 +05:00
|
|
|
REST_API = "rest_api"
|
2025-11-04 19:25:25 +08:00
|
|
|
DISCORD = "discord"
|
|
|
|
|
CONFLUENCE = "confluence"
|
|
|
|
|
GMAIL = "gmail"
|
2025-11-10 19:15:02 +08:00
|
|
|
GOOGLE_DRIVE = "google_drive"
|
2025-11-04 19:25:25 +08:00
|
|
|
JIRA = "jira"
|
|
|
|
|
SHAREPOINT = "sharepoint"
|
|
|
|
|
SLACK = "slack"
|
|
|
|
|
TEAMS = "teams"
|
2025-11-26 07:14:42 +01:00
|
|
|
WEBDAV = "webdav"
|
2025-11-21 12:58:49 +01:00
|
|
|
MOODLE = "moodle"
|
2025-11-25 09:40:03 +08:00
|
|
|
DROPBOX = "dropbox"
|
2025-12-12 10:23:40 +08:00
|
|
|
BOX = "box"
|
2025-12-22 09:36:16 +08:00
|
|
|
R2 = "r2"
|
|
|
|
|
OCI_STORAGE = "oci_storage"
|
|
|
|
|
GOOGLE_CLOUD_STORAGE = "google_cloud_storage"
|
2025-12-25 17:50:41 +08:00
|
|
|
AIRTABLE = "airtable"
|
2025-12-29 13:28:37 +08:00
|
|
|
ASANA = "asana"
|
2025-12-30 15:09:52 +08:00
|
|
|
GITHUB = "github"
|
2025-12-29 17:05:20 +08:00
|
|
|
GITLAB = "gitlab"
|
2025-12-30 17:09:13 +08:00
|
|
|
IMAP = "imap"
|
2025-12-31 17:18:30 +08:00
|
|
|
BITBUCKET = "bitbucket"
|
2025-12-31 14:40:49 +08:00
|
|
|
ZENDESK = "zendesk"
|
2026-02-03 06:42:05 +01:00
|
|
|
SEAFILE = "seafile"
|
2026-02-03 23:14:32 -03:00
|
|
|
MYSQL = "mysql"
|
|
|
|
|
POSTGRESQL = "postgresql"
|
2026-03-06 21:13:23 +08:00
|
|
|
DINGTALK_AI_TABLE = "dingtalk_ai_table"
|
feat(connector): implement OneDrive data source connector (issue #15330) (#15331)
### What problem does this PR solve?
Closes #15330.
RAGFlow had no connector for OneDrive / OneDrive for Business. Users who
store working documents in OneDrive could not index them into a
knowledge base without manually downloading and re-uploading files.
This PR adds a net-new OneDrive data source that:
- Authenticates against Microsoft Graph with the same MSAL
client-credentials flow already used by the SharePoint and Teams
connectors (no new auth primitives).
- Enumerates every drive visible to the service principal and pages
through `/drives/{id}/root/delta`, persisting `@odata.deltaLink` values
per drive so subsequent syncs only fetch changed items.
- Optionally narrows ingestion to a sub-folder (`folder_path`) without
needing a separate code path.
- Surfaces typed errors on the validation probe (`GET /drives?$top=1`):
401 → `ConnectorMissingCredentialError`, 403 →
`InsufficientPermissionsError` (with a `Files.Read.All` hint), 5xx →
`UnexpectedValidationError`.
- Filters folders, soft-deleted items, and unsupported extensions (`.pdf
.docx .doc .xlsx .xls .pptx .ppt .txt .md .csv`).
#### Files
| File | Change |
|------|--------|
| `common/data_source/onedrive_connector.py` | **New** —
`OneDriveConnector` + `OneDriveCheckpoint`. |
| `common/data_source/config.py` | `DocumentSource.ONEDRIVE =
"onedrive"`. |
| `common/constants.py` | `FileSource.ONEDRIVE = "onedrive"`. |
| `common/data_source/__init__.py` | Export `OneDriveConnector`. |
| `rag/svr/sync_data_source.py` | `OneDrive(SyncBase)` with `batch_size`
normalisation; registered in `func_factory`. |
| `web/src/pages/user-setting/data-source/constant/index.tsx` |
`DataSourceKey.ONEDRIVE`, visibility map (`syncDeletedFiles: true`),
info entry, form fields (tenant_id, client_id, client_secret,
folder_path, batch_size), default values. |
| `web/src/locales/en.ts`, `web/src/locales/zh.ts` |
`onedriveDescription` + 4 tooltip keys (EN + ZH). |
| `test/unit_test/data_source/test_onedrive_connector_unit.py` | **New**
— 13 unit tests (`p1`/`p2`) covering auth, validation, checkpoint
helpers, and document filtering. |
#### Required Azure AD permission
`Files.Read.All` (Application, admin-granted).
#### Out of scope
- Interactive end-user OAuth (delegated permissions) — the connector
uses app-only credentials, consistent with the SharePoint / Teams
precedent.
- Binary download of file contents — the sync layer emits `Document`s
carrying `webUrl` + metadata; bytes are hydrated downstream by the parse
pipeline.
### Type of change
- [x] New Feature (non-breaking change which adds functionality)
2026-05-29 05:26:06 -06:00
|
|
|
ONEDRIVE = "onedrive"
|
feat(connector): implement Outlook data source connector (issue #15332) (#15333)
### What problem does this PR solve?
Closes #15332.
RAGFlow can index Gmail and generic IMAP mailboxes but had no native
connector for Outlook / Microsoft 365 mail. Organisations on Microsoft
365 had no way to bring mailbox content into a knowledge base through
Microsoft Graph.
This PR adds a net-new Outlook data source that:
- Authenticates against Microsoft Graph with the same MSAL
client-credentials flow already used by the SharePoint and Teams
connectors (no new auth primitives).
- Pages over `/users/{id}/mailFolders/{folder}/messages/delta` per
mailbox and persists `@odata.deltaLink` values in
`OutlookCheckpoint.delta_links`, so incremental syncs only fetch changed
messages.
- Supports two scoping modes:
- **Tenant-wide** (default): enumerates every user in the tenant via
`/users` and syncs each mailbox. Requires `User.Read.All`.
- **Targeted**: when `user_ids` is provided (comma-separated UPNs or
object IDs), only those mailboxes are synced. `User.Read.All` is not
needed in this mode.
- Lets the caller pick the mail folder (`inbox`, `sentitems`, `archive`,
...). Defaults to `inbox`.
- Maps each message to a `Document` shaped after the Gmail connector:
one `TextSection` carrying `From/To/Cc/Subject` headers + body, with
HTML bodies stripped to text inline (no extra dependency).
- Surfaces typed errors on the validation probe:
401 → `ConnectorMissingCredentialError`, 403 →
`InsufficientPermissionsError` (with `Mail.Read` / `User.Read.All`
hint), 404 on a configured mailbox → `ConnectorValidationError`, 5xx →
`UnexpectedValidationError`.
- Skips messages flagged `@removed` by the delta semantics and messages
whose `receivedDateTime` is older than `poll_range_start`.
#### Files
| File | Change |
|------|--------|
| `common/data_source/outlook_connector.py` | **New** —
`OutlookConnector` (`CheckpointedConnectorWithPermSync` +
`SlimConnectorWithPermSync`) + `OutlookCheckpoint` + tiny `_strip_html`
helper. |
| `common/data_source/config.py` | `DocumentSource.OUTLOOK = "outlook"`.
|
| `common/constants.py` | `FileSource.OUTLOOK = "outlook"`. |
| `common/data_source/__init__.py` | Export `OutlookConnector`. |
| `rag/svr/sync_data_source.py` | `Outlook(SyncBase)` with `batch_size`
normalisation, CSV/list parsing of `user_ids`; registered in
`func_factory`. |
| `web/src/pages/user-setting/data-source/constant/index.tsx` |
`DataSourceKey.OUTLOOK`, visibility map (`syncDeletedFiles: true`), info
entry, form fields (tenant_id, client_id, client_secret, folder,
user_ids, batch_size), default values. |
| `web/src/locales/en.ts`, `web/src/locales/zh.ts` |
`outlookDescription` + 5 tooltip keys (EN + ZH). |
| `test/unit_test/data_source/test_outlook_connector_unit.py` | **New**
— 19 unit tests (`p1`/`p2`/`p3`) covering auth, validation (tenant-wide
vs specific user vs error paths), checkpoint helpers, user enumeration
pagination, message filtering, HTML body stripping. |
#### Required Azure AD permissions
- `Mail.Read` (Application, admin-granted) — always.
- `User.Read.All` (Application, admin-granted) — only when `user_ids` is
left blank so the connector can enumerate mailboxes.
#### Out of scope
- **Attachment indexing.** The current connector emits message body +
headers; binary attachments are flagged via `metadata.has_attachments`
but not pulled. Adding attachment hydration is straightforward but
scoped out per the issue's "decide whether attachments are indexed in
the first version" note.
- **Delegated (per-user) OAuth.** The connector uses app-only
credentials, consistent with the SharePoint / Teams precedent in this
codebase.
### Type of change
- [x] New Feature (non-breaking change which adds functionality)
2026-05-29 07:52:29 -06:00
|
|
|
OUTLOOK = "outlook"
|
feat(connectors): add Salesforce CRM data source connector (#15462)
### What problem does this PR solve?
Closes #15461.
RAGFlow had no way to ingest Salesforce CRM data, so support / sales
teams couldn't ground responses on live Accounts, Contacts,
Opportunities, Cases, or Knowledge articles. This adds a first-class
Salesforce data source connector that authenticates against a Connected
App via OAuth 2.0 client-credentials, queries selected SObjects via
SOQL, and turns each record into an indexable document with incremental
sync.
**Highlights**
- `common/data_source/salesforce_connector.py`: new
`SalesforceConnector` (`CheckpointedConnectorWithPermSync` +
`SlimConnectorWithPermSync`).
- OAuth 2.0 client-credentials flow; canonical `instance_url` from the
token response so multi-pod orgs route correctly.
- Per-object `SystemModstamp` cursor stored in
`SalesforceCheckpoint.cursors` — a failure mid-object doesn't rewind
sibling objects, and re-syncs only fetch changed rows.
- Deterministic record-to-text formatter (sorted keys) so SOQL field
reordering on the server doesn't mark every row "changed" on each poll.
- `_get_json` raises on non-2xx so 429 / 5xx never silently advance the
checkpoint past missing data.
- `Knowledge__kav` is in the default object set but is skipped silently
when the org doesn't have Salesforce Knowledge enabled (404 on
describe).
- Slim-doc IDs are scoped as `<Object>/<Id>` so prune deletes can't
collide across object types.
- `common/constants.py`, `common/data_source/config.py`,
`common/data_source/__init__.py`: register `salesforce` in `FileSource`
/ `DocumentSource` and export `SalesforceConnector`.
- `rag/svr/sync_data_source.py`: new `Salesforce(SyncBase)` class routed
through `load_from_checkpoint` (poll_source would re-walk every object
each run) and added to `func_factory`.
- Frontend:
- `web/src/pages/user-setting/data-source/constant/index.tsx`: new
`DataSourceKey.SALESFORCE`, form fields (instance URL, client ID/secret,
objects, api_version, batch size), `syncDeletedFiles` capability,
default form values, and tile entry with the new icon.
- `web/src/locales/{en,zh}.ts`: description + per-field tooltips.
- `web/src/assets/svg/data-source/salesforce.svg`: 48x48 brand-style
icon to match the other Microsoft / cloud tiles.
**Verification**
- `npm run build` (vite + esbuild) passes (1m 26s).
### Type of change
- [x] New Feature (non-breaking change which adds functionality)
2026-06-04 23:24:36 -06:00
|
|
|
SALESFORCE = "salesforce"
|
feat(connectors): add Azure Blob Storage data source connector (#15466)
### What problem does this PR solve?
Closes #15465.
RAGFlow supports S3, Google Cloud Storage, R2, and OCI as data sources
but not Azure Blob Storage, leaving Azure users without a way to index
container objects into a knowledge base. This adds a first-class Azure
Blob Storage data-source connector — distinct from RAGFlow's existing
Azure storage *backends* (`rag/utils/azure_sas_conn.py`,
`rag/utils/azure_spn_conn.py`) which store RAGFlow's own files.
**Highlights**
- `common/data_source/azure_blob_connector.py`: new `AzureBlobConnector`
(`CheckpointedConnectorWithPermSync` + `SlimConnectorWithPermSync`).
- Uses the existing `azure-storage-blob` dependency (already in
`pyproject.toml`).
- Three auth modes, tried in order of precedence:
1. **Account key** — `account_name` + `account_key` + `container_name`.
2. **Connection string** — `connection_string` + `container_name`.
3. **SAS token** — `container_url` + `sas_token` (same shape as
`RAGFlowAzureSasBlob`).
- ETag fingerprint stored per blob in `AzureBlobCheckpoint.etags` —
unchanged blobs (same ETag as last run) are skipped without a download.
Only new/modified blobs are fetched.
- Optional `prefix` scopes indexing to a virtual folder.
- `validate_connector_settings()` probes `get_container_properties()`
and maps `AuthenticationFailed / 403 / ContainerNotFound` to typed
connector exceptions.
- Slim-doc IDs are blob names so prune reconciles correctly.
- `common/constants.py`, `common/data_source/config.py`,
`common/data_source/__init__.py`: register `azure_blob` in `FileSource`
/ `DocumentSource` and export `AzureBlobConnector`.
- `rag/svr/sync_data_source.py`: new `AzureBlob(SyncBase)` class routed
through `load_from_checkpoint` (ETag fingerprint owns change-detection)
and added to `func_factory`.
- Frontend:
- `web/src/pages/user-setting/data-source/constant/index.tsx`: new
`DataSourceKey.AZURE_BLOB`, auth-mode selector (account key / connection
string / SAS token), all credential fields, prefix + batch-size,
`syncDeletedFiles` capability, default form values, tile entry with
icon.
- `web/src/locales/{en,zh}.ts`: description + per-field tooltips for all
9 new keys.
- `web/src/assets/svg/data-source/azure-blob.svg`: Azure-branded
stacked-cylinders icon.
**Verification**
- `npm run build` (vite + esbuild) passes (37 s).
### Type of change
- [x] New Feature (non-breaking change which adds functionality)
2026-06-04 07:06:01 -06:00
|
|
|
AZURE_BLOB = "azure_blob"
|
2025-12-30 17:09:13 +08:00
|
|
|
|
2026-01-09 17:48:45 +08:00
|
|
|
|
2025-11-04 19:25:25 +08:00
|
|
|
class PipelineTaskType(StrEnum):
|
|
|
|
|
PARSE = "Parse"
|
|
|
|
|
DOWNLOAD = "Download"
|
|
|
|
|
RAPTOR = "RAPTOR"
|
|
|
|
|
GRAPH_RAG = "GraphRAG"
|
|
|
|
|
MINDMAP = "Mindmap"
|
2025-12-30 11:41:38 +08:00
|
|
|
MEMORY = "Memory"
|
2025-11-04 19:25:25 +08:00
|
|
|
|
|
|
|
|
|
2026-01-09 17:48:45 +08:00
|
|
|
VALID_PIPELINE_TASK_TYPES = {PipelineTaskType.PARSE, PipelineTaskType.DOWNLOAD, PipelineTaskType.RAPTOR, PipelineTaskType.GRAPH_RAG, PipelineTaskType.MINDMAP}
|
|
|
|
|
|
2025-11-04 19:25:25 +08:00
|
|
|
|
2025-11-05 08:01:39 +08:00
|
|
|
class MCPServerType(StrEnum):
|
|
|
|
|
SSE = "sse"
|
|
|
|
|
STREAMABLE_HTTP = "streamable-http"
|
|
|
|
|
|
2026-01-09 17:48:45 +08:00
|
|
|
|
2025-11-05 08:01:39 +08:00
|
|
|
VALID_MCP_SERVER_TYPES = {MCPServerType.SSE, MCPServerType.STREAMABLE_HTTP}
|
|
|
|
|
|
2026-01-09 17:48:45 +08:00
|
|
|
|
2025-11-06 09:36:38 +08:00
|
|
|
class Storage(Enum):
|
|
|
|
|
MINIO = 1
|
|
|
|
|
AZURE_SPN = 2
|
|
|
|
|
AZURE_SAS = 3
|
|
|
|
|
AWS_S3 = 4
|
|
|
|
|
OSS = 5
|
|
|
|
|
OPENDAL = 6
|
2025-12-04 09:44:05 +07:00
|
|
|
GCS = 7
|
2025-11-06 09:36:38 +08:00
|
|
|
|
2025-12-10 13:34:08 +08:00
|
|
|
|
|
|
|
|
class MemoryType(Enum):
|
2026-01-09 17:48:45 +08:00
|
|
|
RAW = 0b0001 # 1 << 0 = 1 (0b00000001)
|
|
|
|
|
SEMANTIC = 0b0010 # 1 << 1 = 2 (0b00000010)
|
|
|
|
|
EPISODIC = 0b0100 # 1 << 2 = 4 (0b00000100)
|
|
|
|
|
PROCEDURAL = 0b1000 # 1 << 3 = 8 (0b00001000)
|
2025-12-10 13:34:08 +08:00
|
|
|
|
|
|
|
|
|
|
|
|
|
class MemoryStorageType(StrEnum):
|
|
|
|
|
TABLE = "table"
|
|
|
|
|
GRAPH = "graph"
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
class ForgettingPolicy(StrEnum):
|
2025-12-26 11:18:08 +08:00
|
|
|
FIFO = "FIFO"
|
2025-12-10 13:34:08 +08:00
|
|
|
|
|
|
|
|
|
2025-11-03 16:32:37 +08:00
|
|
|
# environment
|
|
|
|
|
# ENV_STRONG_TEST_COUNT = "STRONG_TEST_COUNT"
|
|
|
|
|
# ENV_RAGFLOW_SECRET_KEY = "RAGFLOW_SECRET_KEY"
|
|
|
|
|
# ENV_REGISTER_ENABLED = "REGISTER_ENABLED"
|
|
|
|
|
# ENV_DOC_ENGINE = "DOC_ENGINE"
|
|
|
|
|
# ENV_SANDBOX_ENABLED = "SANDBOX_ENABLED"
|
|
|
|
|
# ENV_SANDBOX_HOST = "SANDBOX_HOST"
|
|
|
|
|
# ENV_MAX_CONTENT_LENGTH = "MAX_CONTENT_LENGTH"
|
|
|
|
|
# ENV_COMPONENT_EXEC_TIMEOUT = "COMPONENT_EXEC_TIMEOUT"
|
|
|
|
|
# ENV_TRINO_USE_TLS = "TRINO_USE_TLS"
|
|
|
|
|
# ENV_MAX_FILE_NUM_PER_USER = "MAX_FILE_NUM_PER_USER"
|
|
|
|
|
# ENV_MACOS = "MACOS"
|
|
|
|
|
# ENV_RAGFLOW_DEBUGPY_LISTEN = "RAGFLOW_DEBUGPY_LISTEN"
|
|
|
|
|
# ENV_WERKZEUG_RUN_MAIN = "WERKZEUG_RUN_MAIN"
|
|
|
|
|
# ENV_DISABLE_SDK = "DISABLE_SDK"
|
|
|
|
|
# ENV_ENABLE_TIMEOUT_ASSERTION = "ENABLE_TIMEOUT_ASSERTION"
|
|
|
|
|
# ENV_LOG_LEVELS = "LOG_LEVELS"
|
|
|
|
|
# ENV_TENSORRT_DLA_SVR = "TENSORRT_DLA_SVR"
|
|
|
|
|
# ENV_OCR_GPU_MEM_LIMIT_MB = "OCR_GPU_MEM_LIMIT_MB"
|
|
|
|
|
# ENV_OCR_ARENA_EXTEND_STRATEGY = "OCR_ARENA_EXTEND_STRATEGY"
|
|
|
|
|
# ENV_MAX_CONCURRENT_PROCESS_AND_EXTRACT_CHUNK = "MAX_CONCURRENT_PROCESS_AND_EXTRACT_CHUNK"
|
|
|
|
|
# ENV_MAX_MAX_CONCURRENT_CHATS = "MAX_CONCURRENT_CHATS"
|
|
|
|
|
# ENV_RAGFLOW_MCP_BASE_URL = "RAGFLOW_MCP_BASE_URL"
|
|
|
|
|
# ENV_RAGFLOW_MCP_HOST = "RAGFLOW_MCP_HOST"
|
|
|
|
|
# ENV_RAGFLOW_MCP_PORT = "RAGFLOW_MCP_PORT"
|
|
|
|
|
# ENV_RAGFLOW_MCP_LAUNCH_MODE = "RAGFLOW_MCP_LAUNCH_MODE"
|
|
|
|
|
# ENV_RAGFLOW_MCP_HOST_API_KEY = "RAGFLOW_MCP_HOST_API_KEY"
|
|
|
|
|
# ENV_MINERU_EXECUTABLE = "MINERU_EXECUTABLE"
|
|
|
|
|
# ENV_MINERU_APISERVER = "MINERU_APISERVER"
|
|
|
|
|
# ENV_MINERU_OUTPUT_DIR = "MINERU_OUTPUT_DIR"
|
|
|
|
|
# ENV_MINERU_BACKEND = "MINERU_BACKEND"
|
|
|
|
|
# ENV_MINERU_DELETE_OUTPUT = "MINERU_DELETE_OUTPUT"
|
2026-03-12 18:09:03 +09:00
|
|
|
# ENV_DOCLING_SERVER_URL = "DOCLING_SERVER_URL"
|
|
|
|
|
# ENV_DOCLING_OUTPUT_DIR = "DOCLING_OUTPUT_DIR"
|
|
|
|
|
# ENV_DOCLING_DELETE_OUTPUT = "DOCLING_DELETE_OUTPUT"
|
2025-11-03 16:32:37 +08:00
|
|
|
# ENV_TCADP_OUTPUT_DIR = "TCADP_OUTPUT_DIR"
|
|
|
|
|
# ENV_LM_TIMEOUT_SECONDS = "LM_TIMEOUT_SECONDS"
|
|
|
|
|
# ENV_LLM_MAX_RETRIES = "LLM_MAX_RETRIES"
|
|
|
|
|
# ENV_LLM_BASE_DELAY = "LLM_BASE_DELAY"
|
|
|
|
|
# ENV_OLLAMA_KEEP_ALIVE = "OLLAMA_KEEP_ALIVE"
|
|
|
|
|
# ENV_DOC_BULK_SIZE = "DOC_BULK_SIZE"
|
|
|
|
|
# ENV_EMBEDDING_BATCH_SIZE = "EMBEDDING_BATCH_SIZE"
|
|
|
|
|
# ENV_MAX_CONCURRENT_TASKS = "MAX_CONCURRENT_TASKS"
|
|
|
|
|
# ENV_MAX_CONCURRENT_CHUNK_BUILDERS = "MAX_CONCURRENT_CHUNK_BUILDERS"
|
|
|
|
|
# ENV_MAX_CONCURRENT_MINIO = "MAX_CONCURRENT_MINIO"
|
|
|
|
|
# ENV_WORKER_HEARTBEAT_TIMEOUT = "WORKER_HEARTBEAT_TIMEOUT"
|
|
|
|
|
# ENV_TRACE_MALLOC_ENABLED = "TRACE_MALLOC_ENABLED"
|
2025-11-06 09:36:38 +08:00
|
|
|
|
|
|
|
|
PAGERANK_FLD = "pagerank_fea"
|
2026-05-27 21:54:17 +08:00
|
|
|
SVR_QUEUE_NAME = "te"
|
2025-11-06 09:36:38 +08:00
|
|
|
SVR_CONSUMER_GROUP_NAME = "rag_flow_svr_task_broker"
|
|
|
|
|
TAG_FLD = "tag_feas"
|
2025-12-09 18:54:14 +08:00
|
|
|
|
Fix: Remove hardcoded page limits causing parsing failures on large PDFs (>300 pages) (#14382)
### What problem does this PR solve?
Fixes #14196
## Problem
When using DeepDOC to parse large PDFs (over 1000 pages), the parser
silently truncated processing at 300 pages due to a hardcoded default
`page_to=299` in `RAGFlowPdfParser.__images__()`. This caused:
- **Errors** on pages beyond the limit
- **Poor image quality** as the parser attempted to compensate with
missing page data
- **Inconsistent chunk splitting** between full PDF imports and partial
imports
Additionally, the codebase scattered magic numbers (`299`, `600`,
`10000`, `100000`, `100000000`, `10000000000`, `10**9`) across 22 files
as sentinel values for "parse all pages", making future maintenance
error-prone.
## Root Cause
```python
# deepdoc/parser/pdf_parser.py (before)
def __images__(self, fnm, zoomin=3, page_from=0, page_to=299, callback=None):
# Only the first 300 pages were rendered; everything beyond was silently dropped
```
While most callers in `rag/app/*.py` correctly passed `to_page=100000`,
the base class `RAGFlowPdfParser.__call__()` and `parse_into_bboxes()`
invoked `__images__` **without** forwarding `page_from`/`page_to`,
falling back to the restrictive default of 299.
## Solution
### 1. Define constants in `common/constants.py`
```python
MAXIMUM_PAGE_NUMBER = 100000 # Used by the parsing layer
MAXIMUM_TASK_PAGE_NUMBER = MAXIMUM_PAGE_NUMBER * 1000 # Used by the task/DB layer
```
### 2. Replace all hardcoded sentinel values
| Layer | Files Changed | Old Values | New Value |
|---|---|---|---|
| **Deepdoc parsers** | `pdf_parser.py`, `mineru_parser.py`,
`docling_parser.py`, `opendataloader_parser.py`, `paddleocr_parser.py`,
`docx_parser.py` | `299`, `600`, `10**9`, `100000000` |
`MAXIMUM_PAGE_NUMBER` |
| **Chunk parsers** | `naive.py`, `book.py`, `qa.py`, `one.py`,
`manual.py`, `paper.py`, `presentation.py`, `laws.py`, `resume.py`,
`email.py`, `table.py` | `100000`, `10000`, `10000000000` |
`MAXIMUM_PAGE_NUMBER` |
| **Task/DB layer** | `db_models.py`, `task_service.py`,
`document_service.py`, `file_service.py` | `100000000` |
`MAXIMUM_TASK_PAGE_NUMBER` |
### 3. Fix `parse_into_bboxes()` missing parameters
Added `from_page`/`to_page` parameters to `parse_into_bboxes()` so that
the `rag/flow/parser/parser.py` DeepDOC path no longer falls back to the
restrictive default.
## Files Changed (22)
- `common/constants.py`
- `deepdoc/parser/pdf_parser.py`
- `deepdoc/parser/mineru_parser.py`
- `deepdoc/parser/docling_parser.py`
- `deepdoc/parser/opendataloader_parser.py`
- `deepdoc/parser/paddleocr_parser.py`
- `deepdoc/parser/docx_parser.py`
- `rag/app/naive.py`
- `rag/app/book.py`
- `rag/app/qa.py`
- `rag/app/one.py`
- `rag/app/manual.py`
- `rag/app/paper.py`
- `rag/app/presentation.py`
- `rag/app/laws.py`
- `rag/app/resume.py`
- `rag/app/email.py`
- `rag/app/table.py`
- `api/db/db_models.py`
- `api/db/services/task_service.py`
- `api/db/services/document_service.py`
- `api/db/services/file_service.py`
### Type of change
- [x] Bug Fix (non-breaking change which fixes an issue)
- [x] Refactoring
---------
Signed-off-by: noob <yixiao121314@outlook.com>
2026-04-27 06:57:20 +00:00
|
|
|
# Maximum page number used as "unlimited" sentinel value.
|
|
|
|
|
# Parsing layer (chunk/Pdf.__call__) uses MAXIMUM_PAGE_NUMBER.
|
|
|
|
|
# Task/DB layer (Task model) uses MAXIMUM_PAGE_NUMBER * 1000 to avoid collision with user-specified page ranges.
|
|
|
|
|
MAXIMUM_PAGE_NUMBER = 100000
|
|
|
|
|
MAXIMUM_TASK_PAGE_NUMBER = MAXIMUM_PAGE_NUMBER * 1000
|
|
|
|
|
|
2025-12-09 18:54:14 +08:00
|
|
|
|
|
|
|
|
MINERU_ENV_KEYS = ["MINERU_APISERVER", "MINERU_OUTPUT_DIR", "MINERU_BACKEND", "MINERU_SERVER_URL", "MINERU_DELETE_OUTPUT"]
|
|
|
|
|
MINERU_DEFAULT_CONFIG = {
|
|
|
|
|
"MINERU_APISERVER": "",
|
|
|
|
|
"MINERU_OUTPUT_DIR": "",
|
|
|
|
|
"MINERU_BACKEND": "pipeline",
|
|
|
|
|
"MINERU_SERVER_URL": "",
|
|
|
|
|
"MINERU_DELETE_OUTPUT": 1,
|
|
|
|
|
}
|
2026-01-09 17:48:45 +08:00
|
|
|
|
refactor(paddleocr): migrate from sync API to async Job API (#15967)
## Summary
Migrate PaddleOCR integration from the deprecated synchronous HTTP API
to the new asynchronous Job API (`submit → poll → fetch`), aligning with
PaddleOCR 3.6.0+ architecture.
## Changes
### Python (`deepdoc/parser/paddleocr_parser.py`)
- Replace synchronous `requests.post()` with async Job API flow (submit
→ poll → fetch)
- Authentication: `token {token}` → `Bearer {token}`
- File transfer: base64 JSON body → multipart file upload
- Polling: exponential backoff (initial 3s, ×1.5, max 15s, timeout
controlled by `request_timeout`)
- Result: fetch full JSONL from result URL, preserving `prunedResult`
with bbox info for crop functionality
- Rename `api_url` → `base_url` (backward compatible: `api_url` still
accepted as fallback)
### Python (`rag/llm/ocr_model.py`)
- Prefer `paddleocr_base_url` / `PADDLEOCR_BASE_URL`, fallback to
`paddleocr_api_url` / `PADDLEOCR_API_URL`
### Go (`internal/entity/models/paddleocr.go`)
- Add `Client-Platform: ragflow` header to submit and poll requests
- Change polling from fixed 3s to exponential backoff (initial 3s, ×1.5,
max 15s)
### Python (`common/constants.py`)
- Add `PADDLEOCR_BASE_URL` to env keys and default config
## Backward Compatibility
- Old env var `PADDLEOCR_API_URL` still works (used as fallback)
- Frontend field `paddleocr_api_url` still works (backend reads it as
fallback)
- No user-facing configuration changes required for existing setups
## Why not use the `paddleocr` SDK package directly?
RAGFlow's `_transfer_to_sections()` relies on `prunedResult` (containing
`block_bbox`, `block_label`, `parsing_res_list`) from the raw API
response for PDF crop functionality. The SDK's public `parse_document()`
API only returns `DocParsingResult` with `markdown_text`, discarding the
bbox data. Therefore we implement the async Job API flow directly via
HTTP, following the same logic as the SDK internally.
2026-06-16 19:34:21 +08:00
|
|
|
PADDLEOCR_ENV_KEYS = ["PADDLEOCR_BASE_URL", "PADDLEOCR_API_URL", "PADDLEOCR_ACCESS_TOKEN", "PADDLEOCR_ALGORITHM"]
|
2026-01-09 17:48:45 +08:00
|
|
|
PADDLEOCR_DEFAULT_CONFIG = {
|
refactor(paddleocr): migrate from sync API to async Job API (#15967)
## Summary
Migrate PaddleOCR integration from the deprecated synchronous HTTP API
to the new asynchronous Job API (`submit → poll → fetch`), aligning with
PaddleOCR 3.6.0+ architecture.
## Changes
### Python (`deepdoc/parser/paddleocr_parser.py`)
- Replace synchronous `requests.post()` with async Job API flow (submit
→ poll → fetch)
- Authentication: `token {token}` → `Bearer {token}`
- File transfer: base64 JSON body → multipart file upload
- Polling: exponential backoff (initial 3s, ×1.5, max 15s, timeout
controlled by `request_timeout`)
- Result: fetch full JSONL from result URL, preserving `prunedResult`
with bbox info for crop functionality
- Rename `api_url` → `base_url` (backward compatible: `api_url` still
accepted as fallback)
### Python (`rag/llm/ocr_model.py`)
- Prefer `paddleocr_base_url` / `PADDLEOCR_BASE_URL`, fallback to
`paddleocr_api_url` / `PADDLEOCR_API_URL`
### Go (`internal/entity/models/paddleocr.go`)
- Add `Client-Platform: ragflow` header to submit and poll requests
- Change polling from fixed 3s to exponential backoff (initial 3s, ×1.5,
max 15s)
### Python (`common/constants.py`)
- Add `PADDLEOCR_BASE_URL` to env keys and default config
## Backward Compatibility
- Old env var `PADDLEOCR_API_URL` still works (used as fallback)
- Frontend field `paddleocr_api_url` still works (backend reads it as
fallback)
- No user-facing configuration changes required for existing setups
## Why not use the `paddleocr` SDK package directly?
RAGFlow's `_transfer_to_sections()` relies on `prunedResult` (containing
`block_bbox`, `block_label`, `parsing_res_list`) from the raw API
response for PDF crop functionality. The SDK's public `parse_document()`
API only returns `DocParsingResult` with `markdown_text`, discarding the
bbox data. Therefore we implement the async Job API flow directly via
HTTP, following the same logic as the SDK internally.
2026-06-16 19:34:21 +08:00
|
|
|
"PADDLEOCR_BASE_URL": "",
|
2026-01-09 17:48:45 +08:00
|
|
|
"PADDLEOCR_API_URL": "",
|
|
|
|
|
"PADDLEOCR_ACCESS_TOKEN": None,
|
|
|
|
|
"PADDLEOCR_ALGORITHM": "PaddleOCR-VL",
|
|
|
|
|
}
|
Feat: add OpenDataLoader PDF parser backend (#14058) (#14097)
### What problem does this PR solve?
Closes #14058.
RAGFlow supports multiple PDF parsing backends (DeepDOC, MinerU,
Docling, TCADP, PaddleOCR). This PR adds **OpenDataLoader**
([opendataloader-project/opendataloader-pdf](https://github.com/opendataloader-project/opendataloader-pdf))
as a new optional backend, giving users a deterministic, local-first
alternative with competitive table extraction accuracy.
### Type of change
- [x] New Feature (non-breaking change which adds functionality)
- [x] Documentation Update
---
### Changes
#### Backend
- `deepdoc/parser/opendataloader_parser.py` — new `OpenDataLoaderParser`
class inheriting `RAGFlowPdfParser`. Implements `check_installation()`
(guards Python package + Java 11+ runtime), `parse_pdf()` with
JSON-first extraction (heading/paragraph/table/list/image/formula) and
Markdown fallback, position-tag generation compatible with the shared
`@@page\tx0\tx1\ty0\ty1##` format, and temp-dir lifecycle with cleanup.
- `rag/app/naive.py` — new `by_opendataloader()` wrapper, registered in
`PARSERS` dict, added to `chunk_token_num=0` override list.
- `rag/flow/parser/parser.py` — `"opendataloader"` branch in the
pipeline PDF handler + check validation list.
#### Infrastructure
- `docker/entrypoint.sh` — `ensure_opendataloader()` function: opt-in
via `USE_OPENDATALOADER=true`, skips gracefully if Java is not on PATH.
#### Frontend
- `web/src/components/layout-recognize-form-field.tsx` —
`OpenDataLoader` added to `ParseDocumentType` enum and parser dropdown.
Cascades automatically to the pipeline editor's Parser component.
#### Docs
- `docs/guides/dataset/select_pdf_parser.md` — added OpenDataLoader
entry and full env-var reference.
---
### Environment variables
| Variable | Default | Description |
|---|---|---|
| `USE_OPENDATALOADER` | `false` | Set `true` to install
`opendataloader-pdf` on container startup |
| `OPENDATALOADER_VERSION` | latest | Pin the PyPI release (e.g.
`==2.2.1`) |
| `OPENDATALOADER_HYBRID` | _(unset)_ | Enable hybrid AI mode (e.g.
`docling-fast`) |
| `OPENDATALOADER_IMAGE_OUTPUT` | _(unset)_ | `off` / `embedded` /
`external` |
| `OPENDATALOADER_OUTPUT_DIR` | _(tmp)_ | Persistent output dir; temp
dir used + cleaned if unset |
| `OPENDATALOADER_DELETE_OUTPUT` | `1` | `0` to retain intermediate
files for debugging |
| `OPENDATALOADER_SANITIZE` | _(unset)_ | `1` to filter prompt-injection
patterns from output |
---
### Dependencies
- **Runtime**: `opendataloader-pdf` (PyPI, Apache 2.0) — opt-in, not
added to `pyproject.toml` core deps. Installed by
`ensure_opendataloader()` at container startup when
`USE_OPENDATALOADER=true`.
- **System**: Java 11+ on PATH (JVM is the underlying engine). The
installer skips with a warning if `java` is not found.
---
### How to test
**Standalone parser:**
```bash
source .venv/bin/activate
uv pip install opendataloader-pdf
python3 -c "
import sys; sys.path.insert(0, '.')
from deepdoc.parser.opendataloader_parser import OpenDataLoaderParser
p = OpenDataLoaderParser()
print('available:', p.check_installation())
s, t = p.parse_pdf('path/to/test.pdf', parse_method='pipeline')
print(f'sections={len(s)} tables={len(t)}')
"
```
### Benchmark vs Docling
```
file parser secs sections tables
----------------------------------------------------------------------
text-heavy.pdf docling 45.29 148 10
text-heavy.pdf opendataloader 3.14 559 0
table-heavy.pdf docling 7.05 76 3
table-heavy.pdf opendataloader 3.71 90 0
complex.pdf docling 42.67 114 8
complex.pdf opendataloader 3.51 180 0
```
2026-04-24 18:33:02 +02:00
|
|
|
|
|
|
|
|
OPENDATALOADER_ENV_KEYS = ["OPENDATALOADER_APISERVER"]
|
|
|
|
|
OPENDATALOADER_DEFAULT_CONFIG = {
|
|
|
|
|
"OPENDATALOADER_APISERVER": "",
|
|
|
|
|
}
|