diff --git a/common/constants.py b/common/constants.py index cbc2f534c9..24530c4573 100644 --- a/common/constants.py +++ b/common/constants.py @@ -219,6 +219,9 @@ class ForgettingPolicy(StrEnum): # ENV_MINERU_OUTPUT_DIR = "MINERU_OUTPUT_DIR" # ENV_MINERU_BACKEND = "MINERU_BACKEND" # ENV_MINERU_DELETE_OUTPUT = "MINERU_DELETE_OUTPUT" +# ENV_DOCLING_SERVER_URL = "DOCLING_SERVER_URL" +# ENV_DOCLING_OUTPUT_DIR = "DOCLING_OUTPUT_DIR" +# ENV_DOCLING_DELETE_OUTPUT = "DOCLING_DELETE_OUTPUT" # ENV_TCADP_OUTPUT_DIR = "TCADP_OUTPUT_DIR" # ENV_LM_TIMEOUT_SECONDS = "LM_TIMEOUT_SECONDS" # ENV_LLM_MAX_RETRIES = "LLM_MAX_RETRIES" diff --git a/deepdoc/parser/docling_parser.py b/deepdoc/parser/docling_parser.py index 2ad1e8d3d3..ccd45bab12 100644 --- a/deepdoc/parser/docling_parser.py +++ b/deepdoc/parser/docling_parser.py @@ -17,6 +17,8 @@ from __future__ import annotations import logging import re +import base64 +import os from dataclasses import dataclass from enum import Enum from io import BytesIO @@ -25,6 +27,7 @@ from pathlib import Path from typing import Any, Callable, Iterable, Optional import pdfplumber +import requests from PIL import Image try: @@ -74,15 +77,41 @@ def _extract_bbox_from_prov(item, prov_attr: str = "prov") -> Optional[_BBox]: class DoclingParser(RAGFlowPdfParser): - def __init__(self): + def __init__(self, docling_server_url: str = "", request_timeout: int = 600): self.logger = logging.getLogger(self.__class__.__name__) self.page_images: list[Image.Image] = [] self.page_from = 0 self.page_to = 10_000 self.outlines = [] - - - def check_installation(self) -> bool: + self.docling_server_url = (docling_server_url or "").rstrip("/") + self.request_timeout = request_timeout + + def _effective_server_url(self, docling_server_url: Optional[str] = None) -> str: + return (docling_server_url or self.docling_server_url or "").rstrip("/") or ( + os.environ.get("DOCLING_SERVER_URL", "").rstrip("/") + ) + + @staticmethod + def _is_http_endpoint_valid(url: str, timeout: int = 5) -> bool: + try: + response = requests.head(url, timeout=timeout, allow_redirects=True) + return response.status_code in [200, 301, 302, 307, 308] + except Exception: + try: + response = requests.get(url, timeout=timeout, allow_redirects=True) + return response.status_code in [200, 301, 302, 307, 308] + except Exception: + return False + + def check_installation(self, docling_server_url: Optional[str] = None) -> bool: + server_url = self._effective_server_url(docling_server_url) + if server_url: + for path in ("/openapi.json", "/docs", "/v1/convert/source"): + if self._is_http_endpoint_valid(f"{server_url}{path}", timeout=5): + return True + self.logger.warning(f"[Docling] external server not reachable: {server_url}") + return False + if DocumentConverter is None: self.logger.warning("[Docling] 'docling' is not importable, please: pip install docling") return False @@ -277,6 +306,141 @@ class DoclingParser(RAGFlowPdfParser): tables.append(((img, [captions]), positions if positions else "")) return tables + @staticmethod + def _sections_from_remote_text(text: str, parse_method: str) -> list[tuple[str, ...]]: + txt = (text or "").strip() + if not txt: + return [] + if parse_method == "manual": + return [(txt, DoclingContentType.TEXT.value, "")] + if parse_method == "paper": + return [(txt, DoclingContentType.TEXT.value)] + return [(txt, "")] + + @staticmethod + def _extract_remote_document_entries(payload: Any) -> list[dict[str, Any]]: + if not isinstance(payload, dict): + return [] + if isinstance(payload.get("document"), dict): + return [payload["document"]] + if isinstance(payload.get("documents"), list): + return [d for d in payload["documents"] if isinstance(d, dict)] + if isinstance(payload.get("results"), list): + docs = [] + for it in payload["results"]: + if isinstance(it, dict): + if isinstance(it.get("document"), dict): + docs.append(it["document"]) + elif isinstance(it.get("result"), dict): + docs.append(it["result"]) + else: + docs.append(it) + return docs + return [] + + def _parse_pdf_remote( + self, + filepath: str | PathLike[str], + binary: BytesIO | bytes | None = None, + callback: Optional[Callable] = None, + *, + parse_method: str = "raw", + docling_server_url: Optional[str] = None, + request_timeout: Optional[int] = None, + ): + server_url = self._effective_server_url(docling_server_url) + if not server_url: + raise RuntimeError("[Docling] DOCLING_SERVER_URL is not configured.") + + timeout = request_timeout or self.request_timeout + if binary is not None: + if isinstance(binary, (bytes, bytearray)): + pdf_bytes = bytes(binary) + else: + pdf_bytes = bytes(binary.getbuffer()) + else: + src_path = Path(filepath) + if not src_path.exists(): + raise FileNotFoundError(f"PDF not found: {src_path}") + with open(src_path, "rb") as f: + pdf_bytes = f.read() + + if callback: + callback(0.2, f"[Docling] Requesting external server: {server_url}") + + filename = Path(filepath).name or "input.pdf" + b64 = base64.b64encode(pdf_bytes).decode("ascii") + v1_payload = { + "options": { + "from_formats": ["pdf"], + "to_formats": ["json", "md", "text"], + }, + "sources": [ + { + "kind": "file", + "filename": filename, + "base64_string": b64, + } + ], + } + v1alpha_payload = { + "options": { + "from_formats": ["pdf"], + "to_formats": ["json", "md", "text"], + }, + "file_sources": [ + { + "filename": filename, + "base64_string": b64, + } + ], + } + errors = [] + response_json = None + for endpoint, payload in ( + ("/v1/convert/source", v1_payload), + ("/v1alpha/convert/source", v1alpha_payload), + ): + try: + resp = requests.post( + f"{server_url}{endpoint}", + json=payload, + timeout=timeout, + ) + if resp.status_code < 300: + response_json = resp.json() + break + errors.append(f"{endpoint}: HTTP {resp.status_code} {resp.text[:300]}") + except Exception as exc: + errors.append(f"{endpoint}: {exc}") + + if response_json is None: + raise RuntimeError("[Docling] remote convert failed: " + " | ".join(errors)) + + docs = self._extract_remote_document_entries(response_json) + if not docs: + raise RuntimeError("[Docling] remote response does not contain parsed documents.") + + sections: list[tuple[str, ...]] = [] + tables = [] + for doc in docs: + md = doc.get("md_content") + txt = doc.get("text_content") + if isinstance(md, str) and md.strip(): + sections.extend(self._sections_from_remote_text(md, parse_method=parse_method)) + elif isinstance(txt, str) and txt.strip(): + sections.extend(self._sections_from_remote_text(txt, parse_method=parse_method)) + + json_content = doc.get("json_content") + if isinstance(json_content, dict): + md_fallback = json_content.get("md_content") + if isinstance(md_fallback, str) and md_fallback.strip() and not sections: + sections.extend(self._sections_from_remote_text(md_fallback, parse_method=parse_method)) + + if callback: + callback(0.95, f"[Docling] Remote sections: {len(sections)}") + return sections, tables + def parse_pdf( self, filepath: str | PathLike[str], @@ -287,12 +451,25 @@ class DoclingParser(RAGFlowPdfParser): lang: Optional[str] = None, method: str = "auto", delete_output: bool = True, - parse_method: str = "raw" + parse_method: str = "raw", + docling_server_url: Optional[str] = None, + request_timeout: Optional[int] = None, ): - if not self.check_installation(): + if not self.check_installation(docling_server_url=docling_server_url): raise RuntimeError("Docling not available, please install `docling`") + server_url = self._effective_server_url(docling_server_url) + if server_url: + return self._parse_pdf_remote( + filepath=filepath, + binary=binary, + callback=callback, + parse_method=parse_method, + docling_server_url=server_url, + request_timeout=request_timeout, + ) + if binary is not None: tmpdir = Path(output_dir) if output_dir else Path.cwd() / ".docling_tmp" tmpdir.mkdir(parents=True, exist_ok=True) diff --git a/docs/faq.mdx b/docs/faq.mdx index 965aa16dab..9c45e0fe61 100644 --- a/docs/faq.mdx +++ b/docs/faq.mdx @@ -567,6 +567,24 @@ RAGFlow supports MinerU's `vlm-http-client` backend, enabling you to delegate do When using the `vlm-http-client` backend, the RAGFlow server requires no GPU, only network connectivity. This enables cost-effective distributed deployment with multiple RAGFlow instances sharing one remote vLLM server. ::: +### How to use an external Docling Serve server for document parsing? + +RAGFlow supports Docling in two modes: + +1. **Local Docling** (existing mode): install Docling in the RAGFlow runtime (`USE_DOCLING=true`) and parse in-process. +2. **External Docling Serve** (remote mode): point RAGFlow to a Docling Serve endpoint. + +To enable remote mode, set: + +```bash +DOCLING_SERVER_URL=http://your-docling-serve-host:5001 +``` + +Behavior: + +- When `DOCLING_SERVER_URL` is set, RAGFlow sends PDFs to Docling Serve using `/v1/convert/source` (and falls back to `/v1alpha/convert/source` for older servers). +- When `DOCLING_SERVER_URL` is not set, RAGFlow uses local in-process Docling. + ### How to use PaddleOCR for document parsing? From v0.24.0 onwards, RAGFlow includes PaddleOCR as an optional PDF parser. Please note that RAGFlow acts only as a *remote client* for PaddleOCR, calling the PaddleOCR API to parse PDFs and reading the returned files. diff --git a/docs/guides/agent/agent_component_reference/parser.md b/docs/guides/agent/agent_component_reference/parser.md index cdc0a9e175..75b6341cb2 100644 --- a/docs/guides/agent/agent_component_reference/parser.md +++ b/docs/guides/agent/agent_component_reference/parser.md @@ -65,6 +65,12 @@ Starting from v0.22.0, RAGFlow includes MinerU (≥ 2.6.3) as an optional PDF p - If you decide to use a chunking method from the **Built-in** dropdown, ensure it supports PDF parsing, then select **MinerU** from the **PDF parser** dropdown. - If you use a custom ingestion pipeline instead, select **MinerU** in the **PDF parser** section of the **Parser** component. +To use an external Docling Serve instance (instead of local in-process Docling), set: + +- `DOCLING_SERVER_URL`: The Docling Serve API endpoint (for example, `http://docling-host:5001`). + +When `DOCLING_SERVER_URL` is set, RAGFlow sends PDF content to Docling Serve (`/v1/convert/source`, with fallback to `/v1alpha/convert/source`) and ingests the returned markdown/text. If the variable is not set, RAGFlow keeps using local Docling (`USE_DOCLING=true` + installed package) behavior. + :::note All MinerU environment variables are optional. When set, these values are used to auto-provision a MinerU OCR model for the tenant on first use. To avoid auto-provisioning, skip the environment variable settings and only configure MinerU from the **Model providers** page in the UI. ::: diff --git a/docs/guides/dataset/select_pdf_parser.md b/docs/guides/dataset/select_pdf_parser.md index fa2d068cb4..d96992f5af 100644 --- a/docs/guides/dataset/select_pdf_parser.md +++ b/docs/guides/dataset/select_pdf_parser.md @@ -65,6 +65,12 @@ Starting from v0.22.0, RAGFlow includes MinerU (≥ 2.6.3) as an optional PDF p - If you decide to use a chunking method from the **Built-in** dropdown, ensure it supports PDF parsing, then select **MinerU** from the **PDF parser** dropdown. - If you use a custom ingestion pipeline instead, select **MinerU** in the **PDF parser** section of the **Parser** component. +To use an external Docling Serve instance (instead of local in-process Docling), set: + +- `DOCLING_SERVER_URL`: The Docling Serve API endpoint (for example, `http://docling-host:5001`). + +When `DOCLING_SERVER_URL` is set, RAGFlow sends PDF content to Docling Serve (`/v1/convert/source`, with fallback to `/v1alpha/convert/source`) and ingests the returned markdown/text. If the variable is not set, RAGFlow keeps using local Docling (`USE_DOCLING=true` + installed package) behavior. + :::note All MinerU environment variables are optional. When set, these values are used to auto-provision a MinerU OCR model for the tenant on first use. To avoid auto-provisioning, skip the environment variable settings and only configure MinerU from the **Model providers** page in the UI. ::: diff --git a/rag/app/naive.py b/rag/app/naive.py index 1d2d0ebbf7..3eec55df03 100644 --- a/rag/app/naive.py +++ b/rag/app/naive.py @@ -153,15 +153,17 @@ def by_docling(filename, binary=None, from_page=0, to_page=100000, lang="Chinese parse_method = kwargs.get("parse_method", "raw") if not pdf_parser.check_installation(): - callback(-1, "Docling not found.") + if callback: + callback(-1, "Docling not found.") return None, None, pdf_parser sections, tables = pdf_parser.parse_pdf( filepath=filename, binary=binary, callback=callback, - output_dir=os.environ.get("MINERU_OUTPUT_DIR", ""), - delete_output=bool(int(os.environ.get("MINERU_DELETE_OUTPUT", 1))), + output_dir=os.environ.get("DOCLING_OUTPUT_DIR", ""), + delete_output=bool(int(os.environ.get("DOCLING_DELETE_OUTPUT", 1))), + docling_server_url=os.environ.get("DOCLING_SERVER_URL", ""), parse_method=parse_method, ) return sections, tables, pdf_parser diff --git a/rag/flow/parser/parser.py b/rag/flow/parser/parser.py index 9ee6994841..3f779e252c 100644 --- a/rag/flow/parser/parser.py +++ b/rag/flow/parser/parser.py @@ -32,6 +32,7 @@ from common import settings from common.constants import LLMType from common.misc_utils import get_uuid from deepdoc.parser import ExcelParser +from deepdoc.parser.docling_parser import DoclingParser from deepdoc.parser.pdf_parser import PlainParser, RAGFlowPdfParser, VisionParser from deepdoc.parser.tcadp_parser import TCADPParser from rag.app.naive import Docx @@ -173,7 +174,7 @@ class ParserParam(ProcessParamBase): pdf_parse_method = pdf_config.get("parse_method", "") self.check_empty(pdf_parse_method, "Parse method abnormal.") - if pdf_parse_method.lower() not in ["deepdoc", "plain_text", "mineru", "tcadp parser", "paddleocr"]: + if pdf_parse_method.lower() not in ["deepdoc", "plain_text", "mineru", "docling", "tcadp parser", "paddleocr"]: self.check_empty(pdf_config.get("lang", ""), "PDF VLM language") pdf_output_format = pdf_config.get("output_format", "") @@ -371,6 +372,29 @@ class Parser(ProcessBase): "text": t, } bboxes.append(box) + elif parse_method.lower() == "docling": + pdf_parser = DoclingParser(docling_server_url=os.environ.get("DOCLING_SERVER_URL", "")) + lines, _ = pdf_parser.parse_pdf( + filepath=name, + binary=blob, + callback=self.callback, + parse_method=conf.get("docling_parse_method", "raw"), + docling_server_url=os.environ.get("DOCLING_SERVER_URL", ""), + ) + bboxes = [] + for item in lines: + if not isinstance(item, tuple) or not item: + continue + text = item[0] + poss = item[-1] if len(item) >= 2 else "" + box = { + "text": text, + "image": pdf_parser.crop(poss, 1) if isinstance(poss, str) and poss else None, + "positions": [[pos[0][-1], *pos[1:]] for pos in pdf_parser.extract_positions(poss)] + if isinstance(poss, str) and poss + else [], + } + bboxes.append(box) elif parse_method.lower() == "tcadp parser": # ADP is a document parsing tool using Tencent Cloud API table_result_type = conf.get("table_result_type", "1")