ragflow/deepdoc/server/docker_stubs.py

#!/usr/bin/env python3
"""Generate minimal stub packages for the OSS DeepDoc Docker image.

The deepdoc vision modules (ocr.py, recognizer.py, etc.) import from
``common``, ``rag``, and ``deepdoc`` at module level.  In the full
RAGFlow environment these packages pull in heavy dependencies (torch,
pdfplumber, database connectors, beartype) that are not needed by the
ONNX-only inference server.

This script writes lightweight replacement modules under /app so the
import chain succeeds without pulling in the full dependency tree.

Why stubs instead of conditionally lazy imports in the vision code?
The vision modules are shared between the full Python backend and the
Docker server.  Keeping the stubs here avoids adding Docker-specific
guards to the shared code.
"""

import os

TARGET = os.environ.get("STUB_TARGET", "/app")


def write(path: str, content: str) -> None:
    full = os.path.join(TARGET, path)
    os.makedirs(os.path.dirname(full), exist_ok=True)
    with open(full, "w") as f:
        f.write(content.lstrip("\n"))


# ── deepdoc ────────────────────────────────────────────────────────────
# Real deepdoc/__init__.py calls beartype_this_package() which requires
# the beartype library.

write("deepdoc/__init__.py", """
# Minimal deepdoc __init__ for Docker — avoids beartype dependency.
""")

# Real deepdoc/vision/__init__.py imports pdfplumber and
# AscendLayoutRecognizer (requires ais_bench).  The Docker server only
# needs the four ONNX-based classes below.

write("deepdoc/vision/__init__.py", """
# Minimal deepdoc.vision __init__ for Docker — avoids pdfplumber and Ascend imports.
from .ocr import OCR
from .recognizer import Recognizer
from .layout_recognizer import LayoutRecognizer4YOLOv10 as LayoutRecognizer
from .table_structure_recognizer import TableStructureRecognizer

__all__ = ["OCR", "Recognizer", "LayoutRecognizer", "TableStructureRecognizer"]
""")

# ── common ─────────────────────────────────────────────────────────────
# Real common.settings imports rag.utils.es_conn and other database/storage
# connectors.  The server only needs PARALLEL_DEVICES for OCR.

write("common/__init__.py", """
# Stub common.__init__ for Docker deepdoc service.
import os


class _Settings:
    PARALLEL_DEVICES = int(os.environ.get("PARALLEL_DEVICES", "0"))


settings = _Settings()
""")

# Real common.file_utils derives the project base from __file__.  In
# Docker the project root is always /app.

write("common/file_utils.py", """
# Stub common.file_utils for Docker deepdoc service.
import os

_PROJECT_BASE = None


def get_project_base_directory(*args):
    global _PROJECT_BASE
    if _PROJECT_BASE is None:
        _PROJECT_BASE = os.environ.get("RAGFLOW_PROJECT_BASE", "/app")
    if args:
        return os.path.join(_PROJECT_BASE, *args)
    return _PROJECT_BASE
""")

# Real common.misc_utils imports 15+ modules.  The server only calls
# pip_install_torch() inside load_model()'s cuda_is_available() guard.
# On CPU-only images torch is not installed, so the try/except silently
# returns False and onnxruntime falls back to CPUExecutionProvider.

write("common/misc_utils.py", """
# Stub common.misc_utils for Docker deepdoc service.


def pip_install_torch(*args, **kwargs):
    try:
        import torch  # noqa: F401
    except ImportError:
        pass
""")

# ── rag ────────────────────────────────────────────────────────────────

write("rag/__init__.py", """
# Stub rag package for Docker deepdoc service.
""")

# table_structure_recognizer.py imports rag_tokenizer at module level.
# Its tokenize/tag methods are only called from blockType() /
# construct_table(), which are NOT invoked by the TSR adapter's
# __call__() path.  The stub exists solely to satisfy the module-level
# import; its methods are never called at server runtime.

write("rag/nlp/__init__.py", """
# Stub rag.nlp module for Docker deepdoc service.
# Provides minimal rag_tokenizer to satisfy table_structure_recognizer import.


class _StubTokenizer:
    def tokenize(self, text):
        return text

    def tag(self, word):
        return ""


rag_tokenizer = _StubTokenizer()
""")

# operators.py imports ensure_pil_image at module level and calls it in
# NormalizeImage.__call__ / ToCHWImage.__call__ (OCR text detection path).
# The real rag.utils.lazy_image imports concat_img from rag.nlp, pulling
# in the entire NLP stack.

write("rag/utils/lazy_image.py", """
# Stub rag.utils.lazy_image for Docker.
from PIL import Image


def ensure_pil_image(img):
    if isinstance(img, Image.Image):
        return img
    return None
""")


if __name__ == "__main__":
    print(f"Docker stubs written to {TARGET}")