mirror of
https://github.com/infiniflow/ragflow.git
synced 2026-06-29 23:41:12 +08:00
### What problem does this PR solve? Http API based on onnx model. pdf_parser.py to golang ### Type of change - [x] Refactoring
151 lines
4.9 KiB
Python
151 lines
4.9 KiB
Python
#!/usr/bin/env python3
|
|
"""Generate minimal stub packages for the OSS DeepDoc Docker image.
|
|
|
|
The deepdoc vision modules (ocr.py, recognizer.py, etc.) import from
|
|
``common``, ``rag``, and ``deepdoc`` at module level. In the full
|
|
RAGFlow environment these packages pull in heavy dependencies (torch,
|
|
pdfplumber, database connectors, beartype) that are not needed by the
|
|
ONNX-only inference server.
|
|
|
|
This script writes lightweight replacement modules under /app so the
|
|
import chain succeeds without pulling in the full dependency tree.
|
|
|
|
Why stubs instead of conditionally lazy imports in the vision code?
|
|
The vision modules are shared between the full Python backend and the
|
|
Docker server. Keeping the stubs here avoids adding Docker-specific
|
|
guards to the shared code.
|
|
"""
|
|
|
|
import os
|
|
|
|
TARGET = os.environ.get("STUB_TARGET", "/app")
|
|
|
|
|
|
def write(path: str, content: str) -> None:
|
|
full = os.path.join(TARGET, path)
|
|
os.makedirs(os.path.dirname(full), exist_ok=True)
|
|
with open(full, "w") as f:
|
|
f.write(content.lstrip("\n"))
|
|
|
|
|
|
# ── deepdoc ────────────────────────────────────────────────────────────
|
|
# Real deepdoc/__init__.py calls beartype_this_package() which requires
|
|
# the beartype library.
|
|
|
|
write("deepdoc/__init__.py", """
|
|
# Minimal deepdoc __init__ for Docker — avoids beartype dependency.
|
|
""")
|
|
|
|
# Real deepdoc/vision/__init__.py imports pdfplumber and
|
|
# AscendLayoutRecognizer (requires ais_bench). The Docker server only
|
|
# needs the four ONNX-based classes below.
|
|
|
|
write("deepdoc/vision/__init__.py", """
|
|
# Minimal deepdoc.vision __init__ for Docker — avoids pdfplumber and Ascend imports.
|
|
from .ocr import OCR
|
|
from .recognizer import Recognizer
|
|
from .layout_recognizer import LayoutRecognizer4YOLOv10 as LayoutRecognizer
|
|
from .table_structure_recognizer import TableStructureRecognizer
|
|
|
|
__all__ = ["OCR", "Recognizer", "LayoutRecognizer", "TableStructureRecognizer"]
|
|
""")
|
|
|
|
# ── common ─────────────────────────────────────────────────────────────
|
|
# Real common.settings imports rag.utils.es_conn and other database/storage
|
|
# connectors. The server only needs PARALLEL_DEVICES for OCR.
|
|
|
|
write("common/__init__.py", """
|
|
# Stub common.__init__ for Docker deepdoc service.
|
|
import os
|
|
|
|
|
|
class _Settings:
|
|
PARALLEL_DEVICES = int(os.environ.get("PARALLEL_DEVICES", "0"))
|
|
|
|
|
|
settings = _Settings()
|
|
""")
|
|
|
|
# Real common.file_utils derives the project base from __file__. In
|
|
# Docker the project root is always /app.
|
|
|
|
write("common/file_utils.py", """
|
|
# Stub common.file_utils for Docker deepdoc service.
|
|
import os
|
|
|
|
_PROJECT_BASE = None
|
|
|
|
|
|
def get_project_base_directory(*args):
|
|
global _PROJECT_BASE
|
|
if _PROJECT_BASE is None:
|
|
_PROJECT_BASE = os.environ.get("RAGFLOW_PROJECT_BASE", "/app")
|
|
if args:
|
|
return os.path.join(_PROJECT_BASE, *args)
|
|
return _PROJECT_BASE
|
|
""")
|
|
|
|
# Real common.misc_utils imports 15+ modules. The server only calls
|
|
# pip_install_torch() inside load_model()'s cuda_is_available() guard.
|
|
# On CPU-only images torch is not installed, so the try/except silently
|
|
# returns False and onnxruntime falls back to CPUExecutionProvider.
|
|
|
|
write("common/misc_utils.py", """
|
|
# Stub common.misc_utils for Docker deepdoc service.
|
|
|
|
|
|
def pip_install_torch(*args, **kwargs):
|
|
try:
|
|
import torch # noqa: F401
|
|
except ImportError:
|
|
pass
|
|
""")
|
|
|
|
# ── rag ────────────────────────────────────────────────────────────────
|
|
|
|
write("rag/__init__.py", """
|
|
# Stub rag package for Docker deepdoc service.
|
|
""")
|
|
|
|
# table_structure_recognizer.py imports rag_tokenizer at module level.
|
|
# Its tokenize/tag methods are only called from blockType() /
|
|
# construct_table(), which are NOT invoked by the TSR adapter's
|
|
# __call__() path. The stub exists solely to satisfy the module-level
|
|
# import; its methods are never called at server runtime.
|
|
|
|
write("rag/nlp/__init__.py", """
|
|
# Stub rag.nlp module for Docker deepdoc service.
|
|
# Provides minimal rag_tokenizer to satisfy table_structure_recognizer import.
|
|
|
|
|
|
class _StubTokenizer:
|
|
def tokenize(self, text):
|
|
return text
|
|
|
|
def tag(self, word):
|
|
return ""
|
|
|
|
|
|
rag_tokenizer = _StubTokenizer()
|
|
""")
|
|
|
|
# operators.py imports ensure_pil_image at module level and calls it in
|
|
# NormalizeImage.__call__ / ToCHWImage.__call__ (OCR text detection path).
|
|
# The real rag.utils.lazy_image imports concat_img from rag.nlp, pulling
|
|
# in the entire NLP stack.
|
|
|
|
write("rag/utils/lazy_image.py", """
|
|
# Stub rag.utils.lazy_image for Docker.
|
|
from PIL import Image
|
|
|
|
|
|
def ensure_pil_image(img):
|
|
if isinstance(img, Image.Image):
|
|
return img
|
|
return None
|
|
""")
|
|
|
|
|
|
if __name__ == "__main__":
|
|
print(f"Docker stubs written to {TARGET}")
|