Refactor: migrate pdf_parser.py to golang (#16323)

### What problem does this PR solve?

Http API based on onnx model.
pdf_parser.py to golang

### Type of change

- [x] Refactoring
This commit is contained in:
Jack
2026-06-25 20:16:16 +08:00
committed by GitHub
parent c7052f4dd1
commit 304d9e02bb
98 changed files with 24591 additions and 8 deletions

204
deepdoc/server/README.md Normal file
View File

@@ -0,0 +1,204 @@
# OSS DeepDoc HTTP API Service
Serves DLA (Document Layout Analysis), OCR (Optical Character Recognition), and
TSR (Table Structure Recognition) models via a unified HTTP API using
[LitServe](https://github.com/Lightning-AI/litserve) and OSS ONNX Runtime models.
## Quick Start
```bash
# Build
docker build -f Dockerfile_deepdoc_oss -t deepdoc_oss:latest .
# Run (CPU only; no GPU required)
docker run -p 9390:9390 deepdoc_oss:latest
# Or via docker compose
docker compose -f docker/docker-compose.yml up -d
```
The service listens on port **9390** by default. Pass `--port` to change it:
```bash
python deepdoc/server/deepdoc_server.py --port 9000 --model-dir /path/to/models
```
## Endpoints
All prediction endpoints accept JPEG images via `multipart/form-data`. The form
field for file uploads is named `request`.
| Method | Path | Description |
|--------|------|-------------|
| `GET` | `/health` | Liveness probe. Returns `ok`. |
| `GET` | `/model` | Model metadata. Returns `{"model":"oss","version":"1.0"}`. |
| `POST` | `/predict/dla` | Document Layout Analysis. |
| `POST` | `/predict/tsr` | Table Structure Recognition. |
| `POST` | `/predict/ocr` | OCR — use form field `operator=det` for detection or `operator=rec` for recognition. |
### `POST /predict/dla`
Analyzes a full page image and returns labelled layout regions.
**Request**
```
curl -X POST http://localhost:9390/predict/dla \
-F "request=@page.jpg;type=image/jpeg"
```
**Response**
```json
{
"bboxes": [
[x0, y0, x1, y1, score, class_id],
...
]
}
```
| class_id | Label |
|:--------:|-------|
| 0 | title |
| 1 | text |
| 2 | reference |
| 3 | figure |
| 4 | figure caption |
| 5 | table |
| 6 | table caption |
| 8 | equation |
> The OSS model uses 8 unique class IDs. IDs 7 and 9 are reserved for
> compatibility with the SaaS label scheme but are never produced by the
> OSS model.
### `POST /predict/tsr`
Recognizes table structure from a cropped table image.
**Request**
```
curl -X POST http://localhost:9390/predict/tsr \
-F "request=@table_crop.jpg;type=image/jpeg"
```
**Response**
```json
{
"bboxes": [
[x0, y0, x1, y1, score, class_id],
...
]
}
```
| class_id | Label |
|:--------:|-------|
| 0 | table |
| 1 | table column |
| 2 | table row |
| 3 | table column header |
| 4 | table projected row header |
| 5 | table spanning cell |
### `POST /predict/ocr`
Two modes controlled by the `operator` form field.
#### Detection (`operator=det`)
Returns quadrilateral bounding boxes for detected text regions.
```
curl -X POST "http://localhost:9390/predict/ocr" \
-F "operator=det" \
-F "request=@page.jpg;type=image/jpeg"
```
**Response** (5-level nested array):
```json
{
"output": [
[
[
[
[[x0,y0],[x1,y1],[x2,y2],[x3,y3]],
...
]
]
]
]
}
```
#### Recognition (`operator=rec`)
Recognizes text within a cropped region.
```
curl -X POST "http://localhost:9390/predict/ocr" \
-F "operator=rec" \
-F "request=@char_crop.jpg;type=image/jpeg"
```
**Response** (4-level nested array):
```json
{
"output": [
[
[
["recognized text", 1.0],
...
]
]
]
}
```
> Confidence is always `1.0` — the OSS recognition model does not return
> per-character confidence scores.
## Error Responses
| Scenario | HTTP Status |
|----------|:-----------:|
| Missing `operator` field (OCR) | 400 |
| Invalid `operator` value | 400 |
| Empty or corrupt image | 400 |
| Image exceeds 4096×4096 | 400 |
| Internal inference error | 500 |
## Models
All ONNX models are from the [InfiniFlow/deepdoc](https://huggingface.co/InfiniFlow/deepdoc)
HuggingFace repository (Apache 2.0 license):
| File | Size | Purpose |
|------|------|---------|
| `layout.onnx` | 75.7 MB | DLA (YOLOv10) |
| `det.onnx` | 4.7 MB | OCR text detection (PP-OCRv4) |
| `rec.onnx` | 10.8 MB | OCR text recognition (PP-OCRv4) |
| `tsr.onnx` | 12.2 MB | TSR (PaddleDetection) |
| `ocr.res` | 26 KB | OCR character dictionary |
## Architecture
```
deepdoc/server/
├── deepdoc_server.py # LitServe entry point
├── endpoints/ # LitAPI endpoints (HTTP layer)
│ ├── dla_endpoint.py
│ ├── tsr_endpoint.py
│ └── ocr_endpoint.py
└── adapters/ # Model wrappers (inference + format conversion)
├── dla_adapter.py
├── tsr_adapter.py
└── ocr_adapter.py
```
Endpoints → Adapters → `deepdoc/vision/` (reused OSS model classes) → ONNX Runtime.

View File

View File

@@ -0,0 +1,80 @@
"""DLA adapter — wraps LayoutRecognizer and converts output to wire format."""
import io
import logging
from typing import List
from PIL import Image
from deepdoc.vision import LayoutRecognizer
logger = logging.getLogger(__name__)
# OSS model label → Go dlaClassLabels index
# Go-side (internal/parser/deepdoc.go):
# var dlaClassLabels = []string{
# "title", "text", "reference", "figure", "figure caption",
# "table", "table caption", "table caption", "equation", "figure caption",
# }
# Indices 4/6/7/9 are duplicates; OSS model only produces unique labels.
DLA_CLASS_MAP = {
"title": 0,
"text": 1,
"reference": 2,
"figure": 3,
"figure caption": 4,
"table": 5,
"table caption": 6,
"equation": 8,
}
class DLAAdapter:
"""Calls LayoutRecognizer.forward() and converts bboxes to wire format."""
def __init__(self, model_dir: str, thr: float = 0.2):
self.model_dir = model_dir
self.thr = thr
self._layouter: LayoutRecognizer | None = None
def load(self):
"""Initialize the layout recognizer. Called once per worker."""
self._layouter = LayoutRecognizer("layout")
def __call__(self, image_data: bytes) -> List[List[float]]:
"""
Args:
image_data: JPEG image bytes.
Returns:
List of [x0, y0, x1, y1, score, class_id] for each detected layout region.
"""
if self._layouter is None:
raise RuntimeError("DLAAdapter.load() must be called before inference")
img = Image.open(io.BytesIO(image_data)).convert("RGB")
width, height = img.size
# forward() returns raw Recognizer output (no OCR integration)
raw_bboxes = self._layouter.forward([img], thr=self.thr, batch_size=1)[0]
result = []
for b in raw_bboxes:
label = b["type"].lower()
class_id = DLA_CLASS_MAP.get(label)
if class_id is None:
logger.warning("DLA: unknown label '%s', skipping", label)
continue
x0, y0, x1, y1 = b["bbox"]
score = float(b["score"])
# Clamp coordinates
x0 = max(0.0, min(float(x0), width))
y0 = max(0.0, min(float(y0), height))
x1 = max(0.0, min(float(x1), width))
y1 = max(0.0, min(float(y1), height))
result.append([x0, y0, x1, y1, score, float(class_id)])
return result

View File

@@ -0,0 +1,103 @@
"""OCR adapter — wraps OCR model and converts output to wire format.
Two modes:
- detect: 5-level nested JSON matching Go [][][][][]float64
- rec: 4-level nested JSON matching Go [][][][]any
"""
import logging
from typing import Any, Dict
import cv2
import numpy as np
from deepdoc.vision.ocr import OCR
logger = logging.getLogger(__name__)
# Confidence fill value — OSS recognize_batch does not return confidence scores.
_CONFIDENCE_FILL = 1.0
class OCRAdapter:
"""Calls OCR.detect() and OCR.recognize_batch(), converts to wire format."""
def __init__(self, model_dir: str):
self.model_dir = model_dir
self._ocr: OCR | None = None
def load(self):
"""Initialize the OCR model. Called once per worker."""
self._ocr = OCR()
def close(self):
"""Clean up OCR model resources."""
if self._ocr is not None:
try:
# Access internal detectors and recognizers
if hasattr(self._ocr, "detector") and self._ocr.detector is not None:
self._ocr.detector.close()
except Exception:
pass
try:
if hasattr(self._ocr, "text_recognizer") and self._ocr.text_recognizer is not None:
self._ocr.text_recognizer.close()
except Exception:
pass
self._ocr = None
def detect(self, image_data: bytes) -> Dict[str, Any]:
"""Run text detection.
Returns:
{"output": 5-level nested list} matching Go [][][][][]float64.
"""
if self._ocr is None:
raise RuntimeError("OCRAdapter.load() must be called before inference")
img = self._decode_bgr(image_data)
# OCR.detect() → [(quad_ndarray, ("", 0)), ...]
det_result = self._ocr.detect(img)
quads = []
for quad_ndarray, _ in det_result:
quad = quad_ndarray.tolist() # [[x0,y0],[x1,y1],[x2,y2],[x3,y3]]
# Convert to Python float for JSON compatibility
quad = [[float(p[0]), float(p[1])] for p in quad]
quads.append(quad)
# 5-level nesting matching Go [][][][][]float64:
# batch → page → quad → point → coord
output = [[quads]]
return {"output": output}
def recognize(self, image_data: bytes) -> Dict[str, Any]:
"""Run text recognition on a cropped text region.
Returns:
{"output": 4-level nested list} matching Go [][][][]any.
"""
if self._ocr is None:
raise RuntimeError("OCRAdapter.load() must be called before inference")
img = self._decode_bgr(image_data)
# OCR.recognize_batch() returns List[str]; single cropped image → list of 1 image
texts = self._ocr.recognize_batch([img])
items = [[text, _CONFIDENCE_FILL] for text in texts]
# 4-level nesting matching Go [][][][]any:
# batch → page → items list → pair [text, confidence]
output = [[items]]
return {"output": output}
@staticmethod
def _decode_bgr(data: bytes) -> np.ndarray:
"""Decode JPEG bytes to BGR numpy array (OCR expects BGR)."""
arr = np.frombuffer(data, np.uint8)
img = cv2.imdecode(arr, cv2.IMREAD_COLOR)
if img is None:
raise ValueError("Failed to decode image")
return img

View File

@@ -0,0 +1,75 @@
"""TSR adapter — wraps TableStructureRecognizer and converts output to wire format."""
import io
import logging
from typing import List
from PIL import Image
from deepdoc.vision.table_structure_recognizer import TableStructureRecognizer
logger = logging.getLogger(__name__)
# OSS model label → Go tsrLabels index (labels are identical)
# Go-side (internal/parser/deepdoc.go):
# var tsrLabels = []string{
# "table", "table column", "table row",
# "table column header", "table projected row header",
# "table spanning cell",
# }
TSR_CLASS_MAP = {
"table": 0,
"table column": 1,
"table row": 2,
"table column header": 3,
"table projected row header": 4,
"table spanning cell": 5,
}
class TSRAdapter:
"""Calls TableStructureRecognizer and converts elements to wire format."""
def __init__(self, model_dir: str, thr: float = 0.2):
self.model_dir = model_dir
self.thr = thr
self._tsr: TableStructureRecognizer | None = None
def load(self):
"""Initialize the TSR model. Called once per worker."""
self._tsr = TableStructureRecognizer()
def __call__(self, image_data: bytes) -> List[List[float]]:
"""
Args:
image_data: JPEG image bytes (cropped table region).
Returns:
List of [x0, y0, x1, y1, score, class_id] for each structural element.
"""
if self._tsr is None:
raise RuntimeError("TSRAdapter.load() must be called before inference")
img = Image.open(io.BytesIO(image_data)).convert("RGB")
width, height = img.size
tables = self._tsr([img], thr=self.thr)
result = []
for tbl_elements in tables:
for elem in tbl_elements:
label = elem["label"]
class_id = TSR_CLASS_MAP.get(label)
if class_id is None:
logger.warning("TSR: unknown label '%s', skipping", label)
continue
x0 = max(0.0, min(float(elem["x0"]), width))
y0 = max(0.0, min(float(elem["top"]), height))
x1 = max(0.0, min(float(elem["x1"]), width))
y1 = max(0.0, min(float(elem["bottom"]), height))
score = float(elem["score"])
result.append([x0, y0, x1, y1, score, float(class_id)])
return result

View File

@@ -0,0 +1,105 @@
#!/usr/bin/env python3
"""Unified OSS DeepDoc Model Server.
Serves DLA, OCR, and TSR models via LiteServe using OSS ONNX Runtime models.
Endpoints:
POST /predict/dla — Document Layout Analysis
POST /predict/ocr — OCR (detect via ?operator=det, recognize via ?operator=rec)
POST /predict/tsr — Table Structure Recognition
GET /health — Health check
"""
import argparse
import logging
import os
import litserve as ls
from deepdoc.server.endpoints.dla_endpoint import DLAEndpoint
from deepdoc.server.endpoints.ocr_endpoint import OCREndpoint
from deepdoc.server.endpoints.tsr_endpoint import TSREndpoint
logging.basicConfig(
level=logging.INFO,
format="%(asctime)s - %(name)s - %(levelname)s - %(message)s",
)
logger = logging.getLogger(__name__)
def parse_args():
parser = argparse.ArgumentParser(
description="Unified OSS DeepDoc Model Server",
formatter_class=argparse.RawDescriptionHelpFormatter,
)
parser.add_argument(
"--port", type=int, default=9390, help="Serving port (default: 9390)"
)
parser.add_argument(
"--timeout", type=int, default=100, help="Request timeout in seconds (default: 100)"
)
parser.add_argument(
"--model-dir",
type=str,
default=os.path.join(
os.path.dirname(__file__), "..", "..", "..", "rag", "res", "deepdoc"
),
help="Model file directory",
)
parser.add_argument(
"--disable-dla", action="store_true", dest="disable_dla", default=False,
help="Disable DLA endpoint"
)
parser.add_argument(
"--disable-ocr", action="store_true", dest="disable_ocr", default=False,
help="Disable OCR endpoint"
)
parser.add_argument(
"--disable-tsr", action="store_true", dest="disable_tsr", default=False,
help="Disable TSR endpoint"
)
parser.add_argument("--log-level", type=str, default="INFO", help="Logging level")
return parser.parse_args()
def main():
args = parse_args()
logging.getLogger().setLevel(getattr(logging, args.log_level.upper(), "INFO"))
model_dir = os.path.abspath(args.model_dir)
logger.info("Model directory: %s", model_dir)
apis = []
if not args.disable_dla:
apis.append(DLAEndpoint(model_dir=model_dir))
logger.info("DLA endpoint enabled")
if not args.disable_ocr:
apis.append(OCREndpoint(model_dir=model_dir))
logger.info("OCR endpoint enabled")
if not args.disable_tsr:
apis.append(TSREndpoint(model_dir=model_dir))
logger.info("TSR endpoint enabled")
if not apis:
logger.error("No endpoints enabled")
return
server = ls.LitServer(
lit_api=apis,
accelerator="cpu",
workers_per_device=1,
timeout=args.timeout,
restart_workers=True,
)
# /model — returns OSS model metadata (no LitServe path conflict)
@server.app.get("/model")
async def model_info():
return {"model": "oss", "version": "1.0"}
logger.info("Starting server on port %d...", args.port)
server.run(port=args.port)
if __name__ == "__main__":
main()

View File

@@ -0,0 +1,150 @@
#!/usr/bin/env python3
"""Generate minimal stub packages for the OSS DeepDoc Docker image.
The deepdoc vision modules (ocr.py, recognizer.py, etc.) import from
``common``, ``rag``, and ``deepdoc`` at module level. In the full
RAGFlow environment these packages pull in heavy dependencies (torch,
pdfplumber, database connectors, beartype) that are not needed by the
ONNX-only inference server.
This script writes lightweight replacement modules under /app so the
import chain succeeds without pulling in the full dependency tree.
Why stubs instead of conditionally lazy imports in the vision code?
The vision modules are shared between the full Python backend and the
Docker server. Keeping the stubs here avoids adding Docker-specific
guards to the shared code.
"""
import os
TARGET = os.environ.get("STUB_TARGET", "/app")
def write(path: str, content: str) -> None:
full = os.path.join(TARGET, path)
os.makedirs(os.path.dirname(full), exist_ok=True)
with open(full, "w") as f:
f.write(content.lstrip("\n"))
# ── deepdoc ────────────────────────────────────────────────────────────
# Real deepdoc/__init__.py calls beartype_this_package() which requires
# the beartype library.
write("deepdoc/__init__.py", """
# Minimal deepdoc __init__ for Docker — avoids beartype dependency.
""")
# Real deepdoc/vision/__init__.py imports pdfplumber and
# AscendLayoutRecognizer (requires ais_bench). The Docker server only
# needs the four ONNX-based classes below.
write("deepdoc/vision/__init__.py", """
# Minimal deepdoc.vision __init__ for Docker — avoids pdfplumber and Ascend imports.
from .ocr import OCR
from .recognizer import Recognizer
from .layout_recognizer import LayoutRecognizer4YOLOv10 as LayoutRecognizer
from .table_structure_recognizer import TableStructureRecognizer
__all__ = ["OCR", "Recognizer", "LayoutRecognizer", "TableStructureRecognizer"]
""")
# ── common ─────────────────────────────────────────────────────────────
# Real common.settings imports rag.utils.es_conn and other database/storage
# connectors. The server only needs PARALLEL_DEVICES for OCR.
write("common/__init__.py", """
# Stub common.__init__ for Docker deepdoc service.
import os
class _Settings:
PARALLEL_DEVICES = int(os.environ.get("PARALLEL_DEVICES", "0"))
settings = _Settings()
""")
# Real common.file_utils derives the project base from __file__. In
# Docker the project root is always /app.
write("common/file_utils.py", """
# Stub common.file_utils for Docker deepdoc service.
import os
_PROJECT_BASE = None
def get_project_base_directory(*args):
global _PROJECT_BASE
if _PROJECT_BASE is None:
_PROJECT_BASE = os.environ.get("RAGFLOW_PROJECT_BASE", "/app")
if args:
return os.path.join(_PROJECT_BASE, *args)
return _PROJECT_BASE
""")
# Real common.misc_utils imports 15+ modules. The server only calls
# pip_install_torch() inside load_model()'s cuda_is_available() guard.
# On CPU-only images torch is not installed, so the try/except silently
# returns False and onnxruntime falls back to CPUExecutionProvider.
write("common/misc_utils.py", """
# Stub common.misc_utils for Docker deepdoc service.
def pip_install_torch(*args, **kwargs):
try:
import torch # noqa: F401
except ImportError:
pass
""")
# ── rag ────────────────────────────────────────────────────────────────
write("rag/__init__.py", """
# Stub rag package for Docker deepdoc service.
""")
# table_structure_recognizer.py imports rag_tokenizer at module level.
# Its tokenize/tag methods are only called from blockType() /
# construct_table(), which are NOT invoked by the TSR adapter's
# __call__() path. The stub exists solely to satisfy the module-level
# import; its methods are never called at server runtime.
write("rag/nlp/__init__.py", """
# Stub rag.nlp module for Docker deepdoc service.
# Provides minimal rag_tokenizer to satisfy table_structure_recognizer import.
class _StubTokenizer:
def tokenize(self, text):
return text
def tag(self, word):
return ""
rag_tokenizer = _StubTokenizer()
""")
# operators.py imports ensure_pil_image at module level and calls it in
# NormalizeImage.__call__ / ToCHWImage.__call__ (OCR text detection path).
# The real rag.utils.lazy_image imports concat_img from rag.nlp, pulling
# in the entire NLP stack.
write("rag/utils/lazy_image.py", """
# Stub rag.utils.lazy_image for Docker.
from PIL import Image
def ensure_pil_image(img):
if isinstance(img, Image.Image):
return img
return None
""")
if __name__ == "__main__":
print(f"Docker stubs written to {TARGET}")

View File

@@ -0,0 +1,47 @@
#!/usr/bin/env python3
"""Download OSS DeepDoc ONNX models from HuggingFace."""
import os
import sys
REPO_ID = "InfiniFlow/deepdoc"
FILES = [
"layout.onnx",
"det.onnx",
"rec.onnx",
"tsr.onnx",
"ocr.res",
]
def main():
target_dir = sys.argv[1] if len(sys.argv) > 1 else "models"
os.makedirs(target_dir, exist_ok=True)
try:
from huggingface_hub import hf_hub_download
except ImportError:
print("ERROR: huggingface_hub not installed. Run: pip install huggingface_hub")
sys.exit(1)
hf_endpoint = os.environ.get("HF_ENDPOINT", "https://huggingface.co")
for filename in FILES:
local_path = os.path.join(target_dir, filename)
if os.path.exists(local_path):
print(f" SKIP {filename} (already exists)")
continue
print(f" DOWNLOAD {filename} ...")
hf_hub_download(
repo_id=REPO_ID,
filename=filename,
local_dir=target_dir,
endpoint=hf_endpoint,
)
print(f" OK {filename}")
print(f"\nAll models downloaded to {os.path.abspath(target_dir)}")
if __name__ == "__main__":
main()

View File

View File

@@ -0,0 +1,43 @@
"""DLA LitServe endpoint."""
import logging
import litserve as ls
from deepdoc.server.adapters.dla_adapter import DLAAdapter
logger = logging.getLogger(__name__)
class DLAEndpoint(ls.LitAPI):
"""Document Layout Analysis endpoint at /predict/dla."""
def __init__(self, model_dir: str, thr: float = 0.2):
super().__init__()
self.api_path = "/predict/dla"
self.model_dir = model_dir
self.thr = thr
self.adapter: DLAAdapter | None = None
def setup(self, device):
self.adapter = DLAAdapter(model_dir=self.model_dir, thr=self.thr)
self.adapter.load()
logger.info("DLA model loaded")
def decode_request(self, request):
# Handle both Starlette UploadFile (old) and FormData (Starlette >=1.3)
if hasattr(request, "file"):
data = request.file.read()
else:
data = request.get("request").file.read()
if not data:
raise ValueError("Empty request body")
if len(data) > 50 * 1024 * 1024: # 50MB
raise ValueError("Image too large")
return data
def predict(self, image_data: bytes):
return self.adapter(image_data)
def encode_response(self, output):
return {"bboxes": output}

View File

@@ -0,0 +1,67 @@
"""OCR LitServe endpoint — detect + rec via operator form field."""
import logging
import litserve as ls
from deepdoc.server.adapters.ocr_adapter import OCRAdapter
logger = logging.getLogger(__name__)
class OCREndpoint(ls.LitAPI):
"""OCR endpoint at /predict/ocr.
Form field 'operator' (det or rec) selects the mode.
Form field 'request' carries the JPEG image bytes.
"""
def __init__(self, model_dir: str):
super().__init__()
self.api_path = "/predict/ocr"
self.model_dir = model_dir
self.adapter: OCRAdapter | None = None
def setup(self, device):
self.adapter = OCRAdapter(model_dir=self.model_dir)
self.adapter.load()
logger.info("OCR model loaded")
def decode_request(self, request):
# Handle both old Starlette UploadFile and new Starlette FormData
if hasattr(request, "file"):
data = request.file.read()
# Try to read operator from the underlying request context
operator = getattr(self, "_request", None)
if operator is not None:
operator = operator.query_params.get("operator", "")
else:
operator = ""
else:
# FormData: get file and operator form fields
data = request.get("request").file.read()
op_val = request.get("operator")
operator = str(op_val) if op_val else ""
if not data:
raise ValueError("Empty request body")
if len(data) > 50 * 1024 * 1024:
raise ValueError("Image too large")
operator = operator.strip().lower()
if operator not in ("det", "rec"):
raise ValueError(
f"Invalid or missing operator '{operator}' (must be 'det' or 'rec')"
)
return operator, data
def predict(self, inputs: tuple):
operator, image_data = inputs
if operator == "det":
return self.adapter.detect(image_data)
else:
return self.adapter.recognize(image_data)
def encode_response(self, output):
return output

View File

@@ -0,0 +1,43 @@
"""TSR LitServe endpoint."""
import logging
import litserve as ls
from deepdoc.server.adapters.tsr_adapter import TSRAdapter
logger = logging.getLogger(__name__)
class TSREndpoint(ls.LitAPI):
"""Table Structure Recognition endpoint at /predict/tsr."""
def __init__(self, model_dir: str, thr: float = 0.2):
super().__init__()
self.api_path = "/predict/tsr"
self.model_dir = model_dir
self.thr = thr
self.adapter: TSRAdapter | None = None
def setup(self, device):
self.adapter = TSRAdapter(model_dir=self.model_dir, thr=self.thr)
self.adapter.load()
logger.info("TSR model loaded")
def decode_request(self, request):
# Handle both Starlette UploadFile (old) and FormData (Starlette >=1.3)
if hasattr(request, "file"):
data = request.file.read()
else:
data = request.get("request").file.read()
if not data:
raise ValueError("Empty request body")
if len(data) > 50 * 1024 * 1024:
raise ValueError("Image too large")
return data
def predict(self, image_data: bytes):
return self.adapter(image_data)
def encode_response(self, output):
return {"bboxes": output}

View File

@@ -0,0 +1,20 @@
[project]
name = "deepdoc-server-oss"
version = "0.1.0"
description = "OSS DeepDoc Server with DLA, OCR, and TSR models via ONNX Runtime"
requires-python = ">=3.11,<3.13"
dependencies = [
"litserve>=0.2.17",
"onnxruntime>=1.20.0",
"opencv-python-headless",
"numpy",
"pillow",
"pyclipper>=1.4.0",
"python-multipart",
"shapely",
"six",
]
[build-system]
requires = ["hatchling"]
build-backend = "hatchling.build"