Files
ragflow/test/unit_test/deepdoc/parser/test_docling_parser_remote.py

77 lines
2.4 KiB
Python
Raw Permalink Normal View History

from __future__ import annotations
import importlib.util
import sys
import types
from pathlib import Path
import pytest
ROOT = Path(__file__).resolve().parents[4]
class _Response:
status_code = 200
text = ""
def __init__(self, payload):
self._payload = payload
def json(self):
return self._payload
def _load_docling_parser(monkeypatch):
common_pkg = types.ModuleType("common")
constants_mod = types.ModuleType("common.constants")
constants_mod.MAXIMUM_PAGE_NUMBER = 1000
deepdoc_pkg = types.ModuleType("deepdoc")
parser_pkg = types.ModuleType("deepdoc.parser")
parser_pkg.__path__ = []
utils_mod = types.ModuleType("deepdoc.parser.utils")
utils_mod.extract_pdf_outlines = lambda _source: []
pil_pkg = types.ModuleType("PIL")
image_mod = types.ModuleType("PIL.Image")
image_mod.Image = object
pil_pkg.Image = image_mod
monkeypatch.setitem(sys.modules, "common", common_pkg)
monkeypatch.setitem(sys.modules, "common.constants", constants_mod)
monkeypatch.setitem(sys.modules, "deepdoc", deepdoc_pkg)
monkeypatch.setitem(sys.modules, "deepdoc.parser", parser_pkg)
monkeypatch.setitem(sys.modules, "deepdoc.parser.utils", utils_mod)
monkeypatch.setitem(sys.modules, "pdfplumber", types.ModuleType("pdfplumber"))
monkeypatch.setitem(sys.modules, "PIL", pil_pkg)
monkeypatch.setitem(sys.modules, "PIL.Image", image_mod)
spec = importlib.util.spec_from_file_location(
"_docling_parser_under_test",
ROOT / "deepdoc" / "parser" / "docling_parser.py",
)
module = importlib.util.module_from_spec(spec)
monkeypatch.setitem(sys.modules, spec.name, module)
spec.loader.exec_module(module)
return module
@pytest.mark.p2
def test_remote_chunked_200_standard_payload_falls_back(monkeypatch):
module = _load_docling_parser(monkeypatch)
calls = []
def fake_post(_url, json, timeout):
calls.append((json, timeout))
return _Response({"document": {"md_content": "# Parsed\n\nbody"}})
monkeypatch.setattr(module.requests, "post", fake_post)
parser = module.DoclingParser(docling_server_url="http://docling.local")
sections, tables = parser._parse_pdf_remote("sample.pdf", binary=b"%PDF", parse_method="raw")
assert sections == [("# Parsed\n\nbody", "")]
assert tables == []
assert calls[0][0]["options"]["do_chunking"] is True