mirror of
https://github.com/infiniflow/ragflow.git
synced 2026-07-01 00:05:43 +08:00
77 lines
2.4 KiB
Python
77 lines
2.4 KiB
Python
|
|
from __future__ import annotations
|
||
|
|
|
||
|
|
import importlib.util
|
||
|
|
import sys
|
||
|
|
import types
|
||
|
|
from pathlib import Path
|
||
|
|
|
||
|
|
import pytest
|
||
|
|
|
||
|
|
|
||
|
|
ROOT = Path(__file__).resolve().parents[4]
|
||
|
|
|
||
|
|
|
||
|
|
class _Response:
|
||
|
|
status_code = 200
|
||
|
|
text = ""
|
||
|
|
|
||
|
|
def __init__(self, payload):
|
||
|
|
self._payload = payload
|
||
|
|
|
||
|
|
def json(self):
|
||
|
|
return self._payload
|
||
|
|
|
||
|
|
|
||
|
|
def _load_docling_parser(monkeypatch):
|
||
|
|
common_pkg = types.ModuleType("common")
|
||
|
|
constants_mod = types.ModuleType("common.constants")
|
||
|
|
constants_mod.MAXIMUM_PAGE_NUMBER = 1000
|
||
|
|
|
||
|
|
deepdoc_pkg = types.ModuleType("deepdoc")
|
||
|
|
parser_pkg = types.ModuleType("deepdoc.parser")
|
||
|
|
parser_pkg.__path__ = []
|
||
|
|
utils_mod = types.ModuleType("deepdoc.parser.utils")
|
||
|
|
utils_mod.extract_pdf_outlines = lambda _source: []
|
||
|
|
|
||
|
|
pil_pkg = types.ModuleType("PIL")
|
||
|
|
image_mod = types.ModuleType("PIL.Image")
|
||
|
|
image_mod.Image = object
|
||
|
|
pil_pkg.Image = image_mod
|
||
|
|
|
||
|
|
monkeypatch.setitem(sys.modules, "common", common_pkg)
|
||
|
|
monkeypatch.setitem(sys.modules, "common.constants", constants_mod)
|
||
|
|
monkeypatch.setitem(sys.modules, "deepdoc", deepdoc_pkg)
|
||
|
|
monkeypatch.setitem(sys.modules, "deepdoc.parser", parser_pkg)
|
||
|
|
monkeypatch.setitem(sys.modules, "deepdoc.parser.utils", utils_mod)
|
||
|
|
monkeypatch.setitem(sys.modules, "pdfplumber", types.ModuleType("pdfplumber"))
|
||
|
|
monkeypatch.setitem(sys.modules, "PIL", pil_pkg)
|
||
|
|
monkeypatch.setitem(sys.modules, "PIL.Image", image_mod)
|
||
|
|
|
||
|
|
spec = importlib.util.spec_from_file_location(
|
||
|
|
"_docling_parser_under_test",
|
||
|
|
ROOT / "deepdoc" / "parser" / "docling_parser.py",
|
||
|
|
)
|
||
|
|
module = importlib.util.module_from_spec(spec)
|
||
|
|
monkeypatch.setitem(sys.modules, spec.name, module)
|
||
|
|
spec.loader.exec_module(module)
|
||
|
|
return module
|
||
|
|
|
||
|
|
|
||
|
|
@pytest.mark.p2
|
||
|
|
def test_remote_chunked_200_standard_payload_falls_back(monkeypatch):
|
||
|
|
module = _load_docling_parser(monkeypatch)
|
||
|
|
calls = []
|
||
|
|
|
||
|
|
def fake_post(_url, json, timeout):
|
||
|
|
calls.append((json, timeout))
|
||
|
|
return _Response({"document": {"md_content": "# Parsed\n\nbody"}})
|
||
|
|
|
||
|
|
monkeypatch.setattr(module.requests, "post", fake_post)
|
||
|
|
|
||
|
|
parser = module.DoclingParser(docling_server_url="http://docling.local")
|
||
|
|
sections, tables = parser._parse_pdf_remote("sample.pdf", binary=b"%PDF", parse_method="raw")
|
||
|
|
|
||
|
|
assert sections == [("# Parsed\n\nbody", "")]
|
||
|
|
assert tables == []
|
||
|
|
assert calls[0][0]["options"]["do_chunking"] is True
|