mirror of
https://github.com/infiniflow/ragflow.git
synced 2026-06-29 23:41:12 +08:00
## Summary - keep the native Docling chunking path when it returns usable chunks - fall back to the standard Docling response parser when a chunked request gets HTTP 200 but returns no usable chunks - add a regression test for older Docling servers that accept the chunking request but return a standard conversion payload ## Why Older external Docling servers can accept a request containing `do_chunking: true` and still return the standard conversion response shape. The current code treats any HTTP 200 from the chunked request as a native chunk response, finds no chunk entries, and returns zero sections without trying the standard response parser. Fixes #15569. ## Validation - `python -m pytest test\\unit_test\\deepdoc\\parser\\test_docling_parser_remote.py -q` - `python -m py_compile deepdoc\\parser\\docling_parser.py test\\unit_test\\deepdoc\\parser\\test_docling_parser_remote.py` - `python -m ruff check deepdoc\\parser\\docling_parser.py test\\unit_test\\deepdoc\\parser\\test_docling_parser_remote.py` - `git diff --check`
77 lines
2.4 KiB
Python
77 lines
2.4 KiB
Python
from __future__ import annotations
|
|
|
|
import importlib.util
|
|
import sys
|
|
import types
|
|
from pathlib import Path
|
|
|
|
import pytest
|
|
|
|
|
|
ROOT = Path(__file__).resolve().parents[4]
|
|
|
|
|
|
class _Response:
|
|
status_code = 200
|
|
text = ""
|
|
|
|
def __init__(self, payload):
|
|
self._payload = payload
|
|
|
|
def json(self):
|
|
return self._payload
|
|
|
|
|
|
def _load_docling_parser(monkeypatch):
|
|
common_pkg = types.ModuleType("common")
|
|
constants_mod = types.ModuleType("common.constants")
|
|
constants_mod.MAXIMUM_PAGE_NUMBER = 1000
|
|
|
|
deepdoc_pkg = types.ModuleType("deepdoc")
|
|
parser_pkg = types.ModuleType("deepdoc.parser")
|
|
parser_pkg.__path__ = []
|
|
utils_mod = types.ModuleType("deepdoc.parser.utils")
|
|
utils_mod.extract_pdf_outlines = lambda _source: []
|
|
|
|
pil_pkg = types.ModuleType("PIL")
|
|
image_mod = types.ModuleType("PIL.Image")
|
|
image_mod.Image = object
|
|
pil_pkg.Image = image_mod
|
|
|
|
monkeypatch.setitem(sys.modules, "common", common_pkg)
|
|
monkeypatch.setitem(sys.modules, "common.constants", constants_mod)
|
|
monkeypatch.setitem(sys.modules, "deepdoc", deepdoc_pkg)
|
|
monkeypatch.setitem(sys.modules, "deepdoc.parser", parser_pkg)
|
|
monkeypatch.setitem(sys.modules, "deepdoc.parser.utils", utils_mod)
|
|
monkeypatch.setitem(sys.modules, "pdfplumber", types.ModuleType("pdfplumber"))
|
|
monkeypatch.setitem(sys.modules, "PIL", pil_pkg)
|
|
monkeypatch.setitem(sys.modules, "PIL.Image", image_mod)
|
|
|
|
spec = importlib.util.spec_from_file_location(
|
|
"_docling_parser_under_test",
|
|
ROOT / "deepdoc" / "parser" / "docling_parser.py",
|
|
)
|
|
module = importlib.util.module_from_spec(spec)
|
|
monkeypatch.setitem(sys.modules, spec.name, module)
|
|
spec.loader.exec_module(module)
|
|
return module
|
|
|
|
|
|
@pytest.mark.p2
|
|
def test_remote_chunked_200_standard_payload_falls_back(monkeypatch):
|
|
module = _load_docling_parser(monkeypatch)
|
|
calls = []
|
|
|
|
def fake_post(_url, json, timeout):
|
|
calls.append((json, timeout))
|
|
return _Response({"document": {"md_content": "# Parsed\n\nbody"}})
|
|
|
|
monkeypatch.setattr(module.requests, "post", fake_post)
|
|
|
|
parser = module.DoclingParser(docling_server_url="http://docling.local")
|
|
sections, tables = parser._parse_pdf_remote("sample.pdf", binary=b"%PDF", parse_method="raw")
|
|
|
|
assert sections == [("# Parsed\n\nbody", "")]
|
|
assert tables == []
|
|
assert calls[0][0]["options"]["do_chunking"] is True
|