From 5db1b296fbea33ddf5bca78d9d69afeab403cfb3 Mon Sep 17 00:00:00 2001 From: Yufeng He <40085740+he-yufeng@users.noreply.github.com> Date: Thu, 4 Jun 2026 13:42:58 +0800 Subject: [PATCH] fix: fall back from empty Docling native chunks (#15601) ## Summary - keep the native Docling chunking path when it returns usable chunks - fall back to the standard Docling response parser when a chunked request gets HTTP 200 but returns no usable chunks - add a regression test for older Docling servers that accept the chunking request but return a standard conversion payload ## Why Older external Docling servers can accept a request containing `do_chunking: true` and still return the standard conversion response shape. The current code treats any HTTP 200 from the chunked request as a native chunk response, finds no chunk entries, and returns zero sections without trying the standard response parser. Fixes #15569. ## Validation - `python -m pytest test\\unit_test\\deepdoc\\parser\\test_docling_parser_remote.py -q` - `python -m py_compile deepdoc\\parser\\docling_parser.py test\\unit_test\\deepdoc\\parser\\test_docling_parser_remote.py` - `python -m ruff check deepdoc\\parser\\docling_parser.py test\\unit_test\\deepdoc\\parser\\test_docling_parser_remote.py` - `git diff --check` --- deepdoc/parser/docling_parser.py | 5 +- .../parser/test_docling_parser_remote.py | 76 +++++++++++++++++++ 2 files changed, 80 insertions(+), 1 deletion(-) create mode 100644 test/unit_test/deepdoc/parser/test_docling_parser_remote.py diff --git a/deepdoc/parser/docling_parser.py b/deepdoc/parser/docling_parser.py index 948a7acb0c..097e9c9045 100644 --- a/deepdoc/parser/docling_parser.py +++ b/deepdoc/parser/docling_parser.py @@ -477,7 +477,10 @@ class DoclingParser(RAGFlowPdfParser): if callback: callback(0.95, f"[Docling] Native chunks received: {len(sections)}") - return sections, tables + if sections: + return sections, tables + + self.logger.warning("[Docling] Native chunking returned no usable chunks; trying standard response parsing.") # --- FALLBACK: Standard RAGFlow parsing for older docling servers --- docs = self._extract_remote_document_entries(response_json) diff --git a/test/unit_test/deepdoc/parser/test_docling_parser_remote.py b/test/unit_test/deepdoc/parser/test_docling_parser_remote.py new file mode 100644 index 0000000000..e3410c65e2 --- /dev/null +++ b/test/unit_test/deepdoc/parser/test_docling_parser_remote.py @@ -0,0 +1,76 @@ +from __future__ import annotations + +import importlib.util +import sys +import types +from pathlib import Path + +import pytest + + +ROOT = Path(__file__).resolve().parents[4] + + +class _Response: + status_code = 200 + text = "" + + def __init__(self, payload): + self._payload = payload + + def json(self): + return self._payload + + +def _load_docling_parser(monkeypatch): + common_pkg = types.ModuleType("common") + constants_mod = types.ModuleType("common.constants") + constants_mod.MAXIMUM_PAGE_NUMBER = 1000 + + deepdoc_pkg = types.ModuleType("deepdoc") + parser_pkg = types.ModuleType("deepdoc.parser") + parser_pkg.__path__ = [] + utils_mod = types.ModuleType("deepdoc.parser.utils") + utils_mod.extract_pdf_outlines = lambda _source: [] + + pil_pkg = types.ModuleType("PIL") + image_mod = types.ModuleType("PIL.Image") + image_mod.Image = object + pil_pkg.Image = image_mod + + monkeypatch.setitem(sys.modules, "common", common_pkg) + monkeypatch.setitem(sys.modules, "common.constants", constants_mod) + monkeypatch.setitem(sys.modules, "deepdoc", deepdoc_pkg) + monkeypatch.setitem(sys.modules, "deepdoc.parser", parser_pkg) + monkeypatch.setitem(sys.modules, "deepdoc.parser.utils", utils_mod) + monkeypatch.setitem(sys.modules, "pdfplumber", types.ModuleType("pdfplumber")) + monkeypatch.setitem(sys.modules, "PIL", pil_pkg) + monkeypatch.setitem(sys.modules, "PIL.Image", image_mod) + + spec = importlib.util.spec_from_file_location( + "_docling_parser_under_test", + ROOT / "deepdoc" / "parser" / "docling_parser.py", + ) + module = importlib.util.module_from_spec(spec) + monkeypatch.setitem(sys.modules, spec.name, module) + spec.loader.exec_module(module) + return module + + +@pytest.mark.p2 +def test_remote_chunked_200_standard_payload_falls_back(monkeypatch): + module = _load_docling_parser(monkeypatch) + calls = [] + + def fake_post(_url, json, timeout): + calls.append((json, timeout)) + return _Response({"document": {"md_content": "# Parsed\n\nbody"}}) + + monkeypatch.setattr(module.requests, "post", fake_post) + + parser = module.DoclingParser(docling_server_url="http://docling.local") + sections, tables = parser._parse_pdf_remote("sample.pdf", binary=b"%PDF", parse_method="raw") + + assert sections == [("# Parsed\n\nbody", "")] + assert tables == [] + assert calls[0][0]["options"]["do_chunking"] is True