fix: fall back from empty Docling native chunks (#15601)

## Summary
- keep the native Docling chunking path when it returns usable chunks
- fall back to the standard Docling response parser when a chunked
request gets HTTP 200 but returns no usable chunks
- add a regression test for older Docling servers that accept the
chunking request but return a standard conversion payload

## Why
Older external Docling servers can accept a request containing
`do_chunking: true` and still return the standard conversion response
shape. The current code treats any HTTP 200 from the chunked request as
a native chunk response, finds no chunk entries, and returns zero
sections without trying the standard response parser.

Fixes #15569.

## Validation
- `python -m pytest
test\\unit_test\\deepdoc\\parser\\test_docling_parser_remote.py -q`
- `python -m py_compile deepdoc\\parser\\docling_parser.py
test\\unit_test\\deepdoc\\parser\\test_docling_parser_remote.py`
- `python -m ruff check deepdoc\\parser\\docling_parser.py
test\\unit_test\\deepdoc\\parser\\test_docling_parser_remote.py`
- `git diff --check`
This commit is contained in:
Yufeng He
2026-06-04 13:42:58 +08:00
committed by GitHub
parent 01a5598aa5
commit 5db1b296fb
2 changed files with 80 additions and 1 deletions

View File

@@ -477,7 +477,10 @@ class DoclingParser(RAGFlowPdfParser):
if callback:
callback(0.95, f"[Docling] Native chunks received: {len(sections)}")
return sections, tables
if sections:
return sections, tables
self.logger.warning("[Docling] Native chunking returned no usable chunks; trying standard response parsing.")
# --- FALLBACK: Standard RAGFlow parsing for older docling servers ---
docs = self._extract_remote_document_entries(response_json)

View File

@@ -0,0 +1,76 @@
from __future__ import annotations
import importlib.util
import sys
import types
from pathlib import Path
import pytest
ROOT = Path(__file__).resolve().parents[4]
class _Response:
status_code = 200
text = ""
def __init__(self, payload):
self._payload = payload
def json(self):
return self._payload
def _load_docling_parser(monkeypatch):
common_pkg = types.ModuleType("common")
constants_mod = types.ModuleType("common.constants")
constants_mod.MAXIMUM_PAGE_NUMBER = 1000
deepdoc_pkg = types.ModuleType("deepdoc")
parser_pkg = types.ModuleType("deepdoc.parser")
parser_pkg.__path__ = []
utils_mod = types.ModuleType("deepdoc.parser.utils")
utils_mod.extract_pdf_outlines = lambda _source: []
pil_pkg = types.ModuleType("PIL")
image_mod = types.ModuleType("PIL.Image")
image_mod.Image = object
pil_pkg.Image = image_mod
monkeypatch.setitem(sys.modules, "common", common_pkg)
monkeypatch.setitem(sys.modules, "common.constants", constants_mod)
monkeypatch.setitem(sys.modules, "deepdoc", deepdoc_pkg)
monkeypatch.setitem(sys.modules, "deepdoc.parser", parser_pkg)
monkeypatch.setitem(sys.modules, "deepdoc.parser.utils", utils_mod)
monkeypatch.setitem(sys.modules, "pdfplumber", types.ModuleType("pdfplumber"))
monkeypatch.setitem(sys.modules, "PIL", pil_pkg)
monkeypatch.setitem(sys.modules, "PIL.Image", image_mod)
spec = importlib.util.spec_from_file_location(
"_docling_parser_under_test",
ROOT / "deepdoc" / "parser" / "docling_parser.py",
)
module = importlib.util.module_from_spec(spec)
monkeypatch.setitem(sys.modules, spec.name, module)
spec.loader.exec_module(module)
return module
@pytest.mark.p2
def test_remote_chunked_200_standard_payload_falls_back(monkeypatch):
module = _load_docling_parser(monkeypatch)
calls = []
def fake_post(_url, json, timeout):
calls.append((json, timeout))
return _Response({"document": {"md_content": "# Parsed\n\nbody"}})
monkeypatch.setattr(module.requests, "post", fake_post)
parser = module.DoclingParser(docling_server_url="http://docling.local")
sections, tables = parser._parse_pdf_remote("sample.pdf", binary=b"%PDF", parse_method="raw")
assert sections == [("# Parsed\n\nbody", "")]
assert tables == []
assert calls[0][0]["options"]["do_chunking"] is True