mirror of
https://github.com/infiniflow/ragflow.git
synced 2026-06-29 23:41:12 +08:00
fix: fall back from empty Docling native chunks (#15601)
## Summary - keep the native Docling chunking path when it returns usable chunks - fall back to the standard Docling response parser when a chunked request gets HTTP 200 but returns no usable chunks - add a regression test for older Docling servers that accept the chunking request but return a standard conversion payload ## Why Older external Docling servers can accept a request containing `do_chunking: true` and still return the standard conversion response shape. The current code treats any HTTP 200 from the chunked request as a native chunk response, finds no chunk entries, and returns zero sections without trying the standard response parser. Fixes #15569. ## Validation - `python -m pytest test\\unit_test\\deepdoc\\parser\\test_docling_parser_remote.py -q` - `python -m py_compile deepdoc\\parser\\docling_parser.py test\\unit_test\\deepdoc\\parser\\test_docling_parser_remote.py` - `python -m ruff check deepdoc\\parser\\docling_parser.py test\\unit_test\\deepdoc\\parser\\test_docling_parser_remote.py` - `git diff --check`
This commit is contained in:
@@ -477,7 +477,10 @@ class DoclingParser(RAGFlowPdfParser):
|
||||
|
||||
if callback:
|
||||
callback(0.95, f"[Docling] Native chunks received: {len(sections)}")
|
||||
return sections, tables
|
||||
if sections:
|
||||
return sections, tables
|
||||
|
||||
self.logger.warning("[Docling] Native chunking returned no usable chunks; trying standard response parsing.")
|
||||
|
||||
# --- FALLBACK: Standard RAGFlow parsing for older docling servers ---
|
||||
docs = self._extract_remote_document_entries(response_json)
|
||||
|
||||
76
test/unit_test/deepdoc/parser/test_docling_parser_remote.py
Normal file
76
test/unit_test/deepdoc/parser/test_docling_parser_remote.py
Normal file
@@ -0,0 +1,76 @@
|
||||
from __future__ import annotations
|
||||
|
||||
import importlib.util
|
||||
import sys
|
||||
import types
|
||||
from pathlib import Path
|
||||
|
||||
import pytest
|
||||
|
||||
|
||||
ROOT = Path(__file__).resolve().parents[4]
|
||||
|
||||
|
||||
class _Response:
|
||||
status_code = 200
|
||||
text = ""
|
||||
|
||||
def __init__(self, payload):
|
||||
self._payload = payload
|
||||
|
||||
def json(self):
|
||||
return self._payload
|
||||
|
||||
|
||||
def _load_docling_parser(monkeypatch):
|
||||
common_pkg = types.ModuleType("common")
|
||||
constants_mod = types.ModuleType("common.constants")
|
||||
constants_mod.MAXIMUM_PAGE_NUMBER = 1000
|
||||
|
||||
deepdoc_pkg = types.ModuleType("deepdoc")
|
||||
parser_pkg = types.ModuleType("deepdoc.parser")
|
||||
parser_pkg.__path__ = []
|
||||
utils_mod = types.ModuleType("deepdoc.parser.utils")
|
||||
utils_mod.extract_pdf_outlines = lambda _source: []
|
||||
|
||||
pil_pkg = types.ModuleType("PIL")
|
||||
image_mod = types.ModuleType("PIL.Image")
|
||||
image_mod.Image = object
|
||||
pil_pkg.Image = image_mod
|
||||
|
||||
monkeypatch.setitem(sys.modules, "common", common_pkg)
|
||||
monkeypatch.setitem(sys.modules, "common.constants", constants_mod)
|
||||
monkeypatch.setitem(sys.modules, "deepdoc", deepdoc_pkg)
|
||||
monkeypatch.setitem(sys.modules, "deepdoc.parser", parser_pkg)
|
||||
monkeypatch.setitem(sys.modules, "deepdoc.parser.utils", utils_mod)
|
||||
monkeypatch.setitem(sys.modules, "pdfplumber", types.ModuleType("pdfplumber"))
|
||||
monkeypatch.setitem(sys.modules, "PIL", pil_pkg)
|
||||
monkeypatch.setitem(sys.modules, "PIL.Image", image_mod)
|
||||
|
||||
spec = importlib.util.spec_from_file_location(
|
||||
"_docling_parser_under_test",
|
||||
ROOT / "deepdoc" / "parser" / "docling_parser.py",
|
||||
)
|
||||
module = importlib.util.module_from_spec(spec)
|
||||
monkeypatch.setitem(sys.modules, spec.name, module)
|
||||
spec.loader.exec_module(module)
|
||||
return module
|
||||
|
||||
|
||||
@pytest.mark.p2
|
||||
def test_remote_chunked_200_standard_payload_falls_back(monkeypatch):
|
||||
module = _load_docling_parser(monkeypatch)
|
||||
calls = []
|
||||
|
||||
def fake_post(_url, json, timeout):
|
||||
calls.append((json, timeout))
|
||||
return _Response({"document": {"md_content": "# Parsed\n\nbody"}})
|
||||
|
||||
monkeypatch.setattr(module.requests, "post", fake_post)
|
||||
|
||||
parser = module.DoclingParser(docling_server_url="http://docling.local")
|
||||
sections, tables = parser._parse_pdf_remote("sample.pdf", binary=b"%PDF", parse_method="raw")
|
||||
|
||||
assert sections == [("# Parsed\n\nbody", "")]
|
||||
assert tables == []
|
||||
assert calls[0][0]["options"]["do_chunking"] is True
|
||||
Reference in New Issue
Block a user