Fix OpenDataLoader naive parsing by normalizing @OpenDataLoader and filtering unsupported parser kwargs (#14581)

### What problem does this PR solve?
This PR fixes a bug where `layout_recognize="<name>@OpenDataLoader"` was
misrouted and then failed during parsing in the naive parser path. It
now routes correctly to OpenDataLoader and avoids passing unsupported
arguments that caused runtime errors. fixes #14572

### Type of change
- [x] Bug Fix (non-breaking change which fixes an issue)
This commit is contained in:
Idriss Sbaaoui
2026-05-06 15:00:55 +08:00
committed by GitHub
parent 7e83c5f421
commit 38f6484e98
2 changed files with 9 additions and 1 deletions

View File

@@ -29,5 +29,8 @@ def normalize_layout_recognizer(layout_recognizer_raw: Any) -> tuple[Any, str |
elif lowered.endswith("@paddleocr"):
parser_model_name = layout_recognizer_raw.rsplit("@", 1)[0]
layout_recognizer = "PaddleOCR"
elif lowered.endswith("@opendataloader"):
parser_model_name = layout_recognizer_raw.rsplit("@", 1)[0]
layout_recognizer = "OpenDataLoader"
return layout_recognizer, parser_model_name

View File

@@ -201,12 +201,13 @@ def by_opendataloader(
ocr_model_config = get_model_config_by_type_and_name(tenant_id, LLMType.OCR, opendataloader_llm_name)
ocr_model = LLMBundle(tenant_id=tenant_id, model_config=ocr_model_config, lang=lang)
pdf_parser = ocr_model.mdl
parse_options = {k: kwargs[k] for k in ("hybrid", "image_output", "sanitize") if k in kwargs}
sections, tables = pdf_parser.parse_pdf(
filepath=filename,
binary=binary,
callback=callback,
parse_method=parse_method,
**kwargs,
**parse_options,
)
return sections, tables, pdf_parser
except Exception as e:
@@ -867,6 +868,9 @@ def chunk(filename, binary=None, from_page=0, to_page=MAXIMUM_PAGE_NUMBER, lang=
elif re.search(r"\.pdf$", filename, re.IGNORECASE):
layout_recognizer, parser_model_name = normalize_layout_recognizer(parser_config.get("layout_recognize", "DeepDOC"))
opendataloader_llm_name = kwargs.pop("opendataloader_llm_name", None)
if layout_recognizer == "OpenDataLoader" and parser_model_name:
opendataloader_llm_name = parser_model_name
if parser_config.get("analyze_hyperlink", False) and is_root:
urls = extract_links_from_pdf(binary)
@@ -888,6 +892,7 @@ def chunk(filename, binary=None, from_page=0, to_page=MAXIMUM_PAGE_NUMBER, lang=
layout_recognizer=layout_recognizer,
mineru_llm_name=parser_model_name,
paddleocr_llm_name=parser_model_name,
opendataloader_llm_name=opendataloader_llm_name,
**kwargs,
)
sections = _normalize_section_text_for_rtl_presentation_forms(sections)