mirror of
https://github.com/infiniflow/ragflow.git
synced 2026-06-29 23:41:12 +08:00
Fix OpenDataLoader naive parsing by normalizing @OpenDataLoader and filtering unsupported parser kwargs (#14581)
### What problem does this PR solve? This PR fixes a bug where `layout_recognize="<name>@OpenDataLoader"` was misrouted and then failed during parsing in the naive parser path. It now routes correctly to OpenDataLoader and avoids passing unsupported arguments that caused runtime errors. fixes #14572 ### Type of change - [x] Bug Fix (non-breaking change which fixes an issue)
This commit is contained in:
@@ -29,5 +29,8 @@ def normalize_layout_recognizer(layout_recognizer_raw: Any) -> tuple[Any, str |
|
||||
elif lowered.endswith("@paddleocr"):
|
||||
parser_model_name = layout_recognizer_raw.rsplit("@", 1)[0]
|
||||
layout_recognizer = "PaddleOCR"
|
||||
elif lowered.endswith("@opendataloader"):
|
||||
parser_model_name = layout_recognizer_raw.rsplit("@", 1)[0]
|
||||
layout_recognizer = "OpenDataLoader"
|
||||
|
||||
return layout_recognizer, parser_model_name
|
||||
|
||||
@@ -201,12 +201,13 @@ def by_opendataloader(
|
||||
ocr_model_config = get_model_config_by_type_and_name(tenant_id, LLMType.OCR, opendataloader_llm_name)
|
||||
ocr_model = LLMBundle(tenant_id=tenant_id, model_config=ocr_model_config, lang=lang)
|
||||
pdf_parser = ocr_model.mdl
|
||||
parse_options = {k: kwargs[k] for k in ("hybrid", "image_output", "sanitize") if k in kwargs}
|
||||
sections, tables = pdf_parser.parse_pdf(
|
||||
filepath=filename,
|
||||
binary=binary,
|
||||
callback=callback,
|
||||
parse_method=parse_method,
|
||||
**kwargs,
|
||||
**parse_options,
|
||||
)
|
||||
return sections, tables, pdf_parser
|
||||
except Exception as e:
|
||||
@@ -867,6 +868,9 @@ def chunk(filename, binary=None, from_page=0, to_page=MAXIMUM_PAGE_NUMBER, lang=
|
||||
|
||||
elif re.search(r"\.pdf$", filename, re.IGNORECASE):
|
||||
layout_recognizer, parser_model_name = normalize_layout_recognizer(parser_config.get("layout_recognize", "DeepDOC"))
|
||||
opendataloader_llm_name = kwargs.pop("opendataloader_llm_name", None)
|
||||
if layout_recognizer == "OpenDataLoader" and parser_model_name:
|
||||
opendataloader_llm_name = parser_model_name
|
||||
|
||||
if parser_config.get("analyze_hyperlink", False) and is_root:
|
||||
urls = extract_links_from_pdf(binary)
|
||||
@@ -888,6 +892,7 @@ def chunk(filename, binary=None, from_page=0, to_page=MAXIMUM_PAGE_NUMBER, lang=
|
||||
layout_recognizer=layout_recognizer,
|
||||
mineru_llm_name=parser_model_name,
|
||||
paddleocr_llm_name=parser_model_name,
|
||||
opendataloader_llm_name=opendataloader_llm_name,
|
||||
**kwargs,
|
||||
)
|
||||
sections = _normalize_section_text_for_rtl_presentation_forms(sections)
|
||||
|
||||
Reference in New Issue
Block a user