diff --git a/common/parser_config_utils.py b/common/parser_config_utils.py index 0bc7ffc28b..daf91cc8e1 100644 --- a/common/parser_config_utils.py +++ b/common/parser_config_utils.py @@ -29,5 +29,8 @@ def normalize_layout_recognizer(layout_recognizer_raw: Any) -> tuple[Any, str | elif lowered.endswith("@paddleocr"): parser_model_name = layout_recognizer_raw.rsplit("@", 1)[0] layout_recognizer = "PaddleOCR" + elif lowered.endswith("@opendataloader"): + parser_model_name = layout_recognizer_raw.rsplit("@", 1)[0] + layout_recognizer = "OpenDataLoader" return layout_recognizer, parser_model_name diff --git a/rag/app/naive.py b/rag/app/naive.py index 90d1b42858..f91e2a8f94 100644 --- a/rag/app/naive.py +++ b/rag/app/naive.py @@ -201,12 +201,13 @@ def by_opendataloader( ocr_model_config = get_model_config_by_type_and_name(tenant_id, LLMType.OCR, opendataloader_llm_name) ocr_model = LLMBundle(tenant_id=tenant_id, model_config=ocr_model_config, lang=lang) pdf_parser = ocr_model.mdl + parse_options = {k: kwargs[k] for k in ("hybrid", "image_output", "sanitize") if k in kwargs} sections, tables = pdf_parser.parse_pdf( filepath=filename, binary=binary, callback=callback, parse_method=parse_method, - **kwargs, + **parse_options, ) return sections, tables, pdf_parser except Exception as e: @@ -867,6 +868,9 @@ def chunk(filename, binary=None, from_page=0, to_page=MAXIMUM_PAGE_NUMBER, lang= elif re.search(r"\.pdf$", filename, re.IGNORECASE): layout_recognizer, parser_model_name = normalize_layout_recognizer(parser_config.get("layout_recognize", "DeepDOC")) + opendataloader_llm_name = kwargs.pop("opendataloader_llm_name", None) + if layout_recognizer == "OpenDataLoader" and parser_model_name: + opendataloader_llm_name = parser_model_name if parser_config.get("analyze_hyperlink", False) and is_root: urls = extract_links_from_pdf(binary) @@ -888,6 +892,7 @@ def chunk(filename, binary=None, from_page=0, to_page=MAXIMUM_PAGE_NUMBER, lang= layout_recognizer=layout_recognizer, mineru_llm_name=parser_model_name, paddleocr_llm_name=parser_model_name, + opendataloader_llm_name=opendataloader_llm_name, **kwargs, ) sections = _normalize_section_text_for_rtl_presentation_forms(sections)