mirror of
https://github.com/infiniflow/ragflow.git
synced 2026-06-29 15:31:05 +08:00
### What problem does this PR solve? Fix: update based on #14436 ### Type of change - [x] Bug Fix (non-breaking change which fixes an issue)
This commit is contained in:
@@ -45,6 +45,12 @@ AlgorithmType = Literal["PaddleOCR-VL", "PP-OCRv5", "PP-StructureV3", "PaddleOCR
|
|||||||
SectionTuple = tuple[str, ...]
|
SectionTuple = tuple[str, ...]
|
||||||
TableTuple = tuple[str, ...]
|
TableTuple = tuple[str, ...]
|
||||||
ParseResult = tuple[list[SectionTuple], list[TableTuple]]
|
ParseResult = tuple[list[SectionTuple], list[TableTuple]]
|
||||||
|
SUPPORTED_PADDLEOCR_ALGORITHMS: tuple[AlgorithmType, ...] = (
|
||||||
|
"PaddleOCR-VL",
|
||||||
|
"PP-OCRv5",
|
||||||
|
"PP-StructureV3",
|
||||||
|
"PaddleOCR-VL-1.5",
|
||||||
|
)
|
||||||
|
|
||||||
|
|
||||||
_MARKDOWN_IMAGE_PATTERN = re.compile(
|
_MARKDOWN_IMAGE_PATTERN = re.compile(
|
||||||
@@ -130,12 +136,12 @@ class PaddleOCRConfig:
|
|||||||
algorithm = cfg.get("algorithm", "PaddleOCR-VL")
|
algorithm = cfg.get("algorithm", "PaddleOCR-VL")
|
||||||
|
|
||||||
# Validate algorithm
|
# Validate algorithm
|
||||||
if algorithm not in ("PaddleOCR-VL"):
|
if algorithm not in SUPPORTED_PADDLEOCR_ALGORITHMS:
|
||||||
raise ValueError(f"Unsupported algorithm: {algorithm}")
|
raise ValueError(f"Unsupported algorithm: {algorithm}")
|
||||||
|
|
||||||
# Extract algorithm-specific configuration
|
# Extract algorithm-specific configuration
|
||||||
algorithm_config: dict[str, Any] = {}
|
algorithm_config: dict[str, Any] = {}
|
||||||
if algorithm == "PaddleOCR-VL":
|
if algorithm in SUPPORTED_PADDLEOCR_ALGORITHMS:
|
||||||
algorithm_config = asdict(PaddleOCRVLConfig())
|
algorithm_config = asdict(PaddleOCRVLConfig())
|
||||||
algorithm_config_user = cfg.get("algorithm_config")
|
algorithm_config_user = cfg.get("algorithm_config")
|
||||||
if isinstance(algorithm_config_user, dict):
|
if isinstance(algorithm_config_user, dict):
|
||||||
@@ -173,34 +179,39 @@ class PaddleOCRParser(RAGFlowPdfParser):
|
|||||||
"visualize": "visualize",
|
"visualize": "visualize",
|
||||||
}
|
}
|
||||||
|
|
||||||
|
_VL_FIELD_MAPPING: ClassVar[dict[str, str]] = {
|
||||||
|
"use_doc_orientation_classify": "useDocOrientationClassify",
|
||||||
|
"use_doc_unwarping": "useDocUnwarping",
|
||||||
|
"use_layout_detection": "useLayoutDetection",
|
||||||
|
"use_chart_recognition": "useChartRecognition",
|
||||||
|
"use_seal_recognition": "useSealRecognition",
|
||||||
|
"use_ocr_for_image_block": "useOcrForImageBlock",
|
||||||
|
"layout_threshold": "layoutThreshold",
|
||||||
|
"layout_nms": "layoutNms",
|
||||||
|
"layout_unclip_ratio": "layoutUnclipRatio",
|
||||||
|
"layout_merge_bboxes_mode": "layoutMergeBboxesMode",
|
||||||
|
"layout_shape_mode": "layoutShapeMode",
|
||||||
|
"prompt_label": "promptLabel",
|
||||||
|
"format_block_content": "formatBlockContent",
|
||||||
|
"repetition_penalty": "repetitionPenalty",
|
||||||
|
"temperature": "temperature",
|
||||||
|
"top_p": "topP",
|
||||||
|
"min_pixels": "minPixels",
|
||||||
|
"max_pixels": "maxPixels",
|
||||||
|
"max_new_tokens": "maxNewTokens",
|
||||||
|
"merge_layout_blocks": "mergeLayoutBlocks",
|
||||||
|
"markdown_ignore_labels": "markdownIgnoreLabels",
|
||||||
|
"vlm_extra_args": "vlmExtraArgs",
|
||||||
|
"restructure_pages": "restructurePages",
|
||||||
|
"merge_tables": "mergeTables",
|
||||||
|
"relevel_titles": "relevelTitles",
|
||||||
|
}
|
||||||
|
|
||||||
_ALGORITHM_FIELD_MAPPINGS: ClassVar[dict[str, dict[str, str]]] = {
|
_ALGORITHM_FIELD_MAPPINGS: ClassVar[dict[str, dict[str, str]]] = {
|
||||||
"PaddleOCR-VL": {
|
"PaddleOCR-VL": _VL_FIELD_MAPPING,
|
||||||
"use_doc_orientation_classify": "useDocOrientationClassify",
|
"PP-OCRv5": _VL_FIELD_MAPPING,
|
||||||
"use_doc_unwarping": "useDocUnwarping",
|
"PP-StructureV3": _VL_FIELD_MAPPING,
|
||||||
"use_layout_detection": "useLayoutDetection",
|
"PaddleOCR-VL-1.5": _VL_FIELD_MAPPING,
|
||||||
"use_chart_recognition": "useChartRecognition",
|
|
||||||
"use_seal_recognition": "useSealRecognition",
|
|
||||||
"use_ocr_for_image_block": "useOcrForImageBlock",
|
|
||||||
"layout_threshold": "layoutThreshold",
|
|
||||||
"layout_nms": "layoutNms",
|
|
||||||
"layout_unclip_ratio": "layoutUnclipRatio",
|
|
||||||
"layout_merge_bboxes_mode": "layoutMergeBboxesMode",
|
|
||||||
"layout_shape_mode": "layoutShapeMode",
|
|
||||||
"prompt_label": "promptLabel",
|
|
||||||
"format_block_content": "formatBlockContent",
|
|
||||||
"repetition_penalty": "repetitionPenalty",
|
|
||||||
"temperature": "temperature",
|
|
||||||
"top_p": "topP",
|
|
||||||
"min_pixels": "minPixels",
|
|
||||||
"max_pixels": "maxPixels",
|
|
||||||
"max_new_tokens": "maxNewTokens",
|
|
||||||
"merge_layout_blocks": "mergeLayoutBlocks",
|
|
||||||
"markdown_ignore_labels": "markdownIgnoreLabels",
|
|
||||||
"vlm_extra_args": "vlmExtraArgs",
|
|
||||||
"restructure_pages": "restructurePages",
|
|
||||||
"merge_tables": "mergeTables",
|
|
||||||
"relevel_titles": "relevelTitles",
|
|
||||||
},
|
|
||||||
}
|
}
|
||||||
|
|
||||||
def __init__(
|
def __init__(
|
||||||
@@ -393,7 +404,7 @@ class PaddleOCRParser(RAGFlowPdfParser):
|
|||||||
"""Convert API response to section tuples."""
|
"""Convert API response to section tuples."""
|
||||||
sections: list[SectionTuple] = []
|
sections: list[SectionTuple] = []
|
||||||
|
|
||||||
if algorithm in ("PaddleOCR-VL",):
|
if algorithm in SUPPORTED_PADDLEOCR_ALGORITHMS:
|
||||||
layout_parsing_results = result.get("layoutParsingResults", [])
|
layout_parsing_results = result.get("layoutParsingResults", [])
|
||||||
|
|
||||||
for page_idx, layout_result in enumerate(layout_parsing_results):
|
for page_idx, layout_result in enumerate(layout_parsing_results):
|
||||||
|
|||||||
Reference in New Issue
Block a user