From 2ee39f64fe4b0f5aadc21d901a48ea322a58a087 Mon Sep 17 00:00:00 2001 From: Stephen Hu Date: Mon, 9 Feb 2026 11:59:24 +0800 Subject: [PATCH] Refactor: improve ppt shape order logic (#13054) ### What problem does this PR solve? improve ppt shape order logic ### Type of change - [x] Refactoring --- deepdoc/parser/ppt_parser.py | 15 ++++++++++++--- 1 file changed, 12 insertions(+), 3 deletions(-) diff --git a/deepdoc/parser/ppt_parser.py b/deepdoc/parser/ppt_parser.py index 1b04b4d7c3..afff23d7de 100644 --- a/deepdoc/parser/ppt_parser.py +++ b/deepdoc/parser/ppt_parser.py @@ -22,6 +22,16 @@ from pptx import Presentation class RAGFlowPptParser: def __init__(self): super().__init__() + self._shape_cache = {} + + def __sort_shapes(self, shapes): + cache_key = id(shapes) + if cache_key not in self._shape_cache: + self._shape_cache[cache_key] = sorted( + shapes, + key=lambda x: ((x.top if x.top is not None else 0) // 10, x.left if x.left is not None else 0) + ) + return self._shape_cache[cache_key] def __get_bulleted_text(self, paragraph): is_bulleted = bool(paragraph._p.xpath("./a:pPr/a:buChar")) or bool(paragraph._p.xpath("./a:pPr/a:buAutoNum")) or bool(paragraph._p.xpath("./a:pPr/a:buBlip")) @@ -62,7 +72,7 @@ class RAGFlowPptParser: # Handle group shape if shape_type == 6: texts = [] - for p in sorted(shape.shapes, key=lambda x: (x.top // 10, x.left)): + for p in self.__sort_shapes(shape.shapes): t = self.__extract(p) if t: texts.append(t) @@ -86,8 +96,7 @@ class RAGFlowPptParser: if i >= to_page: break texts = [] - for shape in sorted( - slide.shapes, key=lambda x: ((x.top if x.top is not None else 0) // 10, x.left if x.left is not None else 0)): + for shape in self.__sort_shapes(slide.shapes): txt = self.__extract(shape) if txt: texts.append(txt)