From fb3bd3de029b0ec327c0b9aac1f1ec0aa8ac2a21 Mon Sep 17 00:00:00 2001 From: euvre <93761161+euvre@users.noreply.github.com> Date: Mon, 1 Jun 2026 04:22:11 -0700 Subject: [PATCH] fix(deepdoc): add English caption patterns to fix missing figure/table numbering (#15481) MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit ### What problem does this PR solve? ## Problem When parsing PDFs containing English figure/table captions (e.g. "Fig. 20", "Figure 20", "Table 20"), the `is_caption` method in `TableStructureRecognizer` failed to recognize them as captions. This caused figure numbering gaps in the parsed output (e.g. Fig. 19 → Fig. 21, skipping Fig. 20). ## Root Cause The `is_caption` regex only matched Chinese caption formats: ```python patt = [r"[图表]+[ 0-9::]{2,}"] ``` When the layout recognizer also failed to assign a `caption` layout type to a given text block, English captions were entirely missed. ## Fix Added three case-insensitive English caption patterns to `is_caption` in `deepdoc/vision/table_structure_recognizer.py`: - `(?i)Fig\.?\s*\d+` — matches `Fig. 20`, `Fig 20`, `FIG. 20`, etc. - `(?i)Figure\s+\d+` — matches `Figure 20`, `FIGURE 20`, etc. - `(?i)Table\s+\d+` — matches `Table 20`, `TABLE 20`, etc. ## Files Changed - `deepdoc/vision/table_structure_recognizer.py` — extended `is_caption` regex patterns - [x] Bug Fix (non-breaking change which fixes an issue) Signed-off-by: noob --- deepdoc/vision/table_structure_recognizer.py | 7 ++++++- 1 file changed, 6 insertions(+), 1 deletion(-) diff --git a/deepdoc/vision/table_structure_recognizer.py b/deepdoc/vision/table_structure_recognizer.py index e0892c2d72..997bc84b62 100644 --- a/deepdoc/vision/table_structure_recognizer.py +++ b/deepdoc/vision/table_structure_recognizer.py @@ -112,7 +112,12 @@ class TableStructureRecognizer(Recognizer): @staticmethod def is_caption(bx): - patt = [r"[图表]+[ 0-9::]{2,}"] + patt = [ + r"[图表]+[ 0-9::]{2,}", + r"(?i)Fig\.?\s*\d+", + r"(?i)Figure\s+\d+", + r"(?i)Table\s+\d+", + ] if any([re.match(p, bx["text"].strip()) for p in patt]) or bx.get("layout_type", "").find("caption") >= 0: return True return False