fix: detect and fallback garbled PDF text to OCR (#13366) (#13404)

## Problem When PDF fonts lack ToUnicode/CMap mappings, pdfplumber (pdfminer) cannot map CIDs to correct Unicode characters, outputting PUA characters (U+E000~U+F8FF) or `(cid:xxx)` placeholders. The original code fully trusted pdfplumber text without any garbled detection, causing garbled output in the final parsed result. Relates to #13366 ## Solution ### 1. Garbled text detection functions - `_is_garbled_char(ch)`: Detects PUA characters (BMP/Plane 15/16), replacement character U+FFFD, control characters, and unassigned/surrogate codepoints - `_is_garbled_text(text, threshold)`: Calculates garbled ratio and detects `(cid:xxx)` patterns ### 2. Box-level fallback (in `__ocr()`) When a text box has ≥50% garbled characters, discard pdfplumber text and fallback to OCR recognition. ### 3. Page-level detection (in `__images__()`) Sample characters from each page; if garbled rate ≥30%, clear all pdfplumber characters for that page, forcing full OCR. ### 4. Layout recognizer CID filtering Filter out `(cid:xxx)` patterns in `layout_recognizer.py` text processing to prevent them from polluting layout analysis. ## Testing - 29 unit tests covering: normal CJK/English text, PUA characters, CID patterns, mixed text, boundary thresholds, edge cases - All 85 existing project unit tests pass without regression
2026-07-03 17:21:59 +08:00 · 2026-03-10 11:20:31 +08:00
parent 7f6a9e8ee9
commit 292a1a8566
3 changed files with 619 additions and 6 deletions
--- a/deepdoc/parser/pdf_parser.py
+++ b/deepdoc/parser/pdf_parser.py
@@ -22,6 +22,7 @@ import random
 import re
 import sys
 import threading
+import unicodedata
 from collections import Counter, defaultdict
 from copy import deepcopy
 from io import BytesIO
@@ -197,6 +198,127 @@ class RAGFlowPdfParser:
                    return False
        return True

+    # CID pattern regex for unmapped font characters from pdfminer
+    _CID_PATTERN = re.compile(r"\(cid\s*:\s*\d+\s*\)")
+
+    @staticmethod
+    def _is_garbled_char(ch):
+        """Check if a single character is garbled (unmappable from PDF font encoding).
+
+        A character is considered garbled if it falls into Unicode Private Use Areas
+        or certain replacement/control character ranges that typically indicate
+        pdfminer failed to map a CID to a valid Unicode codepoint.
+        """
+        if not ch:
+            return False
+        cp = ord(ch)
+        if 0xE000 <= cp <= 0xF8FF:
+            return True
+        if 0xF0000 <= cp <= 0xFFFFF:
+            return True
+        if 0x100000 <= cp <= 0x10FFFF:
+            return True
+        if cp == 0xFFFD:
+            return True
+        if cp < 0x20 and ch not in ('\t', '\n', '\r'):
+            return True
+        if 0x80 <= cp <= 0x9F:
+            return True
+        cat = unicodedata.category(ch)
+        if cat in ("Cn", "Cs"):
+            return True
+        return False
+
+    @staticmethod
+    def _is_garbled_text(text, threshold=0.5):
+        """Check if a text string contains too many garbled characters.
+
+        Examines each character and determines if the overall proportion
+        of garbled characters exceeds the given threshold. Also detects
+        pdfminer's CID placeholder patterns like '(cid:123)'.
+        """
+        if not text or not text.strip():
+            return False
+        if RAGFlowPdfParser._CID_PATTERN.search(text):
+            return True
+        garbled_count = 0
+        total = 0
+        for ch in text:
+            if ch.isspace():
+                continue
+            total += 1
+            if RAGFlowPdfParser._is_garbled_char(ch):
+                garbled_count += 1
+        if total == 0:
+            return False
+        return garbled_count / total >= threshold
+
+    @staticmethod
+    def _has_subset_font_prefix(fontname):
+        """Check if a font name has a subset prefix (e.g. 'DY1+ZLQDm1-1').
+
+        PDF subset fonts use a 6-letter uppercase tag followed by '+' before
+        the actual font name. Some tools use shorter tags (e.g. 'DY1+').
+        """
+        if not fontname:
+            return False
+        return bool(re.match(r"^[A-Z0-9]{2,6}\+", fontname))
+
+    @staticmethod
+    def _is_garbled_by_font_encoding(page_chars, min_chars=20):
+        """Detect garbled text caused by broken font encoding mappings.
+
+        Some PDFs (especially older Chinese standards) embed custom fonts that
+        map CJK glyphs to ASCII codepoints. The extracted text appears as
+        random ASCII punctuation/symbols instead of actual CJK characters.
+
+        Detection strategy: if a significant proportion of characters come from
+        subset-embedded fonts and the page produces overwhelmingly ASCII
+        (punctuation, digits, symbols) with virtually no CJK/Hangul/Kana
+        characters, the page is likely garbled due to broken font encoding.
+        """
+        if not page_chars or len(page_chars) < min_chars:
+            return False
+
+        subset_font_count = 0
+        total_non_space = 0
+        ascii_punct_sym = 0
+        cjk_like = 0
+
+        for c in page_chars:
+            text = c.get("text", "")
+            fontname = c.get("fontname", "")
+            if not text or text.isspace():
+                continue
+            total_non_space += 1
+
+            if RAGFlowPdfParser._has_subset_font_prefix(fontname):
+                subset_font_count += 1
+
+            cp = ord(text[0])
+            if (0x2E80 <= cp <= 0x9FFF or 0xF900 <= cp <= 0xFAFF
+                    or 0x20000 <= cp <= 0x2FA1F
+                    or 0xAC00 <= cp <= 0xD7AF
+                    or 0x3040 <= cp <= 0x30FF):
+                cjk_like += 1
+            elif (0x21 <= cp <= 0x2F or 0x3A <= cp <= 0x40
+                    or 0x5B <= cp <= 0x60 or 0x7B <= cp <= 0x7E):
+                ascii_punct_sym += 1
+
+        if total_non_space < min_chars:
+            return False
+
+        subset_ratio = subset_font_count / total_non_space
+        if subset_ratio < 0.3:
+            return False
+
+        cjk_ratio = cjk_like / total_non_space
+        punct_ratio = ascii_punct_sym / total_non_space
+        if cjk_ratio < 0.05 and punct_ratio > 0.4:
+            return True
+
+        return False
+
    def _evaluate_table_orientation(self, table_img, sample_ratio=0.3):
        """
        Evaluate the best rotation orientation for a table image.
@@ -618,14 +740,40 @@ class RAGFlowPdfParser:
            if not b["chars"]:
                del b["chars"]
                continue
-            m_ht = np.mean([c["height"] for c in b["chars"]])
-            for c in Recognizer.sort_Y_firstly(b["chars"], m_ht):
+            box_chars = b["chars"]
+            m_ht = np.mean([c["height"] for c in box_chars])
+            garbled_count = 0
+            total_count = 0
+            for c in Recognizer.sort_Y_firstly(box_chars, m_ht):
                if c["text"] == " " and b["text"]:
                    if re.match(r"[0-9a-zA-Zа-яА-Я,.?;:!%%]", b["text"][-1]):
                        b["text"] += " "
                else:
                    b["text"] += c["text"]
+                    for ch in c["text"]:
+                        if not ch.isspace():
+                            total_count += 1
+                            if self._is_garbled_char(ch):
+                                garbled_count += 1
            del b["chars"]
+            # If the majority of characters from pdfplumber are garbled,
+            # clear the text so OCR recognition will be used as fallback.
+            # Strategy 1: PUA / unmapped CID characters
+            if total_count > 0 and garbled_count / total_count >= 0.5:
+                logging.info(
+                    "Page %d: detected garbled pdfplumber text (garbled=%d/%d), falling back to OCR for box at (%.1f, %.1f)",
+                    pagenum, garbled_count, total_count, b["x0"], b["top"],
+                )
+                b["text"] = ""
+                continue
+            # Strategy 2: font-encoding garbling — all chars are ASCII
+            # punctuation from subset fonts (no CJK output)
+            if total_count > 0 and self._is_garbled_by_font_encoding(box_chars, min_chars=5):
+                logging.info(
+                    "Page %d: detected font-encoding garbled text (%d chars), falling back to OCR for box at (%.1f, %.1f)",
+                    pagenum, total_count, b["x0"], b["top"],
+                )
+                b["text"] = ""

        logging.info(f"__ocr sorting {len(chars)} chars cost {timer() - start}s")
        start = timer()
@@ -1400,6 +1548,34 @@ class RAGFlowPdfParser:
                        logging.warning(f"Failed to extract characters for pages {page_from}-{page_to}: {str(e)}")
                        self.page_chars = [[] for _ in range(page_to - page_from)]  # If failed to extract, using empty list instead.

+                    # Detect garbled pages and clear their chars so the OCR
+                    # path will be used instead. Two detection strategies:
+                    # 1) PUA / unmapped CID characters (threshold=0.3)
+                    # 2) Font-encoding garbling: subset fonts mapping CJK to ASCII
+                    for pi, page_ch in enumerate(self.page_chars):
+                        if not page_ch:
+                            continue
+                        # Strategy 1: PUA / CID garbling
+                        sample = page_ch if len(page_ch) <= 200 else page_ch[:200]
+                        sample_text = "".join(c.get("text", "") for c in sample)
+                        if self._is_garbled_text(sample_text, threshold=0.3):
+                            logging.warning(
+                                "Page %d: pdfplumber extracted mostly garbled characters (%d chars), "
+                                "clearing to use OCR fallback.",
+                                page_from + pi + 1, len(page_ch),
+                            )
+                            self.page_chars[pi] = []
+                            continue
+                        # Strategy 2: font-encoding garbling (CJK mapped to ASCII)
+                        if self._is_garbled_by_font_encoding(page_ch):
+                            logging.warning(
+                                "Page %d: detected font-encoding garbled text "
+                                "(subset fonts with no CJK output, %d chars), "
+                                "clearing to use OCR fallback.",
+                                page_from + pi + 1, len(page_ch),
+                            )
+                            self.page_chars[pi] = []
+
                    self.total_page = len(self.pdf.pages)

        except Exception as e:
--- a/deepdoc/vision/layout_recognizer.py
+++ b/deepdoc/vision/layout_recognizer.py
@@ -17,7 +17,7 @@
 import logging
 import math
 import os
-# import re
+import re
 from collections import Counter
 from copy import deepcopy

@@ -62,9 +62,8 @@ class LayoutRecognizer(Recognizer):

    def __call__(self, image_list, ocr_res, scale_factor=3, thr=0.2, batch_size=16, drop=True):
        def __is_garbage(b):
-            return False
-            # patt = [r"^•+$", "^[0-9]{1,2} / ?[0-9]{1,2}$", r"^[0-9]{1,2} of [0-9]{1,2}$", "^http://[^ ]{12,}", "\\(cid *: *[0-9]+ *\\)"]
-            # return any([re.search(p, b["text"]) for p in patt])
+            patt = [r"\(cid\s*:\s*\d+\s*\)"]
+            return any([re.search(p, b.get("text", "")) for p in patt])

        if self.client:
            layouts = self.client.predict(image_list)
--- a/test/unit_test/deepdoc/parser/test_pdf_garbled_detection.py
+++ b/test/unit_test/deepdoc/parser/test_pdf_garbled_detection.py
@@ -0,0 +1,438 @@
+#
+#  Copyright 2025 The InfiniFlow Authors. All Rights Reserved.
+#
+#  Licensed under the Apache License, Version 2.0 (the "License");
+#  you may not use this file except in compliance with the License.
+#  You may obtain a copy of the License at
+#
+#      http://www.apache.org/licenses/LICENSE-2.0
+#
+#  Unless required by applicable law or agreed to in writing, software
+#  distributed under the License is distributed on an "AS IS" BASIS,
+#  WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+#  See the License for the specific language governing permissions and
+#  limitations under the License.
+#
+
+"""Unit tests for PDF garbled text detection and layout garbage filtering.
+
+Tests cover:
+- RAGFlowPdfParser static methods: _is_garbled_char, _is_garbled_text,
+  _has_subset_font_prefix, _is_garbled_by_font_encoding
+- layout_recognizer.__is_garbage: CID pattern filtering
+"""
+
+import re
+import sys
+import os
+import importlib.util
+from unittest import mock
+
+# Import RAGFlowPdfParser directly by file path to avoid triggering
+# deepdoc/parser/__init__.py which pulls in heavy dependencies
+# (pdfplumber, xgboost, etc.) that may not be available in test environments.
+#
+# We mock the heavy third-party modules so that pdf_parser.py can be loaded
+# purely for its static detection methods.
+_MOCK_MODULES = [
+    "numpy", "np", "pdfplumber", "xgboost", "xgb",
+    "huggingface_hub", "PIL", "PIL.Image", "pypdf",
+    "sklearn", "sklearn.cluster", "sklearn.metrics",
+    "common", "common.file_utils", "common.misc_utils", "common.settings",
+    "common.token_utils",
+    "deepdoc", "deepdoc.vision", "deepdoc.parser",
+    "rag", "rag.nlp", "rag.prompts", "rag.prompts.generator",
+]
+for _m in _MOCK_MODULES:
+    if _m not in sys.modules:
+        sys.modules[_m] = mock.MagicMock()
+
+def _find_project_root(marker="pyproject.toml"):
+    """Walk up from this file until a directory containing *marker* is found."""
+    cur = os.path.dirname(os.path.abspath(__file__))
+    while True:
+        if os.path.exists(os.path.join(cur, marker)):
+            return cur
+        parent = os.path.dirname(cur)
+        if parent == cur:
+            raise FileNotFoundError(f"Could not locate project root (missing {marker})")
+        cur = parent
+
+
+_MODULE_PATH = os.path.join(_find_project_root(), "deepdoc", "parser", "pdf_parser.py")
+_spec = importlib.util.spec_from_file_location("pdf_parser", _MODULE_PATH)
+_mod = importlib.util.module_from_spec(_spec)
+_spec.loader.exec_module(_mod)
+
+_Parser = _mod.RAGFlowPdfParser
+is_garbled_char = _Parser._is_garbled_char
+is_garbled_text = _Parser._is_garbled_text
+has_subset_font_prefix = _Parser._has_subset_font_prefix
+is_garbled_by_font_encoding = _Parser._is_garbled_by_font_encoding
+
+
+# ---------------------------------------------------------------------------
+# Tests for is_garbled_char
+# ---------------------------------------------------------------------------
+
+
+class TestIsGarbledChar:
+    """Tests for the is_garbled_char function."""
+
+    def test_normal_ascii_chars(self):
+        for ch in "Hello World 123 !@#":
+            assert is_garbled_char(ch) is False
+
+    def test_normal_chinese_chars(self):
+        for ch in "中文测试你好世界":
+            assert is_garbled_char(ch) is False
+
+    def test_normal_japanese_chars(self):
+        for ch in "日本語テスト":
+            assert is_garbled_char(ch) is False
+
+    def test_normal_korean_chars(self):
+        for ch in "한국어테스트":
+            assert is_garbled_char(ch) is False
+
+    def test_common_whitespace_not_garbled(self):
+        assert is_garbled_char('\t') is False
+        assert is_garbled_char('\n') is False
+        assert is_garbled_char('\r') is False
+        assert is_garbled_char(' ') is False
+
+    def test_pua_chars_are_garbled(self):
+        assert is_garbled_char('\uE000') is True
+        assert is_garbled_char('\uF000') is True
+        assert is_garbled_char('\uF8FF') is True
+
+    def test_supplementary_pua_a(self):
+        assert is_garbled_char(chr(0xF0000)) is True
+        assert is_garbled_char(chr(0xFFFFF)) is True
+
+    def test_supplementary_pua_b(self):
+        assert is_garbled_char(chr(0x100000)) is True
+        assert is_garbled_char(chr(0x10FFFF)) is True
+
+    def test_replacement_char(self):
+        assert is_garbled_char('\uFFFD') is True
+
+    def test_c0_control_chars(self):
+        assert is_garbled_char('\x00') is True
+        assert is_garbled_char('\x01') is True
+        assert is_garbled_char('\x1F') is True
+
+    def test_c1_control_chars(self):
+        assert is_garbled_char('\x80') is True
+        assert is_garbled_char('\x8F') is True
+        assert is_garbled_char('\x9F') is True
+
+    def test_empty_string(self):
+        assert is_garbled_char('') is False
+
+    def test_common_punctuation(self):
+        for ch in ".,;:!?()[]{}\"'-/\\@#$%^&*+=<>~`|":
+            assert is_garbled_char(ch) is False
+
+    def test_unicode_symbols(self):
+        for ch in "©®™°±²³µ¶·¹º»¼½¾":
+            assert is_garbled_char(ch) is False
+
+
+# ---------------------------------------------------------------------------
+# Tests for is_garbled_text
+# ---------------------------------------------------------------------------
+
+
+class TestIsGarbledText:
+    """Tests for the is_garbled_text function."""
+
+    def test_normal_chinese_text(self):
+        assert is_garbled_text("这是一段正常的中文文本") is False
+
+    def test_normal_english_text(self):
+        assert is_garbled_text("This is normal English text.") is False
+
+    def test_mixed_normal_text(self):
+        assert is_garbled_text("Hello 你好 World 世界 123") is False
+
+    def test_empty_text(self):
+        assert is_garbled_text("") is False
+        assert is_garbled_text("   ") is False
+
+    def test_none_text(self):
+        assert is_garbled_text(None) is False
+
+    def test_all_pua_chars(self):
+        text = "\uE000\uE001\uE002\uE003\uE004"
+        assert is_garbled_text(text) is True
+
+    def test_mostly_garbled(self):
+        text = "\uE000\uE001\uE002好"
+        assert is_garbled_text(text, threshold=0.5) is True
+
+    def test_few_garbled_below_threshold(self):
+        text = "这是正常文本\uE000"
+        assert is_garbled_text(text, threshold=0.5) is False
+
+    def test_cid_pattern_detected(self):
+        assert is_garbled_text("Hello (cid:123) World") is True
+        assert is_garbled_text("(cid : 45)") is True
+        assert is_garbled_text("(cid:0)") is True
+
+    def test_cid_like_but_not_matching(self):
+        assert is_garbled_text("This is a valid cid reference") is False
+
+    def test_whitespace_only_text(self):
+        assert is_garbled_text("   \t\n  ") is False
+
+    def test_custom_threshold(self):
+        text = "\uE000正常"
+        assert is_garbled_text(text, threshold=0.3) is True
+        assert is_garbled_text(text, threshold=0.5) is False
+
+    def test_replacement_chars_in_text(self):
+        text = "文档\uFFFD\uFFFD解析"
+        assert is_garbled_text(text, threshold=0.5) is False
+        assert is_garbled_text(text, threshold=0.3) is True
+
+    def test_real_world_garbled_pattern(self):
+        text = "\uE000\uE001\uE002\uE003\uE004\uE005\uE006\uE007"
+        assert is_garbled_text(text) is True
+
+    def test_mixed_garbled_and_normal_at_boundary(self):
+        text = "AB\uE000\uE001"
+        assert is_garbled_text(text, threshold=0.5) is True
+        text2 = "ABC\uE000"
+        assert is_garbled_text(text2, threshold=0.5) is False
+
+
+# ---------------------------------------------------------------------------
+# Tests for has_subset_font_prefix
+# ---------------------------------------------------------------------------
+
+
+class TestHasSubsetFontPrefix:
+    """Tests for the has_subset_font_prefix function."""
+
+    def test_standard_subset_prefix(self):
+        assert has_subset_font_prefix("ABCDEF+Arial") is True
+        assert has_subset_font_prefix("XYZABC+TimesNewRoman") is True
+
+    def test_short_subset_prefix(self):
+        assert has_subset_font_prefix("DY1+ZLQDm1-1") is True
+        assert has_subset_font_prefix("AB+Font") is True
+
+    def test_alphanumeric_prefix(self):
+        assert has_subset_font_prefix("DY2+ZLQDnC-2") is True
+        assert has_subset_font_prefix("A1B2C3+MyFont") is True
+
+    def test_no_prefix(self):
+        assert has_subset_font_prefix("Arial") is False
+        assert has_subset_font_prefix("TimesNewRoman") is False
+
+    def test_empty_or_none(self):
+        assert has_subset_font_prefix("") is False
+        assert has_subset_font_prefix(None) is False
+
+    def test_plus_in_middle_not_prefix(self):
+        assert has_subset_font_prefix("Font+Name") is False
+
+    def test_lowercase_not_prefix(self):
+        assert has_subset_font_prefix("abc+Font") is False
+
+
+# ---------------------------------------------------------------------------
+# Tests for is_garbled_by_font_encoding
+# ---------------------------------------------------------------------------
+
+
+def _make_chars(texts, fontname="DY1+ZLQDm1-1"):
+    """Helper to create a list of pdfplumber-like char dicts."""
+    return [{"text": t, "fontname": fontname} for t in texts]
+
+
+class TestIsGarbledByFontEncoding:
+    """Tests for font-encoding garbled text detection.
+
+    This covers the scenario where PDF fonts with broken ToUnicode
+    mappings cause CJK characters to be extracted as ASCII
+    punctuation/symbols (e.g. GB.18067-2000.pdf).
+    """
+
+    def test_ascii_punct_from_subset_font_is_garbled(self):
+        """Simulates GB.18067-2000.pdf: all chars are ASCII punct from subset fonts."""
+        chars = _make_chars(
+            list('!"#$%&\'(\'&)\'"*$!"#$%&\'\'()*+,$-'),
+            fontname="DY1+ZLQDm1-1",
+        )
+        assert is_garbled_by_font_encoding(chars) is True
+
+    def test_normal_cjk_text_not_garbled(self):
+        """Normal Chinese text from subset fonts should not be flagged."""
+        chars = _make_chars(
+            list("这是一段正常的中文文本用于测试的示例内容没有问题"),
+            fontname="ABCDEF+SimSun",
+        )
+        assert is_garbled_by_font_encoding(chars) is False
+
+    def test_mixed_cjk_and_ascii_not_garbled(self):
+        """Mixed CJK and ASCII content should not be flagged."""
+        chars = _make_chars(
+            list("GB18067-2000居住区大气中酚卫生标准"),
+            fontname="DY1+ZLQDm1-1",
+        )
+        assert is_garbled_by_font_encoding(chars) is False
+
+    def test_non_subset_font_not_flagged(self):
+        """ASCII punct from non-subset fonts should not be flagged."""
+        chars = _make_chars(
+            list('!"#$%&\'()*+,-./!"#$%&\'()*+,-./'),
+            fontname="Arial",
+        )
+        assert is_garbled_by_font_encoding(chars) is False
+
+    def test_too_few_chars_not_flagged(self):
+        """Pages with very few chars should not trigger detection."""
+        chars = _make_chars(list('!"#$'), fontname="DY1+ZLQDm1-1")
+        assert is_garbled_by_font_encoding(chars) is False
+
+    def test_mostly_digits_not_garbled(self):
+        """Pages with lots of digits (like data tables) should not be flagged."""
+        chars = _make_chars(
+            list("1234567890" * 3),
+            fontname="DY1+ZLQDm1-1",
+        )
+        assert is_garbled_by_font_encoding(chars) is False
+
+    def test_english_letters_not_garbled(self):
+        """Pages with English letters should not be flagged."""
+        chars = _make_chars(
+            list("The quick brown fox jumps over the lazy dog"),
+            fontname="ABCDEF+Arial",
+        )
+        assert is_garbled_by_font_encoding(chars) is False
+
+    def test_real_world_gb18067_page1(self):
+        """Simulate actual GB.18067-2000.pdf Page 1 character distribution."""
+        page_text = '!"#$%&\'(\'&)\'"*$!"#$%&\'\'()*+,$-'
+        chars = _make_chars(list(page_text), fontname="DY1+ZLQDm1-1")
+        assert is_garbled_by_font_encoding(chars) is True
+
+    def test_real_world_gb18067_page3(self):
+        """Simulate actual GB.18067-2000.pdf Page 3 character distribution."""
+        page_text = '!"#$%&\'()*+,-.*+/0+123456789:;<'
+        chars = _make_chars(list(page_text), fontname="DY1+ZLQDnC-1")
+        assert is_garbled_by_font_encoding(chars) is True
+
+    def test_empty_chars(self):
+        assert is_garbled_by_font_encoding([]) is False
+        assert is_garbled_by_font_encoding(None) is False
+
+    def test_only_spaces(self):
+        chars = _make_chars([" "] * 30, fontname="DY1+ZLQDm1-1")
+        assert is_garbled_by_font_encoding(chars) is False
+
+    def test_small_min_chars_threshold(self):
+        """With reduced min_chars, even small boxes can be detected."""
+        chars = _make_chars(list('!"#$%&'), fontname="DY1+ZLQDm1-1")
+        assert is_garbled_by_font_encoding(chars, min_chars=5) is True
+        assert is_garbled_by_font_encoding(chars, min_chars=20) is False
+
+    def test_boundary_cjk_ratio(self):
+        """Just below 5% CJK threshold should still be flagged."""
+        # 1 CJK out of 25 chars = 4% CJK, rest are punct from subset font
+        chars = _make_chars(list('!"#$%&\'()*+,-./!@#$%^&*'), fontname="DY1+Font")
+        chars.append({"text": "中", "fontname": "DY1+Font"})
+        assert is_garbled_by_font_encoding(chars, min_chars=5) is True
+
+    def test_boundary_above_cjk_threshold(self):
+        """Above 5% CJK ratio should NOT be flagged."""
+        # 3 CJK out of 23 chars = ~13% CJK
+        chars = _make_chars(list('!"#$%&\'()*+,-./!@#$'), fontname="DY1+Font")
+        for ch in "中文字":
+            chars.append({"text": ch, "fontname": "DY1+Font"})
+        assert is_garbled_by_font_encoding(chars, min_chars=5) is False
+
+    def test_low_subset_ratio_not_flagged(self):
+        """When only a few chars come from subset fonts, should not be flagged.
+
+        Addresses reviewer feedback: a single subset font should not cause
+        the entire page to be flagged as garbled.
+        """
+        # 5 chars from subset font, 20 from normal font -> 20% subset ratio < 30%
+        chars = _make_chars(list('!"#$%'), fontname="DY1+Font")
+        chars.extend(_make_chars(list('!"#$%&\'()*+,-./!@#$%'), fontname="Arial"))
+        assert is_garbled_by_font_encoding(chars, min_chars=5) is False
+
+    def test_high_subset_ratio_flagged(self):
+        """When most chars come from subset fonts, detection should trigger."""
+        # All 30 chars from subset font with punct -> garbled
+        chars = _make_chars(
+            list('!"#$%&\'()*+,-./!@#$%^&*()[]{}'),
+            fontname="BCDGEE+R0015",
+        )
+        assert is_garbled_by_font_encoding(chars) is True
+
+
+# ---------------------------------------------------------------------------
+# Tests for layout_recognizer.__is_garbage
+# ---------------------------------------------------------------------------
+
+
+def _is_garbage(b):
+    """Reproduce LayoutRecognizer.__is_garbage for unit testing.
+
+    The original is a closure nested inside LayoutRecognizer.__call__
+    (deepdoc/vision/layout_recognizer.py). We replicate it here because
+    it cannot be directly imported.
+    """
+    patt = [r"\(cid\s*:\s*\d+\s*\)"]
+    return any([re.search(p, b.get("text", "")) for p in patt])
+
+
+class TestLayoutRecognizerIsGarbage:
+    """Tests for the layout_recognizer __is_garbage function.
+
+    This function filters out text boxes containing CID patterns like
+    (cid:123) which indicate unmapped characters in PDF fonts.
+    """
+
+    def test_cid_pattern_simple(self):
+        assert _is_garbage({"text": "(cid:123)"}) is True
+
+    def test_cid_pattern_with_spaces(self):
+        assert _is_garbage({"text": "(cid : 45)"}) is True
+        assert _is_garbage({"text": "(cid :  0)"}) is True
+
+    def test_cid_pattern_embedded_in_text(self):
+        assert _is_garbage({"text": "Hello (cid:99) World"}) is True
+
+    def test_cid_pattern_multiple(self):
+        assert _is_garbage({"text": "(cid:1)(cid:2)(cid:3)"}) is True
+
+    def test_normal_text_not_garbage(self):
+        assert _is_garbage({"text": "This is normal text."}) is False
+
+    def test_chinese_text_not_garbage(self):
+        assert _is_garbage({"text": "这是正常的中文内容"}) is False
+
+    def test_empty_text_not_garbage(self):
+        assert _is_garbage({"text": ""}) is False
+
+    def test_missing_text_key_not_garbage(self):
+        assert _is_garbage({}) is False
+
+    def test_parentheses_without_cid_not_garbage(self):
+        assert _is_garbage({"text": "(hello:123)"}) is False
+        assert _is_garbage({"text": "cid:123"}) is False
+
+    def test_partial_cid_not_garbage(self):
+        assert _is_garbage({"text": "(cid:)"}) is False
+        assert _is_garbage({"text": "(cid)"}) is False
+
+    def test_cid_with_zero(self):
+        assert _is_garbage({"text": "(cid:0)"}) is True
+
+    def test_cid_with_large_number(self):
+        assert _is_garbage({"text": "(cid:99999)"}) is True