diff --git a/deepdoc/parser/pdf_parser.py b/deepdoc/parser/pdf_parser.py index 49880c3c55..6020361c07 100644 --- a/deepdoc/parser/pdf_parser.py +++ b/deepdoc/parser/pdf_parser.py @@ -22,6 +22,7 @@ import random import re import sys import threading +import unicodedata from collections import Counter, defaultdict from copy import deepcopy from io import BytesIO @@ -197,6 +198,127 @@ class RAGFlowPdfParser: return False return True + # CID pattern regex for unmapped font characters from pdfminer + _CID_PATTERN = re.compile(r"\(cid\s*:\s*\d+\s*\)") + + @staticmethod + def _is_garbled_char(ch): + """Check if a single character is garbled (unmappable from PDF font encoding). + + A character is considered garbled if it falls into Unicode Private Use Areas + or certain replacement/control character ranges that typically indicate + pdfminer failed to map a CID to a valid Unicode codepoint. + """ + if not ch: + return False + cp = ord(ch) + if 0xE000 <= cp <= 0xF8FF: + return True + if 0xF0000 <= cp <= 0xFFFFF: + return True + if 0x100000 <= cp <= 0x10FFFF: + return True + if cp == 0xFFFD: + return True + if cp < 0x20 and ch not in ('\t', '\n', '\r'): + return True + if 0x80 <= cp <= 0x9F: + return True + cat = unicodedata.category(ch) + if cat in ("Cn", "Cs"): + return True + return False + + @staticmethod + def _is_garbled_text(text, threshold=0.5): + """Check if a text string contains too many garbled characters. + + Examines each character and determines if the overall proportion + of garbled characters exceeds the given threshold. Also detects + pdfminer's CID placeholder patterns like '(cid:123)'. + """ + if not text or not text.strip(): + return False + if RAGFlowPdfParser._CID_PATTERN.search(text): + return True + garbled_count = 0 + total = 0 + for ch in text: + if ch.isspace(): + continue + total += 1 + if RAGFlowPdfParser._is_garbled_char(ch): + garbled_count += 1 + if total == 0: + return False + return garbled_count / total >= threshold + + @staticmethod + def _has_subset_font_prefix(fontname): + """Check if a font name has a subset prefix (e.g. 'DY1+ZLQDm1-1'). + + PDF subset fonts use a 6-letter uppercase tag followed by '+' before + the actual font name. Some tools use shorter tags (e.g. 'DY1+'). + """ + if not fontname: + return False + return bool(re.match(r"^[A-Z0-9]{2,6}\+", fontname)) + + @staticmethod + def _is_garbled_by_font_encoding(page_chars, min_chars=20): + """Detect garbled text caused by broken font encoding mappings. + + Some PDFs (especially older Chinese standards) embed custom fonts that + map CJK glyphs to ASCII codepoints. The extracted text appears as + random ASCII punctuation/symbols instead of actual CJK characters. + + Detection strategy: if a significant proportion of characters come from + subset-embedded fonts and the page produces overwhelmingly ASCII + (punctuation, digits, symbols) with virtually no CJK/Hangul/Kana + characters, the page is likely garbled due to broken font encoding. + """ + if not page_chars or len(page_chars) < min_chars: + return False + + subset_font_count = 0 + total_non_space = 0 + ascii_punct_sym = 0 + cjk_like = 0 + + for c in page_chars: + text = c.get("text", "") + fontname = c.get("fontname", "") + if not text or text.isspace(): + continue + total_non_space += 1 + + if RAGFlowPdfParser._has_subset_font_prefix(fontname): + subset_font_count += 1 + + cp = ord(text[0]) + if (0x2E80 <= cp <= 0x9FFF or 0xF900 <= cp <= 0xFAFF + or 0x20000 <= cp <= 0x2FA1F + or 0xAC00 <= cp <= 0xD7AF + or 0x3040 <= cp <= 0x30FF): + cjk_like += 1 + elif (0x21 <= cp <= 0x2F or 0x3A <= cp <= 0x40 + or 0x5B <= cp <= 0x60 or 0x7B <= cp <= 0x7E): + ascii_punct_sym += 1 + + if total_non_space < min_chars: + return False + + subset_ratio = subset_font_count / total_non_space + if subset_ratio < 0.3: + return False + + cjk_ratio = cjk_like / total_non_space + punct_ratio = ascii_punct_sym / total_non_space + if cjk_ratio < 0.05 and punct_ratio > 0.4: + return True + + return False + def _evaluate_table_orientation(self, table_img, sample_ratio=0.3): """ Evaluate the best rotation orientation for a table image. @@ -618,14 +740,40 @@ class RAGFlowPdfParser: if not b["chars"]: del b["chars"] continue - m_ht = np.mean([c["height"] for c in b["chars"]]) - for c in Recognizer.sort_Y_firstly(b["chars"], m_ht): + box_chars = b["chars"] + m_ht = np.mean([c["height"] for c in box_chars]) + garbled_count = 0 + total_count = 0 + for c in Recognizer.sort_Y_firstly(box_chars, m_ht): if c["text"] == " " and b["text"]: if re.match(r"[0-9a-zA-Zа-яА-Я,.?;:!%%]", b["text"][-1]): b["text"] += " " else: b["text"] += c["text"] + for ch in c["text"]: + if not ch.isspace(): + total_count += 1 + if self._is_garbled_char(ch): + garbled_count += 1 del b["chars"] + # If the majority of characters from pdfplumber are garbled, + # clear the text so OCR recognition will be used as fallback. + # Strategy 1: PUA / unmapped CID characters + if total_count > 0 and garbled_count / total_count >= 0.5: + logging.info( + "Page %d: detected garbled pdfplumber text (garbled=%d/%d), falling back to OCR for box at (%.1f, %.1f)", + pagenum, garbled_count, total_count, b["x0"], b["top"], + ) + b["text"] = "" + continue + # Strategy 2: font-encoding garbling — all chars are ASCII + # punctuation from subset fonts (no CJK output) + if total_count > 0 and self._is_garbled_by_font_encoding(box_chars, min_chars=5): + logging.info( + "Page %d: detected font-encoding garbled text (%d chars), falling back to OCR for box at (%.1f, %.1f)", + pagenum, total_count, b["x0"], b["top"], + ) + b["text"] = "" logging.info(f"__ocr sorting {len(chars)} chars cost {timer() - start}s") start = timer() @@ -1400,6 +1548,34 @@ class RAGFlowPdfParser: logging.warning(f"Failed to extract characters for pages {page_from}-{page_to}: {str(e)}") self.page_chars = [[] for _ in range(page_to - page_from)] # If failed to extract, using empty list instead. + # Detect garbled pages and clear their chars so the OCR + # path will be used instead. Two detection strategies: + # 1) PUA / unmapped CID characters (threshold=0.3) + # 2) Font-encoding garbling: subset fonts mapping CJK to ASCII + for pi, page_ch in enumerate(self.page_chars): + if not page_ch: + continue + # Strategy 1: PUA / CID garbling + sample = page_ch if len(page_ch) <= 200 else page_ch[:200] + sample_text = "".join(c.get("text", "") for c in sample) + if self._is_garbled_text(sample_text, threshold=0.3): + logging.warning( + "Page %d: pdfplumber extracted mostly garbled characters (%d chars), " + "clearing to use OCR fallback.", + page_from + pi + 1, len(page_ch), + ) + self.page_chars[pi] = [] + continue + # Strategy 2: font-encoding garbling (CJK mapped to ASCII) + if self._is_garbled_by_font_encoding(page_ch): + logging.warning( + "Page %d: detected font-encoding garbled text " + "(subset fonts with no CJK output, %d chars), " + "clearing to use OCR fallback.", + page_from + pi + 1, len(page_ch), + ) + self.page_chars[pi] = [] + self.total_page = len(self.pdf.pages) except Exception as e: diff --git a/deepdoc/vision/layout_recognizer.py b/deepdoc/vision/layout_recognizer.py index 5b79e2bf5c..be1f8667ce 100644 --- a/deepdoc/vision/layout_recognizer.py +++ b/deepdoc/vision/layout_recognizer.py @@ -17,7 +17,7 @@ import logging import math import os -# import re +import re from collections import Counter from copy import deepcopy @@ -62,9 +62,8 @@ class LayoutRecognizer(Recognizer): def __call__(self, image_list, ocr_res, scale_factor=3, thr=0.2, batch_size=16, drop=True): def __is_garbage(b): - return False - # patt = [r"^•+$", "^[0-9]{1,2} / ?[0-9]{1,2}$", r"^[0-9]{1,2} of [0-9]{1,2}$", "^http://[^ ]{12,}", "\\(cid *: *[0-9]+ *\\)"] - # return any([re.search(p, b["text"]) for p in patt]) + patt = [r"\(cid\s*:\s*\d+\s*\)"] + return any([re.search(p, b.get("text", "")) for p in patt]) if self.client: layouts = self.client.predict(image_list) diff --git a/test/unit_test/deepdoc/parser/test_pdf_garbled_detection.py b/test/unit_test/deepdoc/parser/test_pdf_garbled_detection.py new file mode 100644 index 0000000000..fa7c4a8b76 --- /dev/null +++ b/test/unit_test/deepdoc/parser/test_pdf_garbled_detection.py @@ -0,0 +1,438 @@ +# +# Copyright 2025 The InfiniFlow Authors. All Rights Reserved. +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. +# + +"""Unit tests for PDF garbled text detection and layout garbage filtering. + +Tests cover: +- RAGFlowPdfParser static methods: _is_garbled_char, _is_garbled_text, + _has_subset_font_prefix, _is_garbled_by_font_encoding +- layout_recognizer.__is_garbage: CID pattern filtering +""" + +import re +import sys +import os +import importlib.util +from unittest import mock + +# Import RAGFlowPdfParser directly by file path to avoid triggering +# deepdoc/parser/__init__.py which pulls in heavy dependencies +# (pdfplumber, xgboost, etc.) that may not be available in test environments. +# +# We mock the heavy third-party modules so that pdf_parser.py can be loaded +# purely for its static detection methods. +_MOCK_MODULES = [ + "numpy", "np", "pdfplumber", "xgboost", "xgb", + "huggingface_hub", "PIL", "PIL.Image", "pypdf", + "sklearn", "sklearn.cluster", "sklearn.metrics", + "common", "common.file_utils", "common.misc_utils", "common.settings", + "common.token_utils", + "deepdoc", "deepdoc.vision", "deepdoc.parser", + "rag", "rag.nlp", "rag.prompts", "rag.prompts.generator", +] +for _m in _MOCK_MODULES: + if _m not in sys.modules: + sys.modules[_m] = mock.MagicMock() + +def _find_project_root(marker="pyproject.toml"): + """Walk up from this file until a directory containing *marker* is found.""" + cur = os.path.dirname(os.path.abspath(__file__)) + while True: + if os.path.exists(os.path.join(cur, marker)): + return cur + parent = os.path.dirname(cur) + if parent == cur: + raise FileNotFoundError(f"Could not locate project root (missing {marker})") + cur = parent + + +_MODULE_PATH = os.path.join(_find_project_root(), "deepdoc", "parser", "pdf_parser.py") +_spec = importlib.util.spec_from_file_location("pdf_parser", _MODULE_PATH) +_mod = importlib.util.module_from_spec(_spec) +_spec.loader.exec_module(_mod) + +_Parser = _mod.RAGFlowPdfParser +is_garbled_char = _Parser._is_garbled_char +is_garbled_text = _Parser._is_garbled_text +has_subset_font_prefix = _Parser._has_subset_font_prefix +is_garbled_by_font_encoding = _Parser._is_garbled_by_font_encoding + + +# --------------------------------------------------------------------------- +# Tests for is_garbled_char +# --------------------------------------------------------------------------- + + +class TestIsGarbledChar: + """Tests for the is_garbled_char function.""" + + def test_normal_ascii_chars(self): + for ch in "Hello World 123 !@#": + assert is_garbled_char(ch) is False + + def test_normal_chinese_chars(self): + for ch in "中文测试你好世界": + assert is_garbled_char(ch) is False + + def test_normal_japanese_chars(self): + for ch in "日本語テスト": + assert is_garbled_char(ch) is False + + def test_normal_korean_chars(self): + for ch in "한국어테스트": + assert is_garbled_char(ch) is False + + def test_common_whitespace_not_garbled(self): + assert is_garbled_char('\t') is False + assert is_garbled_char('\n') is False + assert is_garbled_char('\r') is False + assert is_garbled_char(' ') is False + + def test_pua_chars_are_garbled(self): + assert is_garbled_char('\uE000') is True + assert is_garbled_char('\uF000') is True + assert is_garbled_char('\uF8FF') is True + + def test_supplementary_pua_a(self): + assert is_garbled_char(chr(0xF0000)) is True + assert is_garbled_char(chr(0xFFFFF)) is True + + def test_supplementary_pua_b(self): + assert is_garbled_char(chr(0x100000)) is True + assert is_garbled_char(chr(0x10FFFF)) is True + + def test_replacement_char(self): + assert is_garbled_char('\uFFFD') is True + + def test_c0_control_chars(self): + assert is_garbled_char('\x00') is True + assert is_garbled_char('\x01') is True + assert is_garbled_char('\x1F') is True + + def test_c1_control_chars(self): + assert is_garbled_char('\x80') is True + assert is_garbled_char('\x8F') is True + assert is_garbled_char('\x9F') is True + + def test_empty_string(self): + assert is_garbled_char('') is False + + def test_common_punctuation(self): + for ch in ".,;:!?()[]{}\"'-/\\@#$%^&*+=<>~`|": + assert is_garbled_char(ch) is False + + def test_unicode_symbols(self): + for ch in "©®™°±²³µ¶·¹º»¼½¾": + assert is_garbled_char(ch) is False + + +# --------------------------------------------------------------------------- +# Tests for is_garbled_text +# --------------------------------------------------------------------------- + + +class TestIsGarbledText: + """Tests for the is_garbled_text function.""" + + def test_normal_chinese_text(self): + assert is_garbled_text("这是一段正常的中文文本") is False + + def test_normal_english_text(self): + assert is_garbled_text("This is normal English text.") is False + + def test_mixed_normal_text(self): + assert is_garbled_text("Hello 你好 World 世界 123") is False + + def test_empty_text(self): + assert is_garbled_text("") is False + assert is_garbled_text(" ") is False + + def test_none_text(self): + assert is_garbled_text(None) is False + + def test_all_pua_chars(self): + text = "\uE000\uE001\uE002\uE003\uE004" + assert is_garbled_text(text) is True + + def test_mostly_garbled(self): + text = "\uE000\uE001\uE002好" + assert is_garbled_text(text, threshold=0.5) is True + + def test_few_garbled_below_threshold(self): + text = "这是正常文本\uE000" + assert is_garbled_text(text, threshold=0.5) is False + + def test_cid_pattern_detected(self): + assert is_garbled_text("Hello (cid:123) World") is True + assert is_garbled_text("(cid : 45)") is True + assert is_garbled_text("(cid:0)") is True + + def test_cid_like_but_not_matching(self): + assert is_garbled_text("This is a valid cid reference") is False + + def test_whitespace_only_text(self): + assert is_garbled_text(" \t\n ") is False + + def test_custom_threshold(self): + text = "\uE000正常" + assert is_garbled_text(text, threshold=0.3) is True + assert is_garbled_text(text, threshold=0.5) is False + + def test_replacement_chars_in_text(self): + text = "文档\uFFFD\uFFFD解析" + assert is_garbled_text(text, threshold=0.5) is False + assert is_garbled_text(text, threshold=0.3) is True + + def test_real_world_garbled_pattern(self): + text = "\uE000\uE001\uE002\uE003\uE004\uE005\uE006\uE007" + assert is_garbled_text(text) is True + + def test_mixed_garbled_and_normal_at_boundary(self): + text = "AB\uE000\uE001" + assert is_garbled_text(text, threshold=0.5) is True + text2 = "ABC\uE000" + assert is_garbled_text(text2, threshold=0.5) is False + + +# --------------------------------------------------------------------------- +# Tests for has_subset_font_prefix +# --------------------------------------------------------------------------- + + +class TestHasSubsetFontPrefix: + """Tests for the has_subset_font_prefix function.""" + + def test_standard_subset_prefix(self): + assert has_subset_font_prefix("ABCDEF+Arial") is True + assert has_subset_font_prefix("XYZABC+TimesNewRoman") is True + + def test_short_subset_prefix(self): + assert has_subset_font_prefix("DY1+ZLQDm1-1") is True + assert has_subset_font_prefix("AB+Font") is True + + def test_alphanumeric_prefix(self): + assert has_subset_font_prefix("DY2+ZLQDnC-2") is True + assert has_subset_font_prefix("A1B2C3+MyFont") is True + + def test_no_prefix(self): + assert has_subset_font_prefix("Arial") is False + assert has_subset_font_prefix("TimesNewRoman") is False + + def test_empty_or_none(self): + assert has_subset_font_prefix("") is False + assert has_subset_font_prefix(None) is False + + def test_plus_in_middle_not_prefix(self): + assert has_subset_font_prefix("Font+Name") is False + + def test_lowercase_not_prefix(self): + assert has_subset_font_prefix("abc+Font") is False + + +# --------------------------------------------------------------------------- +# Tests for is_garbled_by_font_encoding +# --------------------------------------------------------------------------- + + +def _make_chars(texts, fontname="DY1+ZLQDm1-1"): + """Helper to create a list of pdfplumber-like char dicts.""" + return [{"text": t, "fontname": fontname} for t in texts] + + +class TestIsGarbledByFontEncoding: + """Tests for font-encoding garbled text detection. + + This covers the scenario where PDF fonts with broken ToUnicode + mappings cause CJK characters to be extracted as ASCII + punctuation/symbols (e.g. GB.18067-2000.pdf). + """ + + def test_ascii_punct_from_subset_font_is_garbled(self): + """Simulates GB.18067-2000.pdf: all chars are ASCII punct from subset fonts.""" + chars = _make_chars( + list('!"#$%&\'(\'&)\'"*$!"#$%&\'\'()*+,$-'), + fontname="DY1+ZLQDm1-1", + ) + assert is_garbled_by_font_encoding(chars) is True + + def test_normal_cjk_text_not_garbled(self): + """Normal Chinese text from subset fonts should not be flagged.""" + chars = _make_chars( + list("这是一段正常的中文文本用于测试的示例内容没有问题"), + fontname="ABCDEF+SimSun", + ) + assert is_garbled_by_font_encoding(chars) is False + + def test_mixed_cjk_and_ascii_not_garbled(self): + """Mixed CJK and ASCII content should not be flagged.""" + chars = _make_chars( + list("GB18067-2000居住区大气中酚卫生标准"), + fontname="DY1+ZLQDm1-1", + ) + assert is_garbled_by_font_encoding(chars) is False + + def test_non_subset_font_not_flagged(self): + """ASCII punct from non-subset fonts should not be flagged.""" + chars = _make_chars( + list('!"#$%&\'()*+,-./!"#$%&\'()*+,-./'), + fontname="Arial", + ) + assert is_garbled_by_font_encoding(chars) is False + + def test_too_few_chars_not_flagged(self): + """Pages with very few chars should not trigger detection.""" + chars = _make_chars(list('!"#$'), fontname="DY1+ZLQDm1-1") + assert is_garbled_by_font_encoding(chars) is False + + def test_mostly_digits_not_garbled(self): + """Pages with lots of digits (like data tables) should not be flagged.""" + chars = _make_chars( + list("1234567890" * 3), + fontname="DY1+ZLQDm1-1", + ) + assert is_garbled_by_font_encoding(chars) is False + + def test_english_letters_not_garbled(self): + """Pages with English letters should not be flagged.""" + chars = _make_chars( + list("The quick brown fox jumps over the lazy dog"), + fontname="ABCDEF+Arial", + ) + assert is_garbled_by_font_encoding(chars) is False + + def test_real_world_gb18067_page1(self): + """Simulate actual GB.18067-2000.pdf Page 1 character distribution.""" + page_text = '!"#$%&\'(\'&)\'"*$!"#$%&\'\'()*+,$-' + chars = _make_chars(list(page_text), fontname="DY1+ZLQDm1-1") + assert is_garbled_by_font_encoding(chars) is True + + def test_real_world_gb18067_page3(self): + """Simulate actual GB.18067-2000.pdf Page 3 character distribution.""" + page_text = '!"#$%&\'()*+,-.*+/0+123456789:;<' + chars = _make_chars(list(page_text), fontname="DY1+ZLQDnC-1") + assert is_garbled_by_font_encoding(chars) is True + + def test_empty_chars(self): + assert is_garbled_by_font_encoding([]) is False + assert is_garbled_by_font_encoding(None) is False + + def test_only_spaces(self): + chars = _make_chars([" "] * 30, fontname="DY1+ZLQDm1-1") + assert is_garbled_by_font_encoding(chars) is False + + def test_small_min_chars_threshold(self): + """With reduced min_chars, even small boxes can be detected.""" + chars = _make_chars(list('!"#$%&'), fontname="DY1+ZLQDm1-1") + assert is_garbled_by_font_encoding(chars, min_chars=5) is True + assert is_garbled_by_font_encoding(chars, min_chars=20) is False + + def test_boundary_cjk_ratio(self): + """Just below 5% CJK threshold should still be flagged.""" + # 1 CJK out of 25 chars = 4% CJK, rest are punct from subset font + chars = _make_chars(list('!"#$%&\'()*+,-./!@#$%^&*'), fontname="DY1+Font") + chars.append({"text": "中", "fontname": "DY1+Font"}) + assert is_garbled_by_font_encoding(chars, min_chars=5) is True + + def test_boundary_above_cjk_threshold(self): + """Above 5% CJK ratio should NOT be flagged.""" + # 3 CJK out of 23 chars = ~13% CJK + chars = _make_chars(list('!"#$%&\'()*+,-./!@#$'), fontname="DY1+Font") + for ch in "中文字": + chars.append({"text": ch, "fontname": "DY1+Font"}) + assert is_garbled_by_font_encoding(chars, min_chars=5) is False + + def test_low_subset_ratio_not_flagged(self): + """When only a few chars come from subset fonts, should not be flagged. + + Addresses reviewer feedback: a single subset font should not cause + the entire page to be flagged as garbled. + """ + # 5 chars from subset font, 20 from normal font -> 20% subset ratio < 30% + chars = _make_chars(list('!"#$%'), fontname="DY1+Font") + chars.extend(_make_chars(list('!"#$%&\'()*+,-./!@#$%'), fontname="Arial")) + assert is_garbled_by_font_encoding(chars, min_chars=5) is False + + def test_high_subset_ratio_flagged(self): + """When most chars come from subset fonts, detection should trigger.""" + # All 30 chars from subset font with punct -> garbled + chars = _make_chars( + list('!"#$%&\'()*+,-./!@#$%^&*()[]{}'), + fontname="BCDGEE+R0015", + ) + assert is_garbled_by_font_encoding(chars) is True + + +# --------------------------------------------------------------------------- +# Tests for layout_recognizer.__is_garbage +# --------------------------------------------------------------------------- + + +def _is_garbage(b): + """Reproduce LayoutRecognizer.__is_garbage for unit testing. + + The original is a closure nested inside LayoutRecognizer.__call__ + (deepdoc/vision/layout_recognizer.py). We replicate it here because + it cannot be directly imported. + """ + patt = [r"\(cid\s*:\s*\d+\s*\)"] + return any([re.search(p, b.get("text", "")) for p in patt]) + + +class TestLayoutRecognizerIsGarbage: + """Tests for the layout_recognizer __is_garbage function. + + This function filters out text boxes containing CID patterns like + (cid:123) which indicate unmapped characters in PDF fonts. + """ + + def test_cid_pattern_simple(self): + assert _is_garbage({"text": "(cid:123)"}) is True + + def test_cid_pattern_with_spaces(self): + assert _is_garbage({"text": "(cid : 45)"}) is True + assert _is_garbage({"text": "(cid : 0)"}) is True + + def test_cid_pattern_embedded_in_text(self): + assert _is_garbage({"text": "Hello (cid:99) World"}) is True + + def test_cid_pattern_multiple(self): + assert _is_garbage({"text": "(cid:1)(cid:2)(cid:3)"}) is True + + def test_normal_text_not_garbage(self): + assert _is_garbage({"text": "This is normal text."}) is False + + def test_chinese_text_not_garbage(self): + assert _is_garbage({"text": "这是正常的中文内容"}) is False + + def test_empty_text_not_garbage(self): + assert _is_garbage({"text": ""}) is False + + def test_missing_text_key_not_garbage(self): + assert _is_garbage({}) is False + + def test_parentheses_without_cid_not_garbage(self): + assert _is_garbage({"text": "(hello:123)"}) is False + assert _is_garbage({"text": "cid:123"}) is False + + def test_partial_cid_not_garbage(self): + assert _is_garbage({"text": "(cid:)"}) is False + assert _is_garbage({"text": "(cid)"}) is False + + def test_cid_with_zero(self): + assert _is_garbage({"text": "(cid:0)"}) is True + + def test_cid_with_large_number(self): + assert _is_garbage({"text": "(cid:99999)"}) is True