fix: detect and fallback garbled PDF text to OCR (#13366) (#13404)

## Problem

When PDF fonts lack ToUnicode/CMap mappings, pdfplumber (pdfminer)
cannot map CIDs to correct Unicode characters, outputting PUA characters
(U+E000~U+F8FF) or `(cid:xxx)` placeholders. The original code fully
trusted pdfplumber text without any garbled detection, causing garbled
output in the final parsed result.

Relates to #13366

## Solution

### 1. Garbled text detection functions
- `_is_garbled_char(ch)`: Detects PUA characters (BMP/Plane 15/16),
replacement character U+FFFD, control characters, and
unassigned/surrogate codepoints
- `_is_garbled_text(text, threshold)`: Calculates garbled ratio and
detects `(cid:xxx)` patterns

### 2. Box-level fallback (in `__ocr()`)
When a text box has ≥50% garbled characters, discard pdfplumber text and
fallback to OCR recognition.

### 3. Page-level detection (in `__images__()`)
Sample characters from each page; if garbled rate ≥30%, clear all
pdfplumber characters for that page, forcing full OCR.

### 4. Layout recognizer CID filtering
Filter out `(cid:xxx)` patterns in `layout_recognizer.py` text
processing to prevent them from polluting layout analysis.

## Testing
- 29 unit tests covering: normal CJK/English text, PUA characters, CID
patterns, mixed text, boundary thresholds, edge cases
- All 85 existing project unit tests pass without regression
This commit is contained in:
tunsuy
2026-03-10 11:20:31 +08:00
committed by GitHub
parent 7f6a9e8ee9
commit 292a1a8566
3 changed files with 619 additions and 6 deletions

View File

@@ -22,6 +22,7 @@ import random
import re
import sys
import threading
import unicodedata
from collections import Counter, defaultdict
from copy import deepcopy
from io import BytesIO
@@ -197,6 +198,127 @@ class RAGFlowPdfParser:
return False
return True
# CID pattern regex for unmapped font characters from pdfminer
_CID_PATTERN = re.compile(r"\(cid\s*:\s*\d+\s*\)")
@staticmethod
def _is_garbled_char(ch):
"""Check if a single character is garbled (unmappable from PDF font encoding).
A character is considered garbled if it falls into Unicode Private Use Areas
or certain replacement/control character ranges that typically indicate
pdfminer failed to map a CID to a valid Unicode codepoint.
"""
if not ch:
return False
cp = ord(ch)
if 0xE000 <= cp <= 0xF8FF:
return True
if 0xF0000 <= cp <= 0xFFFFF:
return True
if 0x100000 <= cp <= 0x10FFFF:
return True
if cp == 0xFFFD:
return True
if cp < 0x20 and ch not in ('\t', '\n', '\r'):
return True
if 0x80 <= cp <= 0x9F:
return True
cat = unicodedata.category(ch)
if cat in ("Cn", "Cs"):
return True
return False
@staticmethod
def _is_garbled_text(text, threshold=0.5):
"""Check if a text string contains too many garbled characters.
Examines each character and determines if the overall proportion
of garbled characters exceeds the given threshold. Also detects
pdfminer's CID placeholder patterns like '(cid:123)'.
"""
if not text or not text.strip():
return False
if RAGFlowPdfParser._CID_PATTERN.search(text):
return True
garbled_count = 0
total = 0
for ch in text:
if ch.isspace():
continue
total += 1
if RAGFlowPdfParser._is_garbled_char(ch):
garbled_count += 1
if total == 0:
return False
return garbled_count / total >= threshold
@staticmethod
def _has_subset_font_prefix(fontname):
"""Check if a font name has a subset prefix (e.g. 'DY1+ZLQDm1-1').
PDF subset fonts use a 6-letter uppercase tag followed by '+' before
the actual font name. Some tools use shorter tags (e.g. 'DY1+').
"""
if not fontname:
return False
return bool(re.match(r"^[A-Z0-9]{2,6}\+", fontname))
@staticmethod
def _is_garbled_by_font_encoding(page_chars, min_chars=20):
"""Detect garbled text caused by broken font encoding mappings.
Some PDFs (especially older Chinese standards) embed custom fonts that
map CJK glyphs to ASCII codepoints. The extracted text appears as
random ASCII punctuation/symbols instead of actual CJK characters.
Detection strategy: if a significant proportion of characters come from
subset-embedded fonts and the page produces overwhelmingly ASCII
(punctuation, digits, symbols) with virtually no CJK/Hangul/Kana
characters, the page is likely garbled due to broken font encoding.
"""
if not page_chars or len(page_chars) < min_chars:
return False
subset_font_count = 0
total_non_space = 0
ascii_punct_sym = 0
cjk_like = 0
for c in page_chars:
text = c.get("text", "")
fontname = c.get("fontname", "")
if not text or text.isspace():
continue
total_non_space += 1
if RAGFlowPdfParser._has_subset_font_prefix(fontname):
subset_font_count += 1
cp = ord(text[0])
if (0x2E80 <= cp <= 0x9FFF or 0xF900 <= cp <= 0xFAFF
or 0x20000 <= cp <= 0x2FA1F
or 0xAC00 <= cp <= 0xD7AF
or 0x3040 <= cp <= 0x30FF):
cjk_like += 1
elif (0x21 <= cp <= 0x2F or 0x3A <= cp <= 0x40
or 0x5B <= cp <= 0x60 or 0x7B <= cp <= 0x7E):
ascii_punct_sym += 1
if total_non_space < min_chars:
return False
subset_ratio = subset_font_count / total_non_space
if subset_ratio < 0.3:
return False
cjk_ratio = cjk_like / total_non_space
punct_ratio = ascii_punct_sym / total_non_space
if cjk_ratio < 0.05 and punct_ratio > 0.4:
return True
return False
def _evaluate_table_orientation(self, table_img, sample_ratio=0.3):
"""
Evaluate the best rotation orientation for a table image.
@@ -618,14 +740,40 @@ class RAGFlowPdfParser:
if not b["chars"]:
del b["chars"]
continue
m_ht = np.mean([c["height"] for c in b["chars"]])
for c in Recognizer.sort_Y_firstly(b["chars"], m_ht):
box_chars = b["chars"]
m_ht = np.mean([c["height"] for c in box_chars])
garbled_count = 0
total_count = 0
for c in Recognizer.sort_Y_firstly(box_chars, m_ht):
if c["text"] == " " and b["text"]:
if re.match(r"[0-9a-zA-Zа-яА-Я,.?;:!%%]", b["text"][-1]):
b["text"] += " "
else:
b["text"] += c["text"]
for ch in c["text"]:
if not ch.isspace():
total_count += 1
if self._is_garbled_char(ch):
garbled_count += 1
del b["chars"]
# If the majority of characters from pdfplumber are garbled,
# clear the text so OCR recognition will be used as fallback.
# Strategy 1: PUA / unmapped CID characters
if total_count > 0 and garbled_count / total_count >= 0.5:
logging.info(
"Page %d: detected garbled pdfplumber text (garbled=%d/%d), falling back to OCR for box at (%.1f, %.1f)",
pagenum, garbled_count, total_count, b["x0"], b["top"],
)
b["text"] = ""
continue
# Strategy 2: font-encoding garbling — all chars are ASCII
# punctuation from subset fonts (no CJK output)
if total_count > 0 and self._is_garbled_by_font_encoding(box_chars, min_chars=5):
logging.info(
"Page %d: detected font-encoding garbled text (%d chars), falling back to OCR for box at (%.1f, %.1f)",
pagenum, total_count, b["x0"], b["top"],
)
b["text"] = ""
logging.info(f"__ocr sorting {len(chars)} chars cost {timer() - start}s")
start = timer()
@@ -1400,6 +1548,34 @@ class RAGFlowPdfParser:
logging.warning(f"Failed to extract characters for pages {page_from}-{page_to}: {str(e)}")
self.page_chars = [[] for _ in range(page_to - page_from)] # If failed to extract, using empty list instead.
# Detect garbled pages and clear their chars so the OCR
# path will be used instead. Two detection strategies:
# 1) PUA / unmapped CID characters (threshold=0.3)
# 2) Font-encoding garbling: subset fonts mapping CJK to ASCII
for pi, page_ch in enumerate(self.page_chars):
if not page_ch:
continue
# Strategy 1: PUA / CID garbling
sample = page_ch if len(page_ch) <= 200 else page_ch[:200]
sample_text = "".join(c.get("text", "") for c in sample)
if self._is_garbled_text(sample_text, threshold=0.3):
logging.warning(
"Page %d: pdfplumber extracted mostly garbled characters (%d chars), "
"clearing to use OCR fallback.",
page_from + pi + 1, len(page_ch),
)
self.page_chars[pi] = []
continue
# Strategy 2: font-encoding garbling (CJK mapped to ASCII)
if self._is_garbled_by_font_encoding(page_ch):
logging.warning(
"Page %d: detected font-encoding garbled text "
"(subset fonts with no CJK output, %d chars), "
"clearing to use OCR fallback.",
page_from + pi + 1, len(page_ch),
)
self.page_chars[pi] = []
self.total_page = len(self.pdf.pages)
except Exception as e:

View File

@@ -17,7 +17,7 @@
import logging
import math
import os
# import re
import re
from collections import Counter
from copy import deepcopy
@@ -62,9 +62,8 @@ class LayoutRecognizer(Recognizer):
def __call__(self, image_list, ocr_res, scale_factor=3, thr=0.2, batch_size=16, drop=True):
def __is_garbage(b):
return False
# patt = [r"^•+$", "^[0-9]{1,2} / ?[0-9]{1,2}$", r"^[0-9]{1,2} of [0-9]{1,2}$", "^http://[^ ]{12,}", "\\(cid *: *[0-9]+ *\\)"]
# return any([re.search(p, b["text"]) for p in patt])
patt = [r"\(cid\s*:\s*\d+\s*\)"]
return any([re.search(p, b.get("text", "")) for p in patt])
if self.client:
layouts = self.client.predict(image_list)

View File

@@ -0,0 +1,438 @@
#
# Copyright 2025 The InfiniFlow Authors. All Rights Reserved.
#
# Licensed under the Apache License, Version 2.0 (the "License");
# you may not use this file except in compliance with the License.
# You may obtain a copy of the License at
#
# http://www.apache.org/licenses/LICENSE-2.0
#
# Unless required by applicable law or agreed to in writing, software
# distributed under the License is distributed on an "AS IS" BASIS,
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# See the License for the specific language governing permissions and
# limitations under the License.
#
"""Unit tests for PDF garbled text detection and layout garbage filtering.
Tests cover:
- RAGFlowPdfParser static methods: _is_garbled_char, _is_garbled_text,
_has_subset_font_prefix, _is_garbled_by_font_encoding
- layout_recognizer.__is_garbage: CID pattern filtering
"""
import re
import sys
import os
import importlib.util
from unittest import mock
# Import RAGFlowPdfParser directly by file path to avoid triggering
# deepdoc/parser/__init__.py which pulls in heavy dependencies
# (pdfplumber, xgboost, etc.) that may not be available in test environments.
#
# We mock the heavy third-party modules so that pdf_parser.py can be loaded
# purely for its static detection methods.
_MOCK_MODULES = [
"numpy", "np", "pdfplumber", "xgboost", "xgb",
"huggingface_hub", "PIL", "PIL.Image", "pypdf",
"sklearn", "sklearn.cluster", "sklearn.metrics",
"common", "common.file_utils", "common.misc_utils", "common.settings",
"common.token_utils",
"deepdoc", "deepdoc.vision", "deepdoc.parser",
"rag", "rag.nlp", "rag.prompts", "rag.prompts.generator",
]
for _m in _MOCK_MODULES:
if _m not in sys.modules:
sys.modules[_m] = mock.MagicMock()
def _find_project_root(marker="pyproject.toml"):
"""Walk up from this file until a directory containing *marker* is found."""
cur = os.path.dirname(os.path.abspath(__file__))
while True:
if os.path.exists(os.path.join(cur, marker)):
return cur
parent = os.path.dirname(cur)
if parent == cur:
raise FileNotFoundError(f"Could not locate project root (missing {marker})")
cur = parent
_MODULE_PATH = os.path.join(_find_project_root(), "deepdoc", "parser", "pdf_parser.py")
_spec = importlib.util.spec_from_file_location("pdf_parser", _MODULE_PATH)
_mod = importlib.util.module_from_spec(_spec)
_spec.loader.exec_module(_mod)
_Parser = _mod.RAGFlowPdfParser
is_garbled_char = _Parser._is_garbled_char
is_garbled_text = _Parser._is_garbled_text
has_subset_font_prefix = _Parser._has_subset_font_prefix
is_garbled_by_font_encoding = _Parser._is_garbled_by_font_encoding
# ---------------------------------------------------------------------------
# Tests for is_garbled_char
# ---------------------------------------------------------------------------
class TestIsGarbledChar:
"""Tests for the is_garbled_char function."""
def test_normal_ascii_chars(self):
for ch in "Hello World 123 !@#":
assert is_garbled_char(ch) is False
def test_normal_chinese_chars(self):
for ch in "中文测试你好世界":
assert is_garbled_char(ch) is False
def test_normal_japanese_chars(self):
for ch in "日本語テスト":
assert is_garbled_char(ch) is False
def test_normal_korean_chars(self):
for ch in "한국어테스트":
assert is_garbled_char(ch) is False
def test_common_whitespace_not_garbled(self):
assert is_garbled_char('\t') is False
assert is_garbled_char('\n') is False
assert is_garbled_char('\r') is False
assert is_garbled_char(' ') is False
def test_pua_chars_are_garbled(self):
assert is_garbled_char('\uE000') is True
assert is_garbled_char('\uF000') is True
assert is_garbled_char('\uF8FF') is True
def test_supplementary_pua_a(self):
assert is_garbled_char(chr(0xF0000)) is True
assert is_garbled_char(chr(0xFFFFF)) is True
def test_supplementary_pua_b(self):
assert is_garbled_char(chr(0x100000)) is True
assert is_garbled_char(chr(0x10FFFF)) is True
def test_replacement_char(self):
assert is_garbled_char('\uFFFD') is True
def test_c0_control_chars(self):
assert is_garbled_char('\x00') is True
assert is_garbled_char('\x01') is True
assert is_garbled_char('\x1F') is True
def test_c1_control_chars(self):
assert is_garbled_char('\x80') is True
assert is_garbled_char('\x8F') is True
assert is_garbled_char('\x9F') is True
def test_empty_string(self):
assert is_garbled_char('') is False
def test_common_punctuation(self):
for ch in ".,;:!?()[]{}\"'-/\\@#$%^&*+=<>~`|":
assert is_garbled_char(ch) is False
def test_unicode_symbols(self):
for ch in "©®™°±²³µ¶·¹º»¼½¾":
assert is_garbled_char(ch) is False
# ---------------------------------------------------------------------------
# Tests for is_garbled_text
# ---------------------------------------------------------------------------
class TestIsGarbledText:
"""Tests for the is_garbled_text function."""
def test_normal_chinese_text(self):
assert is_garbled_text("这是一段正常的中文文本") is False
def test_normal_english_text(self):
assert is_garbled_text("This is normal English text.") is False
def test_mixed_normal_text(self):
assert is_garbled_text("Hello 你好 World 世界 123") is False
def test_empty_text(self):
assert is_garbled_text("") is False
assert is_garbled_text(" ") is False
def test_none_text(self):
assert is_garbled_text(None) is False
def test_all_pua_chars(self):
text = "\uE000\uE001\uE002\uE003\uE004"
assert is_garbled_text(text) is True
def test_mostly_garbled(self):
text = "\uE000\uE001\uE002"
assert is_garbled_text(text, threshold=0.5) is True
def test_few_garbled_below_threshold(self):
text = "这是正常文本\uE000"
assert is_garbled_text(text, threshold=0.5) is False
def test_cid_pattern_detected(self):
assert is_garbled_text("Hello (cid:123) World") is True
assert is_garbled_text("(cid : 45)") is True
assert is_garbled_text("(cid:0)") is True
def test_cid_like_but_not_matching(self):
assert is_garbled_text("This is a valid cid reference") is False
def test_whitespace_only_text(self):
assert is_garbled_text(" \t\n ") is False
def test_custom_threshold(self):
text = "\uE000正常"
assert is_garbled_text(text, threshold=0.3) is True
assert is_garbled_text(text, threshold=0.5) is False
def test_replacement_chars_in_text(self):
text = "文档\uFFFD\uFFFD解析"
assert is_garbled_text(text, threshold=0.5) is False
assert is_garbled_text(text, threshold=0.3) is True
def test_real_world_garbled_pattern(self):
text = "\uE000\uE001\uE002\uE003\uE004\uE005\uE006\uE007"
assert is_garbled_text(text) is True
def test_mixed_garbled_and_normal_at_boundary(self):
text = "AB\uE000\uE001"
assert is_garbled_text(text, threshold=0.5) is True
text2 = "ABC\uE000"
assert is_garbled_text(text2, threshold=0.5) is False
# ---------------------------------------------------------------------------
# Tests for has_subset_font_prefix
# ---------------------------------------------------------------------------
class TestHasSubsetFontPrefix:
"""Tests for the has_subset_font_prefix function."""
def test_standard_subset_prefix(self):
assert has_subset_font_prefix("ABCDEF+Arial") is True
assert has_subset_font_prefix("XYZABC+TimesNewRoman") is True
def test_short_subset_prefix(self):
assert has_subset_font_prefix("DY1+ZLQDm1-1") is True
assert has_subset_font_prefix("AB+Font") is True
def test_alphanumeric_prefix(self):
assert has_subset_font_prefix("DY2+ZLQDnC-2") is True
assert has_subset_font_prefix("A1B2C3+MyFont") is True
def test_no_prefix(self):
assert has_subset_font_prefix("Arial") is False
assert has_subset_font_prefix("TimesNewRoman") is False
def test_empty_or_none(self):
assert has_subset_font_prefix("") is False
assert has_subset_font_prefix(None) is False
def test_plus_in_middle_not_prefix(self):
assert has_subset_font_prefix("Font+Name") is False
def test_lowercase_not_prefix(self):
assert has_subset_font_prefix("abc+Font") is False
# ---------------------------------------------------------------------------
# Tests for is_garbled_by_font_encoding
# ---------------------------------------------------------------------------
def _make_chars(texts, fontname="DY1+ZLQDm1-1"):
"""Helper to create a list of pdfplumber-like char dicts."""
return [{"text": t, "fontname": fontname} for t in texts]
class TestIsGarbledByFontEncoding:
"""Tests for font-encoding garbled text detection.
This covers the scenario where PDF fonts with broken ToUnicode
mappings cause CJK characters to be extracted as ASCII
punctuation/symbols (e.g. GB.18067-2000.pdf).
"""
def test_ascii_punct_from_subset_font_is_garbled(self):
"""Simulates GB.18067-2000.pdf: all chars are ASCII punct from subset fonts."""
chars = _make_chars(
list('!"#$%&\'(\'&)\'"*$!"#$%&\'\'()*+,$-'),
fontname="DY1+ZLQDm1-1",
)
assert is_garbled_by_font_encoding(chars) is True
def test_normal_cjk_text_not_garbled(self):
"""Normal Chinese text from subset fonts should not be flagged."""
chars = _make_chars(
list("这是一段正常的中文文本用于测试的示例内容没有问题"),
fontname="ABCDEF+SimSun",
)
assert is_garbled_by_font_encoding(chars) is False
def test_mixed_cjk_and_ascii_not_garbled(self):
"""Mixed CJK and ASCII content should not be flagged."""
chars = _make_chars(
list("GB18067-2000居住区大气中酚卫生标准"),
fontname="DY1+ZLQDm1-1",
)
assert is_garbled_by_font_encoding(chars) is False
def test_non_subset_font_not_flagged(self):
"""ASCII punct from non-subset fonts should not be flagged."""
chars = _make_chars(
list('!"#$%&\'()*+,-./!"#$%&\'()*+,-./'),
fontname="Arial",
)
assert is_garbled_by_font_encoding(chars) is False
def test_too_few_chars_not_flagged(self):
"""Pages with very few chars should not trigger detection."""
chars = _make_chars(list('!"#$'), fontname="DY1+ZLQDm1-1")
assert is_garbled_by_font_encoding(chars) is False
def test_mostly_digits_not_garbled(self):
"""Pages with lots of digits (like data tables) should not be flagged."""
chars = _make_chars(
list("1234567890" * 3),
fontname="DY1+ZLQDm1-1",
)
assert is_garbled_by_font_encoding(chars) is False
def test_english_letters_not_garbled(self):
"""Pages with English letters should not be flagged."""
chars = _make_chars(
list("The quick brown fox jumps over the lazy dog"),
fontname="ABCDEF+Arial",
)
assert is_garbled_by_font_encoding(chars) is False
def test_real_world_gb18067_page1(self):
"""Simulate actual GB.18067-2000.pdf Page 1 character distribution."""
page_text = '!"#$%&\'(\'&)\'"*$!"#$%&\'\'()*+,$-'
chars = _make_chars(list(page_text), fontname="DY1+ZLQDm1-1")
assert is_garbled_by_font_encoding(chars) is True
def test_real_world_gb18067_page3(self):
"""Simulate actual GB.18067-2000.pdf Page 3 character distribution."""
page_text = '!"#$%&\'()*+,-.*+/0+123456789:;<'
chars = _make_chars(list(page_text), fontname="DY1+ZLQDnC-1")
assert is_garbled_by_font_encoding(chars) is True
def test_empty_chars(self):
assert is_garbled_by_font_encoding([]) is False
assert is_garbled_by_font_encoding(None) is False
def test_only_spaces(self):
chars = _make_chars([" "] * 30, fontname="DY1+ZLQDm1-1")
assert is_garbled_by_font_encoding(chars) is False
def test_small_min_chars_threshold(self):
"""With reduced min_chars, even small boxes can be detected."""
chars = _make_chars(list('!"#$%&'), fontname="DY1+ZLQDm1-1")
assert is_garbled_by_font_encoding(chars, min_chars=5) is True
assert is_garbled_by_font_encoding(chars, min_chars=20) is False
def test_boundary_cjk_ratio(self):
"""Just below 5% CJK threshold should still be flagged."""
# 1 CJK out of 25 chars = 4% CJK, rest are punct from subset font
chars = _make_chars(list('!"#$%&\'()*+,-./!@#$%^&*'), fontname="DY1+Font")
chars.append({"text": "", "fontname": "DY1+Font"})
assert is_garbled_by_font_encoding(chars, min_chars=5) is True
def test_boundary_above_cjk_threshold(self):
"""Above 5% CJK ratio should NOT be flagged."""
# 3 CJK out of 23 chars = ~13% CJK
chars = _make_chars(list('!"#$%&\'()*+,-./!@#$'), fontname="DY1+Font")
for ch in "中文字":
chars.append({"text": ch, "fontname": "DY1+Font"})
assert is_garbled_by_font_encoding(chars, min_chars=5) is False
def test_low_subset_ratio_not_flagged(self):
"""When only a few chars come from subset fonts, should not be flagged.
Addresses reviewer feedback: a single subset font should not cause
the entire page to be flagged as garbled.
"""
# 5 chars from subset font, 20 from normal font -> 20% subset ratio < 30%
chars = _make_chars(list('!"#$%'), fontname="DY1+Font")
chars.extend(_make_chars(list('!"#$%&\'()*+,-./!@#$%'), fontname="Arial"))
assert is_garbled_by_font_encoding(chars, min_chars=5) is False
def test_high_subset_ratio_flagged(self):
"""When most chars come from subset fonts, detection should trigger."""
# All 30 chars from subset font with punct -> garbled
chars = _make_chars(
list('!"#$%&\'()*+,-./!@#$%^&*()[]{}'),
fontname="BCDGEE+R0015",
)
assert is_garbled_by_font_encoding(chars) is True
# ---------------------------------------------------------------------------
# Tests for layout_recognizer.__is_garbage
# ---------------------------------------------------------------------------
def _is_garbage(b):
"""Reproduce LayoutRecognizer.__is_garbage for unit testing.
The original is a closure nested inside LayoutRecognizer.__call__
(deepdoc/vision/layout_recognizer.py). We replicate it here because
it cannot be directly imported.
"""
patt = [r"\(cid\s*:\s*\d+\s*\)"]
return any([re.search(p, b.get("text", "")) for p in patt])
class TestLayoutRecognizerIsGarbage:
"""Tests for the layout_recognizer __is_garbage function.
This function filters out text boxes containing CID patterns like
(cid:123) which indicate unmapped characters in PDF fonts.
"""
def test_cid_pattern_simple(self):
assert _is_garbage({"text": "(cid:123)"}) is True
def test_cid_pattern_with_spaces(self):
assert _is_garbage({"text": "(cid : 45)"}) is True
assert _is_garbage({"text": "(cid : 0)"}) is True
def test_cid_pattern_embedded_in_text(self):
assert _is_garbage({"text": "Hello (cid:99) World"}) is True
def test_cid_pattern_multiple(self):
assert _is_garbage({"text": "(cid:1)(cid:2)(cid:3)"}) is True
def test_normal_text_not_garbage(self):
assert _is_garbage({"text": "This is normal text."}) is False
def test_chinese_text_not_garbage(self):
assert _is_garbage({"text": "这是正常的中文内容"}) is False
def test_empty_text_not_garbage(self):
assert _is_garbage({"text": ""}) is False
def test_missing_text_key_not_garbage(self):
assert _is_garbage({}) is False
def test_parentheses_without_cid_not_garbage(self):
assert _is_garbage({"text": "(hello:123)"}) is False
assert _is_garbage({"text": "cid:123"}) is False
def test_partial_cid_not_garbage(self):
assert _is_garbage({"text": "(cid:)"}) is False
assert _is_garbage({"text": "(cid)"}) is False
def test_cid_with_zero(self):
assert _is_garbage({"text": "(cid:0)"}) is True
def test_cid_with_large_number(self):
assert _is_garbage({"text": "(cid:99999)"}) is True