From f2aadd3871a645b0523047e3fdec534c1c79ec4f Mon Sep 17 00:00:00 2001 From: Yash Raj Pandey <55940078+devYRPauli@users.noreply.github.com> Date: Mon, 8 Jun 2026 08:25:23 -0400 Subject: [PATCH] Fix: is_english() returns False for any list argument (broken language detection) (#15489) MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit ### What problem does this PR solve? `is_english()` in `rag/nlp/__init__.py` compiles a **single-character** regex class and `fullmatch`es it against each item: ```python pattern = re.compile(r"[`a-zA-Z0-9\s.,':;/\"?<>!\(\)\-]") # no quantifier ... eng = sum(1 for t in texts if pattern.fullmatch(t.strip())) ``` For a **string** argument the text is first split into single characters (`texts = list(texts)`), so each `fullmatch` sees one character and works. But for a **list** argument each item is a whole multi-character string, and `fullmatch` of a one-character pattern against a multi-character string always fails — so `is_english()` returns `False` for **any** list, regardless of content. ```python is_english("This is English") # True (ok) is_english(["The quick brown fox jumps.", "Hello world."]) # False (bug — should be True) is_english(["这是中文。"]) # False (right answer, wrong reason) ``` Many call sites pass lists and were therefore silently always-`False`, e.g.: - `rag/llm/chat_model.py:1088`, `rag/llm/cv_model.py:168,1155` — `is_english([ans])` when an answer is truncated at `max_tokens`, so an English reply gets the Chinese "······由于长度的原因,回答被截断了,要继续吗?" continuation suffix instead of the English one. - `rag/app/book.py` — `remove_contents_table(..., eng=is_english([...sections...]))`, so English books have their contents table stripped in Chinese mode. - `common/doc_store/es_conn_base.py:339`, `rag/utils/opensearch_conn.py:733` — `is_english(txt.split())` in highlight handling. - plus `rag/app/qa.py`, `rag/flow/parser/utils.py`, `common/doc_store/infinity_conn_base.py`. ### Fix Add a `+` quantifier so an all-English multi-character item matches: ```python pattern = re.compile(r"[`a-zA-Z0-9\s.,':;/\"?<>!\(\)\-]+") ``` The string path is unchanged (single characters still match) and non-English lists still return `False`. Adds `test/unit_test/rag/test_is_english.py`; the two list cases fail before this change and pass after. ### Type of change - [x] Bug Fix (non-breaking change which fixes an issue) Used the Claude CLI while working on this. --- rag/nlp/__init__.py | 2 +- test/unit_test/rag/test_is_english.py | 60 +++++++++++++++++++++++++++ 2 files changed, 61 insertions(+), 1 deletion(-) create mode 100644 test/unit_test/rag/test_is_english.py diff --git a/rag/nlp/__init__.py b/rag/nlp/__init__.py index b6a02f2713..9e7f332dbd 100644 --- a/rag/nlp/__init__.py +++ b/rag/nlp/__init__.py @@ -237,7 +237,7 @@ def is_english(texts): if not texts: return False - pattern = re.compile(r"[`a-zA-Z0-9\s.,':;/\"?<>!\(\)\-]") + pattern = re.compile(r"[`a-zA-Z0-9\s.,':;/\"?<>!\(\)\-]+") if isinstance(texts, str): texts = list(texts) diff --git a/test/unit_test/rag/test_is_english.py b/test/unit_test/rag/test_is_english.py new file mode 100644 index 0000000000..3b589065f6 --- /dev/null +++ b/test/unit_test/rag/test_is_english.py @@ -0,0 +1,60 @@ +# +# Copyright 2025 The InfiniFlow Authors. All Rights Reserved. +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. +# + +import sys +import types + +import pytest + + +def _stub(name, **attrs): + mod = types.ModuleType(name) + for key, value in attrs.items(): + setattr(mod, key, value) + sys.modules.setdefault(name, mod) + return mod + + +# Stub heavy module-level imports so rag.nlp can be imported in isolation. +_stub("common.token_utils", num_tokens_from_string=lambda *a, **k: 0) +_stub("roman_numbers") +_stub("word2number", w2n=types.SimpleNamespace()) +_stub("cn2an", cn2an=lambda *a, **k: 0) +_pil = _stub("PIL") +_pil.Image = _stub("PIL.Image") +_stub("chardet") + +from rag.nlp import is_english + + +@pytest.mark.p2 +def test_is_english_string_path_unchanged(): + assert is_english("This is English") is True + + +@pytest.mark.p2 +def test_is_english_list_of_english_sentences(): + assert is_english(["The quick brown fox jumps.", "Hello world today.", "Good morning sir."]) is True + + +@pytest.mark.p2 +def test_is_english_single_english_answer_in_list(): + assert is_english(["This is a normal English answer."]) is True + + +@pytest.mark.p2 +def test_is_english_chinese_list_is_false(): + assert is_english(["这是中文段落。", "另一个中文段落。"]) is False