From f2aadd3871a645b0523047e3fdec534c1c79ec4f Mon Sep 17 00:00:00 2001
From: Yash Raj Pandey <55940078+devYRPauli@users.noreply.github.com>
Date: Mon, 8 Jun 2026 08:25:23 -0400
Subject: [PATCH] Fix: is_english() returns False for any list argument (broken
 language detection) (#15489)
MIME-Version: 1.0
Content-Type: text/plain; charset=UTF-8
Content-Transfer-Encoding: 8bit

### What problem does this PR solve?

`is_english()` in `rag/nlp/__init__.py` compiles a **single-character**
regex class and `fullmatch`es it against each item:

```python
pattern = re.compile(r"[`a-zA-Z0-9\s.,':;/\"?<>!\(\)\-]")   # no quantifier
...
eng = sum(1 for t in texts if pattern.fullmatch(t.strip()))
```

For a **string** argument the text is first split into single characters
(`texts = list(texts)`), so each `fullmatch` sees one character and
works. But for a **list** argument each item is a whole multi-character
string, and `fullmatch` of a one-character pattern against a
multi-character string always fails — so `is_english()` returns `False`
for **any** list, regardless of content.

```python
is_english("This is English")                              # True   (ok)
is_english(["The quick brown fox jumps.", "Hello world."]) # False  (bug — should be True)
is_english(["这是中文。"])                                    # False  (right answer, wrong reason)
```

Many call sites pass lists and were therefore silently always-`False`,
e.g.:

- `rag/llm/chat_model.py:1088`, `rag/llm/cv_model.py:168,1155` —
`is_english([ans])` when an answer is truncated at `max_tokens`, so an
English reply gets the Chinese "······由于长度的原因，回答被截断了，要继续吗？" continuation
suffix instead of the English one.
- `rag/app/book.py` — `remove_contents_table(...,
eng=is_english([...sections...]))`, so English books have their contents
table stripped in Chinese mode.
- `common/doc_store/es_conn_base.py:339`,
`rag/utils/opensearch_conn.py:733` — `is_english(txt.split())` in
highlight handling.
- plus `rag/app/qa.py`, `rag/flow/parser/utils.py`,
`common/doc_store/infinity_conn_base.py`.

### Fix

Add a `+` quantifier so an all-English multi-character item matches:

```python
pattern = re.compile(r"[`a-zA-Z0-9\s.,':;/\"?<>!\(\)\-]+")
```

The string path is unchanged (single characters still match) and
non-English lists still return `False`. Adds
`test/unit_test/rag/test_is_english.py`; the two list cases fail before
this change and pass after.

### Type of change

- [x] Bug Fix (non-breaking change which fixes an issue)

Used the Claude CLI while working on this.
---
 rag/nlp/__init__.py                   |  2 +-
 test/unit_test/rag/test_is_english.py | 60 +++++++++++++++++++++++++++
 2 files changed, 61 insertions(+), 1 deletion(-)
 create mode 100644 test/unit_test/rag/test_is_english.py

diff --git a/rag/nlp/__init__.py b/rag/nlp/__init__.py
index b6a02f2713..9e7f332dbd 100644
--- a/rag/nlp/__init__.py
+++ b/rag/nlp/__init__.py
@@ -237,7 +237,7 @@ def is_english(texts):
     if not texts:
         return False
 
-    pattern = re.compile(r"[`a-zA-Z0-9\s.,':;/\"?<>!\(\)\-]")
+    pattern = re.compile(r"[`a-zA-Z0-9\s.,':;/\"?<>!\(\)\-]+")
 
     if isinstance(texts, str):
         texts = list(texts)
diff --git a/test/unit_test/rag/test_is_english.py b/test/unit_test/rag/test_is_english.py
new file mode 100644
index 0000000000..3b589065f6
--- /dev/null
+++ b/test/unit_test/rag/test_is_english.py
@@ -0,0 +1,60 @@
+#
+#  Copyright 2025 The InfiniFlow Authors. All Rights Reserved.
+#
+#  Licensed under the Apache License, Version 2.0 (the "License");
+#  you may not use this file except in compliance with the License.
+#  You may obtain a copy of the License at
+#
+#      http://www.apache.org/licenses/LICENSE-2.0
+#
+#  Unless required by applicable law or agreed to in writing, software
+#  distributed under the License is distributed on an "AS IS" BASIS,
+#  WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+#  See the License for the specific language governing permissions and
+#  limitations under the License.
+#
+
+import sys
+import types
+
+import pytest
+
+
+def _stub(name, **attrs):
+    mod = types.ModuleType(name)
+    for key, value in attrs.items():
+        setattr(mod, key, value)
+    sys.modules.setdefault(name, mod)
+    return mod
+
+
+# Stub heavy module-level imports so rag.nlp can be imported in isolation.
+_stub("common.token_utils", num_tokens_from_string=lambda *a, **k: 0)
+_stub("roman_numbers")
+_stub("word2number", w2n=types.SimpleNamespace())
+_stub("cn2an", cn2an=lambda *a, **k: 0)
+_pil = _stub("PIL")
+_pil.Image = _stub("PIL.Image")
+_stub("chardet")
+
+from rag.nlp import is_english
+
+
+@pytest.mark.p2
+def test_is_english_string_path_unchanged():
+    assert is_english("This is English") is True
+
+
+@pytest.mark.p2
+def test_is_english_list_of_english_sentences():
+    assert is_english(["The quick brown fox jumps.", "Hello world today.", "Good morning sir."]) is True
+
+
+@pytest.mark.p2
+def test_is_english_single_english_answer_in_list():
+    assert is_english(["This is a normal English answer."]) is True
+
+
+@pytest.mark.p2
+def test_is_english_chinese_list_is_false():
+    assert is_english(["这是中文段落。", "另一个中文段落。"]) is False