From c2ce49e037ded8eada92411ad6c14a2382dedcd1 Mon Sep 17 00:00:00 2001
From: Octopus <liyuan851277048@icloud.com>
Date: Thu, 9 Apr 2026 19:10:34 +0800
Subject: [PATCH] fix: strip single quotes from synonym terms to prevent
 Infinity TokenError (#13969)

Fixes #13823

## Problem

When querying with words like `cat`, RAGFlow's query expansion system
looks up synonyms via WordNet, which can return terms containing single
quotes (e.g., `cat-o'-nine-tails`). When using Infinity as the document
store, these unescaped single quotes in the query string cause a
`TokenError` because Infinity's lexer treats `'` as a string delimiter.

```
TokenError: Error tokenizing ' OR "big cat" OR "computerized tomography")^0.7)': Missing ' from 1:531
```

## Solution

Strip single quotes from synonym terms before they are inserted into
query expressions, consistent with how single quotes are already
stripped from the input query text (line 51 of `query.py`):

- **`common/query_base.py`**: In `sub_special_char()`, strip `'` before
escaping other special characters. This fixes the Chinese text
processing path and the `paragraph()` method.
- **`rag/nlp/query.py`**: In the English text path, strip `'` from
tokenized synonym terms.
- **`memory/services/query.py`**: Same fix for the memory query English
text path.

## Testing

The fix can be verified by:
1. Using Infinity as the document store (`DOC_ENGINE=infinity`)
2. Creating a dataset and running a retrieval test with the keyword
`cat`
3. Confirming no `TokenError` is raised and results are returned
normally

<!-- This is an auto-generated comment: release notes by coderabbit.ai
-->

## Summary by CodeRabbit

* **Bug Fixes**
* Enhanced special character handling in query processing and synonym
expansion by properly sanitizing single quotes before text processing.
* Simplified OCR detection output by removing timing metadata while
preserving core detection accuracy.

<!-- end of auto-generated comment: release notes by coderabbit.ai -->

---------

Co-authored-by: ximi <octo-patch@github.com>
---
 common/query_base.py     |  4 +++-
 deepdoc/vision/ocr.py    | 12 +++---------
 memory/services/query.py |  4 +++-
 rag/nlp/query.py         |  4 +++-
 4 files changed, 12 insertions(+), 12 deletions(-)

diff --git a/common/query_base.py b/common/query_base.py
index c728304639..ef7ba23d1f 100644
--- a/common/query_base.py
+++ b/common/query_base.py
@@ -32,7 +32,9 @@ class QueryBase(ABC):
 
     @staticmethod
     def sub_special_char(line):
-        return re.sub(r"([:\{\}/\[\]\-\*\?\"\(\)\|\+~\^])", r"\\\1", line).strip()
+        # Strip single quotes first to avoid Infinity's lexer treating them as string delimiters,
+        # then escape remaining Infinity/Lucene special characters.
+        return re.sub(r"([:\{\}/\[\]\-\*\?\"\(\)\|\+~\^])", r"\\\1", line.replace("'", "")).strip()
 
     @staticmethod
     def rmWWW(txt):
diff --git a/deepdoc/vision/ocr.py b/deepdoc/vision/ocr.py
index 1f573bda59..d5e546a3c5 100644
--- a/deepdoc/vision/ocr.py
+++ b/deepdoc/vision/ocr.py
@@ -670,19 +670,13 @@ class OCR:
         if device_id is None:
             device_id = 0
 
-        time_dict = {'det': 0, 'rec': 0, 'cls': 0, 'all': 0}
-
         if img is None:
-            return None, None, time_dict
+            return None
 
-        start = time.time()
-        dt_boxes, elapse = self.text_detector[device_id](img)
-        time_dict['det'] = elapse
+        dt_boxes, _ = self.text_detector[device_id](img)
 
         if dt_boxes is None:
-            end = time.time()
-            time_dict['all'] = end - start
-            return None, None, time_dict
+            return None
 
         return zip(self.sorted_boxes(dt_boxes), [
                    ("", 0) for _ in range(len(dt_boxes))])
diff --git a/memory/services/query.py b/memory/services/query.py
index 06f253f6b5..0e97f1fc2b 100644
--- a/memory/services/query.py
+++ b/memory/services/query.py
@@ -72,7 +72,9 @@ class MsgTextQuery(QueryBase):
             syns = []
             for tk, w in tks_w[:256]:
                 syn = self.syn.lookup(tk)
-                syn = rag_tokenizer.tokenize(" ".join(syn)).split()
+                # Strip single quotes to avoid Infinity lexer TokenError
+                # (e.g. WordNet returns "cat-o'-nine-tails" for "cat")
+                syn = re.sub(r"'", "", rag_tokenizer.tokenize(" ".join(syn))).split()
                 keywords.extend(syn)
                 syn = ["\"{}\"^{:.4f}".format(s, w / 4.) for s in syn if s.strip()]
                 syns.append(" ".join(syn))
diff --git a/rag/nlp/query.py b/rag/nlp/query.py
index 2f5807147c..2d50eea343 100644
--- a/rag/nlp/query.py
+++ b/rag/nlp/query.py
@@ -65,7 +65,9 @@ class FulltextQueryer(QueryBase):
             tks_w = [(tk.strip(), w) for tk, w in tks_w if tk.strip()]
             syns = []
             for tk, w in tks_w[:256]:
-                syn = [rag_tokenizer.tokenize(s) for s in self.syn.lookup(tk)]
+                # Strip single quotes from synonym terms to avoid Infinity lexer TokenError
+                # (e.g. WordNet returns "cat-o'-nine-tails" for "cat")
+                syn = [rag_tokenizer.tokenize(s).replace("'", "") for s in self.syn.lookup(tk)]
                 keywords.extend(syn)
                 syn = ["\"{}\"^{:.4f}".format(s, w / 4.) for s in syn if s.strip()]
                 syns.append(" ".join(syn))