diff --git a/common/query_base.py b/common/query_base.py index c728304639..ef7ba23d1f 100644 --- a/common/query_base.py +++ b/common/query_base.py @@ -32,7 +32,9 @@ class QueryBase(ABC): @staticmethod def sub_special_char(line): - return re.sub(r"([:\{\}/\[\]\-\*\?\"\(\)\|\+~\^])", r"\\\1", line).strip() + # Strip single quotes first to avoid Infinity's lexer treating them as string delimiters, + # then escape remaining Infinity/Lucene special characters. + return re.sub(r"([:\{\}/\[\]\-\*\?\"\(\)\|\+~\^])", r"\\\1", line.replace("'", "")).strip() @staticmethod def rmWWW(txt): diff --git a/deepdoc/vision/ocr.py b/deepdoc/vision/ocr.py index 1f573bda59..d5e546a3c5 100644 --- a/deepdoc/vision/ocr.py +++ b/deepdoc/vision/ocr.py @@ -670,19 +670,13 @@ class OCR: if device_id is None: device_id = 0 - time_dict = {'det': 0, 'rec': 0, 'cls': 0, 'all': 0} - if img is None: - return None, None, time_dict + return None - start = time.time() - dt_boxes, elapse = self.text_detector[device_id](img) - time_dict['det'] = elapse + dt_boxes, _ = self.text_detector[device_id](img) if dt_boxes is None: - end = time.time() - time_dict['all'] = end - start - return None, None, time_dict + return None return zip(self.sorted_boxes(dt_boxes), [ ("", 0) for _ in range(len(dt_boxes))]) diff --git a/memory/services/query.py b/memory/services/query.py index 06f253f6b5..0e97f1fc2b 100644 --- a/memory/services/query.py +++ b/memory/services/query.py @@ -72,7 +72,9 @@ class MsgTextQuery(QueryBase): syns = [] for tk, w in tks_w[:256]: syn = self.syn.lookup(tk) - syn = rag_tokenizer.tokenize(" ".join(syn)).split() + # Strip single quotes to avoid Infinity lexer TokenError + # (e.g. WordNet returns "cat-o'-nine-tails" for "cat") + syn = re.sub(r"'", "", rag_tokenizer.tokenize(" ".join(syn))).split() keywords.extend(syn) syn = ["\"{}\"^{:.4f}".format(s, w / 4.) for s in syn if s.strip()] syns.append(" ".join(syn)) diff --git a/rag/nlp/query.py b/rag/nlp/query.py index 2f5807147c..2d50eea343 100644 --- a/rag/nlp/query.py +++ b/rag/nlp/query.py @@ -65,7 +65,9 @@ class FulltextQueryer(QueryBase): tks_w = [(tk.strip(), w) for tk, w in tks_w if tk.strip()] syns = [] for tk, w in tks_w[:256]: - syn = [rag_tokenizer.tokenize(s) for s in self.syn.lookup(tk)] + # Strip single quotes from synonym terms to avoid Infinity lexer TokenError + # (e.g. WordNet returns "cat-o'-nine-tails" for "cat") + syn = [rag_tokenizer.tokenize(s).replace("'", "") for s in self.syn.lookup(tk)] keywords.extend(syn) syn = ["\"{}\"^{:.4f}".format(s, w / 4.) for s in syn if s.strip()] syns.append(" ".join(syn))