diff --git a/common/query_base.py b/common/query_base.py index eae44514f1..c728304639 100644 --- a/common/query_base.py +++ b/common/query_base.py @@ -32,7 +32,7 @@ class QueryBase(ABC): @staticmethod def sub_special_char(line): - return re.sub(r"([:\{\}/\[\]\-\*\"\(\)\|\+~\^])", r"\\\1", line).strip() + return re.sub(r"([:\{\}/\[\]\-\*\?\"\(\)\|\+~\^])", r"\\\1", line).strip() @staticmethod def rmWWW(txt): diff --git a/rag/nlp/query.py b/rag/nlp/query.py index 39b6b439d0..2f5807147c 100644 --- a/rag/nlp/query.py +++ b/rag/nlp/query.py @@ -41,8 +41,14 @@ class FulltextQueryer(QueryBase): def question(self, txt, tbl="qa", min_match: float = 0.6): original_query = txt txt = self.add_space_between_eng_zh(txt) + + # Strip Infinity ESCAPABLE characters from the query. + # + # Infinity's search_lexer.l defines ESCAPABLE characters [\x20()^"'~*?:\\] + # If these characters appear unescaped in a query, Infinity's lexer will + # interpret them as special tokens, causing parsing errors. txt = re.sub( - r"[ :|\r\n\t,,。??/`!!&^%%()\[\]{}<>]+", + r"[ :|\r\n\t,,。??/`!!&^%%()\[\]{}<>*~'\"\\]+", " ", rag_tokenizer.tradi2simp(rag_tokenizer.strQ2B(txt.lower())), ).strip() diff --git a/rag/utils/infinity_conn.py b/rag/utils/infinity_conn.py index 1976e14270..c039a7f7c1 100644 --- a/rag/utils/infinity_conn.py +++ b/rag/utils/infinity_conn.py @@ -243,6 +243,7 @@ class InfinityConnection(InfinityConnectionBase): for matchExpr in match_expressions: if isinstance(matchExpr, MatchTextExpr): fields = ",".join(matchExpr.fields) + self.logger.info(f"INFINITY search match_text: {matchExpr.matching_text}") builder = builder.match_text( fields, matchExpr.matching_text,