mirror of
https://github.com/infiniflow/ragflow.git
synced 2026-06-29 23:41:12 +08:00
### What problem does this PR solve? Add validation logic for parser_config. Refactor the processing flow. Before change, validation logics and update logics are mixed up - some validation logis executes followed by some update logic executes and then another such "validation-and-then-update" which is not good. After change, all validation logic executes firstly. Update logic will be executed after ALL validation logic executed. Validation logic for parameters (that come from front end) will be checked using Pydantic. For validation logic that depends on data from DB, they will be in separate methods. ### Type of change - [x] Bug Fix (non-breaking change which fixes an issue) - [x] Refactoring
111 lines
3.3 KiB
Python
111 lines
3.3 KiB
Python
#
|
|
# Copyright 2024 The InfiniFlow Authors. All Rights Reserved.
|
|
#
|
|
# Licensed under the Apache License, Version 2.0 (the "License");
|
|
# you may not use this file except in compliance with the License.
|
|
# You may obtain a copy of the License at
|
|
#
|
|
# http://www.apache.org/licenses/LICENSE-2.0
|
|
#
|
|
# Unless required by applicable law or agreed to in writing, software
|
|
# distributed under the License is distributed on an "AS IS" BASIS,
|
|
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
|
|
# See the License for the specific language governing permissions and
|
|
# limitations under the License.
|
|
#
|
|
|
|
import logging
|
|
import json
|
|
import os
|
|
import time
|
|
import re
|
|
from nltk.corpus import wordnet
|
|
from common.file_utils import get_project_base_directory
|
|
|
|
|
|
# Forces NLTK to load the corpus synchronously once, preventing concurrent tasks
|
|
# from triggering the lazy-loading race condition.
|
|
try:
|
|
wordnet.ensure_loaded()
|
|
except Exception:
|
|
logging.warning("Fail to load wordnet.ensure_loaded()")
|
|
|
|
class Dealer:
|
|
def __init__(self, redis=None):
|
|
|
|
self.lookup_num = 100000000
|
|
self.load_tm = time.time() - 1000000
|
|
self.dictionary = None
|
|
path = os.path.join(get_project_base_directory(), "rag/res", "synonym.json")
|
|
try:
|
|
with open(path, 'r') as f:
|
|
self.dictionary = json.load(f)
|
|
|
|
self.dictionary = { (k.lower() if isinstance(k, str) else k): v for k, v in self.dictionary.items() }
|
|
except Exception:
|
|
logging.warning("Missing synonym.json")
|
|
self.dictionary = {}
|
|
|
|
if not redis:
|
|
logging.warning(
|
|
"Realtime synonym is disabled, since no redis connection.")
|
|
if not len(self.dictionary.keys()):
|
|
logging.warning("Fail to load synonym")
|
|
|
|
self.redis = redis
|
|
self.load()
|
|
|
|
def load(self):
|
|
if not self.redis:
|
|
return
|
|
|
|
if self.lookup_num < 100:
|
|
return
|
|
tm = time.time()
|
|
if tm - self.load_tm < 3600:
|
|
return
|
|
|
|
self.load_tm = time.time()
|
|
self.lookup_num = 0
|
|
d = self.redis.get("kevin_synonyms")
|
|
if not d:
|
|
return
|
|
try:
|
|
d = json.loads(d)
|
|
self.dictionary = d
|
|
except Exception as e:
|
|
logging.error("Fail to load synonym!" + str(e))
|
|
|
|
|
|
def lookup(self, tk, topn=8):
|
|
if not tk or not isinstance(tk, str):
|
|
return []
|
|
|
|
# 1) Check the custom dictionary first (both keys and tk are already lowercase)
|
|
self.lookup_num += 1
|
|
self.load()
|
|
key = re.sub(r"[ \t]+", " ", tk.strip())
|
|
res = self.dictionary.get(key, [])
|
|
if isinstance(res, str):
|
|
res = [res]
|
|
if res: # Found in dictionary → return directly
|
|
return res[:topn]
|
|
|
|
# 2) If not found and tk is purely alphabetical → fallback to WordNet
|
|
if re.fullmatch(r"[a-z]+", tk):
|
|
wn_set = {
|
|
re.sub("_", " ", syn.name().split(".")[0])
|
|
for syn in wordnet.synsets(tk)
|
|
}
|
|
wn_set.discard(tk) # Remove the original token itself
|
|
wn_res = [t for t in wn_set if t]
|
|
return wn_res[:topn]
|
|
|
|
# 3) Nothing found in either source
|
|
return []
|
|
|
|
|
|
if __name__ == '__main__':
|
|
dl = Dealer()
|
|
print(dl.dictionary)
|