mirror of
https://github.com/infiniflow/ragflow.git
synced 2026-07-04 18:45:38 +08:00
51 lines
2.1 KiB
Python
51 lines
2.1 KiB
Python
|
|
#
|
||
|
|
# Copyright 2025 The InfiniFlow Authors. All Rights Reserved.
|
||
|
|
#
|
||
|
|
# Licensed under the Apache License, Version 2.0 (the "License");
|
||
|
|
# you may not use this file except in compliance with the License.
|
||
|
|
# You may obtain a copy of the License at
|
||
|
|
#
|
||
|
|
# http://www.apache.org/licenses/LICENSE-2.0
|
||
|
|
#
|
||
|
|
# Unless required by applicable law or agreed to in writing, software
|
||
|
|
# distributed under the License is distributed on an "AS IS" BASIS,
|
||
|
|
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
|
||
|
|
# See the License for the specific language governing permissions and
|
||
|
|
# limitations under the License.
|
||
|
|
#
|
||
|
|
|
||
|
|
"""Shared setup for RAGFlow unit tests.
|
||
|
|
|
||
|
|
Several parsers and the chunking pipeline tokenize text with NLTK, which needs
|
||
|
|
the ``punkt_tab`` and ``wordnet`` data sets. Production provisions these via
|
||
|
|
``download_deps.py`` (into ``nltk_data``, exported as ``NLTK_DATA`` by
|
||
|
|
``docker/launch_backend_service.sh``) and ``api.validation`` at startup, but the
|
||
|
|
unit-test runner has neither. Without the data, tokenizer-backed tests such as
|
||
|
|
``test_epub_parser`` and ``test_dataflow_service`` fail with
|
||
|
|
``LookupError: Resource 'punkt_tab' not found``. Make sure the data is reachable
|
||
|
|
before any test imports a tokenizer: reuse a provisioned ``nltk_data`` directory
|
||
|
|
when present, and download only what is still missing.
|
||
|
|
"""
|
||
|
|
|
||
|
|
import os
|
||
|
|
|
||
|
|
import nltk
|
||
|
|
|
||
|
|
# Reuse data already fetched by download_deps.py (the directory the app exports
|
||
|
|
# as NLTK_DATA) so provisioned environments do not download it again.
|
||
|
|
_REPO_ROOT = os.path.abspath(os.path.join(os.path.dirname(__file__), os.pardir, os.pardir))
|
||
|
|
_LOCAL_NLTK_DATA = os.path.join(_REPO_ROOT, "nltk_data")
|
||
|
|
if os.path.isdir(_LOCAL_NLTK_DATA) and _LOCAL_NLTK_DATA not in nltk.data.path:
|
||
|
|
nltk.data.path.insert(0, _LOCAL_NLTK_DATA)
|
||
|
|
|
||
|
|
# (download name, resource path used by nltk.data.find)
|
||
|
|
_REQUIRED_NLTK_DATA = (
|
||
|
|
("punkt_tab", "tokenizers/punkt_tab"),
|
||
|
|
("wordnet", "corpora/wordnet"),
|
||
|
|
)
|
||
|
|
for _name, _find_path in _REQUIRED_NLTK_DATA:
|
||
|
|
try:
|
||
|
|
nltk.data.find(_find_path)
|
||
|
|
except LookupError:
|
||
|
|
nltk.download(_name, quiet=True)
|