diff --git a/agent/templates/stock_market_research_assistant.json b/agent/templates/stock_market_research_assistant.json index 00e9cecd4a..60c56192a0 100644 --- a/agent/templates/stock_market_research_assistant.json +++ b/agent/templates/stock_market_research_assistant.json @@ -57,7 +57,7 @@ "component_name": "TavilySearch", "name": "TavilySearch", "params": { - "api_key": "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", + "api_key": "", "days": 7, "exclude_domains": [], "include_answer": false, @@ -651,7 +651,7 @@ "component_name": "TavilySearch", "name": "TavilySearch", "params": { - "api_key": "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", + "api_key": "", "days": 7, "exclude_domains": [], "include_answer": false, diff --git a/api/apps/services/models_api_service.py b/api/apps/services/models_api_service.py index ee850f6aef..19c533d872 100644 --- a/api/apps/services/models_api_service.py +++ b/api/apps/services/models_api_service.py @@ -16,6 +16,7 @@ import os import logging +from api.db.joint_services.tenant_model_service import ensure_mineru_from_env, ensure_paddleocr_from_env from common.constants import ActiveStatusEnum, LLMType from common.settings import FACTORY_LLM_INFOS from api.db.services.tenant_model_provider_service import TenantModelProviderService @@ -301,6 +302,9 @@ def list_tenant_added_models(tenant_id: str, model_type_filter: str=None): if not e: return False, "Tenant not found" + ensure_mineru_from_env(tenant_id) + ensure_paddleocr_from_env(tenant_id) + if model_type_filter: model_type_filter = model_type_filter.lower() diff --git a/api/db/joint_services/tenant_model_service.py b/api/db/joint_services/tenant_model_service.py index 797715b889..bccd7ecdbb 100644 --- a/api/db/joint_services/tenant_model_service.py +++ b/api/db/joint_services/tenant_model_service.py @@ -18,8 +18,8 @@ import os import enum import json from common import settings -from common.constants import LLMType, ActiveStatusEnum -from api.db.services.tenant_llm_service import TenantLLMService, TenantService +from common.constants import ActiveStatusEnum, LLMType, MINERU_DEFAULT_CONFIG, MINERU_ENV_KEYS, PADDLEOCR_DEFAULT_CONFIG, PADDLEOCR_ENV_KEYS +from api.db.services.tenant_llm_service import TenantService from api.db.services.tenant_model_provider_service import TenantModelProviderService from api.db.services.tenant_model_instance_service import TenantModelInstanceService from api.db.services.tenant_model_service import TenantModelService @@ -27,6 +27,106 @@ from api.db.services.tenant_model_service import TenantModelService logger = logging.getLogger(__name__) +def _decode_api_key_config(raw_api_key: str) -> tuple[str, bool | None, str | None]: + if not raw_api_key: + return raw_api_key, None, None + + try: + parsed = json.loads(raw_api_key) + except Exception: + return raw_api_key, None, None + + if not isinstance(parsed, dict): + return raw_api_key, None, None + + is_tools = bool(parsed["is_tools"]) if "is_tools" in parsed else None + if set(parsed.keys()) <= {"api_key", "is_tools"}: + return parsed.get("api_key", ""), is_tools, None + + return parsed.get("api_key", raw_api_key), is_tools, raw_api_key + + +def get_first_provider_model_name(tenant_id: str, provider_name: str, model_type: str | enum.Enum) -> str | None: + model_type_val = model_type if isinstance(model_type, str) else model_type.value + provider_obj = TenantModelProviderService.get_by_tenant_id_and_provider_name(tenant_id, provider_name) + if not provider_obj: + return None + + for instance_obj in TenantModelInstanceService.get_all_by_provider_id(provider_obj.id): + if instance_obj.status != ActiveStatusEnum.ACTIVE.value: + continue + for model_obj in TenantModelService.get_models_by_instance_id(instance_obj.id): + if model_obj.model_type == model_type_val and model_obj.status == ActiveStatusEnum.ACTIVE.value: + return f"{model_obj.model_name}@{instance_obj.instance_name}@{provider_name}" + return None + + +def _collect_env_config(env_keys: list[str], default_config: dict) -> dict | None: + config = dict(default_config) + found = False + for key in env_keys: + value = os.environ.get(key) + if value: + found = True + config[key] = value + return config if found else None + + +def _ensure_ocr_provider_from_env(tenant_id: str, provider_name: str, model_name: str, config: dict | None) -> str | None: + if not config: + return None + + provider_obj = TenantModelProviderService.get_by_tenant_id_and_provider_name(tenant_id, provider_name) + if not provider_obj: + TenantModelProviderService.insert(tenant_id=tenant_id, provider_name=provider_name) + provider_obj = TenantModelProviderService.get_by_tenant_id_and_provider_name(tenant_id, provider_name) + + api_key = json.dumps(config) + instance_obj = TenantModelInstanceService.get_by_provider_id_and_api_key(provider_obj.id, api_key) + if not instance_obj: + instance_obj = TenantModelInstanceService.create_instance( + provider_id=provider_obj.id, + instance_name=model_name, + api_key=api_key, + extra="{}", + ) + + model_obj = TenantModelService.get_by_provider_id_and_instance_id_and_model_type_and_model_name( + provider_obj.id, + instance_obj.id, + LLMType.OCR.value, + model_name, + ) + if not model_obj: + TenantModelService.insert( + model_name=model_name, + provider_id=provider_obj.id, + instance_id=instance_obj.id, + model_type=LLMType.OCR.value, + extra=json.dumps({"max_tokens": 0}), + ) + + return f"{model_name}@{instance_obj.instance_name}@{provider_name}" + + +def ensure_mineru_from_env(tenant_id: str) -> str | None: + return _ensure_ocr_provider_from_env( + tenant_id, + "MinerU", + "mineru-from-env", + _collect_env_config(MINERU_ENV_KEYS, MINERU_DEFAULT_CONFIG), + ) + + +def ensure_paddleocr_from_env(tenant_id: str) -> str | None: + return _ensure_ocr_provider_from_env( + tenant_id, + "PaddleOCR", + "paddleocr-from-env", + _collect_env_config(PADDLEOCR_ENV_KEYS, PADDLEOCR_DEFAULT_CONFIG), + ) + + def get_tenant_default_model_by_type(tenant_id: str, model_type: str|enum.Enum): exist, tenant = TenantService.get_by_id(tenant_id) if not exist: @@ -103,7 +203,7 @@ def get_model_config_from_provider_instance(tenant_id, model_type: str|enum.Enum raise LookupError(f"Instance {instance_name} not found for model {model_name}.") model_obj = TenantModelService.get_by_provider_id_and_instance_id_and_model_type_and_model_name(provider_obj.id, instance_obj.id, model_type_val, pure_model_name) - api_key, is_tool, api_key_payload = TenantLLMService._decode_api_key_config(instance_obj.api_key) + api_key, is_tool, api_key_payload = _decode_api_key_config(instance_obj.api_key) extra_fields = json.loads(instance_obj.extra) if instance_obj.extra else {} if model_obj: diff --git a/common/parser_config_utils.py b/common/parser_config_utils.py index daf91cc8e1..c73baf6381 100644 --- a/common/parser_config_utils.py +++ b/common/parser_config_utils.py @@ -24,13 +24,13 @@ def normalize_layout_recognizer(layout_recognizer_raw: Any) -> tuple[Any, str | if isinstance(layout_recognizer_raw, str): lowered = layout_recognizer_raw.lower() if lowered.endswith("@mineru"): - parser_model_name = layout_recognizer_raw.rsplit("@", 1)[0] + parser_model_name = layout_recognizer_raw layout_recognizer = "MinerU" elif lowered.endswith("@paddleocr"): - parser_model_name = layout_recognizer_raw.rsplit("@", 1)[0] + parser_model_name = layout_recognizer_raw layout_recognizer = "PaddleOCR" elif lowered.endswith("@opendataloader"): - parser_model_name = layout_recognizer_raw.rsplit("@", 1)[0] + parser_model_name = layout_recognizer_raw layout_recognizer = "OpenDataLoader" return layout_recognizer, parser_model_name diff --git a/deepdoc/parser/paddleocr_parser.py b/deepdoc/parser/paddleocr_parser.py index c3afebdff1..218f0c0186 100644 --- a/deepdoc/parser/paddleocr_parser.py +++ b/deepdoc/parser/paddleocr_parser.py @@ -223,8 +223,6 @@ class PaddleOCRParser(RAGFlowPdfParser): request_timeout: int = 600, ): """Initialize PaddleOCR parser.""" - super().__init__() - self.outlines = [] self.api_url = api_url.rstrip("/") if api_url else os.getenv("PADDLEOCR_API_URL", "") self.access_token = access_token or os.getenv("PADDLEOCR_ACCESS_TOKEN") diff --git a/rag/app/naive.py b/rag/app/naive.py index 18f790003a..ff0fa3d3f9 100644 --- a/rag/app/naive.py +++ b/rag/app/naive.py @@ -31,7 +31,13 @@ from common.token_utils import num_tokens_from_string from common.constants import LLMType, MAXIMUM_PAGE_NUMBER from api.db.services.llm_service import LLMBundle -from api.db.joint_services.tenant_model_service import get_tenant_default_model_by_type, get_model_config_from_provider_instance +from api.db.joint_services.tenant_model_service import ( + ensure_mineru_from_env, + ensure_paddleocr_from_env, + get_first_provider_model_name, + get_model_config_from_provider_instance, + get_tenant_default_model_by_type, +) from rag.utils.file_utils import extract_embed_file, extract_links_from_pdf, extract_links_from_docx, extract_html from deepdoc.parser import DocxParser, EpubParser, ExcelParser, HtmlParser, JsonParser, MarkdownElementExtractor, MarkdownParser, PdfParser, TxtParser from deepdoc.parser.figure_parser import VisionFigureParser, vision_figure_parser_docx_wrapper_naive, vision_figure_parser_pdf_wrapper @@ -137,14 +143,7 @@ def by_mineru( if tenant_id: if not mineru_llm_name: try: - from api.db.services.tenant_llm_service import TenantLLMService - - env_name = TenantLLMService.ensure_mineru_from_env(tenant_id) - candidates = TenantLLMService.query(tenant_id=tenant_id, llm_factory="MinerU", model_type=LLMType.OCR) - if candidates: - mineru_llm_name = candidates[0].llm_name - elif env_name: - mineru_llm_name = env_name + mineru_llm_name = get_first_provider_model_name(tenant_id, "MinerU", LLMType.OCR) or ensure_mineru_from_env(tenant_id) except Exception as e: # best-effort fallback logging.warning(f"fallback to env mineru: {e}") @@ -281,14 +280,7 @@ def by_paddleocr( if tenant_id: if not paddleocr_llm_name: try: - from api.db.services.tenant_llm_service import TenantLLMService - - env_name = TenantLLMService.ensure_paddleocr_from_env(tenant_id) - candidates = TenantLLMService.query(tenant_id=tenant_id, llm_factory="PaddleOCR", model_type=LLMType.OCR) - if candidates: - paddleocr_llm_name = candidates[0].llm_name - elif env_name: - paddleocr_llm_name = env_name + paddleocr_llm_name = get_first_provider_model_name(tenant_id, "PaddleOCR", LLMType.OCR) or ensure_paddleocr_from_env(tenant_id) except Exception as e: # best-effort fallback logging.warning(f"fallback to env paddleocr: {e}") diff --git a/rag/flow/parser/parser.py b/rag/flow/parser/parser.py index 5395f1c031..b765675b96 100644 --- a/rag/flow/parser/parser.py +++ b/rag/flow/parser/parser.py @@ -27,7 +27,13 @@ from PIL import Image from api.db.services.file2document_service import File2DocumentService from api.db.services.file_service import FileService from api.db.services.llm_service import LLMBundle -from api.db.joint_services.tenant_model_service import get_tenant_default_model_by_type, get_model_config_from_provider_instance +from api.db.joint_services.tenant_model_service import ( + ensure_mineru_from_env, + ensure_paddleocr_from_env, + get_first_provider_model_name, + get_model_config_from_provider_instance, + get_tenant_default_model_by_type, +) from common import settings from common.constants import LLMType from common.misc_utils import get_uuid, thread_pool_exec @@ -336,10 +342,10 @@ class Parser(ProcessBase): if isinstance(raw_parse_method, str): lowered = raw_parse_method.lower() if lowered.endswith("@mineru"): - parser_model_name = raw_parse_method.rsplit("@", 1)[0] + parser_model_name = raw_parse_method parse_method = "MinerU" elif lowered.endswith("@paddleocr"): - parser_model_name = raw_parse_method.rsplit("@", 1)[0] + parser_model_name = raw_parse_method parse_method = "PaddleOCR" # DeepDOC returns structured page boxes directly. @@ -368,13 +374,7 @@ class Parser(ProcessBase): if not tenant_id: return None - from api.db.services.tenant_llm_service import TenantLLMService - - env_name = TenantLLMService.ensure_mineru_from_env(tenant_id) - candidates = TenantLLMService.query(tenant_id=tenant_id, llm_factory="MinerU", model_type=LLMType.OCR.value) - if candidates: - return candidates[0].llm_name - return env_name + return get_first_provider_model_name(tenant_id, "MinerU", LLMType.OCR) or ensure_mineru_from_env(tenant_id) parser_model_name = resolve_mineru_llm_name() if not parser_model_name: @@ -550,13 +550,7 @@ class Parser(ProcessBase): if not tenant_id: return None - from api.db.services.tenant_llm_service import TenantLLMService - - env_name = TenantLLMService.ensure_paddleocr_from_env(tenant_id) - candidates = TenantLLMService.query(tenant_id=tenant_id, llm_factory="PaddleOCR", model_type=LLMType.OCR.value) - if candidates: - return candidates[0].llm_name - return env_name + return get_first_provider_model_name(tenant_id, "PaddleOCR", LLMType.OCR) or ensure_paddleocr_from_env(tenant_id) parser_model_name = resolve_paddleocr_llm_name() if not parser_model_name: diff --git a/web/src/pages/user-setting/setting-model/modal/provider-modal/field-config/provider-config-map.ts b/web/src/pages/user-setting/setting-model/modal/provider-modal/field-config/provider-config-map.ts index 1085ccbb0b..5c51a82cc3 100644 --- a/web/src/pages/user-setting/setting-model/modal/provider-modal/field-config/provider-config-map.ts +++ b/web/src/pages/user-setting/setting-model/modal/provider-modal/field-config/provider-config-map.ts @@ -766,7 +766,10 @@ export const ProviderConfigMap: Record = { return { apiKey: cfg, baseUrl: values.paddleocr_api_url, - modelInfo: buildModelInfoFromValues(values), + modelInfo: buildModelInfoFromValues({ + ...values, + model_type: ['ocr'], + }), }; }, submitTransform: (values) => { @@ -782,7 +785,10 @@ export const ProviderConfigMap: Record = { llm_factory: LLMFactory.PaddleOCR, api_key: cfg, api_base: '', - model_info: buildModelInfoFromValues(values), + model_info: buildModelInfoFromValues({ + ...values, + model_type: ['ocr'], + }), }; }, },