mirror of
https://github.com/infiniflow/ragflow.git
synced 2026-06-29 15:31:05 +08:00
Fix paddle ocr / minerU cannot add (#15858)
Fix paddle ocr / minerU cannot add
This commit is contained in:
@@ -57,7 +57,7 @@
|
||||
"component_name": "TavilySearch",
|
||||
"name": "TavilySearch",
|
||||
"params": {
|
||||
"api_key": "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",
|
||||
"api_key": "",
|
||||
"days": 7,
|
||||
"exclude_domains": [],
|
||||
"include_answer": false,
|
||||
@@ -651,7 +651,7 @@
|
||||
"component_name": "TavilySearch",
|
||||
"name": "TavilySearch",
|
||||
"params": {
|
||||
"api_key": "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",
|
||||
"api_key": "",
|
||||
"days": 7,
|
||||
"exclude_domains": [],
|
||||
"include_answer": false,
|
||||
|
||||
@@ -16,6 +16,7 @@
|
||||
import os
|
||||
import logging
|
||||
|
||||
from api.db.joint_services.tenant_model_service import ensure_mineru_from_env, ensure_paddleocr_from_env
|
||||
from common.constants import ActiveStatusEnum, LLMType
|
||||
from common.settings import FACTORY_LLM_INFOS
|
||||
from api.db.services.tenant_model_provider_service import TenantModelProviderService
|
||||
@@ -301,6 +302,9 @@ def list_tenant_added_models(tenant_id: str, model_type_filter: str=None):
|
||||
if not e:
|
||||
return False, "Tenant not found"
|
||||
|
||||
ensure_mineru_from_env(tenant_id)
|
||||
ensure_paddleocr_from_env(tenant_id)
|
||||
|
||||
if model_type_filter:
|
||||
model_type_filter = model_type_filter.lower()
|
||||
|
||||
|
||||
@@ -18,8 +18,8 @@ import os
|
||||
import enum
|
||||
import json
|
||||
from common import settings
|
||||
from common.constants import LLMType, ActiveStatusEnum
|
||||
from api.db.services.tenant_llm_service import TenantLLMService, TenantService
|
||||
from common.constants import ActiveStatusEnum, LLMType, MINERU_DEFAULT_CONFIG, MINERU_ENV_KEYS, PADDLEOCR_DEFAULT_CONFIG, PADDLEOCR_ENV_KEYS
|
||||
from api.db.services.tenant_llm_service import TenantService
|
||||
from api.db.services.tenant_model_provider_service import TenantModelProviderService
|
||||
from api.db.services.tenant_model_instance_service import TenantModelInstanceService
|
||||
from api.db.services.tenant_model_service import TenantModelService
|
||||
@@ -27,6 +27,106 @@ from api.db.services.tenant_model_service import TenantModelService
|
||||
logger = logging.getLogger(__name__)
|
||||
|
||||
|
||||
def _decode_api_key_config(raw_api_key: str) -> tuple[str, bool | None, str | None]:
|
||||
if not raw_api_key:
|
||||
return raw_api_key, None, None
|
||||
|
||||
try:
|
||||
parsed = json.loads(raw_api_key)
|
||||
except Exception:
|
||||
return raw_api_key, None, None
|
||||
|
||||
if not isinstance(parsed, dict):
|
||||
return raw_api_key, None, None
|
||||
|
||||
is_tools = bool(parsed["is_tools"]) if "is_tools" in parsed else None
|
||||
if set(parsed.keys()) <= {"api_key", "is_tools"}:
|
||||
return parsed.get("api_key", ""), is_tools, None
|
||||
|
||||
return parsed.get("api_key", raw_api_key), is_tools, raw_api_key
|
||||
|
||||
|
||||
def get_first_provider_model_name(tenant_id: str, provider_name: str, model_type: str | enum.Enum) -> str | None:
|
||||
model_type_val = model_type if isinstance(model_type, str) else model_type.value
|
||||
provider_obj = TenantModelProviderService.get_by_tenant_id_and_provider_name(tenant_id, provider_name)
|
||||
if not provider_obj:
|
||||
return None
|
||||
|
||||
for instance_obj in TenantModelInstanceService.get_all_by_provider_id(provider_obj.id):
|
||||
if instance_obj.status != ActiveStatusEnum.ACTIVE.value:
|
||||
continue
|
||||
for model_obj in TenantModelService.get_models_by_instance_id(instance_obj.id):
|
||||
if model_obj.model_type == model_type_val and model_obj.status == ActiveStatusEnum.ACTIVE.value:
|
||||
return f"{model_obj.model_name}@{instance_obj.instance_name}@{provider_name}"
|
||||
return None
|
||||
|
||||
|
||||
def _collect_env_config(env_keys: list[str], default_config: dict) -> dict | None:
|
||||
config = dict(default_config)
|
||||
found = False
|
||||
for key in env_keys:
|
||||
value = os.environ.get(key)
|
||||
if value:
|
||||
found = True
|
||||
config[key] = value
|
||||
return config if found else None
|
||||
|
||||
|
||||
def _ensure_ocr_provider_from_env(tenant_id: str, provider_name: str, model_name: str, config: dict | None) -> str | None:
|
||||
if not config:
|
||||
return None
|
||||
|
||||
provider_obj = TenantModelProviderService.get_by_tenant_id_and_provider_name(tenant_id, provider_name)
|
||||
if not provider_obj:
|
||||
TenantModelProviderService.insert(tenant_id=tenant_id, provider_name=provider_name)
|
||||
provider_obj = TenantModelProviderService.get_by_tenant_id_and_provider_name(tenant_id, provider_name)
|
||||
|
||||
api_key = json.dumps(config)
|
||||
instance_obj = TenantModelInstanceService.get_by_provider_id_and_api_key(provider_obj.id, api_key)
|
||||
if not instance_obj:
|
||||
instance_obj = TenantModelInstanceService.create_instance(
|
||||
provider_id=provider_obj.id,
|
||||
instance_name=model_name,
|
||||
api_key=api_key,
|
||||
extra="{}",
|
||||
)
|
||||
|
||||
model_obj = TenantModelService.get_by_provider_id_and_instance_id_and_model_type_and_model_name(
|
||||
provider_obj.id,
|
||||
instance_obj.id,
|
||||
LLMType.OCR.value,
|
||||
model_name,
|
||||
)
|
||||
if not model_obj:
|
||||
TenantModelService.insert(
|
||||
model_name=model_name,
|
||||
provider_id=provider_obj.id,
|
||||
instance_id=instance_obj.id,
|
||||
model_type=LLMType.OCR.value,
|
||||
extra=json.dumps({"max_tokens": 0}),
|
||||
)
|
||||
|
||||
return f"{model_name}@{instance_obj.instance_name}@{provider_name}"
|
||||
|
||||
|
||||
def ensure_mineru_from_env(tenant_id: str) -> str | None:
|
||||
return _ensure_ocr_provider_from_env(
|
||||
tenant_id,
|
||||
"MinerU",
|
||||
"mineru-from-env",
|
||||
_collect_env_config(MINERU_ENV_KEYS, MINERU_DEFAULT_CONFIG),
|
||||
)
|
||||
|
||||
|
||||
def ensure_paddleocr_from_env(tenant_id: str) -> str | None:
|
||||
return _ensure_ocr_provider_from_env(
|
||||
tenant_id,
|
||||
"PaddleOCR",
|
||||
"paddleocr-from-env",
|
||||
_collect_env_config(PADDLEOCR_ENV_KEYS, PADDLEOCR_DEFAULT_CONFIG),
|
||||
)
|
||||
|
||||
|
||||
def get_tenant_default_model_by_type(tenant_id: str, model_type: str|enum.Enum):
|
||||
exist, tenant = TenantService.get_by_id(tenant_id)
|
||||
if not exist:
|
||||
@@ -103,7 +203,7 @@ def get_model_config_from_provider_instance(tenant_id, model_type: str|enum.Enum
|
||||
raise LookupError(f"Instance {instance_name} not found for model {model_name}.")
|
||||
model_obj = TenantModelService.get_by_provider_id_and_instance_id_and_model_type_and_model_name(provider_obj.id, instance_obj.id, model_type_val, pure_model_name)
|
||||
|
||||
api_key, is_tool, api_key_payload = TenantLLMService._decode_api_key_config(instance_obj.api_key)
|
||||
api_key, is_tool, api_key_payload = _decode_api_key_config(instance_obj.api_key)
|
||||
extra_fields = json.loads(instance_obj.extra) if instance_obj.extra else {}
|
||||
|
||||
if model_obj:
|
||||
|
||||
@@ -24,13 +24,13 @@ def normalize_layout_recognizer(layout_recognizer_raw: Any) -> tuple[Any, str |
|
||||
if isinstance(layout_recognizer_raw, str):
|
||||
lowered = layout_recognizer_raw.lower()
|
||||
if lowered.endswith("@mineru"):
|
||||
parser_model_name = layout_recognizer_raw.rsplit("@", 1)[0]
|
||||
parser_model_name = layout_recognizer_raw
|
||||
layout_recognizer = "MinerU"
|
||||
elif lowered.endswith("@paddleocr"):
|
||||
parser_model_name = layout_recognizer_raw.rsplit("@", 1)[0]
|
||||
parser_model_name = layout_recognizer_raw
|
||||
layout_recognizer = "PaddleOCR"
|
||||
elif lowered.endswith("@opendataloader"):
|
||||
parser_model_name = layout_recognizer_raw.rsplit("@", 1)[0]
|
||||
parser_model_name = layout_recognizer_raw
|
||||
layout_recognizer = "OpenDataLoader"
|
||||
|
||||
return layout_recognizer, parser_model_name
|
||||
|
||||
@@ -223,8 +223,6 @@ class PaddleOCRParser(RAGFlowPdfParser):
|
||||
request_timeout: int = 600,
|
||||
):
|
||||
"""Initialize PaddleOCR parser."""
|
||||
super().__init__()
|
||||
|
||||
self.outlines = []
|
||||
self.api_url = api_url.rstrip("/") if api_url else os.getenv("PADDLEOCR_API_URL", "")
|
||||
self.access_token = access_token or os.getenv("PADDLEOCR_ACCESS_TOKEN")
|
||||
|
||||
@@ -31,7 +31,13 @@ from common.token_utils import num_tokens_from_string
|
||||
|
||||
from common.constants import LLMType, MAXIMUM_PAGE_NUMBER
|
||||
from api.db.services.llm_service import LLMBundle
|
||||
from api.db.joint_services.tenant_model_service import get_tenant_default_model_by_type, get_model_config_from_provider_instance
|
||||
from api.db.joint_services.tenant_model_service import (
|
||||
ensure_mineru_from_env,
|
||||
ensure_paddleocr_from_env,
|
||||
get_first_provider_model_name,
|
||||
get_model_config_from_provider_instance,
|
||||
get_tenant_default_model_by_type,
|
||||
)
|
||||
from rag.utils.file_utils import extract_embed_file, extract_links_from_pdf, extract_links_from_docx, extract_html
|
||||
from deepdoc.parser import DocxParser, EpubParser, ExcelParser, HtmlParser, JsonParser, MarkdownElementExtractor, MarkdownParser, PdfParser, TxtParser
|
||||
from deepdoc.parser.figure_parser import VisionFigureParser, vision_figure_parser_docx_wrapper_naive, vision_figure_parser_pdf_wrapper
|
||||
@@ -137,14 +143,7 @@ def by_mineru(
|
||||
if tenant_id:
|
||||
if not mineru_llm_name:
|
||||
try:
|
||||
from api.db.services.tenant_llm_service import TenantLLMService
|
||||
|
||||
env_name = TenantLLMService.ensure_mineru_from_env(tenant_id)
|
||||
candidates = TenantLLMService.query(tenant_id=tenant_id, llm_factory="MinerU", model_type=LLMType.OCR)
|
||||
if candidates:
|
||||
mineru_llm_name = candidates[0].llm_name
|
||||
elif env_name:
|
||||
mineru_llm_name = env_name
|
||||
mineru_llm_name = get_first_provider_model_name(tenant_id, "MinerU", LLMType.OCR) or ensure_mineru_from_env(tenant_id)
|
||||
except Exception as e: # best-effort fallback
|
||||
logging.warning(f"fallback to env mineru: {e}")
|
||||
|
||||
@@ -281,14 +280,7 @@ def by_paddleocr(
|
||||
if tenant_id:
|
||||
if not paddleocr_llm_name:
|
||||
try:
|
||||
from api.db.services.tenant_llm_service import TenantLLMService
|
||||
|
||||
env_name = TenantLLMService.ensure_paddleocr_from_env(tenant_id)
|
||||
candidates = TenantLLMService.query(tenant_id=tenant_id, llm_factory="PaddleOCR", model_type=LLMType.OCR)
|
||||
if candidates:
|
||||
paddleocr_llm_name = candidates[0].llm_name
|
||||
elif env_name:
|
||||
paddleocr_llm_name = env_name
|
||||
paddleocr_llm_name = get_first_provider_model_name(tenant_id, "PaddleOCR", LLMType.OCR) or ensure_paddleocr_from_env(tenant_id)
|
||||
except Exception as e: # best-effort fallback
|
||||
logging.warning(f"fallback to env paddleocr: {e}")
|
||||
|
||||
|
||||
@@ -27,7 +27,13 @@ from PIL import Image
|
||||
from api.db.services.file2document_service import File2DocumentService
|
||||
from api.db.services.file_service import FileService
|
||||
from api.db.services.llm_service import LLMBundle
|
||||
from api.db.joint_services.tenant_model_service import get_tenant_default_model_by_type, get_model_config_from_provider_instance
|
||||
from api.db.joint_services.tenant_model_service import (
|
||||
ensure_mineru_from_env,
|
||||
ensure_paddleocr_from_env,
|
||||
get_first_provider_model_name,
|
||||
get_model_config_from_provider_instance,
|
||||
get_tenant_default_model_by_type,
|
||||
)
|
||||
from common import settings
|
||||
from common.constants import LLMType
|
||||
from common.misc_utils import get_uuid, thread_pool_exec
|
||||
@@ -336,10 +342,10 @@ class Parser(ProcessBase):
|
||||
if isinstance(raw_parse_method, str):
|
||||
lowered = raw_parse_method.lower()
|
||||
if lowered.endswith("@mineru"):
|
||||
parser_model_name = raw_parse_method.rsplit("@", 1)[0]
|
||||
parser_model_name = raw_parse_method
|
||||
parse_method = "MinerU"
|
||||
elif lowered.endswith("@paddleocr"):
|
||||
parser_model_name = raw_parse_method.rsplit("@", 1)[0]
|
||||
parser_model_name = raw_parse_method
|
||||
parse_method = "PaddleOCR"
|
||||
|
||||
# DeepDOC returns structured page boxes directly.
|
||||
@@ -368,13 +374,7 @@ class Parser(ProcessBase):
|
||||
if not tenant_id:
|
||||
return None
|
||||
|
||||
from api.db.services.tenant_llm_service import TenantLLMService
|
||||
|
||||
env_name = TenantLLMService.ensure_mineru_from_env(tenant_id)
|
||||
candidates = TenantLLMService.query(tenant_id=tenant_id, llm_factory="MinerU", model_type=LLMType.OCR.value)
|
||||
if candidates:
|
||||
return candidates[0].llm_name
|
||||
return env_name
|
||||
return get_first_provider_model_name(tenant_id, "MinerU", LLMType.OCR) or ensure_mineru_from_env(tenant_id)
|
||||
|
||||
parser_model_name = resolve_mineru_llm_name()
|
||||
if not parser_model_name:
|
||||
@@ -550,13 +550,7 @@ class Parser(ProcessBase):
|
||||
if not tenant_id:
|
||||
return None
|
||||
|
||||
from api.db.services.tenant_llm_service import TenantLLMService
|
||||
|
||||
env_name = TenantLLMService.ensure_paddleocr_from_env(tenant_id)
|
||||
candidates = TenantLLMService.query(tenant_id=tenant_id, llm_factory="PaddleOCR", model_type=LLMType.OCR.value)
|
||||
if candidates:
|
||||
return candidates[0].llm_name
|
||||
return env_name
|
||||
return get_first_provider_model_name(tenant_id, "PaddleOCR", LLMType.OCR) or ensure_paddleocr_from_env(tenant_id)
|
||||
|
||||
parser_model_name = resolve_paddleocr_llm_name()
|
||||
if not parser_model_name:
|
||||
|
||||
@@ -766,7 +766,10 @@ export const ProviderConfigMap: Record<string, ProviderConfig> = {
|
||||
return {
|
||||
apiKey: cfg,
|
||||
baseUrl: values.paddleocr_api_url,
|
||||
modelInfo: buildModelInfoFromValues(values),
|
||||
modelInfo: buildModelInfoFromValues({
|
||||
...values,
|
||||
model_type: ['ocr'],
|
||||
}),
|
||||
};
|
||||
},
|
||||
submitTransform: (values) => {
|
||||
@@ -782,7 +785,10 @@ export const ProviderConfigMap: Record<string, ProviderConfig> = {
|
||||
llm_factory: LLMFactory.PaddleOCR,
|
||||
api_key: cfg,
|
||||
api_base: '',
|
||||
model_info: buildModelInfoFromValues(values),
|
||||
model_info: buildModelInfoFromValues({
|
||||
...values,
|
||||
model_type: ['ocr'],
|
||||
}),
|
||||
};
|
||||
},
|
||||
},
|
||||
|
||||
Reference in New Issue
Block a user