From 4303be223fba929fe2982249ce6faafd764cd1b3 Mon Sep 17 00:00:00 2001 From: Idriss Sbaaoui <112825897+6ba3i@users.noreply.github.com> Date: Mon, 27 Apr 2026 16:18:06 +0800 Subject: [PATCH] Fix metadata parsing regression for upgraded v0.24 datasets (#14383) ### What problem does this PR solve? This PR fixes issue #14371 where file parsing failed after upgrading from v0.24.0 to v0.25.0, because metadata config could be a JSON Schema object but was handled like a list and later caused `KeyError: 'properties'`. ### Type of change - [x] Bug Fix (non-breaking change which fixes an issue) --- rag/svr/task_executor.py | 18 +++++++++++++++++- 1 file changed, 17 insertions(+), 1 deletion(-) diff --git a/rag/svr/task_executor.py b/rag/svr/task_executor.py index 94ad77a0b2..4144e9cbb8 100644 --- a/rag/svr/task_executor.py +++ b/rag/svr/task_executor.py @@ -427,7 +427,23 @@ async def build_chunks(task, progress_callback): chat_mdl = LLMBundle(task["tenant_id"], chat_model_config, lang=task["language"]) async def gen_metadata_task(chat_mdl, d): - metadata_conf = list(task["parser_config"].get("metadata", [])) + list(task["parser_config"].get("built_in_metadata") or []) + metadata_conf = task["parser_config"].get("metadata", []) + built_in_metadata = list(task["parser_config"].get("built_in_metadata") or []) + if isinstance(metadata_conf, dict): + if not isinstance(metadata_conf.get("properties"), dict): + metadata_conf = {"type": "object", "properties": {}} + if built_in_metadata: + metadata_conf = { + **metadata_conf, + "properties": { + **metadata_conf.get("properties", {}), + **turn2jsonschema(built_in_metadata).get("properties", {}), + }, + } + elif isinstance(metadata_conf, list): + metadata_conf = metadata_conf + built_in_metadata + else: + metadata_conf = built_in_metadata cached = get_llm_cache(chat_mdl.llm_name, d["content_with_weight"], "metadata", metadata_conf) if not cached: