Fix metadata parsing regression for upgraded v0.24 datasets (#14383)

### What problem does this PR solve?

This PR fixes issue #14371 where file parsing failed after upgrading
from v0.24.0 to v0.25.0, because metadata config could be a JSON Schema
object but was handled like a list and later caused `KeyError:
'properties'`.

### Type of change

- [x] Bug Fix (non-breaking change which fixes an issue)
This commit is contained in:
Idriss Sbaaoui
2026-04-27 16:18:06 +08:00
committed by GitHub
parent d88f7ac8d2
commit 4303be223f

View File

@@ -427,7 +427,23 @@ async def build_chunks(task, progress_callback):
chat_mdl = LLMBundle(task["tenant_id"], chat_model_config, lang=task["language"])
async def gen_metadata_task(chat_mdl, d):
metadata_conf = list(task["parser_config"].get("metadata", [])) + list(task["parser_config"].get("built_in_metadata") or [])
metadata_conf = task["parser_config"].get("metadata", [])
built_in_metadata = list(task["parser_config"].get("built_in_metadata") or [])
if isinstance(metadata_conf, dict):
if not isinstance(metadata_conf.get("properties"), dict):
metadata_conf = {"type": "object", "properties": {}}
if built_in_metadata:
metadata_conf = {
**metadata_conf,
"properties": {
**metadata_conf.get("properties", {}),
**turn2jsonschema(built_in_metadata).get("properties", {}),
},
}
elif isinstance(metadata_conf, list):
metadata_conf = metadata_conf + built_in_metadata
else:
metadata_conf = built_in_metadata
cached = get_llm_cache(chat_mdl.llm_name, d["content_with_weight"], "metadata",
metadata_conf)
if not cached: