Fix metadata parsing regression for upgraded v0.24 datasets (#14383)

### What problem does this PR solve? This PR fixes issue #14371 where file parsing failed after upgrading from v0.24.0 to v0.25.0, because metadata config could be a JSON Schema object but was handled like a list and later caused `KeyError: 'properties'`. ### Type of change - [x] Bug Fix (non-breaking change which fixes an issue)
2026-07-01 16:25:44 +08:00 · 2026-04-27 16:18:06 +08:00
parent d88f7ac8d2
commit 4303be223f
1 changed files with 17 additions and 1 deletions
--- a/rag/svr/task_executor.py
+++ b/rag/svr/task_executor.py
@@ -427,7 +427,23 @@ async def build_chunks(task, progress_callback):
        chat_mdl = LLMBundle(task["tenant_id"], chat_model_config, lang=task["language"])

        async def gen_metadata_task(chat_mdl, d):
-            metadata_conf = list(task["parser_config"].get("metadata", [])) + list(task["parser_config"].get("built_in_metadata") or [])
+            metadata_conf = task["parser_config"].get("metadata", [])
+            built_in_metadata = list(task["parser_config"].get("built_in_metadata") or [])
+            if isinstance(metadata_conf, dict):
+                if not isinstance(metadata_conf.get("properties"), dict):
+                    metadata_conf = {"type": "object", "properties": {}}
+                if built_in_metadata:
+                    metadata_conf = {
+                        **metadata_conf,
+                        "properties": {
+                            **metadata_conf.get("properties", {}),
+                            **turn2jsonschema(built_in_metadata).get("properties", {}),
+                        },
+                    }
+            elif isinstance(metadata_conf, list):
+                metadata_conf = metadata_conf + built_in_metadata
+            else:
+                metadata_conf = built_in_metadata
            cached = get_llm_cache(chat_mdl.llm_name, d["content_with_weight"], "metadata",
                                   metadata_conf)
            if not cached: