From 3eff41361b4dc6c3515230f99b2d63fd70658960 Mon Sep 17 00:00:00 2001
From: Jack <xugangqiang@hotmail.com>
Date: Tue, 9 Jun 2026 19:10:48 +0800
Subject: [PATCH] fix: prevent None values in auto-metadata from causing
 KeyError (#15842)
MIME-Version: 1.0
Content-Type: text/plain; charset=UTF-8
Content-Transfer-Encoding: 8bit

## Problem

When users configure auto-metadata for a dataset, parsing crashes with:

```
KeyError: 'properties' in gen_metadata → schema["properties"]
```

## Root Cause

Pydantic `AutoMetadataField` defaults `enum` and `description` to `None`
when the frontend omits these fields:

```python
class AutoMetadataField(Base):
    enum: Annotated[list[str] | None, Field(default=None)]
    description: Annotated[str | None, Field(default=None)]
```

These `None` values propagate through the call chain and cause two
crashes:
---
 common/metadata_utils.py                      | 10 +--
 rag/prompts/generator.py                      |  5 ++
 .../svr/task_executor_refactor/conftest.py    |  3 +
 .../test_chunk_post_processor.py              | 90 +++++++++++++++----
 4 files changed, 87 insertions(+), 21 deletions(-)

diff --git a/common/metadata_utils.py b/common/metadata_utils.py
index 591acb8053..a6c6d273dc 100644
--- a/common/metadata_utils.py
+++ b/common/metadata_utils.py
@@ -399,11 +399,11 @@ def _is_metadata_list(obj: list) -> bool:
         key = item.get("key")
         if not isinstance(key, str) or not key:
             return False
-        if "enum" in item and not isinstance(item["enum"], list):
+        if "enum" in item and item["enum"] is not None and not isinstance(item["enum"], list):
             return False
-        if "description" in item and not isinstance(item["description"], str):
+        if "description" in item and item["description"] is not None and not isinstance(item["description"], str):
             return False
-        if "descriptions" in item and not isinstance(item["descriptions"], str):
+        if "descriptions" in item and item["descriptions"] is not None and not isinstance(item["descriptions"], str):
             return False
     return True
 
@@ -414,12 +414,12 @@ def turn2jsonschema(obj: dict | list) -> Dict[str, Any]:
     if isinstance(obj, list) and _is_metadata_list(obj):
         normalized = []
         for item in obj:
-            description = item.get("description", item.get("descriptions", ""))
+            description = item.get("description") or item.get("descriptions") or ""
             normalized_item = {
                 "key": item.get("key"),
                 "description": description,
             }
-            if "enum" in item:
+            if "enum" in item and item["enum"] is not None:
                 normalized_item["enum"] = item["enum"]
             normalized.append(normalized_item)
         return metadata_schema(normalized)
diff --git a/rag/prompts/generator.py b/rag/prompts/generator.py
index edba89c808..fd58ddff7a 100644
--- a/rag/prompts/generator.py
+++ b/rag/prompts/generator.py
@@ -955,6 +955,11 @@ async def relevant_chunks_with_toc(query: str, toc: list[dict], chat_mdl, topn:
 
 META_DATA = load_prompt("meta_data")
 async def gen_metadata(chat_mdl, schema: dict, content: str):
+    if not schema:
+        return ""
+    if "properties" not in schema:
+        logging.warning("gen_metadata: schema has no 'properties' key: %s", schema)
+        return ""
     template = PROMPT_JINJA_ENV.from_string(META_DATA)
     for k, desc in schema["properties"].items():
         if "enum" in desc and not desc.get("enum"):
diff --git a/test/unit_test/rag/svr/task_executor_refactor/conftest.py b/test/unit_test/rag/svr/task_executor_refactor/conftest.py
index 8f57701d04..84d06cc959 100644
--- a/test/unit_test/rag/svr/task_executor_refactor/conftest.py
+++ b/test/unit_test/rag/svr/task_executor_refactor/conftest.py
@@ -280,6 +280,9 @@ class MockChatModel:
     def __exit__(self, *args):
         pass
 
+    async def async_chat(self, system_prompt, messages, **kwargs):
+        return '{"key": "value"}'
+
 
 @pytest.fixture
 def mock_embedding_model():
diff --git a/test/unit_test/rag/svr/task_executor_refactor/test_chunk_post_processor.py b/test/unit_test/rag/svr/task_executor_refactor/test_chunk_post_processor.py
index a5e5427d86..c1394c4fae 100644
--- a/test/unit_test/rag/svr/task_executor_refactor/test_chunk_post_processor.py
+++ b/test/unit_test/rag/svr/task_executor_refactor/test_chunk_post_processor.py
@@ -154,8 +154,8 @@ class TestGenerateMetadata(_BasePostProcessorTest):
         ctx = make_task_context(
             parser_config={
                 "enable_metadata": True,
-                "metadata": [{"name": "category", "type": "string"}],
-                "built_in_metadata": ["author", "date"],
+                "metadata": [{"key": "category", "type": "string"}],
+                "built_in_metadata": [{"key": "update_time", "type": "time"}],
             },
         )
         docs = [{"content_with_weight": "This is test content"}]
@@ -164,12 +164,8 @@ class TestGenerateMetadata(_BasePostProcessorTest):
         p3 = patch("rag.svr.task_executor_refactor.chunk_post_processor.get_llm_cache",
                    return_value=None)
         p4 = patch("rag.svr.task_executor_refactor.chunk_post_processor.set_llm_cache")
-        p5 = self._patch_prompt_func(
-            "rag.svr.task_executor_refactor.chunk_post_processor.gen_metadata",
-            return_value={"category": "test"},
-        )
-        p6 = patch("rag.svr.task_executor_refactor.chunk_post_processor.DocMetadataService")
-        with p1, p2, p3, p4, p5, p6 as mock_meta:
+        p5 = patch("rag.svr.task_executor_refactor.chunk_post_processor.DocMetadataService")
+        with p1, p2, p3, p4, p5 as mock_meta:
             mock_meta.get_document_metadata.return_value = {}
             mock_meta.update_document_metadata = MagicMock()
             await generate_metadata(docs, ctx)
@@ -181,8 +177,8 @@ class TestGenerateMetadata(_BasePostProcessorTest):
         ctx = make_task_context(
             parser_config={
                 "enable_metadata": True,
-                "metadata": [{"name": "category", "type": "string"}],
-                "built_in_metadata": ["author", "date"],
+                "metadata": [{"key": "category", "type": "string"}],
+                "built_in_metadata": [{"key": "update_time", "type": "time"}],
             },
             write_interceptor=MagicMock(),
         )
@@ -192,17 +188,79 @@ class TestGenerateMetadata(_BasePostProcessorTest):
         p3 = patch("rag.svr.task_executor_refactor.chunk_post_processor.get_llm_cache",
                    return_value=None)
         p4 = patch("rag.svr.task_executor_refactor.chunk_post_processor.set_llm_cache")
-        p5 = self._patch_prompt_func(
-            "rag.svr.task_executor_refactor.chunk_post_processor.gen_metadata",
-            return_value={"category": "test"},
-        )
-        p6 = patch("rag.svr.task_executor_refactor.chunk_post_processor.DocMetadataService")
-        with p1, p2, p3, p4, p5, p6:
+        p5 = patch("rag.svr.task_executor_refactor.chunk_post_processor.DocMetadataService")
+        with p1, p2, p3, p4, p5:
             await generate_metadata(docs, ctx)
             ctx.write_interceptor.intercept.assert_called_once_with(
                 "DocMetadataService.update_document_metadata"
             )
 
+    @pytest.mark.asyncio
+    async def test_generate_metadata_empty_config_does_not_crash(self):
+        """Empty parser_config — no metadata configured — should not crash."""
+        ctx = make_task_context(parser_config={})
+        docs = [{"content_with_weight": "test"}]
+        p1, p2 = self._mock_llm_binding()
+        p3 = patch("rag.svr.task_executor_refactor.chunk_post_processor.DocMetadataService")
+        with p1, p2, p3:
+            await generate_metadata(docs, ctx)  # no exception = pass
+
+    @pytest.mark.asyncio
+    async def test_generate_metadata_enum_none_accepted(self):
+        """enum: None in metadata — treated as absent, should not crash."""
+        ctx = make_task_context(
+            parser_config={
+                "enable_metadata": True,
+                "metadata": [{"key": "format", "type": "string", "enum": None}],
+            },
+        )
+        docs = [{"content_with_weight": "test"}]
+        p1, p2 = self._mock_llm_binding()
+        p3 = patch("rag.svr.task_executor_refactor.chunk_post_processor.get_llm_cache",
+                   return_value=None)
+        p4 = patch("rag.svr.task_executor_refactor.chunk_post_processor.set_llm_cache")
+        p5 = patch("rag.svr.task_executor_refactor.chunk_post_processor.DocMetadataService")
+        with p1, p2, p3, p4, p5:
+            await generate_metadata(docs, ctx)  # no exception = pass
+
+    @pytest.mark.asyncio
+    async def test_generate_metadata_description_none_accepted(self):
+        """description: None in metadata — should not crash."""
+        ctx = make_task_context(
+            parser_config={
+                "enable_metadata": True,
+                "metadata": [{"key": "test", "type": "string", "description": None}],
+            },
+        )
+        docs = [{"content_with_weight": "test"}]
+        p1, p2 = self._mock_llm_binding()
+        p3 = patch("rag.svr.task_executor_refactor.chunk_post_processor.get_llm_cache",
+                   return_value=None)
+        p4 = patch("rag.svr.task_executor_refactor.chunk_post_processor.set_llm_cache")
+        p5 = patch("rag.svr.task_executor_refactor.chunk_post_processor.DocMetadataService")
+        with p1, p2, p3, p4, p5:
+            await generate_metadata(docs, ctx)  # no exception = pass
+
+    @pytest.mark.asyncio
+    async def test_generate_metadata_built_in_with_enum_none(self):
+        """built_in_metadata with enum: None — should not crash."""
+        ctx = make_task_context(
+            parser_config={
+                "enable_metadata": True,
+                "built_in_metadata": [
+                    {"key": "update_time", "type": "time", "description": None, "enum": None},
+                ],
+            },
+        )
+        docs = [{"content_with_weight": "test"}]
+        p1, p2 = self._mock_llm_binding()
+        p3 = patch("rag.svr.task_executor_refactor.chunk_post_processor.get_llm_cache",
+                   return_value=None)
+        p4 = patch("rag.svr.task_executor_refactor.chunk_post_processor.set_llm_cache")
+        p5 = patch("rag.svr.task_executor_refactor.chunk_post_processor.DocMetadataService")
+        with p1, p2, p3, p4, p5:
+            await generate_metadata(docs, ctx)  # no exception = pass
+
 
 class TestApplyTags(_BasePostProcessorTest):
     """Tests for apply_tags function."""