fix: prevent None values in auto-metadata from causing KeyError (#15842)

## Problem

When users configure auto-metadata for a dataset, parsing crashes with:

```
KeyError: 'properties' in gen_metadata → schema["properties"]
```

## Root Cause

Pydantic `AutoMetadataField` defaults `enum` and `description` to `None`
when the frontend omits these fields:

```python
class AutoMetadataField(Base):
    enum: Annotated[list[str] | None, Field(default=None)]
    description: Annotated[str | None, Field(default=None)]
```

These `None` values propagate through the call chain and cause two
crashes:
This commit is contained in:
Jack
2026-06-09 19:10:48 +08:00
committed by GitHub
parent 2773208159
commit 3eff41361b
4 changed files with 87 additions and 21 deletions

View File

@@ -399,11 +399,11 @@ def _is_metadata_list(obj: list) -> bool:
key = item.get("key")
if not isinstance(key, str) or not key:
return False
if "enum" in item and not isinstance(item["enum"], list):
if "enum" in item and item["enum"] is not None and not isinstance(item["enum"], list):
return False
if "description" in item and not isinstance(item["description"], str):
if "description" in item and item["description"] is not None and not isinstance(item["description"], str):
return False
if "descriptions" in item and not isinstance(item["descriptions"], str):
if "descriptions" in item and item["descriptions"] is not None and not isinstance(item["descriptions"], str):
return False
return True
@@ -414,12 +414,12 @@ def turn2jsonschema(obj: dict | list) -> Dict[str, Any]:
if isinstance(obj, list) and _is_metadata_list(obj):
normalized = []
for item in obj:
description = item.get("description", item.get("descriptions", ""))
description = item.get("description") or item.get("descriptions") or ""
normalized_item = {
"key": item.get("key"),
"description": description,
}
if "enum" in item:
if "enum" in item and item["enum"] is not None:
normalized_item["enum"] = item["enum"]
normalized.append(normalized_item)
return metadata_schema(normalized)

View File

@@ -955,6 +955,11 @@ async def relevant_chunks_with_toc(query: str, toc: list[dict], chat_mdl, topn:
META_DATA = load_prompt("meta_data")
async def gen_metadata(chat_mdl, schema: dict, content: str):
if not schema:
return ""
if "properties" not in schema:
logging.warning("gen_metadata: schema has no 'properties' key: %s", schema)
return ""
template = PROMPT_JINJA_ENV.from_string(META_DATA)
for k, desc in schema["properties"].items():
if "enum" in desc and not desc.get("enum"):

View File

@@ -280,6 +280,9 @@ class MockChatModel:
def __exit__(self, *args):
pass
async def async_chat(self, system_prompt, messages, **kwargs):
return '{"key": "value"}'
@pytest.fixture
def mock_embedding_model():

View File

@@ -154,8 +154,8 @@ class TestGenerateMetadata(_BasePostProcessorTest):
ctx = make_task_context(
parser_config={
"enable_metadata": True,
"metadata": [{"name": "category", "type": "string"}],
"built_in_metadata": ["author", "date"],
"metadata": [{"key": "category", "type": "string"}],
"built_in_metadata": [{"key": "update_time", "type": "time"}],
},
)
docs = [{"content_with_weight": "This is test content"}]
@@ -164,12 +164,8 @@ class TestGenerateMetadata(_BasePostProcessorTest):
p3 = patch("rag.svr.task_executor_refactor.chunk_post_processor.get_llm_cache",
return_value=None)
p4 = patch("rag.svr.task_executor_refactor.chunk_post_processor.set_llm_cache")
p5 = self._patch_prompt_func(
"rag.svr.task_executor_refactor.chunk_post_processor.gen_metadata",
return_value={"category": "test"},
)
p6 = patch("rag.svr.task_executor_refactor.chunk_post_processor.DocMetadataService")
with p1, p2, p3, p4, p5, p6 as mock_meta:
p5 = patch("rag.svr.task_executor_refactor.chunk_post_processor.DocMetadataService")
with p1, p2, p3, p4, p5 as mock_meta:
mock_meta.get_document_metadata.return_value = {}
mock_meta.update_document_metadata = MagicMock()
await generate_metadata(docs, ctx)
@@ -181,8 +177,8 @@ class TestGenerateMetadata(_BasePostProcessorTest):
ctx = make_task_context(
parser_config={
"enable_metadata": True,
"metadata": [{"name": "category", "type": "string"}],
"built_in_metadata": ["author", "date"],
"metadata": [{"key": "category", "type": "string"}],
"built_in_metadata": [{"key": "update_time", "type": "time"}],
},
write_interceptor=MagicMock(),
)
@@ -192,17 +188,79 @@ class TestGenerateMetadata(_BasePostProcessorTest):
p3 = patch("rag.svr.task_executor_refactor.chunk_post_processor.get_llm_cache",
return_value=None)
p4 = patch("rag.svr.task_executor_refactor.chunk_post_processor.set_llm_cache")
p5 = self._patch_prompt_func(
"rag.svr.task_executor_refactor.chunk_post_processor.gen_metadata",
return_value={"category": "test"},
)
p6 = patch("rag.svr.task_executor_refactor.chunk_post_processor.DocMetadataService")
with p1, p2, p3, p4, p5, p6:
p5 = patch("rag.svr.task_executor_refactor.chunk_post_processor.DocMetadataService")
with p1, p2, p3, p4, p5:
await generate_metadata(docs, ctx)
ctx.write_interceptor.intercept.assert_called_once_with(
"DocMetadataService.update_document_metadata"
)
@pytest.mark.asyncio
async def test_generate_metadata_empty_config_does_not_crash(self):
"""Empty parser_config — no metadata configured — should not crash."""
ctx = make_task_context(parser_config={})
docs = [{"content_with_weight": "test"}]
p1, p2 = self._mock_llm_binding()
p3 = patch("rag.svr.task_executor_refactor.chunk_post_processor.DocMetadataService")
with p1, p2, p3:
await generate_metadata(docs, ctx) # no exception = pass
@pytest.mark.asyncio
async def test_generate_metadata_enum_none_accepted(self):
"""enum: None in metadata — treated as absent, should not crash."""
ctx = make_task_context(
parser_config={
"enable_metadata": True,
"metadata": [{"key": "format", "type": "string", "enum": None}],
},
)
docs = [{"content_with_weight": "test"}]
p1, p2 = self._mock_llm_binding()
p3 = patch("rag.svr.task_executor_refactor.chunk_post_processor.get_llm_cache",
return_value=None)
p4 = patch("rag.svr.task_executor_refactor.chunk_post_processor.set_llm_cache")
p5 = patch("rag.svr.task_executor_refactor.chunk_post_processor.DocMetadataService")
with p1, p2, p3, p4, p5:
await generate_metadata(docs, ctx) # no exception = pass
@pytest.mark.asyncio
async def test_generate_metadata_description_none_accepted(self):
"""description: None in metadata — should not crash."""
ctx = make_task_context(
parser_config={
"enable_metadata": True,
"metadata": [{"key": "test", "type": "string", "description": None}],
},
)
docs = [{"content_with_weight": "test"}]
p1, p2 = self._mock_llm_binding()
p3 = patch("rag.svr.task_executor_refactor.chunk_post_processor.get_llm_cache",
return_value=None)
p4 = patch("rag.svr.task_executor_refactor.chunk_post_processor.set_llm_cache")
p5 = patch("rag.svr.task_executor_refactor.chunk_post_processor.DocMetadataService")
with p1, p2, p3, p4, p5:
await generate_metadata(docs, ctx) # no exception = pass
@pytest.mark.asyncio
async def test_generate_metadata_built_in_with_enum_none(self):
"""built_in_metadata with enum: None — should not crash."""
ctx = make_task_context(
parser_config={
"enable_metadata": True,
"built_in_metadata": [
{"key": "update_time", "type": "time", "description": None, "enum": None},
],
},
)
docs = [{"content_with_weight": "test"}]
p1, p2 = self._mock_llm_binding()
p3 = patch("rag.svr.task_executor_refactor.chunk_post_processor.get_llm_cache",
return_value=None)
p4 = patch("rag.svr.task_executor_refactor.chunk_post_processor.set_llm_cache")
p5 = patch("rag.svr.task_executor_refactor.chunk_post_processor.DocMetadataService")
with p1, p2, p3, p4, p5:
await generate_metadata(docs, ctx) # no exception = pass
class TestApplyTags(_BasePostProcessorTest):
"""Tests for apply_tags function."""