mirror of
https://github.com/infiniflow/ragflow.git
synced 2026-06-29 15:31:05 +08:00
fix: prevent None values in auto-metadata from causing KeyError (#15842)
## Problem
When users configure auto-metadata for a dataset, parsing crashes with:
```
KeyError: 'properties' in gen_metadata → schema["properties"]
```
## Root Cause
Pydantic `AutoMetadataField` defaults `enum` and `description` to `None`
when the frontend omits these fields:
```python
class AutoMetadataField(Base):
enum: Annotated[list[str] | None, Field(default=None)]
description: Annotated[str | None, Field(default=None)]
```
These `None` values propagate through the call chain and cause two
crashes:
This commit is contained in:
@@ -399,11 +399,11 @@ def _is_metadata_list(obj: list) -> bool:
|
||||
key = item.get("key")
|
||||
if not isinstance(key, str) or not key:
|
||||
return False
|
||||
if "enum" in item and not isinstance(item["enum"], list):
|
||||
if "enum" in item and item["enum"] is not None and not isinstance(item["enum"], list):
|
||||
return False
|
||||
if "description" in item and not isinstance(item["description"], str):
|
||||
if "description" in item and item["description"] is not None and not isinstance(item["description"], str):
|
||||
return False
|
||||
if "descriptions" in item and not isinstance(item["descriptions"], str):
|
||||
if "descriptions" in item and item["descriptions"] is not None and not isinstance(item["descriptions"], str):
|
||||
return False
|
||||
return True
|
||||
|
||||
@@ -414,12 +414,12 @@ def turn2jsonschema(obj: dict | list) -> Dict[str, Any]:
|
||||
if isinstance(obj, list) and _is_metadata_list(obj):
|
||||
normalized = []
|
||||
for item in obj:
|
||||
description = item.get("description", item.get("descriptions", ""))
|
||||
description = item.get("description") or item.get("descriptions") or ""
|
||||
normalized_item = {
|
||||
"key": item.get("key"),
|
||||
"description": description,
|
||||
}
|
||||
if "enum" in item:
|
||||
if "enum" in item and item["enum"] is not None:
|
||||
normalized_item["enum"] = item["enum"]
|
||||
normalized.append(normalized_item)
|
||||
return metadata_schema(normalized)
|
||||
|
||||
@@ -955,6 +955,11 @@ async def relevant_chunks_with_toc(query: str, toc: list[dict], chat_mdl, topn:
|
||||
|
||||
META_DATA = load_prompt("meta_data")
|
||||
async def gen_metadata(chat_mdl, schema: dict, content: str):
|
||||
if not schema:
|
||||
return ""
|
||||
if "properties" not in schema:
|
||||
logging.warning("gen_metadata: schema has no 'properties' key: %s", schema)
|
||||
return ""
|
||||
template = PROMPT_JINJA_ENV.from_string(META_DATA)
|
||||
for k, desc in schema["properties"].items():
|
||||
if "enum" in desc and not desc.get("enum"):
|
||||
|
||||
@@ -280,6 +280,9 @@ class MockChatModel:
|
||||
def __exit__(self, *args):
|
||||
pass
|
||||
|
||||
async def async_chat(self, system_prompt, messages, **kwargs):
|
||||
return '{"key": "value"}'
|
||||
|
||||
|
||||
@pytest.fixture
|
||||
def mock_embedding_model():
|
||||
|
||||
@@ -154,8 +154,8 @@ class TestGenerateMetadata(_BasePostProcessorTest):
|
||||
ctx = make_task_context(
|
||||
parser_config={
|
||||
"enable_metadata": True,
|
||||
"metadata": [{"name": "category", "type": "string"}],
|
||||
"built_in_metadata": ["author", "date"],
|
||||
"metadata": [{"key": "category", "type": "string"}],
|
||||
"built_in_metadata": [{"key": "update_time", "type": "time"}],
|
||||
},
|
||||
)
|
||||
docs = [{"content_with_weight": "This is test content"}]
|
||||
@@ -164,12 +164,8 @@ class TestGenerateMetadata(_BasePostProcessorTest):
|
||||
p3 = patch("rag.svr.task_executor_refactor.chunk_post_processor.get_llm_cache",
|
||||
return_value=None)
|
||||
p4 = patch("rag.svr.task_executor_refactor.chunk_post_processor.set_llm_cache")
|
||||
p5 = self._patch_prompt_func(
|
||||
"rag.svr.task_executor_refactor.chunk_post_processor.gen_metadata",
|
||||
return_value={"category": "test"},
|
||||
)
|
||||
p6 = patch("rag.svr.task_executor_refactor.chunk_post_processor.DocMetadataService")
|
||||
with p1, p2, p3, p4, p5, p6 as mock_meta:
|
||||
p5 = patch("rag.svr.task_executor_refactor.chunk_post_processor.DocMetadataService")
|
||||
with p1, p2, p3, p4, p5 as mock_meta:
|
||||
mock_meta.get_document_metadata.return_value = {}
|
||||
mock_meta.update_document_metadata = MagicMock()
|
||||
await generate_metadata(docs, ctx)
|
||||
@@ -181,8 +177,8 @@ class TestGenerateMetadata(_BasePostProcessorTest):
|
||||
ctx = make_task_context(
|
||||
parser_config={
|
||||
"enable_metadata": True,
|
||||
"metadata": [{"name": "category", "type": "string"}],
|
||||
"built_in_metadata": ["author", "date"],
|
||||
"metadata": [{"key": "category", "type": "string"}],
|
||||
"built_in_metadata": [{"key": "update_time", "type": "time"}],
|
||||
},
|
||||
write_interceptor=MagicMock(),
|
||||
)
|
||||
@@ -192,17 +188,79 @@ class TestGenerateMetadata(_BasePostProcessorTest):
|
||||
p3 = patch("rag.svr.task_executor_refactor.chunk_post_processor.get_llm_cache",
|
||||
return_value=None)
|
||||
p4 = patch("rag.svr.task_executor_refactor.chunk_post_processor.set_llm_cache")
|
||||
p5 = self._patch_prompt_func(
|
||||
"rag.svr.task_executor_refactor.chunk_post_processor.gen_metadata",
|
||||
return_value={"category": "test"},
|
||||
)
|
||||
p6 = patch("rag.svr.task_executor_refactor.chunk_post_processor.DocMetadataService")
|
||||
with p1, p2, p3, p4, p5, p6:
|
||||
p5 = patch("rag.svr.task_executor_refactor.chunk_post_processor.DocMetadataService")
|
||||
with p1, p2, p3, p4, p5:
|
||||
await generate_metadata(docs, ctx)
|
||||
ctx.write_interceptor.intercept.assert_called_once_with(
|
||||
"DocMetadataService.update_document_metadata"
|
||||
)
|
||||
|
||||
@pytest.mark.asyncio
|
||||
async def test_generate_metadata_empty_config_does_not_crash(self):
|
||||
"""Empty parser_config — no metadata configured — should not crash."""
|
||||
ctx = make_task_context(parser_config={})
|
||||
docs = [{"content_with_weight": "test"}]
|
||||
p1, p2 = self._mock_llm_binding()
|
||||
p3 = patch("rag.svr.task_executor_refactor.chunk_post_processor.DocMetadataService")
|
||||
with p1, p2, p3:
|
||||
await generate_metadata(docs, ctx) # no exception = pass
|
||||
|
||||
@pytest.mark.asyncio
|
||||
async def test_generate_metadata_enum_none_accepted(self):
|
||||
"""enum: None in metadata — treated as absent, should not crash."""
|
||||
ctx = make_task_context(
|
||||
parser_config={
|
||||
"enable_metadata": True,
|
||||
"metadata": [{"key": "format", "type": "string", "enum": None}],
|
||||
},
|
||||
)
|
||||
docs = [{"content_with_weight": "test"}]
|
||||
p1, p2 = self._mock_llm_binding()
|
||||
p3 = patch("rag.svr.task_executor_refactor.chunk_post_processor.get_llm_cache",
|
||||
return_value=None)
|
||||
p4 = patch("rag.svr.task_executor_refactor.chunk_post_processor.set_llm_cache")
|
||||
p5 = patch("rag.svr.task_executor_refactor.chunk_post_processor.DocMetadataService")
|
||||
with p1, p2, p3, p4, p5:
|
||||
await generate_metadata(docs, ctx) # no exception = pass
|
||||
|
||||
@pytest.mark.asyncio
|
||||
async def test_generate_metadata_description_none_accepted(self):
|
||||
"""description: None in metadata — should not crash."""
|
||||
ctx = make_task_context(
|
||||
parser_config={
|
||||
"enable_metadata": True,
|
||||
"metadata": [{"key": "test", "type": "string", "description": None}],
|
||||
},
|
||||
)
|
||||
docs = [{"content_with_weight": "test"}]
|
||||
p1, p2 = self._mock_llm_binding()
|
||||
p3 = patch("rag.svr.task_executor_refactor.chunk_post_processor.get_llm_cache",
|
||||
return_value=None)
|
||||
p4 = patch("rag.svr.task_executor_refactor.chunk_post_processor.set_llm_cache")
|
||||
p5 = patch("rag.svr.task_executor_refactor.chunk_post_processor.DocMetadataService")
|
||||
with p1, p2, p3, p4, p5:
|
||||
await generate_metadata(docs, ctx) # no exception = pass
|
||||
|
||||
@pytest.mark.asyncio
|
||||
async def test_generate_metadata_built_in_with_enum_none(self):
|
||||
"""built_in_metadata with enum: None — should not crash."""
|
||||
ctx = make_task_context(
|
||||
parser_config={
|
||||
"enable_metadata": True,
|
||||
"built_in_metadata": [
|
||||
{"key": "update_time", "type": "time", "description": None, "enum": None},
|
||||
],
|
||||
},
|
||||
)
|
||||
docs = [{"content_with_weight": "test"}]
|
||||
p1, p2 = self._mock_llm_binding()
|
||||
p3 = patch("rag.svr.task_executor_refactor.chunk_post_processor.get_llm_cache",
|
||||
return_value=None)
|
||||
p4 = patch("rag.svr.task_executor_refactor.chunk_post_processor.set_llm_cache")
|
||||
p5 = patch("rag.svr.task_executor_refactor.chunk_post_processor.DocMetadataService")
|
||||
with p1, p2, p3, p4, p5:
|
||||
await generate_metadata(docs, ctx) # no exception = pass
|
||||
|
||||
|
||||
class TestApplyTags(_BasePostProcessorTest):
|
||||
"""Tests for apply_tags function."""
|
||||
|
||||
Reference in New Issue
Block a user