mirror of
https://github.com/infiniflow/ragflow.git
synced 2026-06-29 15:31:05 +08:00
fix: prevent None values in auto-metadata from causing KeyError (#15842)
## Problem
When users configure auto-metadata for a dataset, parsing crashes with:
```
KeyError: 'properties' in gen_metadata → schema["properties"]
```
## Root Cause
Pydantic `AutoMetadataField` defaults `enum` and `description` to `None`
when the frontend omits these fields:
```python
class AutoMetadataField(Base):
enum: Annotated[list[str] | None, Field(default=None)]
description: Annotated[str | None, Field(default=None)]
```
These `None` values propagate through the call chain and cause two
crashes:
This commit is contained in:
@@ -399,11 +399,11 @@ def _is_metadata_list(obj: list) -> bool:
|
|||||||
key = item.get("key")
|
key = item.get("key")
|
||||||
if not isinstance(key, str) or not key:
|
if not isinstance(key, str) or not key:
|
||||||
return False
|
return False
|
||||||
if "enum" in item and not isinstance(item["enum"], list):
|
if "enum" in item and item["enum"] is not None and not isinstance(item["enum"], list):
|
||||||
return False
|
return False
|
||||||
if "description" in item and not isinstance(item["description"], str):
|
if "description" in item and item["description"] is not None and not isinstance(item["description"], str):
|
||||||
return False
|
return False
|
||||||
if "descriptions" in item and not isinstance(item["descriptions"], str):
|
if "descriptions" in item and item["descriptions"] is not None and not isinstance(item["descriptions"], str):
|
||||||
return False
|
return False
|
||||||
return True
|
return True
|
||||||
|
|
||||||
@@ -414,12 +414,12 @@ def turn2jsonschema(obj: dict | list) -> Dict[str, Any]:
|
|||||||
if isinstance(obj, list) and _is_metadata_list(obj):
|
if isinstance(obj, list) and _is_metadata_list(obj):
|
||||||
normalized = []
|
normalized = []
|
||||||
for item in obj:
|
for item in obj:
|
||||||
description = item.get("description", item.get("descriptions", ""))
|
description = item.get("description") or item.get("descriptions") or ""
|
||||||
normalized_item = {
|
normalized_item = {
|
||||||
"key": item.get("key"),
|
"key": item.get("key"),
|
||||||
"description": description,
|
"description": description,
|
||||||
}
|
}
|
||||||
if "enum" in item:
|
if "enum" in item and item["enum"] is not None:
|
||||||
normalized_item["enum"] = item["enum"]
|
normalized_item["enum"] = item["enum"]
|
||||||
normalized.append(normalized_item)
|
normalized.append(normalized_item)
|
||||||
return metadata_schema(normalized)
|
return metadata_schema(normalized)
|
||||||
|
|||||||
@@ -955,6 +955,11 @@ async def relevant_chunks_with_toc(query: str, toc: list[dict], chat_mdl, topn:
|
|||||||
|
|
||||||
META_DATA = load_prompt("meta_data")
|
META_DATA = load_prompt("meta_data")
|
||||||
async def gen_metadata(chat_mdl, schema: dict, content: str):
|
async def gen_metadata(chat_mdl, schema: dict, content: str):
|
||||||
|
if not schema:
|
||||||
|
return ""
|
||||||
|
if "properties" not in schema:
|
||||||
|
logging.warning("gen_metadata: schema has no 'properties' key: %s", schema)
|
||||||
|
return ""
|
||||||
template = PROMPT_JINJA_ENV.from_string(META_DATA)
|
template = PROMPT_JINJA_ENV.from_string(META_DATA)
|
||||||
for k, desc in schema["properties"].items():
|
for k, desc in schema["properties"].items():
|
||||||
if "enum" in desc and not desc.get("enum"):
|
if "enum" in desc and not desc.get("enum"):
|
||||||
|
|||||||
@@ -280,6 +280,9 @@ class MockChatModel:
|
|||||||
def __exit__(self, *args):
|
def __exit__(self, *args):
|
||||||
pass
|
pass
|
||||||
|
|
||||||
|
async def async_chat(self, system_prompt, messages, **kwargs):
|
||||||
|
return '{"key": "value"}'
|
||||||
|
|
||||||
|
|
||||||
@pytest.fixture
|
@pytest.fixture
|
||||||
def mock_embedding_model():
|
def mock_embedding_model():
|
||||||
|
|||||||
@@ -154,8 +154,8 @@ class TestGenerateMetadata(_BasePostProcessorTest):
|
|||||||
ctx = make_task_context(
|
ctx = make_task_context(
|
||||||
parser_config={
|
parser_config={
|
||||||
"enable_metadata": True,
|
"enable_metadata": True,
|
||||||
"metadata": [{"name": "category", "type": "string"}],
|
"metadata": [{"key": "category", "type": "string"}],
|
||||||
"built_in_metadata": ["author", "date"],
|
"built_in_metadata": [{"key": "update_time", "type": "time"}],
|
||||||
},
|
},
|
||||||
)
|
)
|
||||||
docs = [{"content_with_weight": "This is test content"}]
|
docs = [{"content_with_weight": "This is test content"}]
|
||||||
@@ -164,12 +164,8 @@ class TestGenerateMetadata(_BasePostProcessorTest):
|
|||||||
p3 = patch("rag.svr.task_executor_refactor.chunk_post_processor.get_llm_cache",
|
p3 = patch("rag.svr.task_executor_refactor.chunk_post_processor.get_llm_cache",
|
||||||
return_value=None)
|
return_value=None)
|
||||||
p4 = patch("rag.svr.task_executor_refactor.chunk_post_processor.set_llm_cache")
|
p4 = patch("rag.svr.task_executor_refactor.chunk_post_processor.set_llm_cache")
|
||||||
p5 = self._patch_prompt_func(
|
p5 = patch("rag.svr.task_executor_refactor.chunk_post_processor.DocMetadataService")
|
||||||
"rag.svr.task_executor_refactor.chunk_post_processor.gen_metadata",
|
with p1, p2, p3, p4, p5 as mock_meta:
|
||||||
return_value={"category": "test"},
|
|
||||||
)
|
|
||||||
p6 = patch("rag.svr.task_executor_refactor.chunk_post_processor.DocMetadataService")
|
|
||||||
with p1, p2, p3, p4, p5, p6 as mock_meta:
|
|
||||||
mock_meta.get_document_metadata.return_value = {}
|
mock_meta.get_document_metadata.return_value = {}
|
||||||
mock_meta.update_document_metadata = MagicMock()
|
mock_meta.update_document_metadata = MagicMock()
|
||||||
await generate_metadata(docs, ctx)
|
await generate_metadata(docs, ctx)
|
||||||
@@ -181,8 +177,8 @@ class TestGenerateMetadata(_BasePostProcessorTest):
|
|||||||
ctx = make_task_context(
|
ctx = make_task_context(
|
||||||
parser_config={
|
parser_config={
|
||||||
"enable_metadata": True,
|
"enable_metadata": True,
|
||||||
"metadata": [{"name": "category", "type": "string"}],
|
"metadata": [{"key": "category", "type": "string"}],
|
||||||
"built_in_metadata": ["author", "date"],
|
"built_in_metadata": [{"key": "update_time", "type": "time"}],
|
||||||
},
|
},
|
||||||
write_interceptor=MagicMock(),
|
write_interceptor=MagicMock(),
|
||||||
)
|
)
|
||||||
@@ -192,17 +188,79 @@ class TestGenerateMetadata(_BasePostProcessorTest):
|
|||||||
p3 = patch("rag.svr.task_executor_refactor.chunk_post_processor.get_llm_cache",
|
p3 = patch("rag.svr.task_executor_refactor.chunk_post_processor.get_llm_cache",
|
||||||
return_value=None)
|
return_value=None)
|
||||||
p4 = patch("rag.svr.task_executor_refactor.chunk_post_processor.set_llm_cache")
|
p4 = patch("rag.svr.task_executor_refactor.chunk_post_processor.set_llm_cache")
|
||||||
p5 = self._patch_prompt_func(
|
p5 = patch("rag.svr.task_executor_refactor.chunk_post_processor.DocMetadataService")
|
||||||
"rag.svr.task_executor_refactor.chunk_post_processor.gen_metadata",
|
with p1, p2, p3, p4, p5:
|
||||||
return_value={"category": "test"},
|
|
||||||
)
|
|
||||||
p6 = patch("rag.svr.task_executor_refactor.chunk_post_processor.DocMetadataService")
|
|
||||||
with p1, p2, p3, p4, p5, p6:
|
|
||||||
await generate_metadata(docs, ctx)
|
await generate_metadata(docs, ctx)
|
||||||
ctx.write_interceptor.intercept.assert_called_once_with(
|
ctx.write_interceptor.intercept.assert_called_once_with(
|
||||||
"DocMetadataService.update_document_metadata"
|
"DocMetadataService.update_document_metadata"
|
||||||
)
|
)
|
||||||
|
|
||||||
|
@pytest.mark.asyncio
|
||||||
|
async def test_generate_metadata_empty_config_does_not_crash(self):
|
||||||
|
"""Empty parser_config — no metadata configured — should not crash."""
|
||||||
|
ctx = make_task_context(parser_config={})
|
||||||
|
docs = [{"content_with_weight": "test"}]
|
||||||
|
p1, p2 = self._mock_llm_binding()
|
||||||
|
p3 = patch("rag.svr.task_executor_refactor.chunk_post_processor.DocMetadataService")
|
||||||
|
with p1, p2, p3:
|
||||||
|
await generate_metadata(docs, ctx) # no exception = pass
|
||||||
|
|
||||||
|
@pytest.mark.asyncio
|
||||||
|
async def test_generate_metadata_enum_none_accepted(self):
|
||||||
|
"""enum: None in metadata — treated as absent, should not crash."""
|
||||||
|
ctx = make_task_context(
|
||||||
|
parser_config={
|
||||||
|
"enable_metadata": True,
|
||||||
|
"metadata": [{"key": "format", "type": "string", "enum": None}],
|
||||||
|
},
|
||||||
|
)
|
||||||
|
docs = [{"content_with_weight": "test"}]
|
||||||
|
p1, p2 = self._mock_llm_binding()
|
||||||
|
p3 = patch("rag.svr.task_executor_refactor.chunk_post_processor.get_llm_cache",
|
||||||
|
return_value=None)
|
||||||
|
p4 = patch("rag.svr.task_executor_refactor.chunk_post_processor.set_llm_cache")
|
||||||
|
p5 = patch("rag.svr.task_executor_refactor.chunk_post_processor.DocMetadataService")
|
||||||
|
with p1, p2, p3, p4, p5:
|
||||||
|
await generate_metadata(docs, ctx) # no exception = pass
|
||||||
|
|
||||||
|
@pytest.mark.asyncio
|
||||||
|
async def test_generate_metadata_description_none_accepted(self):
|
||||||
|
"""description: None in metadata — should not crash."""
|
||||||
|
ctx = make_task_context(
|
||||||
|
parser_config={
|
||||||
|
"enable_metadata": True,
|
||||||
|
"metadata": [{"key": "test", "type": "string", "description": None}],
|
||||||
|
},
|
||||||
|
)
|
||||||
|
docs = [{"content_with_weight": "test"}]
|
||||||
|
p1, p2 = self._mock_llm_binding()
|
||||||
|
p3 = patch("rag.svr.task_executor_refactor.chunk_post_processor.get_llm_cache",
|
||||||
|
return_value=None)
|
||||||
|
p4 = patch("rag.svr.task_executor_refactor.chunk_post_processor.set_llm_cache")
|
||||||
|
p5 = patch("rag.svr.task_executor_refactor.chunk_post_processor.DocMetadataService")
|
||||||
|
with p1, p2, p3, p4, p5:
|
||||||
|
await generate_metadata(docs, ctx) # no exception = pass
|
||||||
|
|
||||||
|
@pytest.mark.asyncio
|
||||||
|
async def test_generate_metadata_built_in_with_enum_none(self):
|
||||||
|
"""built_in_metadata with enum: None — should not crash."""
|
||||||
|
ctx = make_task_context(
|
||||||
|
parser_config={
|
||||||
|
"enable_metadata": True,
|
||||||
|
"built_in_metadata": [
|
||||||
|
{"key": "update_time", "type": "time", "description": None, "enum": None},
|
||||||
|
],
|
||||||
|
},
|
||||||
|
)
|
||||||
|
docs = [{"content_with_weight": "test"}]
|
||||||
|
p1, p2 = self._mock_llm_binding()
|
||||||
|
p3 = patch("rag.svr.task_executor_refactor.chunk_post_processor.get_llm_cache",
|
||||||
|
return_value=None)
|
||||||
|
p4 = patch("rag.svr.task_executor_refactor.chunk_post_processor.set_llm_cache")
|
||||||
|
p5 = patch("rag.svr.task_executor_refactor.chunk_post_processor.DocMetadataService")
|
||||||
|
with p1, p2, p3, p4, p5:
|
||||||
|
await generate_metadata(docs, ctx) # no exception = pass
|
||||||
|
|
||||||
|
|
||||||
class TestApplyTags(_BasePostProcessorTest):
|
class TestApplyTags(_BasePostProcessorTest):
|
||||||
"""Tests for apply_tags function."""
|
"""Tests for apply_tags function."""
|
||||||
|
|||||||
Reference in New Issue
Block a user