From 3eff41361b4dc6c3515230f99b2d63fd70658960 Mon Sep 17 00:00:00 2001 From: Jack Date: Tue, 9 Jun 2026 19:10:48 +0800 Subject: [PATCH] fix: prevent None values in auto-metadata from causing KeyError (#15842) MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit ## Problem When users configure auto-metadata for a dataset, parsing crashes with: ``` KeyError: 'properties' in gen_metadata → schema["properties"] ``` ## Root Cause Pydantic `AutoMetadataField` defaults `enum` and `description` to `None` when the frontend omits these fields: ```python class AutoMetadataField(Base): enum: Annotated[list[str] | None, Field(default=None)] description: Annotated[str | None, Field(default=None)] ``` These `None` values propagate through the call chain and cause two crashes: --- common/metadata_utils.py | 10 +-- rag/prompts/generator.py | 5 ++ .../svr/task_executor_refactor/conftest.py | 3 + .../test_chunk_post_processor.py | 90 +++++++++++++++---- 4 files changed, 87 insertions(+), 21 deletions(-) diff --git a/common/metadata_utils.py b/common/metadata_utils.py index 591acb8053..a6c6d273dc 100644 --- a/common/metadata_utils.py +++ b/common/metadata_utils.py @@ -399,11 +399,11 @@ def _is_metadata_list(obj: list) -> bool: key = item.get("key") if not isinstance(key, str) or not key: return False - if "enum" in item and not isinstance(item["enum"], list): + if "enum" in item and item["enum"] is not None and not isinstance(item["enum"], list): return False - if "description" in item and not isinstance(item["description"], str): + if "description" in item and item["description"] is not None and not isinstance(item["description"], str): return False - if "descriptions" in item and not isinstance(item["descriptions"], str): + if "descriptions" in item and item["descriptions"] is not None and not isinstance(item["descriptions"], str): return False return True @@ -414,12 +414,12 @@ def turn2jsonschema(obj: dict | list) -> Dict[str, Any]: if isinstance(obj, list) and _is_metadata_list(obj): normalized = [] for item in obj: - description = item.get("description", item.get("descriptions", "")) + description = item.get("description") or item.get("descriptions") or "" normalized_item = { "key": item.get("key"), "description": description, } - if "enum" in item: + if "enum" in item and item["enum"] is not None: normalized_item["enum"] = item["enum"] normalized.append(normalized_item) return metadata_schema(normalized) diff --git a/rag/prompts/generator.py b/rag/prompts/generator.py index edba89c808..fd58ddff7a 100644 --- a/rag/prompts/generator.py +++ b/rag/prompts/generator.py @@ -955,6 +955,11 @@ async def relevant_chunks_with_toc(query: str, toc: list[dict], chat_mdl, topn: META_DATA = load_prompt("meta_data") async def gen_metadata(chat_mdl, schema: dict, content: str): + if not schema: + return "" + if "properties" not in schema: + logging.warning("gen_metadata: schema has no 'properties' key: %s", schema) + return "" template = PROMPT_JINJA_ENV.from_string(META_DATA) for k, desc in schema["properties"].items(): if "enum" in desc and not desc.get("enum"): diff --git a/test/unit_test/rag/svr/task_executor_refactor/conftest.py b/test/unit_test/rag/svr/task_executor_refactor/conftest.py index 8f57701d04..84d06cc959 100644 --- a/test/unit_test/rag/svr/task_executor_refactor/conftest.py +++ b/test/unit_test/rag/svr/task_executor_refactor/conftest.py @@ -280,6 +280,9 @@ class MockChatModel: def __exit__(self, *args): pass + async def async_chat(self, system_prompt, messages, **kwargs): + return '{"key": "value"}' + @pytest.fixture def mock_embedding_model(): diff --git a/test/unit_test/rag/svr/task_executor_refactor/test_chunk_post_processor.py b/test/unit_test/rag/svr/task_executor_refactor/test_chunk_post_processor.py index a5e5427d86..c1394c4fae 100644 --- a/test/unit_test/rag/svr/task_executor_refactor/test_chunk_post_processor.py +++ b/test/unit_test/rag/svr/task_executor_refactor/test_chunk_post_processor.py @@ -154,8 +154,8 @@ class TestGenerateMetadata(_BasePostProcessorTest): ctx = make_task_context( parser_config={ "enable_metadata": True, - "metadata": [{"name": "category", "type": "string"}], - "built_in_metadata": ["author", "date"], + "metadata": [{"key": "category", "type": "string"}], + "built_in_metadata": [{"key": "update_time", "type": "time"}], }, ) docs = [{"content_with_weight": "This is test content"}] @@ -164,12 +164,8 @@ class TestGenerateMetadata(_BasePostProcessorTest): p3 = patch("rag.svr.task_executor_refactor.chunk_post_processor.get_llm_cache", return_value=None) p4 = patch("rag.svr.task_executor_refactor.chunk_post_processor.set_llm_cache") - p5 = self._patch_prompt_func( - "rag.svr.task_executor_refactor.chunk_post_processor.gen_metadata", - return_value={"category": "test"}, - ) - p6 = patch("rag.svr.task_executor_refactor.chunk_post_processor.DocMetadataService") - with p1, p2, p3, p4, p5, p6 as mock_meta: + p5 = patch("rag.svr.task_executor_refactor.chunk_post_processor.DocMetadataService") + with p1, p2, p3, p4, p5 as mock_meta: mock_meta.get_document_metadata.return_value = {} mock_meta.update_document_metadata = MagicMock() await generate_metadata(docs, ctx) @@ -181,8 +177,8 @@ class TestGenerateMetadata(_BasePostProcessorTest): ctx = make_task_context( parser_config={ "enable_metadata": True, - "metadata": [{"name": "category", "type": "string"}], - "built_in_metadata": ["author", "date"], + "metadata": [{"key": "category", "type": "string"}], + "built_in_metadata": [{"key": "update_time", "type": "time"}], }, write_interceptor=MagicMock(), ) @@ -192,17 +188,79 @@ class TestGenerateMetadata(_BasePostProcessorTest): p3 = patch("rag.svr.task_executor_refactor.chunk_post_processor.get_llm_cache", return_value=None) p4 = patch("rag.svr.task_executor_refactor.chunk_post_processor.set_llm_cache") - p5 = self._patch_prompt_func( - "rag.svr.task_executor_refactor.chunk_post_processor.gen_metadata", - return_value={"category": "test"}, - ) - p6 = patch("rag.svr.task_executor_refactor.chunk_post_processor.DocMetadataService") - with p1, p2, p3, p4, p5, p6: + p5 = patch("rag.svr.task_executor_refactor.chunk_post_processor.DocMetadataService") + with p1, p2, p3, p4, p5: await generate_metadata(docs, ctx) ctx.write_interceptor.intercept.assert_called_once_with( "DocMetadataService.update_document_metadata" ) + @pytest.mark.asyncio + async def test_generate_metadata_empty_config_does_not_crash(self): + """Empty parser_config — no metadata configured — should not crash.""" + ctx = make_task_context(parser_config={}) + docs = [{"content_with_weight": "test"}] + p1, p2 = self._mock_llm_binding() + p3 = patch("rag.svr.task_executor_refactor.chunk_post_processor.DocMetadataService") + with p1, p2, p3: + await generate_metadata(docs, ctx) # no exception = pass + + @pytest.mark.asyncio + async def test_generate_metadata_enum_none_accepted(self): + """enum: None in metadata — treated as absent, should not crash.""" + ctx = make_task_context( + parser_config={ + "enable_metadata": True, + "metadata": [{"key": "format", "type": "string", "enum": None}], + }, + ) + docs = [{"content_with_weight": "test"}] + p1, p2 = self._mock_llm_binding() + p3 = patch("rag.svr.task_executor_refactor.chunk_post_processor.get_llm_cache", + return_value=None) + p4 = patch("rag.svr.task_executor_refactor.chunk_post_processor.set_llm_cache") + p5 = patch("rag.svr.task_executor_refactor.chunk_post_processor.DocMetadataService") + with p1, p2, p3, p4, p5: + await generate_metadata(docs, ctx) # no exception = pass + + @pytest.mark.asyncio + async def test_generate_metadata_description_none_accepted(self): + """description: None in metadata — should not crash.""" + ctx = make_task_context( + parser_config={ + "enable_metadata": True, + "metadata": [{"key": "test", "type": "string", "description": None}], + }, + ) + docs = [{"content_with_weight": "test"}] + p1, p2 = self._mock_llm_binding() + p3 = patch("rag.svr.task_executor_refactor.chunk_post_processor.get_llm_cache", + return_value=None) + p4 = patch("rag.svr.task_executor_refactor.chunk_post_processor.set_llm_cache") + p5 = patch("rag.svr.task_executor_refactor.chunk_post_processor.DocMetadataService") + with p1, p2, p3, p4, p5: + await generate_metadata(docs, ctx) # no exception = pass + + @pytest.mark.asyncio + async def test_generate_metadata_built_in_with_enum_none(self): + """built_in_metadata with enum: None — should not crash.""" + ctx = make_task_context( + parser_config={ + "enable_metadata": True, + "built_in_metadata": [ + {"key": "update_time", "type": "time", "description": None, "enum": None}, + ], + }, + ) + docs = [{"content_with_weight": "test"}] + p1, p2 = self._mock_llm_binding() + p3 = patch("rag.svr.task_executor_refactor.chunk_post_processor.get_llm_cache", + return_value=None) + p4 = patch("rag.svr.task_executor_refactor.chunk_post_processor.set_llm_cache") + p5 = patch("rag.svr.task_executor_refactor.chunk_post_processor.DocMetadataService") + with p1, p2, p3, p4, p5: + await generate_metadata(docs, ctx) # no exception = pass + class TestApplyTags(_BasePostProcessorTest): """Tests for apply_tags function."""