From a32ebf32bde3d10e95fef0a46966db9548ad6cd1 Mon Sep 17 00:00:00 2001 From: web-dev0521 Date: Fri, 8 May 2026 04:54:33 -0400 Subject: [PATCH] Fix: handle null document_metadata in kb_prompt to prevent citation crash (#14651) (#14666) MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit ### What problem does this PR solve? Fixes #14651. `kb_prompt()` in `rag/prompts/generator.py` crashes with `AttributeError: 'NoneType' object has no attribute 'items'` during agent citation generation when a retrieved chunk carries `document_metadata: null`. **Root cause.** The crash happens at `rag/prompts/generator.py:132-133`: ```python meta = ck.get("document_metadata", {}) for k, v in meta.items(): ``` `dict.get(key, default)` only returns the default when the key is *missing*. When the key is present with an explicit `None` value, `.get()` returns `None`, and `.items()` crashes. **How the chunk gets `None`.** It's a round-trip inside RAGFlow itself, not bad input from retrieval: 1. The agent stores retrieved chunks via `agent/canvas.py:814`, which routes them through `chunks_format()`. 2. `rag/prompts/generator.py:61` canonicalizes the field with `chunk.get("document_metadata")` (no default), so chunks without metadata become `{"document_metadata": None, ...}`. 3. `agent/component/agent_with_tools.py:314` feeds those canonicalized chunks back into `kb_prompt()` for citation generation, and `.get("document_metadata", {})` no longer protects us. **Fix.** One-line change at `rag/prompts/generator.py:132`: use `ck.get("document_metadata") or {}` so an explicit `None` is also coerced to `{}`. The line-61 `None` is intentionally part of the API/UI contract — the frontend handles it via optional chaining (`web/src/components/markdown-content/index.tsx:184`, `web/src/pages/next-search/search-view.tsx:217`) — so the fix belongs at the consumer, not the producer. ### Type of change - [x] Bug Fix (non-breaking change which fixes an issue) - [ ] New Feature (non-breaking change which adds functionality) - [ ] Documentation Update - [ ] Refactoring - [ ] Performance Improvement - [ ] Other (please describe): --- rag/prompts/generator.py | 2 +- .../rag/prompts/test_kb_prompt_metadata.py | 87 +++++++++++++++++++ 2 files changed, 88 insertions(+), 1 deletion(-) create mode 100644 test/unit_test/rag/prompts/test_kb_prompt_metadata.py diff --git a/rag/prompts/generator.py b/rag/prompts/generator.py index 2ef8b8f8c8..ddf99251b5 100644 --- a/rag/prompts/generator.py +++ b/rag/prompts/generator.py @@ -129,7 +129,7 @@ def kb_prompt(kbinfos, max_tokens, hash_id=False): cnt = "\nID: {}".format(i if not hash_id else hash_str2int(get_value(ck, "id", "chunk_id"), 500)) cnt += draw_node("Title", get_value(ck, "docnm_kwd", "document_name")) cnt += draw_node("URL", ck.get('url', '')) - meta = ck.get("document_metadata", {}) + meta = ck.get("document_metadata") or {} for k, v in meta.items(): cnt += draw_node(k, v) cnt += "\n└── Content:\n" diff --git a/test/unit_test/rag/prompts/test_kb_prompt_metadata.py b/test/unit_test/rag/prompts/test_kb_prompt_metadata.py new file mode 100644 index 0000000000..86d96eeec3 --- /dev/null +++ b/test/unit_test/rag/prompts/test_kb_prompt_metadata.py @@ -0,0 +1,87 @@ +# +# Copyright 2026 The InfiniFlow Authors. All Rights Reserved. +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. +# +import pytest + +from rag.prompts.generator import kb_prompt + + +@pytest.mark.p1 +class TestKbPromptDocumentMetadata: + """Regression tests for kb_prompt's handling of `document_metadata` on chunks.""" + + @pytest.mark.p1 + def test_null_document_metadata_does_not_crash(self): + """A chunk with `document_metadata: None` must not raise AttributeError. + + Regression for issue #14651: chunks retrieved from the index can carry + an explicit null metadata field, which made `dict.get(..., {})` return + `None` and crash citation generation with + `AttributeError: 'NoneType' object has no attribute 'items'`. + """ + kbinfos = { + "chunks": [ + { + "id": "chunk-1", + "content_with_weight": "hello world", + "docnm_kwd": "doc.pdf", + "document_metadata": None, + } + ] + } + + rendered = kb_prompt(kbinfos, max_tokens=10000) + + assert len(rendered) == 1 + assert "hello world" in rendered[0] + assert "doc.pdf" in rendered[0] + + @pytest.mark.p1 + def test_missing_document_metadata_key(self): + """A chunk with no `document_metadata` key at all should also work.""" + kbinfos = { + "chunks": [ + { + "id": "chunk-1", + "content_with_weight": "hello world", + "docnm_kwd": "doc.pdf", + } + ] + } + + rendered = kb_prompt(kbinfos, max_tokens=10000) + + assert len(rendered) == 1 + assert "hello world" in rendered[0] + + @pytest.mark.p1 + def test_populated_document_metadata_renders_fields(self): + """When metadata is a dict, its key/value pairs must be rendered.""" + kbinfos = { + "chunks": [ + { + "id": "chunk-1", + "content_with_weight": "hello world", + "docnm_kwd": "doc.pdf", + "document_metadata": {"author": "alice", "year": "2026"}, + } + ] + } + + rendered = kb_prompt(kbinfos, max_tokens=10000) + + assert len(rendered) == 1 + assert "author: alice" in rendered[0] + assert "year: 2026" in rendered[0]