diff --git a/agent/tools/pubmed.py b/agent/tools/pubmed.py index 05c222810b..48117f1567 100644 --- a/agent/tools/pubmed.py +++ b/agent/tools/pubmed.py @@ -117,8 +117,8 @@ class PubMed(ToolBase, ABC): def _format_pubmed_content(self, child): """Extract structured reference info from PubMed XML""" - def safe_find(path): - node = child + def safe_find(path, base=None): + node = child if base is None else base for p in path.split("/"): if node is None: return None @@ -135,8 +135,8 @@ class PubMed(ToolBase, ABC): # Authors authors = [] for author in child.findall(".//AuthorList/Author"): - lastname = safe_find("LastName") or "" - forename = safe_find("ForeName") or "" + lastname = safe_find("LastName", author) or "" + forename = safe_find("ForeName", author) or "" fullname = f"{forename} {lastname}".strip() if fullname: authors.append(fullname) diff --git a/test/testcases/test_web_api/test_canvas_app/test_pubmed_unit.py b/test/testcases/test_web_api/test_canvas_app/test_pubmed_unit.py new file mode 100644 index 0000000000..ed4f8bf274 --- /dev/null +++ b/test/testcases/test_web_api/test_canvas_app/test_pubmed_unit.py @@ -0,0 +1,91 @@ +# +# Copyright 2026 The InfiniFlow Authors. All Rights Reserved. +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. +# + +import xml.etree.ElementTree as ET + +import pytest + +# PubMed depends on biopython (`Bio`); skip cleanly where it isn't installed. +pytest.importorskip("Bio") + +from agent.tools.pubmed import PubMed # noqa: E402 + + +SAMPLE_ARTICLE = """ + + + 12345678 +
+ Deep learning for retrieval augmented generation + A short abstract. + + Nature Machine Intelligence + 102 + + 101-110 + + KhanFurqan + SmithJane + +
+
+ + + 10.1000/example.doi + + +
+""" + + +def _format(article_xml: str) -> str: + # _format_pubmed_content only reads its `child` argument, so we can bypass + # the canvas-bound __init__ and exercise the pure parsing logic directly. + pm = PubMed.__new__(PubMed) + return pm._format_pubmed_content(ET.fromstring(article_xml)) + + +def test_authors_are_parsed_per_author(): + """Regression: authors used to collapse to 'Unknown Authors' because the + safe_find closure searched from the article root instead of each .""" + out = _format(SAMPLE_ARTICLE) + assert "Authors: Furqan Khan, Jane Smith" in out + assert "Unknown Authors" not in out + + +def test_other_fields_still_parse(): + out = _format(SAMPLE_ARTICLE) + assert "Title: Deep learning for retrieval augmented generation" in out + assert "Journal: Nature Machine Intelligence" in out + assert "DOI: 10.1000/example.doi" in out + + +NO_AUTHORS_ARTICLE = """ + + + 87654321 +
+ An article without an author list + No authors here. +
+
+
+""" + + +def test_missing_authors_falls_back(): + out = _format(NO_AUTHORS_ARTICLE) + assert "Authors: Unknown Authors" in out