fix(agent/tools): PubMed tool always returns "Unknown Authors" (#16330)

### What problem does this PR solve? Fixes the PubMed tool always emitting `Authors: Unknown Authors`. The `safe_find` closure in `_format_pubmed_content` was hardcoded to search from the article root, so the per-author `LastName`/`ForeName` lookups never matched. `safe_find` now accepts an optional `base` node (defaults to `child`, preserving the existing field lookups), and the author loop passes the current `<Author>` element. Closes #16328 ### Type of change - [x] Bug Fix (non-breaking change which fixes an issue) - [x] Add test cases ### Testing Added `test/testcases/test_web_api/test_canvas_app/test_pubmed_unit.py` covering per-author parsing, intact title/journal/DOI fields, and the no-authors fallback. Before: `Authors: Unknown Authors` After: `Authors: Furqan Khan, Jane Smith`
2026-06-29 15:31:05 +08:00 · 2026-06-25 11:34:37 +05:00
parent b9445c67e2
commit 3747a6bfeb
2 changed files with 95 additions and 4 deletions
--- a/test/testcases/test_web_api/test_canvas_app/test_pubmed_unit.py
+++ b/test/testcases/test_web_api/test_canvas_app/test_pubmed_unit.py
@@ -0,0 +1,91 @@
+#
+#  Copyright 2026 The InfiniFlow Authors. All Rights Reserved.
+#
+#  Licensed under the Apache License, Version 2.0 (the "License");
+#  you may not use this file except in compliance with the License.
+#  You may obtain a copy of the License at
+#
+#      http://www.apache.org/licenses/LICENSE-2.0
+#
+#  Unless required by applicable law or agreed to in writing, software
+#  distributed under the License is distributed on an "AS IS" BASIS,
+#  WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+#  See the License for the specific language governing permissions and
+#  limitations under the License.
+#
+
+import xml.etree.ElementTree as ET
+
+import pytest
+
+# PubMed depends on biopython (`Bio`); skip cleanly where it isn't installed.
+pytest.importorskip("Bio")
+
+from agent.tools.pubmed import PubMed  # noqa: E402
+
+
+SAMPLE_ARTICLE = """
+<PubmedArticle>
+  <MedlineCitation>
+    <PMID>12345678</PMID>
+    <Article>
+      <ArticleTitle>Deep learning for retrieval augmented generation</ArticleTitle>
+      <Abstract><AbstractText>A short abstract.</AbstractText></Abstract>
+      <Journal>
+        <Title>Nature Machine Intelligence</Title>
+        <JournalIssue><Volume>10</Volume><Issue>2</Issue></JournalIssue>
+      </Journal>
+      <Pagination><MedlinePgn>101-110</MedlinePgn></Pagination>
+      <AuthorList>
+        <Author><LastName>Khan</LastName><ForeName>Furqan</ForeName></Author>
+        <Author><LastName>Smith</LastName><ForeName>Jane</ForeName></Author>
+      </AuthorList>
+    </Article>
+  </MedlineCitation>
+  <PubmedData>
+    <ArticleIdList>
+      <ArticleId IdType="doi">10.1000/example.doi</ArticleId>
+    </ArticleIdList>
+  </PubmedData>
+</PubmedArticle>
+"""
+
+
+def _format(article_xml: str) -> str:
+    # _format_pubmed_content only reads its `child` argument, so we can bypass
+    # the canvas-bound __init__ and exercise the pure parsing logic directly.
+    pm = PubMed.__new__(PubMed)
+    return pm._format_pubmed_content(ET.fromstring(article_xml))
+
+
+def test_authors_are_parsed_per_author():
+    """Regression: authors used to collapse to 'Unknown Authors' because the
+    safe_find closure searched from the article root instead of each <Author>."""
+    out = _format(SAMPLE_ARTICLE)
+    assert "Authors: Furqan Khan, Jane Smith" in out
+    assert "Unknown Authors" not in out
+
+
+def test_other_fields_still_parse():
+    out = _format(SAMPLE_ARTICLE)
+    assert "Title: Deep learning for retrieval augmented generation" in out
+    assert "Journal: Nature Machine Intelligence" in out
+    assert "DOI: 10.1000/example.doi" in out
+
+
+NO_AUTHORS_ARTICLE = """
+<PubmedArticle>
+  <MedlineCitation>
+    <PMID>87654321</PMID>
+    <Article>
+      <ArticleTitle>An article without an author list</ArticleTitle>
+      <Abstract><AbstractText>No authors here.</AbstractText></Abstract>
+    </Article>
+  </MedlineCitation>
+</PubmedArticle>
+"""
+
+
+def test_missing_authors_falls_back():
+    out = _format(NO_AUTHORS_ARTICLE)
+    assert "Authors: Unknown Authors" in out