fix(agent/tools): PubMed tool always returns "Unknown Authors" (#16330)

### What problem does this PR solve?

Fixes the PubMed tool always emitting `Authors: Unknown Authors`. The
`safe_find` closure in `_format_pubmed_content` was hardcoded to search
from the article root, so the per-author `LastName`/`ForeName` lookups
never matched.

`safe_find` now accepts an optional `base` node (defaults to `child`,
preserving the existing field lookups), and the author loop passes the
current `<Author>` element.

Closes #16328

### Type of change

- [x] Bug Fix (non-breaking change which fixes an issue)
- [x] Add test cases

### Testing

Added `test/testcases/test_web_api/test_canvas_app/test_pubmed_unit.py`
covering per-author parsing, intact title/journal/DOI fields, and the
no-authors fallback.

Before: `Authors: Unknown Authors`
After:  `Authors: Furqan Khan, Jane Smith`
This commit is contained in:
Muhammad Furqan
2026-06-25 11:34:37 +05:00
committed by GitHub
parent b9445c67e2
commit 3747a6bfeb
2 changed files with 95 additions and 4 deletions

View File

@@ -0,0 +1,91 @@
#
# Copyright 2026 The InfiniFlow Authors. All Rights Reserved.
#
# Licensed under the Apache License, Version 2.0 (the "License");
# you may not use this file except in compliance with the License.
# You may obtain a copy of the License at
#
# http://www.apache.org/licenses/LICENSE-2.0
#
# Unless required by applicable law or agreed to in writing, software
# distributed under the License is distributed on an "AS IS" BASIS,
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# See the License for the specific language governing permissions and
# limitations under the License.
#
import xml.etree.ElementTree as ET
import pytest
# PubMed depends on biopython (`Bio`); skip cleanly where it isn't installed.
pytest.importorskip("Bio")
from agent.tools.pubmed import PubMed # noqa: E402
SAMPLE_ARTICLE = """
<PubmedArticle>
<MedlineCitation>
<PMID>12345678</PMID>
<Article>
<ArticleTitle>Deep learning for retrieval augmented generation</ArticleTitle>
<Abstract><AbstractText>A short abstract.</AbstractText></Abstract>
<Journal>
<Title>Nature Machine Intelligence</Title>
<JournalIssue><Volume>10</Volume><Issue>2</Issue></JournalIssue>
</Journal>
<Pagination><MedlinePgn>101-110</MedlinePgn></Pagination>
<AuthorList>
<Author><LastName>Khan</LastName><ForeName>Furqan</ForeName></Author>
<Author><LastName>Smith</LastName><ForeName>Jane</ForeName></Author>
</AuthorList>
</Article>
</MedlineCitation>
<PubmedData>
<ArticleIdList>
<ArticleId IdType="doi">10.1000/example.doi</ArticleId>
</ArticleIdList>
</PubmedData>
</PubmedArticle>
"""
def _format(article_xml: str) -> str:
# _format_pubmed_content only reads its `child` argument, so we can bypass
# the canvas-bound __init__ and exercise the pure parsing logic directly.
pm = PubMed.__new__(PubMed)
return pm._format_pubmed_content(ET.fromstring(article_xml))
def test_authors_are_parsed_per_author():
"""Regression: authors used to collapse to 'Unknown Authors' because the
safe_find closure searched from the article root instead of each <Author>."""
out = _format(SAMPLE_ARTICLE)
assert "Authors: Furqan Khan, Jane Smith" in out
assert "Unknown Authors" not in out
def test_other_fields_still_parse():
out = _format(SAMPLE_ARTICLE)
assert "Title: Deep learning for retrieval augmented generation" in out
assert "Journal: Nature Machine Intelligence" in out
assert "DOI: 10.1000/example.doi" in out
NO_AUTHORS_ARTICLE = """
<PubmedArticle>
<MedlineCitation>
<PMID>87654321</PMID>
<Article>
<ArticleTitle>An article without an author list</ArticleTitle>
<Abstract><AbstractText>No authors here.</AbstractText></Abstract>
</Article>
</MedlineCitation>
</PubmedArticle>
"""
def test_missing_authors_falls_back():
out = _format(NO_AUTHORS_ARTICLE)
assert "Authors: Unknown Authors" in out