fix(agent/tools): GoogleScholar empty json output and ignored top_n (#16419)

### What problem does this PR solve?

Closes #16418.

`scholarly.search_pubs(...)` returns a **lazy generator**, but
`agent/tools/googlescholar.py` treated it as a re-iterable, bounded
list:

```python
scholar_client = scholarly.search_pubs(kwargs["query"], ...)   # lazy generator
self._retrieve_chunks(scholar_client, ...)                     # (1) iterates -> exhausts it
self.set_output("json", list(scholar_client))                  # (2) already empty -> []
```

1. **`json` output was always empty.** `_retrieve_chunks` iterates
`scholar_client`, exhausting the generator; `list(scholar_client)` then
returns `[]`.
2. **`top_n` was never applied.** Unlike `ArXiv`
(`max_results=self._param.top_n`), the unbounded generator was passed
straight to `_retrieve_chunks`, which has no internal limit — so the
tool kept paginating well past Top N (until an error, rate-limit/block,
or `COMPONENT_EXEC_TIMEOUT`).

### Type of change

- [x] Bug Fix (non-breaking change which fixes an issue)

### Changes

- Materialize at most `top_n` results once with `itertools.islice`, and
reuse that list for both `_retrieve_chunks` and the `json` output.
- Add regression tests
(`test/unit_test/agent/component/test_googlescholar.py`, stubbing
`scholarly.search_pubs`) covering the `top_n` bound, the non-empty
`json` output, and the empty-query short-circuit.

Verified: against `main` the new tests fail with `assert 30 == 5` (top_n
ignored) and `assert 0 == 5` (empty json); with this fix all pass.
Backend-only.

---------

Co-authored-by: Zhichang Yu <yuzhichang@gmail.com>
This commit is contained in:
Muhammad Furqan
2026-07-01 07:47:39 +05:00
committed by GitHub
parent 6648fe4151
commit 828c5789f6
2 changed files with 113 additions and 2 deletions

View File

@@ -13,6 +13,7 @@
# See the License for the specific language governing permissions and
# limitations under the License.
#
import itertools
import logging
import os
import time
@@ -70,6 +71,9 @@ class GoogleScholar(ToolBase, ABC):
if not kwargs.get("query"):
self.set_output("formalized_content", "")
# Reset json too, otherwise a reused instance keeps stale results
# from a previous successful call.
self.set_output("json", [])
return ""
last_e = ""
@@ -84,12 +88,18 @@ class GoogleScholar(ToolBase, ABC):
if self.check_if_canceled("GoogleScholar processing"):
return
self._retrieve_chunks(scholar_client,
# search_pubs returns a lazy generator: materialize at most top_n
# results once so the bound is respected and the same list feeds
# both _retrieve_chunks and the json output (iterating it twice
# would otherwise leave json empty).
results = list(itertools.islice(scholar_client, self._param.top_n))
self._retrieve_chunks(results,
get_title=lambda r: r['bib']['title'],
get_url=lambda r: r["pub_url"],
get_content=lambda r: "\n author: " + ",".join(r['bib']['author']) + '\n Abstract: ' + r['bib'].get('abstract', 'no abstract')
)
self.set_output("json", list(scholar_client))
self.set_output("json", results)
return self.output("formalized_content")
except Exception as e:
if self.check_if_canceled("GoogleScholar processing"):

View File

@@ -0,0 +1,101 @@
#
# Copyright 2026 The InfiniFlow Authors. All Rights Reserved.
#
# Licensed under the Apache License, Version 2.0 (the "License");
# you may not use this file except in compliance with the License.
# You may obtain a copy of the License at
#
# http://www.apache.org/licenses/LICENSE-2.0
#
# Unless required by applicable law or agreed to in writing, software
# distributed under the License is distributed on an "AS IS" BASIS,
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# See the License for the specific language governing permissions and
# limitations under the License.
#
import pytest
# GoogleScholar imports the `scholarly` SDK at module load; skip where absent.
pytest.importorskip("scholarly")
import agent.tools.googlescholar as gs_module # noqa: E402
from agent.tools.googlescholar import GoogleScholar, GoogleScholarParam # noqa: E402
def _fake_pubs(n, consumed=None):
"""A lazy generator, exactly like scholarly.search_pubs.
When ``consumed`` (a one-element list) is passed, it counts how many items
are actually pulled from the generator, so a test can prove the tool stops
after top_n instead of draining the whole result stream.
"""
for i in range(n):
if consumed is not None:
consumed[0] += 1
yield {"bib": {"title": f"t{i}", "author": ["A"], "abstract": "x"}, "pub_url": f"u{i}"}
def _make_tool(top_n):
# Bypass the canvas-bound __init__ (mirrors test_pubmed_unit.py) and stub the
# canvas-touching helpers so we can exercise _invoke's generator handling.
gs = GoogleScholar.__new__(GoogleScholar)
param = GoogleScholarParam()
param.top_n = top_n
gs._param = param
gs.check_if_canceled = lambda *a, **k: False
captured = {}
out = {}
def fake_retrieve(res_list, **_kw):
# The real _retrieve_chunks iterates its argument, which exhausts a
# generator; replicate that to expose the original bug.
items = list(res_list)
captured["chunks_count"] = len(items)
out["formalized_content"] = "FC"
gs._retrieve_chunks = fake_retrieve
gs.set_output = lambda k, v: out.__setitem__(k, v)
gs.output = lambda k=None: (out.get(k) if k else out)
return gs, captured, out
def test_respects_top_n(monkeypatch):
# Regression: top_n was never applied; the unbounded generator was passed
# straight to _retrieve_chunks. Assert both that _retrieve_chunks saw a
# sliced list AND that only top_n items were actually pulled from the
# generator (the stream is not drained past top_n).
consumed = [0]
monkeypatch.setattr(gs_module.scholarly, "search_pubs", lambda *a, **k: _fake_pubs(30, consumed))
gs, captured, _ = _make_tool(top_n=5)
gs._invoke(query="q")
assert captured["chunks_count"] == 5
assert consumed[0] == 5
def test_json_output_not_exhausted(monkeypatch):
# Regression: json was set from the already-consumed generator -> always [].
monkeypatch.setattr(gs_module.scholarly, "search_pubs", lambda *a, **k: _fake_pubs(30))
gs, _, out = _make_tool(top_n=5)
gs._invoke(query="q")
assert len(out["json"]) == 5
assert out["json"], "json output must not be empty when there are results"
def test_empty_query_short_circuits(monkeypatch):
# An empty query must short-circuit without hitting the SDK, and must clear
# json so a reused instance can't surface stale results from a prior call.
calls = [0]
def spy(*a, **k):
calls[0] += 1
return _fake_pubs(30)
monkeypatch.setattr(gs_module.scholarly, "search_pubs", spy)
gs, _, out = _make_tool(top_n=5)
out["json"] = ["stale"] # simulate leftover output from a previous call
assert gs._invoke(query="") == ""
assert calls[0] == 0, "search_pubs must not be called for an empty query"
assert out.get("formalized_content") == ""
assert out.get("json") == []