From 828c5789f651d4c4ebe4645190b8b8d244144fe0 Mon Sep 17 00:00:00 2001 From: Muhammad Furqan Date: Wed, 1 Jul 2026 07:47:39 +0500 Subject: [PATCH] fix(agent/tools): GoogleScholar empty json output and ignored top_n (#16419) MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit ### What problem does this PR solve? Closes #16418. `scholarly.search_pubs(...)` returns a **lazy generator**, but `agent/tools/googlescholar.py` treated it as a re-iterable, bounded list: ```python scholar_client = scholarly.search_pubs(kwargs["query"], ...) # lazy generator self._retrieve_chunks(scholar_client, ...) # (1) iterates -> exhausts it self.set_output("json", list(scholar_client)) # (2) already empty -> [] ``` 1. **`json` output was always empty.** `_retrieve_chunks` iterates `scholar_client`, exhausting the generator; `list(scholar_client)` then returns `[]`. 2. **`top_n` was never applied.** Unlike `ArXiv` (`max_results=self._param.top_n`), the unbounded generator was passed straight to `_retrieve_chunks`, which has no internal limit — so the tool kept paginating well past Top N (until an error, rate-limit/block, or `COMPONENT_EXEC_TIMEOUT`). ### Type of change - [x] Bug Fix (non-breaking change which fixes an issue) ### Changes - Materialize at most `top_n` results once with `itertools.islice`, and reuse that list for both `_retrieve_chunks` and the `json` output. - Add regression tests (`test/unit_test/agent/component/test_googlescholar.py`, stubbing `scholarly.search_pubs`) covering the `top_n` bound, the non-empty `json` output, and the empty-query short-circuit. Verified: against `main` the new tests fail with `assert 30 == 5` (top_n ignored) and `assert 0 == 5` (empty json); with this fix all pass. Backend-only. --------- Co-authored-by: Zhichang Yu --- agent/tools/googlescholar.py | 14 ++- .../agent/component/test_googlescholar.py | 101 ++++++++++++++++++ 2 files changed, 113 insertions(+), 2 deletions(-) create mode 100644 test/unit_test/agent/component/test_googlescholar.py diff --git a/agent/tools/googlescholar.py b/agent/tools/googlescholar.py index b5c4eb395d..8196304ee8 100644 --- a/agent/tools/googlescholar.py +++ b/agent/tools/googlescholar.py @@ -13,6 +13,7 @@ # See the License for the specific language governing permissions and # limitations under the License. # +import itertools import logging import os import time @@ -70,6 +71,9 @@ class GoogleScholar(ToolBase, ABC): if not kwargs.get("query"): self.set_output("formalized_content", "") + # Reset json too, otherwise a reused instance keeps stale results + # from a previous successful call. + self.set_output("json", []) return "" last_e = "" @@ -84,12 +88,18 @@ class GoogleScholar(ToolBase, ABC): if self.check_if_canceled("GoogleScholar processing"): return - self._retrieve_chunks(scholar_client, + # search_pubs returns a lazy generator: materialize at most top_n + # results once so the bound is respected and the same list feeds + # both _retrieve_chunks and the json output (iterating it twice + # would otherwise leave json empty). + results = list(itertools.islice(scholar_client, self._param.top_n)) + + self._retrieve_chunks(results, get_title=lambda r: r['bib']['title'], get_url=lambda r: r["pub_url"], get_content=lambda r: "\n author: " + ",".join(r['bib']['author']) + '\n Abstract: ' + r['bib'].get('abstract', 'no abstract') ) - self.set_output("json", list(scholar_client)) + self.set_output("json", results) return self.output("formalized_content") except Exception as e: if self.check_if_canceled("GoogleScholar processing"): diff --git a/test/unit_test/agent/component/test_googlescholar.py b/test/unit_test/agent/component/test_googlescholar.py new file mode 100644 index 0000000000..0fa4b18e41 --- /dev/null +++ b/test/unit_test/agent/component/test_googlescholar.py @@ -0,0 +1,101 @@ +# +# Copyright 2026 The InfiniFlow Authors. All Rights Reserved. +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. +# + +import pytest + +# GoogleScholar imports the `scholarly` SDK at module load; skip where absent. +pytest.importorskip("scholarly") + +import agent.tools.googlescholar as gs_module # noqa: E402 +from agent.tools.googlescholar import GoogleScholar, GoogleScholarParam # noqa: E402 + + +def _fake_pubs(n, consumed=None): + """A lazy generator, exactly like scholarly.search_pubs. + + When ``consumed`` (a one-element list) is passed, it counts how many items + are actually pulled from the generator, so a test can prove the tool stops + after top_n instead of draining the whole result stream. + """ + for i in range(n): + if consumed is not None: + consumed[0] += 1 + yield {"bib": {"title": f"t{i}", "author": ["A"], "abstract": "x"}, "pub_url": f"u{i}"} + + +def _make_tool(top_n): + # Bypass the canvas-bound __init__ (mirrors test_pubmed_unit.py) and stub the + # canvas-touching helpers so we can exercise _invoke's generator handling. + gs = GoogleScholar.__new__(GoogleScholar) + param = GoogleScholarParam() + param.top_n = top_n + gs._param = param + gs.check_if_canceled = lambda *a, **k: False + + captured = {} + out = {} + + def fake_retrieve(res_list, **_kw): + # The real _retrieve_chunks iterates its argument, which exhausts a + # generator; replicate that to expose the original bug. + items = list(res_list) + captured["chunks_count"] = len(items) + out["formalized_content"] = "FC" + + gs._retrieve_chunks = fake_retrieve + gs.set_output = lambda k, v: out.__setitem__(k, v) + gs.output = lambda k=None: (out.get(k) if k else out) + return gs, captured, out + + +def test_respects_top_n(monkeypatch): + # Regression: top_n was never applied; the unbounded generator was passed + # straight to _retrieve_chunks. Assert both that _retrieve_chunks saw a + # sliced list AND that only top_n items were actually pulled from the + # generator (the stream is not drained past top_n). + consumed = [0] + monkeypatch.setattr(gs_module.scholarly, "search_pubs", lambda *a, **k: _fake_pubs(30, consumed)) + gs, captured, _ = _make_tool(top_n=5) + gs._invoke(query="q") + assert captured["chunks_count"] == 5 + assert consumed[0] == 5 + + +def test_json_output_not_exhausted(monkeypatch): + # Regression: json was set from the already-consumed generator -> always []. + monkeypatch.setattr(gs_module.scholarly, "search_pubs", lambda *a, **k: _fake_pubs(30)) + gs, _, out = _make_tool(top_n=5) + gs._invoke(query="q") + assert len(out["json"]) == 5 + assert out["json"], "json output must not be empty when there are results" + + +def test_empty_query_short_circuits(monkeypatch): + # An empty query must short-circuit without hitting the SDK, and must clear + # json so a reused instance can't surface stale results from a prior call. + calls = [0] + + def spy(*a, **k): + calls[0] += 1 + return _fake_pubs(30) + + monkeypatch.setattr(gs_module.scholarly, "search_pubs", spy) + gs, _, out = _make_tool(top_n=5) + out["json"] = ["stale"] # simulate leftover output from a previous call + assert gs._invoke(query="") == "" + assert calls[0] == 0, "search_pubs must not be called for an empty query" + assert out.get("formalized_content") == "" + assert out.get("json") == []