From 828c5789f651d4c4ebe4645190b8b8d244144fe0 Mon Sep 17 00:00:00 2001
From: Muhammad Furqan <muhammadfurqan0100@gmail.com>
Date: Wed, 1 Jul 2026 07:47:39 +0500
Subject: [PATCH] fix(agent/tools): GoogleScholar empty json output and ignored
 top_n (#16419)
MIME-Version: 1.0
Content-Type: text/plain; charset=UTF-8
Content-Transfer-Encoding: 8bit

### What problem does this PR solve?

Closes #16418.

`scholarly.search_pubs(...)` returns a **lazy generator**, but
`agent/tools/googlescholar.py` treated it as a re-iterable, bounded
list:

```python
scholar_client = scholarly.search_pubs(kwargs["query"], ...)   # lazy generator
self._retrieve_chunks(scholar_client, ...)                     # (1) iterates -> exhausts it
self.set_output("json", list(scholar_client))                  # (2) already empty -> []
```

1. **`json` output was always empty.** `_retrieve_chunks` iterates
`scholar_client`, exhausting the generator; `list(scholar_client)` then
returns `[]`.
2. **`top_n` was never applied.** Unlike `ArXiv`
(`max_results=self._param.top_n`), the unbounded generator was passed
straight to `_retrieve_chunks`, which has no internal limit — so the
tool kept paginating well past Top N (until an error, rate-limit/block,
or `COMPONENT_EXEC_TIMEOUT`).

### Type of change

- [x] Bug Fix (non-breaking change which fixes an issue)

### Changes

- Materialize at most `top_n` results once with `itertools.islice`, and
reuse that list for both `_retrieve_chunks` and the `json` output.
- Add regression tests
(`test/unit_test/agent/component/test_googlescholar.py`, stubbing
`scholarly.search_pubs`) covering the `top_n` bound, the non-empty
`json` output, and the empty-query short-circuit.

Verified: against `main` the new tests fail with `assert 30 == 5` (top_n
ignored) and `assert 0 == 5` (empty json); with this fix all pass.
Backend-only.

---------

Co-authored-by: Zhichang Yu <yuzhichang@gmail.com>
---
 agent/tools/googlescholar.py                  |  14 ++-
 .../agent/component/test_googlescholar.py     | 101 ++++++++++++++++++
 2 files changed, 113 insertions(+), 2 deletions(-)
 create mode 100644 test/unit_test/agent/component/test_googlescholar.py

diff --git a/agent/tools/googlescholar.py b/agent/tools/googlescholar.py
index b5c4eb395d..8196304ee8 100644
--- a/agent/tools/googlescholar.py
+++ b/agent/tools/googlescholar.py
@@ -13,6 +13,7 @@
 #  See the License for the specific language governing permissions and
 #  limitations under the License.
 #
+import itertools
 import logging
 import os
 import time
@@ -70,6 +71,9 @@ class GoogleScholar(ToolBase, ABC):
 
         if not kwargs.get("query"):
             self.set_output("formalized_content", "")
+            # Reset json too, otherwise a reused instance keeps stale results
+            # from a previous successful call.
+            self.set_output("json", [])
             return ""
 
         last_e = ""
@@ -84,12 +88,18 @@ class GoogleScholar(ToolBase, ABC):
                 if self.check_if_canceled("GoogleScholar processing"):
                     return
 
-                self._retrieve_chunks(scholar_client,
+                # search_pubs returns a lazy generator: materialize at most top_n
+                # results once so the bound is respected and the same list feeds
+                # both _retrieve_chunks and the json output (iterating it twice
+                # would otherwise leave json empty).
+                results = list(itertools.islice(scholar_client, self._param.top_n))
+
+                self._retrieve_chunks(results,
                                       get_title=lambda r: r['bib']['title'],
                                       get_url=lambda r: r["pub_url"],
                                       get_content=lambda r: "\n author: " + ",".join(r['bib']['author']) + '\n Abstract: ' + r['bib'].get('abstract', 'no abstract')
                                       )
-                self.set_output("json", list(scholar_client))
+                self.set_output("json", results)
                 return self.output("formalized_content")
             except Exception as e:
                 if self.check_if_canceled("GoogleScholar processing"):
diff --git a/test/unit_test/agent/component/test_googlescholar.py b/test/unit_test/agent/component/test_googlescholar.py
new file mode 100644
index 0000000000..0fa4b18e41
--- /dev/null
+++ b/test/unit_test/agent/component/test_googlescholar.py
@@ -0,0 +1,101 @@
+#
+#  Copyright 2026 The InfiniFlow Authors. All Rights Reserved.
+#
+#  Licensed under the Apache License, Version 2.0 (the "License");
+#  you may not use this file except in compliance with the License.
+#  You may obtain a copy of the License at
+#
+#      http://www.apache.org/licenses/LICENSE-2.0
+#
+#  Unless required by applicable law or agreed to in writing, software
+#  distributed under the License is distributed on an "AS IS" BASIS,
+#  WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+#  See the License for the specific language governing permissions and
+#  limitations under the License.
+#
+
+import pytest
+
+# GoogleScholar imports the `scholarly` SDK at module load; skip where absent.
+pytest.importorskip("scholarly")
+
+import agent.tools.googlescholar as gs_module  # noqa: E402
+from agent.tools.googlescholar import GoogleScholar, GoogleScholarParam  # noqa: E402
+
+
+def _fake_pubs(n, consumed=None):
+    """A lazy generator, exactly like scholarly.search_pubs.
+
+    When ``consumed`` (a one-element list) is passed, it counts how many items
+    are actually pulled from the generator, so a test can prove the tool stops
+    after top_n instead of draining the whole result stream.
+    """
+    for i in range(n):
+        if consumed is not None:
+            consumed[0] += 1
+        yield {"bib": {"title": f"t{i}", "author": ["A"], "abstract": "x"}, "pub_url": f"u{i}"}
+
+
+def _make_tool(top_n):
+    # Bypass the canvas-bound __init__ (mirrors test_pubmed_unit.py) and stub the
+    # canvas-touching helpers so we can exercise _invoke's generator handling.
+    gs = GoogleScholar.__new__(GoogleScholar)
+    param = GoogleScholarParam()
+    param.top_n = top_n
+    gs._param = param
+    gs.check_if_canceled = lambda *a, **k: False
+
+    captured = {}
+    out = {}
+
+    def fake_retrieve(res_list, **_kw):
+        # The real _retrieve_chunks iterates its argument, which exhausts a
+        # generator; replicate that to expose the original bug.
+        items = list(res_list)
+        captured["chunks_count"] = len(items)
+        out["formalized_content"] = "FC"
+
+    gs._retrieve_chunks = fake_retrieve
+    gs.set_output = lambda k, v: out.__setitem__(k, v)
+    gs.output = lambda k=None: (out.get(k) if k else out)
+    return gs, captured, out
+
+
+def test_respects_top_n(monkeypatch):
+    # Regression: top_n was never applied; the unbounded generator was passed
+    # straight to _retrieve_chunks. Assert both that _retrieve_chunks saw a
+    # sliced list AND that only top_n items were actually pulled from the
+    # generator (the stream is not drained past top_n).
+    consumed = [0]
+    monkeypatch.setattr(gs_module.scholarly, "search_pubs", lambda *a, **k: _fake_pubs(30, consumed))
+    gs, captured, _ = _make_tool(top_n=5)
+    gs._invoke(query="q")
+    assert captured["chunks_count"] == 5
+    assert consumed[0] == 5
+
+
+def test_json_output_not_exhausted(monkeypatch):
+    # Regression: json was set from the already-consumed generator -> always [].
+    monkeypatch.setattr(gs_module.scholarly, "search_pubs", lambda *a, **k: _fake_pubs(30))
+    gs, _, out = _make_tool(top_n=5)
+    gs._invoke(query="q")
+    assert len(out["json"]) == 5
+    assert out["json"], "json output must not be empty when there are results"
+
+
+def test_empty_query_short_circuits(monkeypatch):
+    # An empty query must short-circuit without hitting the SDK, and must clear
+    # json so a reused instance can't surface stale results from a prior call.
+    calls = [0]
+
+    def spy(*a, **k):
+        calls[0] += 1
+        return _fake_pubs(30)
+
+    monkeypatch.setattr(gs_module.scholarly, "search_pubs", spy)
+    gs, _, out = _make_tool(top_n=5)
+    out["json"] = ["stale"]  # simulate leftover output from a previous call
+    assert gs._invoke(query="") == ""
+    assert calls[0] == 0, "search_pubs must not be called for an empty query"
+    assert out.get("formalized_content") == ""
+    assert out.get("json") == []