Files
ragflow/agent/tools/bgpt.py
connerlambden 9bf57600cf feat(agent): add BGPT structured literature evidence search tool (#16050)
## Summary

Adds a first-class **BGPT** Agent tool (backend + UI) in response to
[#15997](https://github.com/infiniflow/ragflow/issues/15997#issuecomment-4703864227).

BGPT calls `POST https://bgpt.pro/api/mcp-search` and returns structured
study evidence from full-text papers — not just titles/abstracts. Each
result is formatted for RAGFlow citations with:

- methods
- sample size / population
- results
- limitations
- conflicts of interest
- data availability
- study blind spots
- `how_to_falsify`

## Why this shape

- Mirrors existing literature tools (`PubMed`, `ArXiv`) and HTTP tools
(`SearXNG`).
- Works on the free tier (no API key required for first 50 results).
- Optional `api_key` and `days_back` in the node/tool config.
- Surfaces both `formalized_content` and raw `json` outputs (like
SearXNG).

## Files

- `agent/tools/bgpt.py` — REST client + evidence formatter
- Frontend: Operator enum, forms, tool picker, canvas accordion, en/zh
locales, icon

## Demo / docs

Runnable claim-interrogation demo:
https://github.com/connerlambden/bgpt-mcp/blob/main/EVIDENCE_DEMO.md

## Test plan

- [ ] Add BGPT node on Agent canvas, run query `GLP-1 alcohol craving`,
verify `formalized_content` includes limitations/COI fields
- [ ] Add BGPT as Agent sub-tool under Search, verify tool-calling works
- [ ] Confirm empty query / try-run returns gracefully
- [ ] Optional: paid-tier `api_key` path

---------

Co-authored-by: Cursor <cursoragent@cursor.com>
Co-authored-by: Zhichang Yu <yuzhichang@gmail.com>
2026-07-01 13:52:24 +08:00

189 lines
7.3 KiB
Python

#
# Copyright 2024 The InfiniFlow Authors. All Rights Reserved.
#
# Licensed under the Apache License, Version 2.0 (the "License");
# you may not use this file except in compliance with the License.
# You may obtain a copy of the License at
#
# http://www.apache.org/licenses/LICENSE-2.0
#
# Unless required by applicable law or agreed to in writing, software
# distributed under the License is distributed on an "AS IS" BASIS,
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# See the License for the specific language governing permissions and
# limitations under the License.
#
import logging
import os
import time
from abc import ABC
import requests
from agent.tools.base import ToolBase, ToolMeta, ToolParamBase
from common.connection_utils import timeout
BGPT_SEARCH_URL = "https://bgpt.pro/api/mcp-search"
class BGPTParam(ToolParamBase):
"""Define the BGPT component parameters."""
def __init__(self):
self.meta: ToolMeta = {
"name": "bgpt_search",
"description": (
"BGPT searches scientific papers and returns structured evidence extracted from full-text studies: "
"methods, sample sizes, results, limitations, conflicts of interest, data/code availability, "
"study blind spots, quality scores, and falsification prompts. "
"Useful when the agent must judge a scientific claim, not just find abstracts."
),
"parameters": {
"query": {
"type": "string",
"description": ("Natural-language scientific search query. Use the most important terms from the user's request."),
"default": "{sys.query}",
"required": True,
}
},
}
super().__init__()
self.top_n = 10
self.api_key = ""
self.days_back = None
def check(self):
try:
if isinstance(self.top_n, str):
self.top_n = int(self.top_n.strip())
except Exception:
pass
self.check_positive_integer(self.top_n, "Top N")
if self.days_back not in (None, ""):
try:
self.days_back = int(self.days_back)
except (TypeError, ValueError) as exc:
raise ValueError("days_back must be an integer") from exc
self.check_positive_integer(self.days_back, "Days back")
def get_input_form(self) -> dict[str, dict]:
return {"query": {"name": "Query", "type": "line"}}
class BGPT(ToolBase, ABC):
component_name = "BGPT"
@timeout(int(os.environ.get("COMPONENT_EXEC_TIMEOUT", 30)))
def _invoke(self, **kwargs):
if self.check_if_canceled("BGPT processing"):
return
query = kwargs.get("query")
if not query or not isinstance(query, str) or not query.strip():
self.set_output("formalized_content", "")
return ""
payload = {"query": query.strip(), "num_results": self._param.top_n}
if self._param.api_key:
payload["api_key"] = self._param.api_key
if self._param.days_back:
payload["days_back"] = self._param.days_back
last_e = ""
for _ in range(self._param.max_retries + 1):
if self.check_if_canceled("BGPT processing"):
return
try:
response = requests.post(BGPT_SEARCH_URL, json=payload, timeout=25)
response.raise_for_status()
data = response.json()
if not data or not isinstance(data, dict):
raise ValueError("Invalid response from BGPT")
results = data.get("results", [])
if not isinstance(results, list):
raise ValueError("Invalid results format from BGPT")
if self.check_if_canceled("BGPT processing"):
return
self._retrieve_chunks(
results,
get_title=lambda paper: paper.get("title") or "Untitled",
get_url=lambda paper: paper.get("url") or paper.get("doi") or "",
get_content=lambda paper: self._format_bgpt_paper(paper),
)
self.set_output("json", results)
return self.output("formalized_content")
except requests.HTTPError as e:
# Non-retryable 4xx (e.g. 400/401/403/404) should fail fast
# rather than wasting retries on bad requests or auth failures.
status = e.response.status_code if e.response is not None else None
if status is not None and 400 <= status < 500 and status != 429:
last_e = f"HTTP error: {e}"
logging.exception("BGPT non-retryable HTTP error: %s", e)
break
if self.check_if_canceled("BGPT processing"):
return
last_e = f"Network error: {e}"
logging.exception("BGPT network error: %s", e)
time.sleep(self._param.delay_after_error)
except requests.RequestException as e:
if self.check_if_canceled("BGPT processing"):
return
last_e = f"Network error: {e}"
logging.exception("BGPT network error: %s", e)
time.sleep(self._param.delay_after_error)
except Exception as e:
if self.check_if_canceled("BGPT processing"):
return
last_e = str(e)
logging.exception("BGPT error: %s", e)
time.sleep(self._param.delay_after_error)
if last_e:
self.set_output("_ERROR", last_e)
return f"BGPT error: {last_e}"
assert False, self.output()
def _format_bgpt_paper(self, paper: dict) -> str:
def field(*names: str) -> str:
for name in names:
value = paper.get(name)
if value is None:
continue
if isinstance(value, (dict, list)):
value = str(value)
text = str(value).strip()
if text:
return text
return "-"
lines = [
f"Title: {field('title')}",
f"Authors: {field('authors')}",
f"Journal: {field('journal')}",
f"Year: {field('year')}",
f"DOI: {field('doi')}",
f"Abstract: {field('abstract')}",
f"Methods: {field('methods_and_experimental_techniques', 'methods')}",
f"Sample size / population: {field('sample_size_and_population_characteristics', 'sample_size_and_population')}",
f"Results: {field('results_and_conclusions', 'results')}",
f"Limitations: {field('paper_limitations_and_biases', 'limitations')}",
f"Conflicts of interest: {field('conflict_of_interest_statements', 'conflict_of_interest')}",
f"Data availability: {field('data_availability_statements', 'data_availability')}",
f"Blind spots: {field('study_blindspots')}",
f"How to falsify: {field('how_to_falsify')}",
]
return "\n".join(lines)
def thoughts(self) -> str:
return "Searching BGPT for structured scientific evidence on `{}`.".format(self.get_input().get("query", "-_-!"))