mirror of
https://github.com/infiniflow/ragflow.git
synced 2026-07-02 08:45:42 +08:00
## Summary Adds a first-class **BGPT** Agent tool (backend + UI) in response to [#15997](https://github.com/infiniflow/ragflow/issues/15997#issuecomment-4703864227). BGPT calls `POST https://bgpt.pro/api/mcp-search` and returns structured study evidence from full-text papers — not just titles/abstracts. Each result is formatted for RAGFlow citations with: - methods - sample size / population - results - limitations - conflicts of interest - data availability - study blind spots - `how_to_falsify` ## Why this shape - Mirrors existing literature tools (`PubMed`, `ArXiv`) and HTTP tools (`SearXNG`). - Works on the free tier (no API key required for first 50 results). - Optional `api_key` and `days_back` in the node/tool config. - Surfaces both `formalized_content` and raw `json` outputs (like SearXNG). ## Files - `agent/tools/bgpt.py` — REST client + evidence formatter - Frontend: Operator enum, forms, tool picker, canvas accordion, en/zh locales, icon ## Demo / docs Runnable claim-interrogation demo: https://github.com/connerlambden/bgpt-mcp/blob/main/EVIDENCE_DEMO.md ## Test plan - [ ] Add BGPT node on Agent canvas, run query `GLP-1 alcohol craving`, verify `formalized_content` includes limitations/COI fields - [ ] Add BGPT as Agent sub-tool under Search, verify tool-calling works - [ ] Confirm empty query / try-run returns gracefully - [ ] Optional: paid-tier `api_key` path --------- Co-authored-by: Cursor <cursoragent@cursor.com> Co-authored-by: Zhichang Yu <yuzhichang@gmail.com>
189 lines
7.3 KiB
Python
189 lines
7.3 KiB
Python
#
|
|
# Copyright 2024 The InfiniFlow Authors. All Rights Reserved.
|
|
#
|
|
# Licensed under the Apache License, Version 2.0 (the "License");
|
|
# you may not use this file except in compliance with the License.
|
|
# You may obtain a copy of the License at
|
|
#
|
|
# http://www.apache.org/licenses/LICENSE-2.0
|
|
#
|
|
# Unless required by applicable law or agreed to in writing, software
|
|
# distributed under the License is distributed on an "AS IS" BASIS,
|
|
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
|
|
# See the License for the specific language governing permissions and
|
|
# limitations under the License.
|
|
#
|
|
import logging
|
|
import os
|
|
import time
|
|
from abc import ABC
|
|
|
|
import requests
|
|
|
|
from agent.tools.base import ToolBase, ToolMeta, ToolParamBase
|
|
from common.connection_utils import timeout
|
|
|
|
BGPT_SEARCH_URL = "https://bgpt.pro/api/mcp-search"
|
|
|
|
|
|
class BGPTParam(ToolParamBase):
|
|
"""Define the BGPT component parameters."""
|
|
|
|
def __init__(self):
|
|
self.meta: ToolMeta = {
|
|
"name": "bgpt_search",
|
|
"description": (
|
|
"BGPT searches scientific papers and returns structured evidence extracted from full-text studies: "
|
|
"methods, sample sizes, results, limitations, conflicts of interest, data/code availability, "
|
|
"study blind spots, quality scores, and falsification prompts. "
|
|
"Useful when the agent must judge a scientific claim, not just find abstracts."
|
|
),
|
|
"parameters": {
|
|
"query": {
|
|
"type": "string",
|
|
"description": ("Natural-language scientific search query. Use the most important terms from the user's request."),
|
|
"default": "{sys.query}",
|
|
"required": True,
|
|
}
|
|
},
|
|
}
|
|
super().__init__()
|
|
self.top_n = 10
|
|
self.api_key = ""
|
|
self.days_back = None
|
|
|
|
def check(self):
|
|
try:
|
|
if isinstance(self.top_n, str):
|
|
self.top_n = int(self.top_n.strip())
|
|
except Exception:
|
|
pass
|
|
self.check_positive_integer(self.top_n, "Top N")
|
|
if self.days_back not in (None, ""):
|
|
try:
|
|
self.days_back = int(self.days_back)
|
|
except (TypeError, ValueError) as exc:
|
|
raise ValueError("days_back must be an integer") from exc
|
|
self.check_positive_integer(self.days_back, "Days back")
|
|
|
|
def get_input_form(self) -> dict[str, dict]:
|
|
return {"query": {"name": "Query", "type": "line"}}
|
|
|
|
|
|
class BGPT(ToolBase, ABC):
|
|
component_name = "BGPT"
|
|
|
|
@timeout(int(os.environ.get("COMPONENT_EXEC_TIMEOUT", 30)))
|
|
def _invoke(self, **kwargs):
|
|
if self.check_if_canceled("BGPT processing"):
|
|
return
|
|
|
|
query = kwargs.get("query")
|
|
if not query or not isinstance(query, str) or not query.strip():
|
|
self.set_output("formalized_content", "")
|
|
return ""
|
|
|
|
payload = {"query": query.strip(), "num_results": self._param.top_n}
|
|
if self._param.api_key:
|
|
payload["api_key"] = self._param.api_key
|
|
if self._param.days_back:
|
|
payload["days_back"] = self._param.days_back
|
|
|
|
last_e = ""
|
|
for _ in range(self._param.max_retries + 1):
|
|
if self.check_if_canceled("BGPT processing"):
|
|
return
|
|
|
|
try:
|
|
response = requests.post(BGPT_SEARCH_URL, json=payload, timeout=25)
|
|
response.raise_for_status()
|
|
data = response.json()
|
|
|
|
if not data or not isinstance(data, dict):
|
|
raise ValueError("Invalid response from BGPT")
|
|
|
|
results = data.get("results", [])
|
|
if not isinstance(results, list):
|
|
raise ValueError("Invalid results format from BGPT")
|
|
|
|
if self.check_if_canceled("BGPT processing"):
|
|
return
|
|
|
|
self._retrieve_chunks(
|
|
results,
|
|
get_title=lambda paper: paper.get("title") or "Untitled",
|
|
get_url=lambda paper: paper.get("url") or paper.get("doi") or "",
|
|
get_content=lambda paper: self._format_bgpt_paper(paper),
|
|
)
|
|
self.set_output("json", results)
|
|
return self.output("formalized_content")
|
|
|
|
except requests.HTTPError as e:
|
|
# Non-retryable 4xx (e.g. 400/401/403/404) should fail fast
|
|
# rather than wasting retries on bad requests or auth failures.
|
|
status = e.response.status_code if e.response is not None else None
|
|
if status is not None and 400 <= status < 500 and status != 429:
|
|
last_e = f"HTTP error: {e}"
|
|
logging.exception("BGPT non-retryable HTTP error: %s", e)
|
|
break
|
|
if self.check_if_canceled("BGPT processing"):
|
|
return
|
|
|
|
last_e = f"Network error: {e}"
|
|
logging.exception("BGPT network error: %s", e)
|
|
time.sleep(self._param.delay_after_error)
|
|
except requests.RequestException as e:
|
|
if self.check_if_canceled("BGPT processing"):
|
|
return
|
|
|
|
last_e = f"Network error: {e}"
|
|
logging.exception("BGPT network error: %s", e)
|
|
time.sleep(self._param.delay_after_error)
|
|
except Exception as e:
|
|
if self.check_if_canceled("BGPT processing"):
|
|
return
|
|
|
|
last_e = str(e)
|
|
logging.exception("BGPT error: %s", e)
|
|
time.sleep(self._param.delay_after_error)
|
|
|
|
if last_e:
|
|
self.set_output("_ERROR", last_e)
|
|
return f"BGPT error: {last_e}"
|
|
|
|
assert False, self.output()
|
|
|
|
def _format_bgpt_paper(self, paper: dict) -> str:
|
|
def field(*names: str) -> str:
|
|
for name in names:
|
|
value = paper.get(name)
|
|
if value is None:
|
|
continue
|
|
if isinstance(value, (dict, list)):
|
|
value = str(value)
|
|
text = str(value).strip()
|
|
if text:
|
|
return text
|
|
return "-"
|
|
|
|
lines = [
|
|
f"Title: {field('title')}",
|
|
f"Authors: {field('authors')}",
|
|
f"Journal: {field('journal')}",
|
|
f"Year: {field('year')}",
|
|
f"DOI: {field('doi')}",
|
|
f"Abstract: {field('abstract')}",
|
|
f"Methods: {field('methods_and_experimental_techniques', 'methods')}",
|
|
f"Sample size / population: {field('sample_size_and_population_characteristics', 'sample_size_and_population')}",
|
|
f"Results: {field('results_and_conclusions', 'results')}",
|
|
f"Limitations: {field('paper_limitations_and_biases', 'limitations')}",
|
|
f"Conflicts of interest: {field('conflict_of_interest_statements', 'conflict_of_interest')}",
|
|
f"Data availability: {field('data_availability_statements', 'data_availability')}",
|
|
f"Blind spots: {field('study_blindspots')}",
|
|
f"How to falsify: {field('how_to_falsify')}",
|
|
]
|
|
return "\n".join(lines)
|
|
|
|
def thoughts(self) -> str:
|
|
return "Searching BGPT for structured scientific evidence on `{}`.".format(self.get_input().get("query", "-_-!"))
|