Files
ragflow/internal/agent/tool/pubmed.go
Zhichang Yu 3fa15c0e2f feat(agent): Go port — canvas engine, 22 components, DSL v2, 13 endpoints (#15952)
Ports the agent canvas subsystem from Python to Go.

## What's included

### Canvas Engine (Phase 0/1)
- State engine, scheduler, variable resolver, Redis checkpoint store,
cancel protocol
- **209 tests** across canvas / component / io packages

### 22 Components (P0–P4)
| Tier | Components |
|---|---|
| P0 T1+T2+T3 | LLM, Agent, ExitLoop, Switch, Categorize, Begin,
Message, Invoke |
| P1 T3 | VariableAggregator, VariableAssigner, StringTransform,
ListOperations, DataOperations |
| P2 T3 | Iteration, IterationItem, Loop, LoopItem |
| P3 T3 | UserFillUp, Fillup |
| P4 T5 | Browser, ExcelProcessor, DocsGenerator |

### DSL v2 Schema (Phase 2.5)
- Typed v2 in-memory model with v1-to-v2 auto-detect converter
- v1 legacy field stripping per plan §2.11.7

### HTTP Endpoints & Bug Fixes (Plans PR1–PR3)
- **DELETE SQL bug fix**: gorm v2 `Where("id = ?", id).Delete(...)`
pattern
- **CreateAgent validation**: title/DSL required, duplicate check, 103
envelope
- **13 new endpoints**: templates, prompts, tags, sessions CRUD,
chat/completions (SSE + non-stream stubs), rerun, test_db_connection,
logs, webhook/logs
- **756 Go unit tests** (745 → 756, +18)
- **17 → 0 Python integration test failures** (test_agents.py +
test_session_management/)

### Tools
21 eino tools: HTTPHelper, search tools, financial/data tools, mandatory
stubs

### Infrastructure
OTel observability, NATS message queue, DeepDoc gRPC client, SSRF
guards, IDOR mitigation
2026-06-12 22:58:28 +08:00

317 lines
9.9 KiB
Go

//
// Copyright 2026 The InfiniFlow Authors. All Rights Reserved.
//
// Licensed under the Apache License, Version 2.0 (the "License");
// you may not use this file except in compliance with the License.
// You may obtain a copy of the License at
//
// http://www.apache.org/licenses/LICENSE-2.0
//
// Unless required by applicable law or agreed to in writing, software
// distributed under the License is distributed on an "AS IS" BASIS,
// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
// See the License for the specific language governing permissions and
// limitations under the License.
//
package tool
import (
"context"
"encoding/json"
"fmt"
"io"
"net/http"
"net/url"
"strconv"
"strings"
"github.com/cloudwego/eino/components/tool"
"github.com/cloudwego/eino/schema"
)
const pubmedToolName = "pubmed"
const pubmedToolDescription = "Search PubMed via NCBI E-utilities. Returns {pmid, title, authors, journal, year}."
// pubmedParams is the JSON shape the model sends into InvokableRun.
type pubmedParams struct {
Query string `json:"query"`
MaxResults int `json:"max_results"`
}
// pubmedResult is one row in the returned record list.
type pubmedResult struct {
PMID string `json:"pmid"`
Title string `json:"title"`
Authors string `json:"authors"`
Journal string `json:"journal"`
Year string `json:"year"`
}
// pubmedEnvelope is what the model sees.
type pubmedEnvelope struct {
Results []pubmedResult `json:"results"`
Error string `json:"_ERROR,omitempty"`
}
// pubmedUserAgent is the User-Agent that NCBI requires for all
// E-utilities requests. Without it, requests are silently dropped
// or rate-limited to a single IP.
const pubmedUserAgent = "ragflow/1.0"
// pubmedESearchEndpoint is the E-utilities esearch URL. Exposed as a
// package var so tests can substitute a httptest.Server URL.
var pubmedESearchEndpoint = "https://eutils.ncbi.nlm.nih.gov/entrez/eutils/esearch.fcgi"
// pubmedESummaryEndpoint is the E-utilities esummary URL. Exposed
// as a package var so tests can substitute a httptest.Server URL.
var pubmedESummaryEndpoint = "https://eutils.ncbi.nlm.nih.gov/entrez/eutils/esummary.fcgi"
// PubMedTool is the Phase 3 batch 3 implementation of the PubMed
// search tool (plan §2.11.4 row 13, §5 Phase 3 第 3 批). It uses NCBI
// E-utilities: esearch returns a list of PMIDs, then esummary fetches
// the full records for those PMIDs.
type PubMedTool struct {
helper *HTTPHelper
}
// NewPubMedTool returns a PubMedTool using the default HTTPHelper.
func NewPubMedTool() *PubMedTool {
return NewPubMedToolWith(NewHTTPHelper())
}
// NewPubMedToolWith returns a PubMedTool that uses the provided
// HTTPHelper. Useful for tests.
func NewPubMedToolWith(h *HTTPHelper) *PubMedTool {
if h == nil {
h = NewHTTPHelper()
}
return &PubMedTool{helper: h}
}
// Info returns the tool's metadata for the chat model.
func (p *PubMedTool) Info(_ context.Context) (*schema.ToolInfo, error) {
return &schema.ToolInfo{
Name: pubmedToolName,
Desc: pubmedToolDescription,
ParamsOneOf: schema.NewParamsOneOfByParams(map[string]*schema.ParameterInfo{
"query": {
Type: schema.String,
Desc: "PubMed search query (full PubMed query syntax supported).",
Required: true,
},
"max_results": {
Type: schema.Integer,
Desc: "Maximum number of records to return. Defaults to 5 (max 100 per request).",
Required: false,
},
}),
}, nil
}
// buildPubMedESearchURL composes the esearch URL. Centralized for
// testability.
func buildPubMedESearchURL(query string, maxResults int) string {
if maxResults <= 0 {
maxResults = 5
}
if maxResults > 100 {
maxResults = 100
}
q := url.Values{}
q.Set("db", "pubmed")
q.Set("term", query)
q.Set("retmax", strconv.Itoa(maxResults))
q.Set("retmode", "json")
return pubmedESearchEndpoint + "?" + q.Encode()
}
// buildPubMedESummaryURL composes the esummary URL for a list of
// PMIDs. Centralized for testability.
func buildPubMedESummaryURL(pmids []string) string {
q := url.Values{}
q.Set("db", "pubmed")
q.Set("id", strings.Join(pmids, ","))
q.Set("retmode", "json")
return pubmedESummaryEndpoint + "?" + q.Encode()
}
// pubmedESearchResponse is the upstream esearch envelope.
type pubmedESearchResponse struct {
ESearchResult struct {
IDList []string `json:"idlist"`
} `json:"esearchresult"`
}
// pubmedESummaryAuthor is one author in the esummary author list.
type pubmedESummaryAuthor struct {
Name string `json:"name"`
}
// pubmedESummaryArticle is one article in the esummary result map.
type pubmedESummaryArticle struct {
Title string `json:"title"`
Authors []pubmedESummaryAuthor `json:"authors"`
FullJournalName string `json:"fulljournalname"`
PubDate string `json:"pubdate"`
}
// pubmedESummaryResponse is the upstream esummary envelope. The
// `result` value mixes per-PMID article objects with a string-list
// `uids` key, so we decode it as a map of RawMessage and then walk
// the entries to extract the article objects (skipping the `uids`
// array). See decodePubMedESummary.
type pubmedESummaryResponse struct {
Result map[string]json.RawMessage `json:"result"`
}
// mustReadAll is a tiny helper to read the entire response body. We
// keep it private because pubmed.go owns the only two-step HTTP dance.
func mustReadAll(r io.Reader) []byte {
b, err := io.ReadAll(r)
if err != nil {
return nil
}
return b
}
// decodePubMedESummary parses the raw esummary response and returns
// a PMID → article map. The upstream response is a flat object whose
// keys are PMIDs (article values) plus a `uids` key (string list).
// A single JSON decode into map[string]Article would fail on `uids`
// because Go's encoding/json cannot store arrays in a struct-typed
// map; the RawMessage indirection sidesteps that.
func decodePubMedESummary(body []byte) (map[string]pubmedESummaryArticle, error) {
var raw pubmedESummaryResponse
if err := json.Unmarshal(body, &raw); err != nil {
return nil, err
}
out := make(map[string]pubmedESummaryArticle, len(raw.Result))
for k, v := range raw.Result {
// The `uids` key is a JSON array of strings; skip it.
if len(v) > 0 && v[0] == '[' {
continue
}
var article pubmedESummaryArticle
if err := json.Unmarshal(v, &article); err != nil {
continue
}
out[k] = article
}
return out, nil
}
// InvokableRun performs the two-step PubMed lookup.
func (p *PubMedTool) InvokableRun(ctx context.Context, argsJSON string, _ ...tool.Option) (string, error) {
var params pubmedParams
if err := json.Unmarshal([]byte(argsJSON), &params); err != nil {
return pubmedErrJSON(fmt.Errorf("pubmed: parse arguments: %w", err)),
fmt.Errorf("pubmed: parse arguments: %w", err)
}
if strings.TrimSpace(params.Query) == "" {
return pubmedErrJSON(fmt.Errorf("query is required")),
fmt.Errorf("pubmed: query is required")
}
if params.MaxResults <= 0 {
params.MaxResults = 5
}
headers := map[string]string{
"User-Agent": pubmedUserAgent,
"Accept": "application/json",
}
// Step 1: esearch → list of PMIDs
searchURL := buildPubMedESearchURL(params.Query, params.MaxResults)
searchResp, err := p.helper.Do(ctx, http.MethodGet, searchURL, "", "", headers)
if err != nil {
return pubmedErrJSON(err), err
}
var searchBody pubmedESearchResponse
if decErr := json.NewDecoder(searchResp.Body).Decode(&searchBody); decErr != nil {
_ = searchResp.Body.Close()
return pubmedErrJSON(fmt.Errorf("pubmed: decode esearch: %w", decErr)),
fmt.Errorf("pubmed: decode esearch: %w", decErr)
}
_ = searchResp.Body.Close()
if searchResp.StatusCode < 200 || searchResp.StatusCode >= 300 {
return pubmedErrJSON(fmt.Errorf("pubmed: esearch returned %d", searchResp.StatusCode)),
fmt.Errorf("pubmed: esearch returned %d", searchResp.StatusCode)
}
pmids := searchBody.ESearchResult.IDList
if len(pmids) == 0 {
return pubmedJSON(pubmedEnvelope{Results: []pubmedResult{}}), nil
}
// Step 2: esummary → full records
summaryURL := buildPubMedESummaryURL(pmids)
summaryResp, err := p.helper.Do(ctx, http.MethodGet, summaryURL, "", "", headers)
if err != nil {
return pubmedErrJSON(err), err
}
defer summaryResp.Body.Close()
if summaryResp.StatusCode < 200 || summaryResp.StatusCode >= 300 {
return pubmedErrJSON(fmt.Errorf("pubmed: esummary returned %d", summaryResp.StatusCode)),
fmt.Errorf("pubmed: esummary returned %d", summaryResp.StatusCode)
}
articles, err := decodePubMedESummary(mustReadAll(summaryResp.Body))
if err != nil {
return pubmedErrJSON(fmt.Errorf("pubmed: parse esummary: %w", err)),
fmt.Errorf("pubmed: parse esummary: %w", err)
}
results := make([]pubmedResult, 0, len(pmids))
for _, pmid := range pmids {
article, ok := articles[pmid]
if !ok {
continue
}
results = append(results, pubmedResult{
PMID: pmid,
Title: strings.TrimSpace(article.Title),
Authors: joinAuthorNames(article.Authors),
Journal: article.FullJournalName,
Year: firstFourDigitYear(article.PubDate),
})
}
return pubmedJSON(pubmedEnvelope{Results: results}), nil
}
// joinAuthorNames joins the first N authors with ", " and adds
// "et al." for any beyond N. We use N=3 to mirror the convention
// common in academic citation styles.
func joinAuthorNames(authors []pubmedESummaryAuthor) string {
const cap = 3
if len(authors) == 0 {
return ""
}
if len(authors) <= cap {
names := make([]string, len(authors))
for i, a := range authors {
names[i] = a.Name
}
return strings.Join(names, ", ")
}
names := make([]string, 0, cap+1)
for i := range cap {
names = append(names, authors[i].Name)
}
names = append(names, "et al.")
return strings.Join(names, ", ")
}
func pubmedJSON(env pubmedEnvelope) string {
b, err := json.Marshal(env)
if err != nil {
return fmt.Sprintf(`{"_ERROR":"pubmed: marshal result: %s"}`, err)
}
return string(b)
}
func pubmedErrJSON(err error) string {
return pubmedJSON(pubmedEnvelope{Error: err.Error()})
}