mirror of
https://github.com/infiniflow/ragflow.git
synced 2026-07-01 00:05:43 +08:00
Replaces the Python agent canvas runtime with a Go implementation that runs inside `cmd/server_main`. The canvas compiles into an eino Workflow that pauses on wait-for-user via native Interrupt/Resume (no sentinel flag) and resumes from a Redis-backed CheckPointStore. All 21 Python agent components and ~35 tools are ported with functional parity. Sandbox providers now read their JSON config from the admin-panel system_settings table with env fallback. 234 files / +35,413 / -6,111. All Go files are gofmt-clean (CI gate added); drops the v2 DSL E2E step and the gap-analysis plan (both redundant after the port ships). ## Type of change - [x] Refactoring - [x] New feature - [x] Bug fix 🤖 Generated with [Claude Code](https://claude.com/claude-code) --------- Co-authored-by: Claude <noreply@anthropic.com>
356 lines
10 KiB
Go
356 lines
10 KiB
Go
//
|
|
// Copyright 2026 The InfiniFlow Authors. All Rights Reserved.
|
|
//
|
|
// Licensed under the Apache License, Version 2.0 (the "License");
|
|
// you may not use this file except in compliance with the License.
|
|
// You may obtain a copy of the License at
|
|
//
|
|
// http://www.apache.org/licenses/LICENSE-2.0
|
|
//
|
|
// Unless required by applicable law or agreed to in writing, software
|
|
// distributed under the License is distributed on an "AS IS" BASIS,
|
|
// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
|
|
// See the License for the specific language governing permissions and
|
|
// limitations under the License.
|
|
//
|
|
|
|
package tool
|
|
|
|
import (
|
|
"context"
|
|
"encoding/json"
|
|
"fmt"
|
|
"net/http"
|
|
"net/url"
|
|
"strconv"
|
|
"strings"
|
|
|
|
"github.com/cloudwego/eino/components/tool"
|
|
"github.com/cloudwego/eino/schema"
|
|
"golang.org/x/net/html"
|
|
)
|
|
|
|
const googleScholarToolName = "google_scholar"
|
|
|
|
const googleScholarToolDescription = "Search Google Scholar for academic articles. Returns {title, link, snippet, authors, year}."
|
|
|
|
// googleScholarParams is the JSON shape the model sends into InvokableRun.
|
|
type googleScholarParams struct {
|
|
Query string `json:"query"`
|
|
MaxResults int `json:"max_results"`
|
|
}
|
|
|
|
// googleScholarResult is one row in the parsed result list.
|
|
type googleScholarResult struct {
|
|
Title string `json:"title"`
|
|
Link string `json:"link"`
|
|
Snippet string `json:"snippet"`
|
|
Authors string `json:"authors"`
|
|
Year string `json:"year"`
|
|
}
|
|
|
|
// googleScholarEnvelope is what the model sees.
|
|
type googleScholarEnvelope struct {
|
|
Results []googleScholarResult `json:"results"`
|
|
Error string `json:"_ERROR,omitempty"`
|
|
}
|
|
|
|
// googleScholarEndpoint is the Google Scholar search URL. Exposed as
|
|
// a package var so tests can substitute a httptest.Server URL.
|
|
var googleScholarEndpoint = "https://scholar.google.com/scholar"
|
|
|
|
// GoogleScholarTool is the
|
|
// Google Scholar search tool.
|
|
// There is no public Scholar API, so we fetch the search-results
|
|
// HTML and parse it with golang.org/x/net/html.
|
|
type GoogleScholarTool struct {
|
|
helper *HTTPHelper
|
|
}
|
|
|
|
// NewGoogleScholarTool returns a GoogleScholarTool using the default
|
|
// HTTPHelper.
|
|
func NewGoogleScholarTool() *GoogleScholarTool {
|
|
return NewGoogleScholarToolWith(NewHTTPHelper())
|
|
}
|
|
|
|
// NewGoogleScholarToolWith returns a GoogleScholarTool that uses the
|
|
// provided HTTPHelper. Useful for tests.
|
|
func NewGoogleScholarToolWith(h *HTTPHelper) *GoogleScholarTool {
|
|
if h == nil {
|
|
h = NewHTTPHelper()
|
|
}
|
|
return &GoogleScholarTool{helper: h}
|
|
}
|
|
|
|
// Info returns the tool's metadata for the chat model.
|
|
func (g *GoogleScholarTool) Info(_ context.Context) (*schema.ToolInfo, error) {
|
|
return &schema.ToolInfo{
|
|
Name: googleScholarToolName,
|
|
Desc: googleScholarToolDescription,
|
|
ParamsOneOf: schema.NewParamsOneOfByParams(map[string]*schema.ParameterInfo{
|
|
"query": {
|
|
Type: schema.String,
|
|
Desc: "Search query.",
|
|
Required: true,
|
|
},
|
|
"max_results": {
|
|
Type: schema.Integer,
|
|
Desc: "Maximum number of results to return. Defaults to 5 (max 20 per page).",
|
|
Required: false,
|
|
},
|
|
}),
|
|
}, nil
|
|
}
|
|
|
|
// buildGoogleScholarURL composes the Scholar query URL. Centralized
|
|
// for testability.
|
|
func buildGoogleScholarURL(query string, maxResults int) string {
|
|
if maxResults <= 0 {
|
|
maxResults = 5
|
|
}
|
|
if maxResults > 20 {
|
|
maxResults = 20
|
|
}
|
|
q := url.Values{}
|
|
q.Set("q", query)
|
|
q.Set("hl", "en")
|
|
q.Set("num", strconv.Itoa(maxResults))
|
|
return googleScholarEndpoint + "?" + q.Encode()
|
|
}
|
|
|
|
// InvokableRun performs the Google Scholar search.
|
|
func (g *GoogleScholarTool) InvokableRun(ctx context.Context, argsJSON string, _ ...tool.Option) (string, error) {
|
|
var p googleScholarParams
|
|
if err := json.Unmarshal([]byte(argsJSON), &p); err != nil {
|
|
return googleScholarErrJSON(fmt.Errorf("google_scholar: parse arguments: %w", err)),
|
|
fmt.Errorf("google_scholar: parse arguments: %w", err)
|
|
}
|
|
if strings.TrimSpace(p.Query) == "" {
|
|
return googleScholarErrJSON(fmt.Errorf("query is required")),
|
|
fmt.Errorf("google_scholar: query is required")
|
|
}
|
|
|
|
endpoint := buildGoogleScholarURL(p.Query, p.MaxResults)
|
|
headers := map[string]string{
|
|
// Scholar blocks obviously non-browser UAs.
|
|
"User-Agent": "Mozilla/5.0 (compatible; ragflow/1.0)",
|
|
"Accept": "text/html,application/xhtml+xml",
|
|
}
|
|
|
|
resp, err := g.helper.Do(ctx, http.MethodGet, endpoint, "", "", headers)
|
|
if err != nil {
|
|
return googleScholarErrJSON(err), err
|
|
}
|
|
defer resp.Body.Close()
|
|
|
|
if resp.StatusCode < 200 || resp.StatusCode >= 300 {
|
|
return googleScholarErrJSON(fmt.Errorf("google_scholar: upstream returned %d", resp.StatusCode)),
|
|
fmt.Errorf("google_scholar: upstream returned %d", resp.StatusCode)
|
|
}
|
|
|
|
results, err := parseGoogleScholarHTML(resp.Body, p.MaxResults)
|
|
if err != nil {
|
|
return googleScholarErrJSON(fmt.Errorf("google_scholar: parse html: %w", err)),
|
|
fmt.Errorf("google_scholar: parse html: %w", err)
|
|
}
|
|
return googleScholarJSON(googleScholarEnvelope{Results: results}), nil
|
|
}
|
|
|
|
// parseGoogleScholarHTML walks the Scholar search-results HTML and
|
|
// extracts the conventional .gs_rt / .gs_a / .gs_rs fields. We
|
|
// deliberately stay defensive: Scholar's markup changes without
|
|
// notice, so we tolerate missing fields and silently skip articles
|
|
// that are missing the title.
|
|
func parseGoogleScholarHTML(body interface {
|
|
Read(p []byte) (int, error)
|
|
}, maxResults int) ([]googleScholarResult, error) {
|
|
if maxResults <= 0 {
|
|
maxResults = 5
|
|
}
|
|
doc, err := html.Parse(body)
|
|
if err != nil {
|
|
return nil, err
|
|
}
|
|
|
|
var results []googleScholarResult
|
|
var walk func(*html.Node)
|
|
walk = func(n *html.Node) {
|
|
if len(results) >= maxResults {
|
|
return
|
|
}
|
|
if n.Type == html.ElementNode {
|
|
for _, a := range n.Attr {
|
|
if a.Key == "class" && strings.Contains(a.Val, "gs_ri") {
|
|
// gs_ri wraps one Scholar result card
|
|
if r, ok := extractScholarResult(n); ok {
|
|
results = append(results, r)
|
|
return
|
|
}
|
|
}
|
|
}
|
|
}
|
|
for c := n.FirstChild; c != nil; c = c.NextSibling {
|
|
walk(c)
|
|
}
|
|
}
|
|
walk(doc)
|
|
return results, nil
|
|
}
|
|
|
|
// extractScholarResult pulls title/link, snippet, and authors/year
|
|
// from a single .gs_ri node. Returns ok=false when the title anchor
|
|
// is missing (e.g. PDF / citation links the search layout omits).
|
|
func extractScholarResult(card *html.Node) (googleScholarResult, bool) {
|
|
res := googleScholarResult{}
|
|
|
|
// Title + link live inside .gs_rt > a
|
|
title, link := findFirstAnchorInClassedAncestor(card, "gs_rt")
|
|
if title == "" {
|
|
return res, false
|
|
}
|
|
res.Title = strings.TrimSpace(title)
|
|
res.Link = link
|
|
|
|
// Authors + year live in .gs_a (a single line)
|
|
if t := findTextWithClass(card, "gs_a"); t != "" {
|
|
authors, year := splitScholarAuthorsYear(t)
|
|
res.Authors = authors
|
|
res.Year = year
|
|
}
|
|
|
|
// Snippet lives in .gs_rs
|
|
if t := findTextWithClass(card, "gs_rs"); t != "" {
|
|
res.Snippet = strings.TrimSpace(t)
|
|
}
|
|
|
|
return res, true
|
|
}
|
|
|
|
// findFirstAnchorInClassedAncestor returns the text and href of the
|
|
// first <a> descendant of n whose ancestor chain contains an element
|
|
// with `want` in its class list. The `want` argument lets callers
|
|
// pin the search to a specific Scholar sub-element (e.g. .gs_rt).
|
|
func findFirstAnchorInClassedAncestor(n *html.Node, want string) (string, string) {
|
|
var text, href string
|
|
var found bool
|
|
var walk func(*html.Node, bool)
|
|
walk = func(node *html.Node, inTarget bool) {
|
|
if found {
|
|
return
|
|
}
|
|
here := inTarget
|
|
if node.Type == html.ElementNode {
|
|
for _, a := range node.Attr {
|
|
if a.Key == "class" && strings.Contains(a.Val, want) {
|
|
here = true
|
|
break
|
|
}
|
|
}
|
|
if here && node.Data == "a" {
|
|
for _, a := range node.Attr {
|
|
if a.Key == "href" {
|
|
href = a.Val
|
|
}
|
|
}
|
|
text = collectText(node)
|
|
found = true
|
|
return
|
|
}
|
|
}
|
|
for c := node.FirstChild; c != nil; c = c.NextSibling {
|
|
walk(c, here)
|
|
}
|
|
}
|
|
walk(n, false)
|
|
return text, href
|
|
}
|
|
|
|
// findTextWithClass returns the concatenated text of the first
|
|
// descendant element that has `want` in its class list. If the
|
|
// matched element is empty, the search continues into its subtree.
|
|
func findTextWithClass(n *html.Node, want string) string {
|
|
var found string
|
|
var walk func(*html.Node)
|
|
walk = func(node *html.Node) {
|
|
if found != "" {
|
|
return
|
|
}
|
|
if node.Type == html.ElementNode {
|
|
for _, a := range node.Attr {
|
|
if a.Key == "class" && strings.Contains(a.Val, want) {
|
|
found = collectText(node)
|
|
return
|
|
}
|
|
}
|
|
}
|
|
for c := node.FirstChild; c != nil; c = c.NextSibling {
|
|
walk(c)
|
|
}
|
|
}
|
|
walk(n)
|
|
return found
|
|
}
|
|
|
|
// collectText concatenates all text nodes under n (trimmed of
|
|
// surrounding whitespace per node).
|
|
func collectText(n *html.Node) string {
|
|
var b strings.Builder
|
|
var walk func(*html.Node)
|
|
walk = func(node *html.Node) {
|
|
if node.Type == html.TextNode {
|
|
b.WriteString(node.Data)
|
|
}
|
|
for c := node.FirstChild; c != nil; c = c.NextSibling {
|
|
walk(c)
|
|
}
|
|
}
|
|
walk(n)
|
|
return b.String()
|
|
}
|
|
|
|
// splitScholarAuthorsYear parses the .gs_a line, which has the form
|
|
// "<authors> - <journal>, <year>" or "<authors> - <year>". We pull
|
|
// the first 4-digit year out and treat everything before " - " as
|
|
// the author list. Anything we can't parse is returned verbatim so
|
|
// the model can still see it.
|
|
func splitScholarAuthorsYear(line string) (authors, year string) {
|
|
cleaned := strings.TrimSpace(line)
|
|
// The hyphen between authors and venue is the unicode dash "-".
|
|
if head, rest, ok := strings.Cut(cleaned, " - "); ok {
|
|
authors = strings.TrimSpace(head)
|
|
venue := strings.TrimSpace(rest)
|
|
year = firstFourDigitYear(venue)
|
|
return authors, year
|
|
}
|
|
year = firstFourDigitYear(cleaned)
|
|
return cleaned, year
|
|
}
|
|
|
|
// firstFourDigitYear returns the first 4-digit year in s, or "" if
|
|
// none is found. Years 1900-2099 are recognized.
|
|
func firstFourDigitYear(s string) string {
|
|
for i := 0; i+4 <= len(s); i++ {
|
|
candidate := s[i : i+4]
|
|
n, err := strconv.Atoi(candidate)
|
|
if err != nil {
|
|
continue
|
|
}
|
|
if n >= 1900 && n <= 2099 {
|
|
return candidate
|
|
}
|
|
}
|
|
return ""
|
|
}
|
|
|
|
func googleScholarJSON(env googleScholarEnvelope) string {
|
|
b, err := json.Marshal(env)
|
|
if err != nil {
|
|
return fmt.Sprintf(`{"_ERROR":"google_scholar: marshal result: %s"}`, err)
|
|
}
|
|
return string(b)
|
|
}
|
|
|
|
func googleScholarErrJSON(err error) string {
|
|
return googleScholarJSON(googleScholarEnvelope{Error: err.Error()})
|
|
}
|