Files
ragflow/internal/agent/tool/crawler.go
Zhichang Yu e45659868a feat(agent): ship the Go agent canvas port — eino interrupt/resume + Redis check-pointing (#16035)
Replaces the Python agent canvas runtime with a Go implementation that
runs inside `cmd/server_main`.

The canvas compiles into an eino Workflow that pauses on wait-for-user
via native Interrupt/Resume (no sentinel flag) and resumes from a
Redis-backed CheckPointStore.

All 21 Python agent components and ~35 tools are ported with functional
parity.

Sandbox providers now read their JSON config from the admin-panel
system_settings table with env fallback.

234 files / +35,413 / -6,111. All Go files are gofmt-clean (CI gate
added); drops the v2 DSL E2E step and the gap-analysis plan (both
redundant after the port ships).

## Type of change

- [x] Refactoring
- [x] New feature
- [x] Bug fix

🤖 Generated with [Claude Code](https://claude.com/claude-code)

---------

Co-authored-by: Claude <noreply@anthropic.com>
2026-06-17 13:24:03 +08:00

307 lines
10 KiB
Go

//
// Copyright 2026 The InfiniFlow Authors. All Rights Reserved.
//
// Licensed under the Apache License, Version 2.0 (the "License");
// you may not use this file except in compliance with the License.
// You may obtain a copy of the License at
//
// http://www.apache.org/licenses/LICENSE-2.0
//
// Unless required by applicable law or agreed to in writing, software
// distributed under the License is distributed on an "AS IS" BASIS,
// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
// See the License for the specific language governing permissions and
// limitations under the License.
//
package tool
import (
"context"
"encoding/json"
"errors"
"fmt"
"io"
"net"
"net/http"
"strings"
"github.com/cloudwego/eino/components/tool"
"github.com/cloudwego/eino/schema"
"golang.org/x/net/html"
)
const crawlerToolName = "crawler"
const crawlerToolDescription = "Fetches a web page and returns its extracted text content and links."
// crawlerArgs is the JSON shape the model sends in. max_depth and
// max_pages are accepted for API symmetry with the Python tool, but
// the current implementation only supports depth=0 (single page
// fetch).
type crawlerArgs struct {
URL string `json:"url"`
MaxDepth int `json:"max_depth,omitempty"`
MaxPages int `json:"max_pages,omitempty"`
}
// crawlerResult is the JSON envelope returned to the model. The shape
// mirrors the Python tool's `content` / `links` output.
type crawlerResult struct {
URL string `json:"url"`
Title string `json:"title,omitempty"`
Content string `json:"content,omitempty"`
Links []string `json:"links,omitempty"`
Status int `json:"status,omitempty"`
Error string `json:"_ERROR,omitempty"`
}
// Resolver validates a URL and returns the pinned IP for the host. The
// returned IP is dialed directly by HTTPHelper.DoPinned which defeats
// DNS rebinding: an attacker cannot swap a public record for a private
// one between the resolver's lookup and the actual connect, because
// the connect is pinned at the *http.Transport dialer layer (see
// pinnedDialer in http_helper.go) and never re-resolves the hostname.
// The request URL host is preserved so TLS SNI and cert verification
// continue to target the validated hostname.
//
// The default production resolver is ResolveAndValidate (ssrf.go),
// which rejects loopback / link-local / private / metadata targets and
// returns the first safe A/AAAA record.
type Resolver func(rawURL string) (host string, ip net.IP, err error)
// CrawlerTool is the Crawler tool. It fetches a single page
// (max_depth=0) via HTTPHelper and extracts text + links with
// golang.org/x/net/html.
type CrawlerTool struct {
helper *HTTPHelper
// resolve is the URL resolver used to block internal / metadata
// targets AND to pin the host to a known-safe IP. It is a function
// field (rather than a hard call to ResolveAndValidate) so unit tests
// that use httptest.NewServer (which binds to 127.0.0.1) can swap in
// a no-op that returns the literal IP. Production construction
// always uses ResolveAndValidate.
resolve Resolver
}
// NewCrawlerTool returns a CrawlerTool using the default HTTPHelper.
// Pass NewCrawlerToolWith(helper) to inject a custom HTTPHelper (e.g.
// with a test transport).
func NewCrawlerTool() *CrawlerTool {
return NewCrawlerToolWith(NewHTTPHelper())
}
// NewCrawlerToolWith returns a CrawlerTool that uses the provided
// HTTPHelper. Useful for tests and for sharing a single helper across
// multiple tool instances.
func NewCrawlerToolWith(h *HTTPHelper) *CrawlerTool {
if h == nil {
h = NewHTTPHelper()
}
return &CrawlerTool{helper: h, resolve: ResolveAndValidate}
}
// WithResolver replaces the URL resolver (which performs the SSRF
// check and supplies the pinned IP) with a custom function. The default
// is ResolveAndValidate; tests that point the crawler at an
// httptest.NewServer (127.0.0.1) can pass a no-op that returns the
// literal host. Returns the same receiver for fluent use.
func (c *CrawlerTool) WithResolver(fn Resolver) *CrawlerTool {
if fn != nil {
c.resolve = fn
}
return c
}
// Info returns the tool's metadata for the chat model.
func (c *CrawlerTool) Info(_ context.Context) (*schema.ToolInfo, error) {
return &schema.ToolInfo{
Name: crawlerToolName,
Desc: crawlerToolDescription,
ParamsOneOf: schema.NewParamsOneOfByParams(map[string]*schema.ParameterInfo{
"url": {
Type: schema.String,
Desc: "The URL to fetch. Must be http or https.",
Required: true,
},
"max_depth": {
Type: schema.Integer,
Desc: "Recursion depth. 0 only; >0 returns an error.",
Required: false,
},
"max_pages": {
Type: schema.Integer,
Desc: "Maximum number of pages to fetch. Ignored (single page).",
Required: false,
},
}),
}, nil
}
// ErrCrawlerDepthUnsupported is returned when the caller asks for
// max_depth>0. Multi-page crawling is out of scope.
var ErrCrawlerDepthUnsupported = errors.New(
"crawler: max_depth > 0 is not supported; " +
"use a single-page fetch (max_depth=0)",
)
// InvokableRun fetches a single page and returns extracted text + links.
// max_depth>0 is rejected; multi-page crawling is deferred to a later
// batch.
func (c *CrawlerTool) InvokableRun(ctx context.Context, argumentsInJSON string, _ ...tool.Option) (string, error) {
var args crawlerArgs
if argumentsInJSON == "" {
return crawlerStubResult(crawlerResult{Error: "arguments are required"}),
errors.New("crawler: empty arguments")
}
if err := json.Unmarshal([]byte(argumentsInJSON), &args); err != nil {
return crawlerStubResult(crawlerResult{Error: "invalid JSON: " + err.Error()}),
fmt.Errorf("crawler: parse arguments: %w", err)
}
if strings.TrimSpace(args.URL) == "" {
return crawlerStubResult(crawlerResult{Error: "url is required"}),
errors.New("crawler: empty url")
}
if !strings.HasPrefix(args.URL, "http://") && !strings.HasPrefix(args.URL, "https://") {
return crawlerStubResult(crawlerResult{URL: args.URL, Error: "url must be http or https"}),
fmt.Errorf("crawler: unsupported url scheme: %s", args.URL)
}
// Reject max_depth > 0 BEFORE the SSRF guard: the guard performs a
// DNS lookup that may be slow / fail in CI, and a depth-0 caller
// asking for max_depth=10 should be rejected on a structural
// problem first.
if args.MaxDepth > 0 {
return crawlerStubResult(crawlerResult{URL: args.URL, Error: ErrCrawlerDepthUnsupported.Error()}),
ErrCrawlerDepthUnsupported
}
// SSRF guard + DNS-rebinding pinning. c.resolve validates the URL
// and returns the IP we should dial directly. DoPinned installs a
// transport-level pinned dialer that connects to that IP, while the
// request URL host stays as the original hostname — so an attacker
// who flips the A record to a private address after this point
// still cannot redirect the request (the connect is pinned to the
// IP we resolved here) AND TLS SNI / cert verification continue to
// target the validated hostname. Rewriting the URL host to the IP
// would have broken HTTPS, so the pinning happens in the dialer.
host, pinnedIP, resolveErr := c.resolve(args.URL)
if resolveErr != nil {
return crawlerStubResult(crawlerResult{URL: args.URL, Error: resolveErr.Error()}), resolveErr
}
resp, err := c.helper.DoPinned(ctx, http.MethodGet, args.URL, "", "", nil, host, pinnedIP)
if err != nil {
return crawlerStubResult(crawlerResult{URL: args.URL, Error: err.Error()}), err
}
defer resp.Body.Close()
body, err := io.ReadAll(resp.Body)
if err != nil {
return crawlerStubResult(crawlerResult{URL: args.URL, Status: resp.StatusCode, Error: "read body: " + err.Error()}),
fmt.Errorf("crawler: read body: %w", err)
}
page, err := extractPage(body)
if err != nil {
return crawlerStubResult(crawlerResult{URL: args.URL, Status: resp.StatusCode, Error: err.Error()}),
fmt.Errorf("crawler: extract: %w", err)
}
page.URL = args.URL
page.Status = resp.StatusCode
return crawlerJSON(page)
}
// extractPage parses the HTML body and returns its title, plain text
// content, and absolute links. It uses golang.org/x/net/html per plan
// §2.11.4 (T2: HTTP + golang.org/x/net/html).
func extractPage(body []byte) (crawlerResult, error) {
doc, err := html.Parse(strings.NewReader(string(body)))
if err != nil {
return crawlerResult{}, fmt.Errorf("parse html: %w", err)
}
var out crawlerResult
var titleNodes []*html.Node
var text strings.Builder
var walk func(n *html.Node)
walk = func(n *html.Node) {
if n.Type == html.ElementNode {
switch strings.ToLower(n.Data) {
case "script", "style", "noscript", "template", "svg":
// skip non-content subtrees entirely
return
case "head":
// recurse into head only to capture <title>; skip text
// (so meta tags etc. don't pollute the body text)
for c := n.FirstChild; c != nil; c = c.NextSibling {
if c.Type == html.ElementNode && strings.EqualFold(c.Data, "title") {
titleNodes = append(titleNodes, c)
}
}
return
case "title":
titleNodes = append(titleNodes, n)
case "a":
for _, a := range n.Attr {
if strings.EqualFold(a.Key, "href") {
href := strings.TrimSpace(a.Val)
if href != "" && !strings.HasPrefix(href, "#") {
out.Links = append(out.Links, href)
}
break
}
}
}
}
if n.Type == html.TextNode {
t := strings.TrimSpace(n.Data)
if t != "" {
if text.Len() > 0 {
text.WriteByte(' ')
}
text.WriteString(t)
}
}
for c := n.FirstChild; c != nil; c = c.NextSibling {
walk(c)
}
}
walk(doc)
// Extract title text (concatenate text nodes inside <title>).
for _, tn := range titleNodes {
var t strings.Builder
for c := tn.FirstChild; c != nil; c = c.NextSibling {
if c.Type == html.TextNode {
t.WriteString(c.Data)
}
}
title := strings.TrimSpace(t.String())
if title != "" {
out.Title = title
break
}
}
out.Content = strings.Join(strings.Fields(text.String()), " ")
return out, nil
}
func crawlerStubResult(r crawlerResult) string {
b, err := json.Marshal(r)
if err != nil {
return fmt.Sprintf(`{"_ERROR":"crawler: marshal: %s"}`, err)
}
return string(b)
}
func crawlerJSON(r crawlerResult) (string, error) {
b, err := json.Marshal(r)
if err != nil {
return "", fmt.Errorf("crawler: marshal result: %w", err)
}
return string(b), nil
}