mirror of
https://github.com/infiniflow/ragflow.git
synced 2026-07-04 18:45:38 +08:00
Ports the agent canvas subsystem from Python to Go.
## What's included
### Canvas Engine (Phase 0/1)
- State engine, scheduler, variable resolver, Redis checkpoint store,
cancel protocol
- **209 tests** across canvas / component / io packages
### 22 Components (P0–P4)
| Tier | Components |
|---|---|
| P0 T1+T2+T3 | LLM, Agent, ExitLoop, Switch, Categorize, Begin,
Message, Invoke |
| P1 T3 | VariableAggregator, VariableAssigner, StringTransform,
ListOperations, DataOperations |
| P2 T3 | Iteration, IterationItem, Loop, LoopItem |
| P3 T3 | UserFillUp, Fillup |
| P4 T5 | Browser, ExcelProcessor, DocsGenerator |
### DSL v2 Schema (Phase 2.5)
- Typed v2 in-memory model with v1-to-v2 auto-detect converter
- v1 legacy field stripping per plan §2.11.7
### HTTP Endpoints & Bug Fixes (Plans PR1–PR3)
- **DELETE SQL bug fix**: gorm v2 `Where("id = ?", id).Delete(...)`
pattern
- **CreateAgent validation**: title/DSL required, duplicate check, 103
envelope
- **13 new endpoints**: templates, prompts, tags, sessions CRUD,
chat/completions (SSE + non-stream stubs), rerun, test_db_connection,
logs, webhook/logs
- **756 Go unit tests** (745 → 756, +18)
- **17 → 0 Python integration test failures** (test_agents.py +
test_session_management/)
### Tools
21 eino tools: HTTPHelper, search tools, financial/data tools, mandatory
stubs
### Infrastructure
OTel observability, NATS message queue, DeepDoc gRPC client, SSRF
guards, IDOR mitigation
254 lines
7.3 KiB
Go
254 lines
7.3 KiB
Go
//
|
|
// Copyright 2026 The InfiniFlow Authors. All Rights Reserved.
|
|
//
|
|
// Licensed under the Apache License, Version 2.0 (the "License");
|
|
// you may not use this file except in compliance with the License.
|
|
// You may obtain a copy of the License at
|
|
//
|
|
// http://www.apache.org/licenses/LICENSE-2.0
|
|
//
|
|
// Unless required by applicable law or agreed to in writing, software
|
|
// distributed under the License is distributed on an "AS IS" BASIS,
|
|
// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
|
|
// See the License for the specific language governing permissions and
|
|
// limitations under the License.
|
|
//
|
|
|
|
package tool
|
|
|
|
import (
|
|
"bytes"
|
|
"context"
|
|
"encoding/json"
|
|
"encoding/xml"
|
|
"fmt"
|
|
"io"
|
|
"net/http"
|
|
"net/url"
|
|
"strings"
|
|
|
|
"github.com/cloudwego/eino/components/tool"
|
|
"github.com/cloudwego/eino/schema"
|
|
)
|
|
|
|
const arxivToolName = "arxiv"
|
|
|
|
const arxivToolDescription = "Search arXiv and return matching preprints as {title, authors, summary, pdf_url, entry_id}."
|
|
|
|
// arxivParams is the JSON shape the model sends into InvokableRun.
|
|
type arxivParams struct {
|
|
Query string `json:"query"`
|
|
MaxResults int `json:"max_results"`
|
|
}
|
|
|
|
// arxivResult is one entry in the model-facing result list.
|
|
type arxivResult struct {
|
|
Title string `json:"title"`
|
|
Authors []string `json:"authors"`
|
|
Summary string `json:"summary"`
|
|
PDFURL string `json:"pdf_url"`
|
|
EntryID string `json:"entry_id"`
|
|
}
|
|
|
|
// arxivEnvelope is the JSON shape the model sees.
|
|
type arxivEnvelope struct {
|
|
Results []arxivResult `json:"results"`
|
|
Error string `json:"_ERROR,omitempty"`
|
|
}
|
|
|
|
// arxivAtom is the upstream Atom 1.0 envelope. We only model the fields
|
|
// we care about. Authors are stored as a flat list of <name> elements;
|
|
// pdf_url is derived from the first <link> with rel="related" and
|
|
// type="application/pdf", falling back to the entry id.
|
|
type arxivAtom struct {
|
|
XMLName xml.Name `xml:"http://www.w3.org/2005/Atom feed"`
|
|
Title string `xml:"title"`
|
|
Entries []arxivEntry `xml:"entry"`
|
|
}
|
|
|
|
type arxivEntry struct {
|
|
Title string `xml:"title"`
|
|
ID string `xml:"id"`
|
|
Summary string `xml:"summary"`
|
|
Authors []arxivAuthor `xml:"author"`
|
|
Links []arxivLink `xml:"link"`
|
|
}
|
|
|
|
type arxivAuthor struct {
|
|
Name string `xml:"name"`
|
|
}
|
|
|
|
type arxivLink struct {
|
|
Href string `xml:"href,attr"`
|
|
Rel string `xml:"rel,attr"`
|
|
Type string `xml:"type,attr"`
|
|
}
|
|
|
|
// ArxivTool is the Phase 3 batch 2 implementation of the ArXiv academic
|
|
// search tool (plan §2.11.4 row 2, §5 Phase 3 第 2 批). It performs a GET
|
|
// against the public ArXiv API and parses the Atom XML response using
|
|
// the stdlib encoding/xml package.
|
|
type ArxivTool struct {
|
|
helper *HTTPHelper
|
|
}
|
|
|
|
// NewArxivTool returns an ArxivTool using the default HTTPHelper.
|
|
func NewArxivTool() *ArxivTool {
|
|
return NewArxivToolWith(NewHTTPHelper())
|
|
}
|
|
|
|
// NewArxivToolWith returns an ArxivTool that uses the provided
|
|
// HTTPHelper. Useful for tests.
|
|
func NewArxivToolWith(h *HTTPHelper) *ArxivTool {
|
|
if h == nil {
|
|
h = NewHTTPHelper()
|
|
}
|
|
return &ArxivTool{helper: h}
|
|
}
|
|
|
|
// Info returns the tool's metadata for the chat model.
|
|
func (a *ArxivTool) Info(_ context.Context) (*schema.ToolInfo, error) {
|
|
return &schema.ToolInfo{
|
|
Name: arxivToolName,
|
|
Desc: arxivToolDescription,
|
|
ParamsOneOf: schema.NewParamsOneOfByParams(map[string]*schema.ParameterInfo{
|
|
"query": {
|
|
Type: schema.String,
|
|
Desc: "Search query (matches the arXiv `all:` field).",
|
|
Required: true,
|
|
},
|
|
"max_results": {
|
|
Type: schema.Integer,
|
|
Desc: "Maximum number of results to return. Defaults to 5.",
|
|
Required: false,
|
|
},
|
|
}),
|
|
}, nil
|
|
}
|
|
|
|
// buildArxivURL constructs the ArXiv /api/query URL.
|
|
func buildArxivURL(query string, maxResults int) string {
|
|
if maxResults <= 0 {
|
|
maxResults = 5
|
|
}
|
|
q := url.Values{}
|
|
q.Set("search_query", "all:"+query)
|
|
q.Set("max_results", fmt.Sprintf("%d", maxResults))
|
|
return "http://export.arxiv.org/api/query?" + q.Encode()
|
|
}
|
|
|
|
// parseArxivAtom decodes the upstream Atom XML and returns the model-
|
|
// facing results. Exposed at package scope for unit testing with canned
|
|
// XML.
|
|
func parseArxivAtom(body []byte) ([]arxivResult, error) {
|
|
var feed arxivAtom
|
|
if err := xml.NewDecoder(bytes.NewReader(body)).Decode(&feed); err != nil {
|
|
return nil, fmt.Errorf("arxiv: decode atom: %w", err)
|
|
}
|
|
results := make([]arxivResult, 0, len(feed.Entries))
|
|
for _, e := range feed.Entries {
|
|
authors := make([]string, 0, len(e.Authors))
|
|
for _, a := range e.Authors {
|
|
if a.Name != "" {
|
|
authors = append(authors, a.Name)
|
|
}
|
|
}
|
|
pdfURL := pickArxivPDF(e)
|
|
results = append(results, arxivResult{
|
|
Title: normalizeArxivWhitespace(e.Title),
|
|
Authors: authors,
|
|
Summary: normalizeArxivWhitespace(e.Summary),
|
|
PDFURL: pdfURL,
|
|
EntryID: e.ID,
|
|
})
|
|
}
|
|
return results, nil
|
|
}
|
|
|
|
// pickArxivPDF returns the entry's PDF link, preferring the explicit
|
|
// rel="related" type="application/pdf" link and falling back to the
|
|
// canonical abs/.../pdf URL derived from the entry id.
|
|
func pickArxivPDF(e arxivEntry) string {
|
|
for _, l := range e.Links {
|
|
if l.Rel == "related" && l.Type == "application/pdf" && l.Href != "" {
|
|
return l.Href
|
|
}
|
|
}
|
|
// Fallback: derive from the abs id. The arXiv API returns ids like
|
|
// "http://arxiv.org/abs/2501.12345v1"; we transform the abs path
|
|
// to a /pdf/ path.
|
|
id := e.ID
|
|
if id == "" {
|
|
return ""
|
|
}
|
|
u, err := url.Parse(id)
|
|
if err != nil || u.Path == "" {
|
|
return id
|
|
}
|
|
newPath := u.Path
|
|
if i := strings.LastIndex(newPath, "abs"); i >= 0 {
|
|
newPath = newPath[:i] + "pdf" + newPath[i+len("abs"):]
|
|
} else {
|
|
newPath = "/pdf" + newPath
|
|
}
|
|
u2 := *u
|
|
u2.Path = newPath
|
|
return u2.String()
|
|
}
|
|
|
|
// normalizeArxivWhitespace collapses internal whitespace and trims.
|
|
// The arXiv XML wraps long fields across multiple <p> blocks, leaving
|
|
// spurious newlines and double spaces when concatenated.
|
|
func normalizeArxivWhitespace(s string) string {
|
|
return strings.TrimSpace(strings.Join(strings.Fields(s), " "))
|
|
}
|
|
|
|
// InvokableRun performs the ArXiv search.
|
|
func (a *ArxivTool) InvokableRun(ctx context.Context, argsJSON string, _ ...tool.Option) (string, error) {
|
|
var p arxivParams
|
|
if err := json.Unmarshal([]byte(argsJSON), &p); err != nil {
|
|
return arxivErrJSON(fmt.Errorf("arxiv: parse arguments: %w", err)),
|
|
fmt.Errorf("arxiv: parse arguments: %w", err)
|
|
}
|
|
if p.Query == "" {
|
|
return arxivErrJSON(fmt.Errorf("query is required")),
|
|
fmt.Errorf("arxiv: query is required")
|
|
}
|
|
|
|
endpoint := buildArxivURL(p.Query, p.MaxResults)
|
|
resp, err := a.helper.Do(ctx, http.MethodGet, endpoint, "", "", nil)
|
|
if err != nil {
|
|
return arxivErrJSON(err), err
|
|
}
|
|
defer resp.Body.Close()
|
|
|
|
if resp.StatusCode < 200 || resp.StatusCode >= 300 {
|
|
return arxivErrJSON(fmt.Errorf("arxiv: upstream returned %d", resp.StatusCode)),
|
|
fmt.Errorf("arxiv: upstream returned %d", resp.StatusCode)
|
|
}
|
|
|
|
body, err := io.ReadAll(resp.Body)
|
|
if err != nil {
|
|
return arxivErrJSON(fmt.Errorf("arxiv: read body: %w", err)),
|
|
fmt.Errorf("arxiv: read body: %w", err)
|
|
}
|
|
|
|
results, err := parseArxivAtom(body)
|
|
if err != nil {
|
|
return arxivErrJSON(err), err
|
|
}
|
|
return arxivJSON(arxivEnvelope{Results: results}), nil
|
|
}
|
|
|
|
func arxivJSON(env arxivEnvelope) string {
|
|
b, err := json.Marshal(env)
|
|
if err != nil {
|
|
return fmt.Sprintf(`{"_ERROR":"arxiv: marshal result: %s"}`, err)
|
|
}
|
|
return string(b)
|
|
}
|
|
|
|
func arxivErrJSON(err error) string {
|
|
return arxivJSON(arxivEnvelope{Error: err.Error()})
|
|
}
|