Files
ragflow/internal/agent/runtime/selector.go
Zhichang Yu e45659868a feat(agent): ship the Go agent canvas port — eino interrupt/resume + Redis check-pointing (#16035)
Replaces the Python agent canvas runtime with a Go implementation that
runs inside `cmd/server_main`.

The canvas compiles into an eino Workflow that pauses on wait-for-user
via native Interrupt/Resume (no sentinel flag) and resumes from a
Redis-backed CheckPointStore.

All 21 Python agent components and ~35 tools are ported with functional
parity.

Sandbox providers now read their JSON config from the admin-panel
system_settings table with env fallback.

234 files / +35,413 / -6,111. All Go files are gofmt-clean (CI gate
added); drops the v2 DSL E2E step and the gap-analysis plan (both
redundant after the port ships).

## Type of change

- [x] Refactoring
- [x] New feature
- [x] Bug fix

🤖 Generated with [Claude Code](https://claude.com/claude-code)

---------

Co-authored-by: Claude <noreply@anthropic.com>
2026-06-17 13:24:03 +08:00

169 lines
5.7 KiB
Go

//
// Copyright 2026 The InfiniFlow Authors. All Rights Reserved.
//
// Licensed under the Apache License, Version 2.0 (the "License");
// you may not use this file except in compliance with the License.
// You may obtain a copy of the License at
//
// http://www.apache.org/licenses/LICENSE-2.0
//
// Unless required by applicable law or agreed to in writing, software
// distributed under the License is distributed on an "AS IS" BASIS,
// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
// See the License for the specific language governing permissions and
// limitations under the License.
//
// Package runtime implements per-tenant runtime selection for the
// agent canvas port.
//
// Two pieces live in this package:
//
// - Selector (this file): reads/writes the per-tenant runtime
// override in Redis. The default is RuntimeGo; per-tenant
// overrides still let operators force a tenant back to Python
// during the agent_api.py deprecation window.
// - Metrics (metrics.go): Prometheus counter + histogram for
// per-run observation, keyed by runtime mode.
package runtime
import (
"context"
"fmt"
"os"
"sync"
"github.com/redis/go-redis/v9"
"go.uber.org/zap"
)
// RuntimeMode identifies which agent-canvas runtime implementation
// serves a given tenant. Supports "go" and "python"; "auto" is
// reserved for future adaptive policies.
type RuntimeMode string
const (
// RuntimeGo routes the tenant to the Go-side eino
// implementation. This is the process-wide default.
RuntimeGo RuntimeMode = "go"
// RuntimePython routes the tenant to the legacy Python
// agent_api.py implementation. Retained for the 1-release
// deprecation window; per-tenant overrides via Selector.Set
// can still force a tenant to this mode.
RuntimePython RuntimeMode = "python"
// RuntimeAuto defers to the per-tenant override, then to the
// process-wide Default(). It exists as a sentinel for clients that
// want explicit "I don't care, pick for me" semantics.
RuntimeAuto RuntimeMode = "auto"
)
// defaultEnvKey is the environment variable consulted by Default() when no
// override is registered for a tenant.
const defaultEnvKey = "RAGFLOW_CANVAS_DEFAULT_RUNTIME"
// overrideKeyPrefix is the Redis key namespace for per-tenant runtime
// overrides. Final keys look like "tenant_canvas_runtime:<tenantID>".
const overrideKeyPrefix = "tenant_canvas_runtime:"
var (
defaultOnce sync.Once
defaultMode RuntimeMode
)
// Default returns the process-wide default runtime mode.
//
// The default is Go. The per-tenant override (via Selector.Set)
// can still force a tenant back to Python for the 1-release
// deprecation window of agent_api.py.
//
// The value is read once from the RAGFLOW_CANVAS_DEFAULT_RUNTIME env var;
// subsequent calls return the cached result. Unknown env values fall back
// to RuntimeGo (the new default) so a misconfig still lands on the Go path.
func Default() RuntimeMode {
defaultOnce.Do(func() {
raw := os.Getenv(defaultEnvKey)
switch RuntimeMode(raw) {
case RuntimeGo, RuntimePython, RuntimeAuto:
defaultMode = RuntimeMode(raw)
default:
defaultMode = RuntimeGo
}
})
return defaultMode
}
// ResetDefaultCache clears the cached default-mode value. Test-only helper.
func ResetDefaultCache() {
defaultOnce = sync.Once{}
defaultMode = ""
}
// Selector resolves the runtime mode for a tenant at request time. It is
// safe for concurrent use.
type Selector struct {
redis *redis.Client
logger *zap.Logger
}
// NewSelector constructs a Selector backed by the supplied Redis client. A
// nil logger is replaced with zap.NewNop() so callers in tests can omit it.
func NewSelector(rdb *redis.Client, logger *zap.Logger) *Selector {
if logger == nil {
logger = zap.NewNop()
}
return &Selector{redis: rdb, logger: logger}
}
// overrideKey returns the Redis key for a tenant's runtime override.
func overrideKey(tenantID string) string {
return overrideKeyPrefix + tenantID
}
// Select returns the runtime mode registered for tenantID. The lookup
// order is:
//
// 1. The Redis key "tenant_canvas_runtime:<tenantID>" if present.
// 2. The process-wide Default() (env RAGFLOW_CANVAS_DEFAULT_RUNTIME,
// falling back to RuntimeGo).
//
// A nil Redis client short-circuits to the default and never errors.
func (s *Selector) Select(ctx context.Context, tenantID string) (RuntimeMode, error) {
if s == nil || s.redis == nil {
return Default(), nil
}
raw, err := s.redis.Get(ctx, overrideKey(tenantID)).Result()
if err == redis.Nil {
return Default(), nil
}
if err != nil {
s.logger.Warn("runtime selector: redis get failed, falling back to default",
zap.String("tenant_id", tenantID), zap.Error(err))
return Default(), err
}
mode := RuntimeMode(raw)
switch mode {
case RuntimeGo, RuntimePython, RuntimeAuto:
return mode, nil
default:
s.logger.Warn("runtime selector: unrecognized value, falling back to default",
zap.String("tenant_id", tenantID), zap.String("value", raw))
return Default(), fmt.Errorf("unrecognized runtime mode %q for tenant %q", raw, tenantID)
}
}
// Set overrides the runtime mode for a tenant. The override has no TTL
// (it is permanent until explicitly changed) so the operator does not have
// to remember to re-set it after a Redis flush of short-lived keys. Used
// by the admin runtime endpoint and tests.
func (s *Selector) Set(ctx context.Context, tenantID string, mode RuntimeMode) error {
if s == nil || s.redis == nil {
return fmt.Errorf("runtime selector: no redis client configured")
}
switch mode {
case RuntimeGo, RuntimePython, RuntimeAuto:
default:
return fmt.Errorf("runtime selector: refusing to set invalid mode %q", mode)
}
return s.redis.Set(ctx, overrideKey(tenantID), string(mode), 0).Err()
}