Files
ragflow/internal/agent/workflowx/parallel.go
Zhichang Yu 3fa15c0e2f feat(agent): Go port — canvas engine, 22 components, DSL v2, 13 endpoints (#15952)
Ports the agent canvas subsystem from Python to Go.

## What's included

### Canvas Engine (Phase 0/1)
- State engine, scheduler, variable resolver, Redis checkpoint store,
cancel protocol
- **209 tests** across canvas / component / io packages

### 22 Components (P0–P4)
| Tier | Components |
|---|---|
| P0 T1+T2+T3 | LLM, Agent, ExitLoop, Switch, Categorize, Begin,
Message, Invoke |
| P1 T3 | VariableAggregator, VariableAssigner, StringTransform,
ListOperations, DataOperations |
| P2 T3 | Iteration, IterationItem, Loop, LoopItem |
| P3 T3 | UserFillUp, Fillup |
| P4 T5 | Browser, ExcelProcessor, DocsGenerator |

### DSL v2 Schema (Phase 2.5)
- Typed v2 in-memory model with v1-to-v2 auto-detect converter
- v1 legacy field stripping per plan §2.11.7

### HTTP Endpoints & Bug Fixes (Plans PR1–PR3)
- **DELETE SQL bug fix**: gorm v2 `Where("id = ?", id).Delete(...)`
pattern
- **CreateAgent validation**: title/DSL required, duplicate check, 103
envelope
- **13 new endpoints**: templates, prompts, tags, sessions CRUD,
chat/completions (SSE + non-stream stubs), rerun, test_db_connection,
logs, webhook/logs
- **756 Go unit tests** (745 → 756, +18)
- **17 → 0 Python integration test failures** (test_agents.py +
test_session_management/)

### Tools
21 eino tools: HTTPHelper, search tools, financial/data tools, mandatory
stubs

### Infrastructure
OTel observability, NATS message queue, DeepDoc gRPC client, SSRF
guards, IDOR mitigation
2026-06-12 22:58:28 +08:00

796 lines
27 KiB
Go

//
// Copyright 2026 The InfiniFlow Authors. All Rights Reserved.
//
// Licensed under the Apache License, Version 2.0 (the "License");
// you may not use this file except in compliance with the License.
// You may obtain a copy of the License at
//
// http://www.apache.org/licenses/LICENSE-2.0
//
// Unless required by applicable law or agreed to in writing, software
// distributed under the License is distributed on an "AS IS" BASIS,
// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
// See the License for the specific language governing permissions and
// limitations under the License.
//
// Package workflowx parallel extension.
//
// AddParallelNode is a zero-intrusion helper that runs a sub-workflow
// once per input item, with bounded concurrency, and supports
// per-item interrupt / resume. The shape mirrors AddLoopNode:
// the outer workflow sees a single node; the fan-out is entirely
// inside the lambda body.
//
// The first release is invoke-only on the outer lambda; the inner
// per-item sub-workflow is invoked via runner.Invoke.
//
// See .claude/plans/eino-workflow-parallel.md (and
// .omc/autopilot/spec.md) for the design rationale.
package workflowx
import (
"context"
"encoding/json"
"errors"
"fmt"
"strconv"
"sync"
"github.com/cloudwego/eino/compose"
"github.com/cloudwego/eino/schema"
)
// ParallelAddressSegment is the per-item address segment used when
// addressing interrupts. It mirrors the batch node's
// AddressSegmentBatchProcess.
const ParallelAddressSegment compose.AddressSegmentType = "workflowx-parallel"
// Sentinel errors for the parallel extension. Tests use errors.Is
// to assert these.
var (
// ErrParallelCompileFailed wraps a compile-time failure of the
// inner sub-workflow. The original error from sub.Compile is
// reachable via errors.Unwrap.
ErrParallelCompileFailed = errors.New("workflowx: parallel sub-workflow compile failed")
// ErrParallelResumeStateInvalid is returned when a resume is
// requested but the persisted state is missing, malformed, or
// has an empty Inputs slice.
ErrParallelResumeStateInvalid = errors.New("workflowx: parallel resume state invalid")
)
// ParallelOption configures AddParallelNode. Follows the
// functional-options pattern.
type ParallelOption func(*parallelOptions)
type parallelOptions struct {
maxConcurrency int
compileOpts []compose.GraphCompileOption
runOpts []compose.Option
checkpointBuilder func(nodeKey string, index int) string
enableSubCheckpoint bool
}
// WithParallelMaxConcurrency caps the number of per-item sub-workflow
// invocations that run concurrently.
//
// n <= 1 — sequential execution on the calling goroutine (no
// goroutines are spawned for any input length).
// n > 1 — bounded fan-out using a semaphore of size n; the first
// item still runs on the main goroutine.
//
// The default is 0 (sequential).
func WithParallelMaxConcurrency(n int) ParallelOption {
return func(o *parallelOptions) {
if n >= 0 {
o.maxConcurrency = n
}
}
}
// WithParallelCompileOptions appends compile options to the inner
// sub-workflow's Compile call. Useful for wiring a Serializer or a
// caller-managed CheckPointStore on the inner sub-graph.
//
// Note: the parallel extension always passes its own bridge
// CheckPointStore first (when sub-checkpoint is enabled), so any
// store set via this option will not collide with the bridge store.
func WithParallelCompileOptions(opts ...compose.GraphCompileOption) ParallelOption {
return func(o *parallelOptions) {
o.compileOpts = append(o.compileOpts, opts...)
}
}
// WithParallelRunOptions appends run options to every per-item
// sub-workflow Invoke call. Use this to forward run-level options
// such as per-item callbacks.
func WithParallelRunOptions(opts ...compose.Option) ParallelOption {
return func(o *parallelOptions) {
o.runOpts = append(o.runOpts, opts...)
}
}
// WithParallelCheckpointIDBuilder supplies a deterministic checkpoint
// ID for each per-item sub-workflow invocation. eino does not expose
// the active outer checkpoint ID through ctx, so the extension
// cannot derive child IDs by itself.
//
// The default is a reserved-namespace builder
// (workflowx-parallel:<nodeKey>:<index>) which is deterministic and
// stable across resumes. Callers that need a shared prefix can
// capture it in the closure.
//
// The builder is invoked on the first run AND on resume, with stable
// (nodeKey, index) arguments. An empty return is treated as "skip
// the per-item WithCheckPointID" so the inner task does not get a
// bad namespace.
func WithParallelCheckpointIDBuilder(b func(nodeKey string, index int) string) ParallelOption {
return func(o *parallelOptions) {
if b != nil {
o.checkpointBuilder = b
}
}
}
// WithParallelEnableSubCheckpoint opts the parallel node into
// passing compose.WithCheckPointID(...) and an internal bridge
// store to the sub-workflow on every per-item Invoke call.
//
// The default is true. Disabling it is only useful when the caller
// explicitly wants the smaller/no-sub-checkpoint behavior.
func WithParallelEnableSubCheckpoint(enable bool) ParallelOption {
return func(o *parallelOptions) {
o.enableSubCheckpoint = enable
}
}
// defaultParallelCheckpointBuilder returns a deterministic per-item
// checkpoint ID. Unlike the loop extension, the parallel extension
// does not need a UUID in the default because the same item index
// is naturally re-derived from the persisted InterruptedIndices on
// resume — so the same ID is reused.
func defaultParallelCheckpointBuilder(nodeKey string, index int) string {
return fmt.Sprintf("workflowx-parallel:%s:%d", nodeKey, index)
}
func getParallelOptions(opts []ParallelOption) *parallelOptions {
o := &parallelOptions{
checkpointBuilder: defaultParallelCheckpointBuilder,
enableSubCheckpoint: true,
}
for _, opt := range opts {
opt(o)
}
return o
}
// ParallelInterruptState is the parallel-local checkpoint payload.
// It is persisted as the state argument of
// compose.CompositeInterrupt so a resumed run can continue from the
// interrupted items rather than restart.
//
// The struct mirrors the reference batch node's NodeInterruptState,
// with one important adaptation: OriginalInputs is stored as a
// JSON byte slice (not []any) so the parallel extension can
// re-decode it with the original Go types on resume. JSON's
// default behaviour of decoding numbers into float64 would
// otherwise break integer and other typed inputs.
type ParallelInterruptState struct {
// OriginalInputsJSON is the JSON encoding of the input slice
// as seen by the parallel lambda on first run. On resume
// the lambda input is replaced by a zero value by eino's
// rerun mechanism; this byte slice is the source of truth.
OriginalInputsJSON []byte `json:"original_inputs_json"`
// CompletedResults carries every index that already produced
// a value on a previous (interrupted) run.
CompletedResults map[int]any `json:"completed_results"`
// InterruptedIndices is the list of indices that were not
// durably confirmed completed at the interrupt boundary.
// In the common case this equals "the items whose sub-workflow
// Invoke returned an interrupt". Under concurrent execution,
// however, any item that is not present in CompletedResults is
// treated conservatively as needing replay / resume, because its
// precise execution state may be unknown when the outer node
// returns a CompositeInterrupt.
InterruptedIndices []int `json:"interrupted_indices"`
// TotalCount is the size of the input slice. It is the source
// of truth for the output slice length on resume.
TotalCount int `json:"total_count"`
// ItemCheckpoints is the per-item bridge-store payload captured
// at interrupt time. Keys are the per-item child checkpoint
// IDs (whatever the configured builder produced).
ItemCheckpoints map[string][]byte `json:"item_checkpoints,omitempty"`
}
// Compilable is the input type accepted by AddParallelNode. Both
// *compose.Graph[I, O] and *compose.Workflow[I, O] satisfy it.
type Compilable[I, O any] interface {
Compile(ctx context.Context, opts ...compose.GraphCompileOption) (compose.Runnable[I, O], error)
}
// AddParallelNode appends a parallel-fanout node to the outer
// workflow. The fan-out is inside the lambda body; the outer graph
// sees one node.
//
// The lambda is invoke-only in v1; its Stream handler returns a
// documented error. Callers that need outer-stream parallelism
// should treat that as a future v2 plan.
//
// AddParallelNode compiles the sub-workflow immediately. Compile-
// time failures are returned as an error and the outer workflow
// is not modified.
func AddParallelNode[I, O any](
ctx context.Context,
wf *compose.Workflow[[]I, []O],
key string,
sub Compilable[I, O],
opts ...ParallelOption,
) (*compose.WorkflowNode, error) {
if wf == nil {
return nil, errors.New("workflowx: outer workflow is nil")
}
if sub == nil {
return nil, errors.New("workflowx: sub workflow is nil")
}
options := getParallelOptions(opts)
// Build a fresh per-node bridge store. It is captured in the
// lambda's closure and rehydrated from ItemCheckpoints on
// resume.
bridgeState := newParallelBridgeState(nil)
compileOpts := append([]compose.GraphCompileOption{}, options.compileOpts...)
if options.enableSubCheckpoint {
compileOpts = append(compileOpts, compose.WithCheckPointStore(bridgeState.store()))
}
compiled, err := sub.Compile(ctx, compileOpts...)
if err != nil {
return nil, fmt.Errorf("%w: %s: %v", ErrParallelCompileFailed, key, err)
}
lambda, err := compose.AnyLambda[[]I, []O, struct{}](
func(ctx context.Context, items []I, _ ...struct{}) ([]O, error) {
return runParallelInvoke(ctx, key, compiled, items, options, bridgeState)
},
func(ctx context.Context, items []I, _ ...struct{}) (*schema.StreamReader[[]O], error) {
return nil, errParallelOuterStreamUnsupported
},
nil,
nil,
)
if err != nil {
return nil, fmt.Errorf("workflowx: build parallel lambda: %w", err)
}
return wf.AddLambdaNode(key, lambda), nil
}
// errParallelOuterStreamUnsupported is the documented v1 error
// returned from the outer Stream handler. Surfaced as a sentinel
// for tests to assert against via errors.Is.
var errParallelOuterStreamUnsupported = errors.New("workflowx: parallel node does not support outer stream in v1")
// ErrParallelOuterStreamUnsupported is exported so external tests
// can assert on it. The lambda's Stream handler returns this
// (wrapped) error.
var ErrParallelOuterStreamUnsupported = errParallelOuterStreamUnsupported
// runParallelInvoke is the body of the parallel lambda's Invoke
// handler. It implements the documented state machine:
//
// - On a fresh run: process every item 0..len(items)-1 with an
// empty CompletedResults map.
// - On a resume: process exactly prev.InterruptedIndices, with
// prev.CompletedResults pre-populated into the output slice.
// On resume the lambda's items input is replaced by a zero
// value by eino's rerun mechanism; the canonical inputs come
// from prev.OriginalInputs.
// - If any items interrupt, return a single CompositeInterrupt
// carrying all per-item interrupt errors and a state that lets
// a resumed run re-enter deterministically.
// - On a non-interrupt error, return the first one (wrapped per
// "item %d: %w") and discard the other items' results.
func runParallelInvoke[I, O any](
ctx context.Context,
nodeKey string,
sub compose.Runnable[I, O],
items []I,
options *parallelOptions,
defaultBridge *parallelBridgeState,
) ([]O, error) {
prev, isResume, resumeErr := loadParallelSnapshot(ctx)
if resumeErr != nil {
return nil, resumeErr
}
// On a resume, eino's rerun mechanism passes a zero-value
// items slice to the lambda. The canonical inputs come from
// the persisted state. On a fresh run, items is the user's
// input and the persisted state is empty.
effectiveItems := items
if isResume && prev != nil {
var restored []I
if rErr := json.Unmarshal(prev.OriginalInputsJSON, &restored); rErr != nil {
return nil, fmt.Errorf("%w: decode original_inputs_json: %v", ErrParallelResumeStateInvalid, rErr)
}
effectiveItems = restored
}
if len(effectiveItems) == 0 {
return []O{}, nil
}
// Allocate output slice. On resume, the total count is the
// persisted value; on first run, it is the input length.
totalCount := len(effectiveItems)
indicesToProcess := make([]int, len(effectiveItems))
for i := range effectiveItems {
indicesToProcess[i] = i
}
bridgeState := defaultBridge
outputs := make([]O, totalCount)
if isResume && prev != nil {
totalCount = prev.TotalCount
if totalCount < 0 {
return nil, fmt.Errorf("%w: negative total_count", ErrParallelResumeStateInvalid)
}
outputs = make([]O, totalCount)
// Replay completed results into the correct output slots.
// The persisted value came through a JSON round-trip so
// numeric types are float64; we coerce to O via an
// intermediate any round-trip.
for idx, v := range prev.CompletedResults {
if idx < 0 || idx >= totalCount {
return nil, fmt.Errorf("%w: completed index %d out of range", ErrParallelResumeStateInvalid, idx)
}
typed, ok := coerceAnyToO[O](v)
if !ok {
return nil, fmt.Errorf("%w: cannot coerce completed result at index %d (type %T) to target type", ErrParallelResumeStateInvalid, idx, v)
}
outputs[idx] = typed
}
// Only re-invoke the previously-interrupted indices.
indicesToProcess = append([]int(nil), prev.InterruptedIndices...)
// Rehydrate the bridge store from persisted ItemCheckpoints.
bridgeState = newParallelBridgeState(prev.ItemCheckpoints)
}
// Run all items. The sequential / semaphore-bounded fan-out
// is delegated to runParallelFanout.
results := runParallelFanout(ctx, nodeKey, sub, effectiveItems, indicesToProcess, options, bridgeState)
// Drain the result channel, categorising each entry.
var normalErr error
var interruptErrs []error
completedResults := make(map[int]any)
for r := range results {
if r.err == nil {
if r.index >= 0 && r.index < len(outputs) {
if typed, ok := r.output.(O); ok {
outputs[r.index] = typed
}
}
completedResults[r.index] = r.output
continue
}
if isInterruptError(r.err) {
interruptErrs = append(interruptErrs, r.err)
continue
}
// First non-interrupt error wins; we keep draining so
// goroutines do not leak, but the caller will see this
// normalErr and discard the rest.
if normalErr == nil {
normalErr = fmt.Errorf("item %d: %w", r.index, r.err)
}
}
// Non-interrupt error: discard every other result, return the
// first one (wrapped). No state is persisted.
if normalErr != nil {
return nil, normalErr
}
// Interrupt case: persist state and rethrow via CompositeInterrupt.
// We store every non-completed index, not only the indices that
// explicitly surfaced an interrupt. This preserves correctness if
// a future implementation changes the fan-out to short-circuit or
// cancel in-flight work at the first interrupt boundary.
if len(interruptErrs) > 0 {
inputsJSON, jErr := json.Marshal(effectiveItems)
if jErr != nil {
return nil, fmt.Errorf("workflowx: marshal original inputs: %w", jErr)
}
interruptedIndices := buildPendingIndices(totalCount, completedResults)
state, sErr := encodeParallelState(ParallelInterruptState{
OriginalInputsJSON: inputsJSON,
CompletedResults: completedResults,
InterruptedIndices: interruptedIndices,
TotalCount: totalCount,
ItemCheckpoints: bridgeState.snapshot(),
})
if sErr != nil {
return nil, fmt.Errorf("workflowx: encode parallel interrupt state: %w", sErr)
}
return nil, compose.CompositeInterrupt(ctx, nil, state, interruptErrs...)
}
return outputs, nil
}
// coerceAnyToO adapts a JSON-roundtripped any to the typed
// output O. JSON decoding maps numeric types to float64 by
// default, so a value that originated as int, int64, float32,
// etc. comes back as float64. This helper covers the common
// numeric conversions; for non-numeric O, the direct assertion
// is used.
func coerceAnyToO[O any](v any) (O, bool) {
var zero O
if v == nil {
return zero, false
}
if typed, ok := v.(O); ok {
return typed, true
}
// JSON-decode coercion: float64 -> O when O is one of the
// common numeric types. We use reflect-free type switches.
switch any(zero).(type) {
case int:
if f, ok := v.(float64); ok {
return any(int(f)).(O), true
}
case int64:
if f, ok := v.(float64); ok {
return any(int64(f)).(O), true
}
case int32:
if f, ok := v.(float64); ok {
return any(int32(f)).(O), true
}
case float32:
if f, ok := v.(float64); ok {
return any(float32(f)).(O), true
}
case float64:
if f, ok := v.(float64); ok {
return any(f).(O), true
}
case uint:
if f, ok := v.(float64); ok {
return any(uint(f)).(O), true
}
case uint64:
if f, ok := v.(float64); ok {
return any(uint64(f)).(O), true
}
case uint32:
if f, ok := v.(float64); ok {
return any(uint32(f)).(O), true
}
}
return zero, false
}
// parallelResumeBackdoorKey is a context key used by unit tests
// to drive the resume path without going through eino's
// framework-managed checkpoint store. The production resume
// path uses compose.GetInterruptState; this is a test-only
// backdoor. Set via context.WithValue(ctx,
// parallelResumeBackdoorKey{}, payload) where payload is the
// JSON-encoded ParallelInterruptState.
type parallelResumeBackdoorKey struct{}
// loadParallelSnapshot reads the persisted parallel state from
// ctx if the current run is a resume. On a fresh run it returns
// (nil, false, nil).
//
// The loader first checks for a test-injected payload via
// parallelResumeBackdoorKey (so unit tests can drive the resume
// path directly), then falls back to eino's
// compose.GetInterruptState. The production resume path always
// goes through the second branch.
func loadParallelSnapshot(ctx context.Context) (*ParallelInterruptState, bool, error) {
// Test backdoor: a hand-injected payload takes priority so
// unit tests can drive resume without a real checkpoint
// store. Production code never sets this key.
if raw, ok := ctx.Value(parallelResumeBackdoorKey{}).([]byte); ok && len(raw) > 0 {
var st ParallelInterruptState
if err := json.Unmarshal(raw, &st); err != nil {
return nil, false, fmt.Errorf("%w: decode state: %v", ErrParallelResumeStateInvalid, err)
}
if st.TotalCount < 0 {
return nil, false, fmt.Errorf("%w: negative total_count", ErrParallelResumeStateInvalid)
}
if err := validateParallelSnapshot(&st); err != nil {
return nil, false, err
}
return &st, true, nil
}
wasInterrupted, hasState, payload := compose.GetInterruptState[[]byte](ctx)
if !wasInterrupted || !hasState {
return nil, false, nil
}
var st ParallelInterruptState
if err := json.Unmarshal(payload, &st); err != nil {
return nil, false, fmt.Errorf("%w: decode state: %v", ErrParallelResumeStateInvalid, err)
}
if st.TotalCount < 0 {
return nil, false, fmt.Errorf("%w: negative total_count", ErrParallelResumeStateInvalid)
}
if err := validateParallelSnapshot(&st); err != nil {
return nil, false, err
}
return &st, true, nil
}
// encodeParallelState marshals the parallel state to the
// persistable form. Go's encoding/json natively encodes
// map[int]any with integer keys as JSON object string keys, so
// the round-trip preserves the type.
func encodeParallelState(s ParallelInterruptState) ([]byte, error) {
return json.Marshal(s)
}
// buildPendingIndices returns the resume set for an interrupted run:
// every index in [0,totalCount) that is not durably present in
// CompletedResults. The returned slice is intentionally the full
// non-completed complement for safety: under concurrent execution,
// an item whose goroutine was still in-flight at the interrupt
// boundary is treated as needing replay.
func buildPendingIndices(totalCount int, completedResults map[int]any) []int {
if totalCount <= 0 {
return nil
}
pending := make([]int, 0, totalCount-len(completedResults))
for idx := 0; idx < totalCount; idx++ {
if _, ok := completedResults[idx]; ok {
continue
}
pending = append(pending, idx)
}
return pending
}
// validateParallelSnapshot enforces the resume invariant:
// CompletedResults and InterruptedIndices must form a partition of
// [0,totalCount). Any hole means the resumed run cannot know whether
// the missing item never started, partially ran, or already caused
// side effects, so the state is rejected as invalid.
func validateParallelSnapshot(st *ParallelInterruptState) error {
if st == nil {
return nil
}
covered := make([]bool, st.TotalCount)
for idx := range st.CompletedResults {
if idx < 0 || idx >= st.TotalCount {
return fmt.Errorf("%w: completed index %d out of range", ErrParallelResumeStateInvalid, idx)
}
if covered[idx] {
return fmt.Errorf("%w: duplicate index %d across completed/interrupted sets", ErrParallelResumeStateInvalid, idx)
}
covered[idx] = true
}
for _, idx := range st.InterruptedIndices {
if idx < 0 || idx >= st.TotalCount {
return fmt.Errorf("%w: interrupted index %d out of range", ErrParallelResumeStateInvalid, idx)
}
if covered[idx] {
return fmt.Errorf("%w: duplicate index %d across completed/interrupted sets", ErrParallelResumeStateInvalid, idx)
}
covered[idx] = true
}
for idx, ok := range covered {
if !ok {
return fmt.Errorf("%w: missing index %d from completed/interrupted partition", ErrParallelResumeStateInvalid, idx)
}
}
return nil
}
// parallelTaskResult is the per-item outcome that the fan-out
// goroutines send back to the main loop. `output` is any so the
// fan-out helper can be shared by runParallelInvoke callers of
// arbitrary I, O; the consumer type-asserts back to O when filling
// the output slice.
type parallelTaskResult struct {
index int
output any
err error
}
// runParallelFanout executes the per-item sub-workflow calls
// according to the configured concurrency policy and returns a
// channel of results. The channel is closed once every item has
// reported (success, interrupt, or error).
//
// Concurrency policy:
// - maxConcurrency <= 1: strictly sequential, no goroutines
// spawned (matches plan §"Concurrency policy" and the P0
// acceptance criterion "no goroutine spawns for 0 or 1").
// - maxConcurrency > 1: bounded fan-out via a buffered channel
// semaphore of size maxConcurrency. The first item runs on
// the main goroutine; subsequent items run in worker
// goroutines that acquire the semaphore before invoking.
//
// Per-item panics are recovered and surfaced as a normal error
// wrapped with "item %d:" so the outer lambda never crashes.
func runParallelFanout[I, O any](
ctx context.Context,
nodeKey string,
sub compose.Runnable[I, O],
items []I,
indices []int,
options *parallelOptions,
bridgeState *parallelBridgeState,
) <-chan parallelTaskResult {
resultCh := make(chan parallelTaskResult, len(indices))
if len(indices) == 0 {
close(resultCh)
return resultCh
}
runOne := func(idx int) {
// Derive the per-item checkpoint ID. The builder is
// invoked on the first run AND on resume. An empty
// return is treated as "no per-item id"; the option
// is skipped.
var cpID string
if options.enableSubCheckpoint {
cpID = options.checkpointBuilder(nodeKey, idx)
}
// Per-item address segment.
subCtx := compose.AppendAddressSegment(ctx, ParallelAddressSegment, strconv.Itoa(idx))
// Bridge store wiring for this item.
subCtx = withParallelBridgeState(subCtx, bridgeState)
invokeOpts := make([]compose.Option, 0, len(options.runOpts)+1)
if options.enableSubCheckpoint && cpID != "" {
invokeOpts = append(invokeOpts, compose.WithCheckPointID(cpID))
}
invokeOpts = append(invokeOpts, options.runOpts...)
var out O
var err error
func() {
defer func() {
if r := recover(); r != nil {
err = fmt.Errorf("item %d panic: %v", idx, r)
}
}()
out, err = sub.Invoke(subCtx, items[idx], invokeOpts...)
}()
resultCh <- parallelTaskResult{index: idx, output: out, err: err}
}
// Strictly sequential path: no goroutines, regardless of
// input length.
if options.maxConcurrency <= 1 {
for _, idx := range indices {
runOne(idx)
}
close(resultCh)
return resultCh
}
// Concurrent path. Use a buffered channel semaphore.
sem := make(chan struct{}, options.maxConcurrency)
var wg sync.WaitGroup
for i, idx := range indices {
wg.Add(1)
idx := idx
if i == 0 {
// First task runs on the main goroutine.
runOne(idx)
wg.Done()
continue
}
go func() {
defer wg.Done()
sem <- struct{}{}
defer func() { <-sem }()
runOne(idx)
}()
}
go func() {
wg.Wait()
close(resultCh)
}()
return resultCh
}
// parallelBridgeStoreKey is the context key for the per-run
// parallel bridge state.
type parallelBridgeStoreKey struct{}
// parallelBridgeState is the in-memory map backing the per-item
// child checkpoints. It is owned by AddParallelNode and passed
// through ctx so the parallelBridgeStore can find it.
type parallelBridgeState struct {
mu sync.RWMutex
data map[string][]byte
}
func newParallelBridgeState(data map[string][]byte) *parallelBridgeState {
cloned := cloneCheckpointMap(data)
if cloned == nil {
cloned = make(map[string][]byte)
}
return &parallelBridgeState{data: cloned}
}
func (s *parallelBridgeState) get(id string) ([]byte, bool) {
s.mu.RLock()
defer s.mu.RUnlock()
v, ok := s.data[id]
if !ok {
return nil, false
}
buf := make([]byte, len(v))
copy(buf, v)
return buf, true
}
func (s *parallelBridgeState) set(id string, payload []byte) {
s.mu.Lock()
defer s.mu.Unlock()
if s.data == nil {
s.data = make(map[string][]byte)
}
buf := make([]byte, len(payload))
copy(buf, payload)
s.data[id] = buf
}
func (s *parallelBridgeState) snapshot() map[string][]byte {
s.mu.RLock()
defer s.mu.RUnlock()
return cloneCheckpointMap(s.data)
}
// parallelBridgeStore is the CheckPointStore implementation that
// reads/writes the parallel bridge state from ctx. It is registered
// on the inner sub-workflow's Compile call (when
// WithParallelEnableSubCheckpoint(true) is in effect).
type parallelBridgeStore struct{}
func newParallelBridgeStore() *parallelBridgeStore {
return &parallelBridgeStore{}
}
func (s *parallelBridgeStore) Get(ctx context.Context, checkPointID string) ([]byte, bool, error) {
state, ok := ctx.Value(parallelBridgeStoreKey{}).(*parallelBridgeState)
if !ok || state == nil {
return nil, false, nil
}
payload, found := state.get(checkPointID)
return payload, found, nil
}
func (s *parallelBridgeStore) Set(ctx context.Context, checkPointID string, checkPoint []byte) error {
state, ok := ctx.Value(parallelBridgeStoreKey{}).(*parallelBridgeState)
if !ok || state == nil {
return nil
}
state.set(checkPointID, checkPoint)
return nil
}
// withParallelBridgeState wires the per-run bridge state into ctx.
func withParallelBridgeState(ctx context.Context, state *parallelBridgeState) context.Context {
return context.WithValue(ctx, parallelBridgeStoreKey{}, state)
}
// store returns the per-run CheckPointStore that the inner
// sub-workflow should use. It is captured by closure when the
// inner sub-workflow is compiled.
func (s *parallelBridgeState) store() *parallelBridgeStore {
return newParallelBridgeStore()
}