mirror of
https://github.com/infiniflow/ragflow.git
synced 2026-06-29 23:41:12 +08:00
Ports the agent canvas subsystem from Python to Go.
## What's included
### Canvas Engine (Phase 0/1)
- State engine, scheduler, variable resolver, Redis checkpoint store,
cancel protocol
- **209 tests** across canvas / component / io packages
### 22 Components (P0–P4)
| Tier | Components |
|---|---|
| P0 T1+T2+T3 | LLM, Agent, ExitLoop, Switch, Categorize, Begin,
Message, Invoke |
| P1 T3 | VariableAggregator, VariableAssigner, StringTransform,
ListOperations, DataOperations |
| P2 T3 | Iteration, IterationItem, Loop, LoopItem |
| P3 T3 | UserFillUp, Fillup |
| P4 T5 | Browser, ExcelProcessor, DocsGenerator |
### DSL v2 Schema (Phase 2.5)
- Typed v2 in-memory model with v1-to-v2 auto-detect converter
- v1 legacy field stripping per plan §2.11.7
### HTTP Endpoints & Bug Fixes (Plans PR1–PR3)
- **DELETE SQL bug fix**: gorm v2 `Where("id = ?", id).Delete(...)`
pattern
- **CreateAgent validation**: title/DSL required, duplicate check, 103
envelope
- **13 new endpoints**: templates, prompts, tags, sessions CRUD,
chat/completions (SSE + non-stream stubs), rerun, test_db_connection,
logs, webhook/logs
- **756 Go unit tests** (745 → 756, +18)
- **17 → 0 Python integration test failures** (test_agents.py +
test_session_management/)
### Tools
21 eino tools: HTTPHelper, search tools, financial/data tools, mandatory
stubs
### Infrastructure
OTel observability, NATS message queue, DeepDoc gRPC client, SSRF
guards, IDOR mitigation
184 lines
5.9 KiB
Go
184 lines
5.9 KiB
Go
//
|
|
// Copyright 2026 The InfiniFlow Authors. All Rights Reserved.
|
|
//
|
|
// Licensed under the Apache License, Version 2.0 (the "License");
|
|
// you may not use this file except in compliance with the License.
|
|
// You may obtain a copy of the License at
|
|
//
|
|
// http://www.apache.org/licenses/LICENSE-2.0
|
|
//
|
|
// Unless required by applicable law or agreed to in writing, software
|
|
// distributed under the License is distributed on an "AS IS" BASIS,
|
|
// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
|
|
// See the License for the specific language governing permissions and
|
|
// limitations under the License.
|
|
//
|
|
|
|
package deepdoc
|
|
|
|
import (
|
|
"bytes"
|
|
"context"
|
|
"encoding/json"
|
|
"fmt"
|
|
"io"
|
|
"mime/multipart"
|
|
"net/url"
|
|
"strings"
|
|
)
|
|
|
|
// BBox is a 4-tuple [left, top, right, bottom] in the image's
|
|
// native pixel coordinates. Float to preserve sub-pixel accuracy
|
|
// from the upstream server (the Python client lowercases+indexes
|
|
// without rounding).
|
|
type BBox [4]float64
|
|
|
|
// DLAResult is one detected layout region. Type is the normalized
|
|
// class name (lowercased, per Python `dla_cli.py:43`); TypeIdx is
|
|
// the raw class index into DLAClasses (preserved so callers can
|
|
// disambiguate the documented duplicate class slots).
|
|
type DLAResult struct {
|
|
Type string `json:"type"`
|
|
Score float64 `json:"score"`
|
|
BBox BBox `json:"bbox"`
|
|
TypeIdx int `json:"type_idx"`
|
|
}
|
|
|
|
// DLAClasses is the 10-entry class taxonomy from
|
|
// deepdoc/vision/dla_cli.py:10-21. Order is significant — TypeIdx
|
|
// in the wire payload is an index into this slice. The duplicates
|
|
// at indices 4/6/7/9 are kept verbatim for backward compatibility
|
|
// with existing inference servers.
|
|
var DLAClasses = []string{
|
|
"title", // 0
|
|
"text", // 1
|
|
"reference", // 2
|
|
"figure", // 3
|
|
"figure caption", // 4
|
|
"table", // 5
|
|
"table caption", // 6
|
|
"table caption", // 7 duplicate
|
|
"equation", // 8
|
|
"figure caption", // 9 duplicate
|
|
}
|
|
|
|
// rawDLA is the wire format the DLA server returns
|
|
// (docs/agent-port/deepdoc-endpoints.md §2.3).
|
|
type rawDLA struct {
|
|
BBoxes [][]float64 `json:"bboxes"`
|
|
}
|
|
|
|
// DLA calls the remote DLA service for layout analysis of one or
|
|
// more JPEG-encoded images. The Python contract
|
|
// (dla_cli.py:25-50) is replicated:
|
|
//
|
|
// - one HTTP POST per image
|
|
// - 3 attempts per image, 18s per attempt, 200ms initial backoff
|
|
// - failed images return an empty DLAResult (caller does not
|
|
// have to handle per-image errors — the Python
|
|
// `layout_recognizer.py:74-76` is happy with empty results)
|
|
//
|
|
// When no DEEPDOC_URL is set, returns ErrNoURL without any network
|
|
// call. When the base URL is set but the service is unreachable
|
|
// after 3 attempts, the failed image's slot is an empty DLAResult
|
|
// and the rest still process (matches Python's "len(res) == i"
|
|
// append-empty pattern).
|
|
func (c *Client) DLA(ctx context.Context, images [][]byte) ([]DLAResult, error) {
|
|
if !c.Enabled() {
|
|
return nil, ErrNoURL
|
|
}
|
|
if len(images) == 0 {
|
|
return []DLAResult{}, nil
|
|
}
|
|
predictURL, err := c.predictURL()
|
|
if err != nil {
|
|
return nil, err
|
|
}
|
|
out := make([]DLAResult, 0, len(images))
|
|
for _, img := range images {
|
|
res := c.predictOne(ctx, predictURL, img)
|
|
// Per Python: a failed image yields an empty slot rather
|
|
// than aborting the whole batch. Surface the first hard
|
|
// error at the end if the user wants it.
|
|
if len(res) == 0 {
|
|
out = append(out, DLAResult{})
|
|
} else {
|
|
out = append(out, res...)
|
|
}
|
|
}
|
|
return out, nil
|
|
}
|
|
|
|
// predictURL resolves the DLA endpoint URL from the configured base.
|
|
// Trims trailing slash to avoid `//predict` on `http://host/`.
|
|
func (c *Client) predictURL() (string, error) {
|
|
base := strings.TrimRight(c.baseURL, "/")
|
|
u, err := url.Parse(base + predictPath)
|
|
if err != nil {
|
|
return "", fmt.Errorf("deepdoc: parse predict url: %w", err)
|
|
}
|
|
return u.String(), nil
|
|
}
|
|
|
|
// predictOne runs the retry loop for a single image. Returns the
|
|
// list of bboxes the server returned, or an empty slice if all
|
|
// attempts failed. Errors are NOT returned for retry exhaustion —
|
|
// the caller maps "empty slice" to "no detections" per the Python
|
|
// contract; a hard error (4xx, bad URL) is returned immediately.
|
|
func (c *Client) predictOne(ctx context.Context, predictURL string, image []byte) []DLAResult {
|
|
buildBody := func() (io.Reader, string) {
|
|
// Each retry needs a fresh multipart body — multipart.Writer
|
|
// consumes its underlying buffer on Close. CreatePart lets
|
|
// us set both a filename (so Go's net/http server-side
|
|
// parser routes the part to MultipartForm.File) and the
|
|
// image/jpeg Content-Type the DLA server expects (matches
|
|
// the Python `files={'request': ('image.jpg', ...)}`
|
|
// contract from dla_cli.py:35).
|
|
buf := &bytes.Buffer{}
|
|
w := multipart.NewWriter(buf)
|
|
fw, _ := w.CreatePart(map[string][]string{
|
|
"Content-Disposition": {`form-data; name="request"; filename="image.jpg"`},
|
|
"Content-Type": {"image/jpeg"},
|
|
})
|
|
_, _ = fw.Write(image)
|
|
_ = w.Close()
|
|
return buf, w.FormDataContentType()
|
|
}
|
|
validate := func(data []byte) error {
|
|
var r rawDLA
|
|
if err := json.Unmarshal(data, &r); err != nil {
|
|
return fmt.Errorf("%w: %v", ErrInvalidResponse, err)
|
|
}
|
|
if r.BBoxes == nil {
|
|
return fmt.Errorf("%w: missing bboxes key", ErrInvalidResponse)
|
|
}
|
|
return nil
|
|
}
|
|
data, err := c.doPost(ctx, predictURL, buildBody, validate)
|
|
if err != nil {
|
|
return nil
|
|
}
|
|
var r rawDLA
|
|
_ = json.Unmarshal(data, &r) // already validated above
|
|
results := make([]DLAResult, 0, len(r.BBoxes))
|
|
for _, b := range r.BBoxes {
|
|
if len(b) < 6 {
|
|
continue
|
|
}
|
|
// [l, t, r, b, score, type_idx] per docs/agent-port/deepdoc-endpoints.md §2.3.
|
|
bbox := BBox{b[0], b[1], b[2], b[3]}
|
|
idx := int(b[5])
|
|
cls := ""
|
|
if idx >= 0 && idx < len(DLAClasses) {
|
|
cls = DLAClasses[idx]
|
|
}
|
|
results = append(results, DLAResult{
|
|
Type: cls,
|
|
Score: b[4],
|
|
BBox: bbox,
|
|
TypeIdx: idx,
|
|
})
|
|
}
|
|
return results
|
|
}
|