Files
ragflow/internal/deepdoc/dla.go
Zhichang Yu 3fa15c0e2f feat(agent): Go port — canvas engine, 22 components, DSL v2, 13 endpoints (#15952)
Ports the agent canvas subsystem from Python to Go.

## What's included

### Canvas Engine (Phase 0/1)
- State engine, scheduler, variable resolver, Redis checkpoint store,
cancel protocol
- **209 tests** across canvas / component / io packages

### 22 Components (P0–P4)
| Tier | Components |
|---|---|
| P0 T1+T2+T3 | LLM, Agent, ExitLoop, Switch, Categorize, Begin,
Message, Invoke |
| P1 T3 | VariableAggregator, VariableAssigner, StringTransform,
ListOperations, DataOperations |
| P2 T3 | Iteration, IterationItem, Loop, LoopItem |
| P3 T3 | UserFillUp, Fillup |
| P4 T5 | Browser, ExcelProcessor, DocsGenerator |

### DSL v2 Schema (Phase 2.5)
- Typed v2 in-memory model with v1-to-v2 auto-detect converter
- v1 legacy field stripping per plan §2.11.7

### HTTP Endpoints & Bug Fixes (Plans PR1–PR3)
- **DELETE SQL bug fix**: gorm v2 `Where("id = ?", id).Delete(...)`
pattern
- **CreateAgent validation**: title/DSL required, duplicate check, 103
envelope
- **13 new endpoints**: templates, prompts, tags, sessions CRUD,
chat/completions (SSE + non-stream stubs), rerun, test_db_connection,
logs, webhook/logs
- **756 Go unit tests** (745 → 756, +18)
- **17 → 0 Python integration test failures** (test_agents.py +
test_session_management/)

### Tools
21 eino tools: HTTPHelper, search tools, financial/data tools, mandatory
stubs

### Infrastructure
OTel observability, NATS message queue, DeepDoc gRPC client, SSRF
guards, IDOR mitigation
2026-06-12 22:58:28 +08:00

184 lines
5.9 KiB
Go

//
// Copyright 2026 The InfiniFlow Authors. All Rights Reserved.
//
// Licensed under the Apache License, Version 2.0 (the "License");
// you may not use this file except in compliance with the License.
// You may obtain a copy of the License at
//
// http://www.apache.org/licenses/LICENSE-2.0
//
// Unless required by applicable law or agreed to in writing, software
// distributed under the License is distributed on an "AS IS" BASIS,
// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
// See the License for the specific language governing permissions and
// limitations under the License.
//
package deepdoc
import (
"bytes"
"context"
"encoding/json"
"fmt"
"io"
"mime/multipart"
"net/url"
"strings"
)
// BBox is a 4-tuple [left, top, right, bottom] in the image's
// native pixel coordinates. Float to preserve sub-pixel accuracy
// from the upstream server (the Python client lowercases+indexes
// without rounding).
type BBox [4]float64
// DLAResult is one detected layout region. Type is the normalized
// class name (lowercased, per Python `dla_cli.py:43`); TypeIdx is
// the raw class index into DLAClasses (preserved so callers can
// disambiguate the documented duplicate class slots).
type DLAResult struct {
Type string `json:"type"`
Score float64 `json:"score"`
BBox BBox `json:"bbox"`
TypeIdx int `json:"type_idx"`
}
// DLAClasses is the 10-entry class taxonomy from
// deepdoc/vision/dla_cli.py:10-21. Order is significant — TypeIdx
// in the wire payload is an index into this slice. The duplicates
// at indices 4/6/7/9 are kept verbatim for backward compatibility
// with existing inference servers.
var DLAClasses = []string{
"title", // 0
"text", // 1
"reference", // 2
"figure", // 3
"figure caption", // 4
"table", // 5
"table caption", // 6
"table caption", // 7 duplicate
"equation", // 8
"figure caption", // 9 duplicate
}
// rawDLA is the wire format the DLA server returns
// (docs/agent-port/deepdoc-endpoints.md §2.3).
type rawDLA struct {
BBoxes [][]float64 `json:"bboxes"`
}
// DLA calls the remote DLA service for layout analysis of one or
// more JPEG-encoded images. The Python contract
// (dla_cli.py:25-50) is replicated:
//
// - one HTTP POST per image
// - 3 attempts per image, 18s per attempt, 200ms initial backoff
// - failed images return an empty DLAResult (caller does not
// have to handle per-image errors — the Python
// `layout_recognizer.py:74-76` is happy with empty results)
//
// When no DEEPDOC_URL is set, returns ErrNoURL without any network
// call. When the base URL is set but the service is unreachable
// after 3 attempts, the failed image's slot is an empty DLAResult
// and the rest still process (matches Python's "len(res) == i"
// append-empty pattern).
func (c *Client) DLA(ctx context.Context, images [][]byte) ([]DLAResult, error) {
if !c.Enabled() {
return nil, ErrNoURL
}
if len(images) == 0 {
return []DLAResult{}, nil
}
predictURL, err := c.predictURL()
if err != nil {
return nil, err
}
out := make([]DLAResult, 0, len(images))
for _, img := range images {
res := c.predictOne(ctx, predictURL, img)
// Per Python: a failed image yields an empty slot rather
// than aborting the whole batch. Surface the first hard
// error at the end if the user wants it.
if len(res) == 0 {
out = append(out, DLAResult{})
} else {
out = append(out, res...)
}
}
return out, nil
}
// predictURL resolves the DLA endpoint URL from the configured base.
// Trims trailing slash to avoid `//predict` on `http://host/`.
func (c *Client) predictURL() (string, error) {
base := strings.TrimRight(c.baseURL, "/")
u, err := url.Parse(base + predictPath)
if err != nil {
return "", fmt.Errorf("deepdoc: parse predict url: %w", err)
}
return u.String(), nil
}
// predictOne runs the retry loop for a single image. Returns the
// list of bboxes the server returned, or an empty slice if all
// attempts failed. Errors are NOT returned for retry exhaustion —
// the caller maps "empty slice" to "no detections" per the Python
// contract; a hard error (4xx, bad URL) is returned immediately.
func (c *Client) predictOne(ctx context.Context, predictURL string, image []byte) []DLAResult {
buildBody := func() (io.Reader, string) {
// Each retry needs a fresh multipart body — multipart.Writer
// consumes its underlying buffer on Close. CreatePart lets
// us set both a filename (so Go's net/http server-side
// parser routes the part to MultipartForm.File) and the
// image/jpeg Content-Type the DLA server expects (matches
// the Python `files={'request': ('image.jpg', ...)}`
// contract from dla_cli.py:35).
buf := &bytes.Buffer{}
w := multipart.NewWriter(buf)
fw, _ := w.CreatePart(map[string][]string{
"Content-Disposition": {`form-data; name="request"; filename="image.jpg"`},
"Content-Type": {"image/jpeg"},
})
_, _ = fw.Write(image)
_ = w.Close()
return buf, w.FormDataContentType()
}
validate := func(data []byte) error {
var r rawDLA
if err := json.Unmarshal(data, &r); err != nil {
return fmt.Errorf("%w: %v", ErrInvalidResponse, err)
}
if r.BBoxes == nil {
return fmt.Errorf("%w: missing bboxes key", ErrInvalidResponse)
}
return nil
}
data, err := c.doPost(ctx, predictURL, buildBody, validate)
if err != nil {
return nil
}
var r rawDLA
_ = json.Unmarshal(data, &r) // already validated above
results := make([]DLAResult, 0, len(r.BBoxes))
for _, b := range r.BBoxes {
if len(b) < 6 {
continue
}
// [l, t, r, b, score, type_idx] per docs/agent-port/deepdoc-endpoints.md §2.3.
bbox := BBox{b[0], b[1], b[2], b[3]}
idx := int(b[5])
cls := ""
if idx >= 0 && idx < len(DLAClasses) {
cls = DLAClasses[idx]
}
results = append(results, DLAResult{
Type: cls,
Score: b[4],
BBox: bbox,
TypeIdx: idx,
})
}
return results
}