Files
ragflow/internal/ingestion/chunk/expression.go
Jin Hai 115b730d07 Go: parse ingestion DSL (#15938)
PR #15938

### Type of change

- [x] New Feature (non-breaking change which adds functionality)

---------

Signed-off-by: Jin Hai <haijin.chn@gmail.com>
2026-06-12 17:58:36 +08:00

650 lines
14 KiB
Go
Raw Blame History

This file contains ambiguous Unicode characters
This file contains Unicode characters that might be confused with other characters. If you think that this is intentional, you can safely ignore this warning. Use the Escape button to reveal them.
//
// Copyright 2026 The InfiniFlow Authors. All Rights Reserved.
//
// Licensed under the Apache License, Version 2.0 (the "License");
// you may not use this file except in compliance with the License.
// You may obtain a copy of the License at
//
// http://www.apache.org/licenses/LICENSE-2.0
//
// Unless required by applicable law or agreed to in writing, software
// distributed under the License is distributed on an "AS IS" BASIS,
// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
// See the License for the specific language governing permissions and
// limitations under the License.
//
package chunk
import (
"fmt"
"math"
"regexp"
"strconv"
"unicode"
)
// ---------------------------------------------------------------------------
// Token types
// ---------------------------------------------------------------------------
type tokenType int
const (
tokenEOF tokenType = iota
tokenIdentifier
tokenString
tokenNumber
tokenTrue
tokenFalse
tokenEq
tokenNeq
tokenGt
tokenLt
tokenGte
tokenLte
tokenAnd
tokenOr
tokenNot
tokenLParen
tokenRParen
)
var keywords = map[string]tokenType{
"AND": tokenAnd,
"OR": tokenOr,
"NOT": tokenNot,
"true": tokenTrue,
"false": tokenFalse,
"TRUE": tokenTrue,
"FALSE": tokenFalse,
}
type token struct {
typ tokenType
raw string
}
// ---------------------------------------------------------------------------
// Lexer
// ---------------------------------------------------------------------------
type lexer struct {
input []rune
pos int
}
func newLexer(input string) *lexer {
return &lexer{input: []rune(input)}
}
func (l *lexer) skipWhitespace() {
for l.pos < len(l.input) && unicode.IsSpace(l.input[l.pos]) {
l.pos++
}
}
func (l *lexer) next() token {
l.skipWhitespace()
if l.pos >= len(l.input) {
return token{typ: tokenEOF, raw: ""}
}
ch := l.input[l.pos]
// Single-quoted string
if ch == '\'' {
l.pos++ // skip opening '
start := l.pos
for l.pos < len(l.input) && l.input[l.pos] != '\'' {
l.pos++
}
raw := string(l.input[start:l.pos])
if l.pos < len(l.input) {
l.pos++ // skip closing '
}
return token{typ: tokenString, raw: raw}
}
// Operators
if l.pos+1 < len(l.input) {
next := l.input[l.pos+1]
switch string([]rune{ch, next}) {
case ">=":
l.pos += 2
return token{typ: tokenGte, raw: ">="}
case "<=":
l.pos += 2
return token{typ: tokenLte, raw: "<="}
case "!=":
l.pos += 2
return token{typ: tokenNeq, raw: "!="}
}
}
switch ch {
case '=':
l.pos++
return token{typ: tokenEq, raw: "="}
case '>':
l.pos++
return token{typ: tokenGt, raw: ">"}
case '<':
l.pos++
return token{typ: tokenLt, raw: "<"}
case '(':
l.pos++
return token{typ: tokenLParen, raw: "("}
case ')':
l.pos++
return token{typ: tokenRParen, raw: ")"}
}
// Number
if unicode.IsDigit(ch) || (ch == '-' && l.pos+1 < len(l.input) && unicode.IsDigit(l.input[l.pos+1])) {
start := l.pos
if l.input[l.pos] == '-' {
l.pos++
}
for l.pos < len(l.input) && (unicode.IsDigit(l.input[l.pos]) || l.input[l.pos] == '.') {
l.pos++
}
return token{typ: tokenNumber, raw: string(l.input[start:l.pos])}
}
// Identifier / keyword
if unicode.IsLetter(ch) || ch == '_' {
start := l.pos
for l.pos < len(l.input) && (unicode.IsLetter(l.input[l.pos]) || unicode.IsDigit(l.input[l.pos]) || l.input[l.pos] == '_') {
l.pos++
}
raw := string(l.input[start:l.pos])
if kw, ok := keywords[raw]; ok {
return token{typ: kw, raw: raw}
}
return token{typ: tokenIdentifier, raw: raw}
}
// Unknown
l.pos++
return token{typ: tokenIdentifier, raw: string(ch)}
}
func (l *lexer) peek() token {
pos := l.pos
tok := l.next()
l.pos = pos
return tok
}
// ---------------------------------------------------------------------------
// AST nodes
// ---------------------------------------------------------------------------
type Expr interface {
String() string
}
type binaryExpr struct {
left Expr
op tokenType
right Expr
}
func (e binaryExpr) String() string {
ops := map[tokenType]string{
tokenEq: "=",
tokenNeq: "!=",
tokenGt: ">",
tokenLt: "<",
tokenGte: ">=",
tokenLte: "<=",
tokenAnd: "AND",
tokenOr: "OR",
}
return fmt.Sprintf("(%s %s %s)", e.left, ops[e.op], e.right)
}
type unaryExpr struct {
op tokenType
right Expr
}
func (e unaryExpr) String() string {
return fmt.Sprintf("(NOT %s)", e.right)
}
type identifierExpr struct {
name string
}
func (e identifierExpr) String() string {
return e.name
}
type stringExpr struct {
value string
}
func (e stringExpr) String() string {
return "'" + e.value + "'"
}
type numberExpr struct {
value float64
}
func (e numberExpr) String() string {
return strconv.FormatFloat(e.value, 'f', -1, 64)
}
type boolExpr struct {
value bool
}
func (e boolExpr) String() string {
return strconv.FormatBool(e.value)
}
// ---------------------------------------------------------------------------
// Recursive-descent parser
// ---------------------------------------------------------------------------
type parser struct {
lex *lexer
cur token
peeked bool
}
func newParser(input string) *parser {
p := &parser{lex: newLexer(input)}
p.advance()
return p
}
func (p *parser) advance() {
if p.peeked {
p.peeked = false
return
}
p.cur = p.lex.next()
}
func (p *parser) peek() token {
if !p.peeked {
p.peeked = true
p.cur = p.lex.next()
}
return p.cur
}
func (p *parser) expect(typ tokenType) token {
tok := p.cur
if tok.typ != typ {
panic(fmt.Sprintf("expected token %d but got %d (%q)", typ, tok.typ, tok.raw))
}
p.advance()
return tok
}
func (p *parser) parse() Expr {
return p.parseOr()
}
// or_expr → and_expr ("OR" and_expr)*
func (p *parser) parseOr() Expr {
e := p.parseAnd()
for p.cur.typ == tokenOr {
op := p.cur.typ
p.advance()
right := p.parseAnd()
e = binaryExpr{left: e, op: op, right: right}
}
return e
}
// and_expr → not_expr ("AND" not_expr)*
func (p *parser) parseAnd() Expr {
e := p.parseNot()
for p.cur.typ == tokenAnd {
op := p.cur.typ
p.advance()
right := p.parseNot()
e = binaryExpr{left: e, op: op, right: right}
}
return e
}
// not_expr → "NOT" not_expr | primary
func (p *parser) parseNot() Expr {
if p.cur.typ == tokenNot {
op := p.cur.typ
p.advance()
right := p.parseNot()
return unaryExpr{op: op, right: right}
}
return p.parsePrimary()
}
// primary → comparison | "(" expression ")"
func (p *parser) parsePrimary() Expr {
if p.cur.typ == tokenLParen {
p.advance()
e := p.parseOr()
p.expect(tokenRParen)
return e
}
return p.parseComparison()
}
// comparison → IDENTIFIER OP value | value
// comparison → IDENTIFIER OP value
func (p *parser) parseComparison() Expr {
if p.cur.typ == tokenIdentifier {
id := p.cur.raw
p.advance()
switch p.cur.typ {
case tokenEq, tokenNeq, tokenGt, tokenLt, tokenGte, tokenLte:
op := p.cur.typ
p.advance()
right := p.parseValue()
return binaryExpr{left: identifierExpr{name: id}, op: op, right: right}
default:
// identifier alone treat as boolean check
return binaryExpr{
left: identifierExpr{name: id},
op: tokenEq,
right: boolExpr{value: true},
}
}
}
return p.parseValue()
}
// value → STRING | NUMBER | BOOLEAN
func (p *parser) parseValue() Expr {
switch p.cur.typ {
case tokenString:
v := stringExpr{value: p.cur.raw}
p.advance()
return v
case tokenNumber:
f, _ := strconv.ParseFloat(p.cur.raw, 64)
p.advance()
return numberExpr{value: f}
case tokenTrue:
p.advance()
return boolExpr{value: true}
case tokenFalse:
p.advance()
return boolExpr{value: false}
default:
// treat as identifier (e.g. bare variable reference)
id := identifierExpr{name: p.cur.raw}
p.advance()
return id
}
}
// ---------------------------------------------------------------------------
// Evaluator
// ---------------------------------------------------------------------------
var reMediaURL = regexp.MustCompile(`(?i)https?://[^\s]*\.(jpg|jpeg|png|gif|bmp|webp|svg|mp4|avi|mov|wmv|flv|mkv|m4v|mp3|wav|ogg|aac)`)
var reImageURL = regexp.MustCompile(`(?i)https?://[^\s]*\.(jpg|jpeg|png|gif|bmp|webp|svg)`)
var reVideoURL = regexp.MustCompile(`(?i)https?://[^\s]*\.(mp4|avi|mov|wmv|flv|mkv|m4v)`)
var reAnyURL = regexp.MustCompile(`(?i)https?://[^\s]+`)
// buildExprContext builds a variable context from a chunk's content and metadata.
// It auto-detects media/image/video URLs and language hints.
func buildExprContext(chunk ContentProvider, metadata map[string]interface{}) map[string]interface{} {
vars := make(map[string]interface{})
content := chunk.GetContent()
// Pre-populate from metadata
for k, v := range metadata {
vars[k] = v
}
// Auto-detect URL presence
vars["has_media_url"] = reMediaURL.MatchString(content)
vars["has_image_url"] = reImageURL.MatchString(content)
vars["has_video_url"] = reVideoURL.MatchString(content)
vars["has_url"] = reAnyURL.MatchString(content)
vars["length"] = len([]rune(content))
return vars
}
// ContentProvider allows evaluating expressions against any type that has content.
type ContentProvider interface {
GetContent() string
}
// Evaluate parses and evaluates a boolean expression against a variable map.
func Evaluate(exprStr string, vars map[string]interface{}) (bool, error) {
p := newParser(exprStr)
ast := p.parse()
res, err := eval(ast, vars)
if err != nil {
return false, fmt.Errorf("evaluate %q: %w", exprStr, err)
}
b, ok := toBool(res)
if !ok {
return false, fmt.Errorf("evaluate %q: result %v (%T) is not a boolean", exprStr, res, res)
}
return b, nil
}
// CompileExpression parses an expression string into a reusable AST.
func CompileExpression(exprStr string) (Expr, error) {
defer func() {
if r := recover(); r != nil {
panic(fmt.Sprintf("compile expression %q: %v", exprStr, r))
}
}()
p := newParser(exprStr)
return p.parse(), nil
}
// EvalCompiled evaluates a pre-compiled expression AST against variables.
func EvalCompiled(ast interface{}, vars map[string]interface{}) (bool, error) {
e, ok := ast.(Expr)
if !ok {
return false, fmt.Errorf("invalid AST type: %T", ast)
}
res, err := eval(e, vars)
if err != nil {
return false, err
}
b, ok := toBool(res)
if !ok {
return false, fmt.Errorf("result %v (%T) is not boolean", res, res)
}
return b, nil
}
func eval(e Expr, vars map[string]interface{}) (interface{}, error) {
switch n := e.(type) {
case binaryExpr:
return evalBinary(n, vars)
case unaryExpr:
return evalUnary(n, vars)
case identifierExpr:
v, ok := vars[n.name]
if !ok {
return nil, fmt.Errorf("undefined variable: %s", n.name)
}
return v, nil
case stringExpr:
return n.value, nil
case numberExpr:
return n.value, nil
case boolExpr:
return n.value, nil
default:
return nil, fmt.Errorf("unknown expression type: %T", e)
}
}
func evalBinary(e binaryExpr, vars map[string]interface{}) (interface{}, error) {
left, err := eval(e.left, vars)
if err != nil {
return nil, err
}
right, err := eval(e.right, vars)
if err != nil {
return nil, err
}
switch e.op {
case tokenAnd:
l, ok := toBool(left)
if !ok {
return false, fmt.Errorf("AND requires boolean left operand")
}
if !l {
return false, nil
}
r, ok := toBool(right)
if !ok {
return false, fmt.Errorf("AND requires boolean right operand")
}
return r, nil
case tokenOr:
l, ok := toBool(left)
if !ok {
return false, fmt.Errorf("OR requires boolean left operand")
}
if l {
return true, nil
}
r, ok := toBool(right)
if !ok {
return false, fmt.Errorf("OR requires boolean right operand")
}
return r, nil
case tokenEq:
return compareEq(left, right), nil
case tokenNeq:
return !compareEq(left, right), nil
case tokenGt, tokenLt, tokenGte, tokenLte:
return compareOrder(left, right, e.op)
default:
return false, fmt.Errorf("unknown binary op %d", e.op)
}
}
func evalUnary(e unaryExpr, vars map[string]interface{}) (interface{}, error) {
right, err := eval(e.right, vars)
if err != nil {
return nil, err
}
b, ok := toBool(right)
if !ok {
return false, fmt.Errorf("NOT requires boolean operand")
}
return !b, nil
}
func toBool(v interface{}) (bool, bool) {
switch vv := v.(type) {
case bool:
return vv, true
case string:
return vv == "true" || vv == "TRUE" || vv == "1", true
case float64:
return vv != 0, true
case int:
return vv != 0, true
}
return false, false
}
func compareEq(a, b interface{}) bool {
// Normalise numeric types
af, aIsNum := toFloat(a)
bf, bIsNum := toFloat(b)
if aIsNum && bIsNum {
return af == bf
}
// Fall back to string comparison
return fmt.Sprintf("%v", a) == fmt.Sprintf("%v", b)
}
func toFloat(v interface{}) (float64, bool) {
switch vv := v.(type) {
case float64:
return vv, true
case int:
return float64(vv), true
case string:
f, err := strconv.ParseFloat(vv, 64)
return f, err == nil
}
return 0, false
}
func compareOrder(a, b interface{}, op tokenType) (bool, error) {
af, aOK := toFloat(a)
bf, bOK := toFloat(b)
if aOK && bOK {
switch op {
case tokenGt:
return af > bf, nil
case tokenLt:
return af < bf, nil
case tokenGte:
return af >= bf, nil
case tokenLte:
return af <= bf, nil
}
}
// String fallback
sa := fmt.Sprintf("%v", a)
sb := fmt.Sprintf("%v", b)
switch op {
case tokenGt:
return sa > sb, nil
case tokenLt:
return sa < sb, nil
case tokenGte:
return sa >= sb, nil
case tokenLte:
return sa <= sb, nil
}
return false, fmt.Errorf("unsupported comparison op %d between %T and %T", op, a, b)
}
// ---------------------------------------------------------------------------
// Language heuristics
// ---------------------------------------------------------------------------
// DetectLanguage returns a best-effort language code ('zh', 'en', etc.)
// based on the proportion of CJK characters.
func DetectLanguage(text string) string {
cjk := 0
total := 0
for _, r := range text {
if unicode.Is(unicode.Han, r) {
cjk++
}
if unicode.IsLetter(r) {
total++
}
}
if total > 0 && float64(cjk)/float64(total) > 0.3 {
return "zh"
}
return "en"
}
// RuneCount returns the number of runes in text.
func RuneCount(text string) int {
return len([]rune(text))
}
// Ensure math is used (for NaN etc.)
var _ = math.NaN