2026-06-18 18:07:27 +08:00
|
|
|
//
|
|
|
|
|
// Copyright 2026 The InfiniFlow Authors. All Rights Reserved.
|
|
|
|
|
//
|
|
|
|
|
// Licensed under the Apache License, Version 2.0 (the "License");
|
|
|
|
|
// you may not use this file except in compliance with the License.
|
|
|
|
|
// You may obtain a copy of the License at
|
|
|
|
|
//
|
|
|
|
|
// http://www.apache.org/licenses/LICENSE-2.0
|
|
|
|
|
//
|
|
|
|
|
// Unless required by applicable law or agreed to in writing, software
|
|
|
|
|
// distributed under the License is distributed on an "AS IS" BASIS,
|
|
|
|
|
// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
|
|
|
|
|
// See the License for the specific language governing permissions and
|
|
|
|
|
// limitations under the License.
|
|
|
|
|
//
|
|
|
|
|
|
|
|
|
|
package elasticsearch
|
|
|
|
|
|
|
|
|
|
import (
|
|
|
|
|
"bytes"
|
|
|
|
|
"context"
|
|
|
|
|
"encoding/json"
|
|
|
|
|
"errors"
|
|
|
|
|
"fmt"
|
|
|
|
|
"io"
|
|
|
|
|
"net"
|
|
|
|
|
"regexp"
|
|
|
|
|
"strings"
|
|
|
|
|
"time"
|
|
|
|
|
|
|
|
|
|
"ragflow/internal/common"
|
|
|
|
|
"ragflow/internal/tokenizer"
|
|
|
|
|
|
|
|
|
|
"github.com/elastic/go-elasticsearch/v8/esapi"
|
|
|
|
|
"go.uber.org/zap"
|
|
|
|
|
)
|
|
|
|
|
|
|
|
|
|
const (
|
|
|
|
|
esSQLRequestTimeout = 2 * time.Second
|
|
|
|
|
esSQLFetchSize = 128
|
|
|
|
|
)
|
|
|
|
|
|
|
|
|
|
const esSQLRetryAttempts = 2
|
|
|
|
|
const esSQLRetryDelay = 3 * time.Second
|
|
|
|
|
|
|
|
|
|
var whitespaceRe = regexp.MustCompile("[ `]+")
|
|
|
|
|
var lktksMatchRe = regexp.MustCompile(` ([a-z_]+_l?tks)( like | ?= ?)'([^']+)'`)
|
|
|
|
|
|
|
|
|
|
// Preprocess normalizes SQL for ES: collapses whitespace/backticks,
|
|
|
|
|
// strips '%', and rewrites `<field>_l?tks like/= 'value'` into a
|
|
|
|
|
// tokenized MATCH() call.
|
|
|
|
|
func Preprocess(sql string) string {
|
|
|
|
|
sql = whitespaceRe.ReplaceAllString(sql, " ")
|
|
|
|
|
sql = strings.ReplaceAll(sql, "%", "")
|
|
|
|
|
|
|
|
|
|
// Collect replacements so we don't re-scan tokens we've already rewritten
|
|
|
|
|
type replacement struct {
|
|
|
|
|
old, new string
|
|
|
|
|
}
|
|
|
|
|
var replaces []replacement
|
|
|
|
|
for _, m := range lktksMatchRe.FindAllStringSubmatchIndex(sql, -1) {
|
|
|
|
|
match := sql[m[0]:m[1]]
|
|
|
|
|
fld := sql[m[2]:m[3]]
|
|
|
|
|
val := sql[m[6]:m[7]]
|
|
|
|
|
tokenized, err := tokenizer.Tokenize(val)
|
|
|
|
|
if err != nil {
|
|
|
|
|
continue
|
|
|
|
|
}
|
|
|
|
|
fine, err := tokenizer.FineGrainedTokenize(tokenized)
|
|
|
|
|
if err != nil {
|
|
|
|
|
continue
|
|
|
|
|
}
|
|
|
|
|
replaces = append(replaces, replacement{
|
|
|
|
|
old: match,
|
fix(security): address 93 CodeQL code-scanning alerts across 61 files (#16407)
## Summary
Resolves all 93 open alerts at
https://github.com/infiniflow/ragflow/security/code-scanning by rule:
| Rule | Count | Treatment |
|------|-------|-----------|
| py/clear-text-logging-sensitive-data | 23 | Real fix — log scrubbing |
| go/path-injection | 15 | Real fix where possible, suppression with
rationale |
| go/request-forgery | 8 | Suppression with rationale
(operator-controlled URLs) |
| go/clear-text-logging | 10 | Real fix — log scrubbing |
| go/unsafe-quoting | 5 | Real fix — escape or refactor |
| go/sql-injection | 3 | Real fix — orderby whitelist + CodeQL comment |
| go/uncontrolled-allocation-size | 2 | Real fix — cap to 1024 |
| go/incorrect-integer-conversion | 3 | Real fix — ParseInt + range
check |
| go/insecure-hostkeycallback | 1 | Real fix — known_hosts file |
| go/disabled-certificate-check | 2 | Suppression with rationale |
| go/command-injection | 1 | Suppression (sanitized via shq()) |
| go/email-injection | 1 | Suppression with rationale |
| go/cookie-httponly-not-set | 1 | Suppression (SPA bootstrap) |
| js/stack-trace-exposure | 1 | Real fix — generic client message |
| js/prototype-pollution-utility | 1 | Real fix — reject
__proto__/constructor/prototype |
| py/weak-sensitive-data-hashing | 1 | Real fix — MD5 → SHA-256 |
| py/incomplete-url-substring-sanitization | 3 | Real fix —
urlparse(hostname) |
| py/paramiko-missing-host-key-validation | 1 | Real fix —
load_system_host_keys + RejectPolicy |
| cpp/integer-multiplication-cast-to-long | 2 | Real fix — cast to
size_t |
## Real fixes (with measurable security improvement)
**SSH host key verification (Go + Python)**
Replace `InsecureIgnoreHostKey()` / `paramiko.AutoAddPolicy()` with
proper host key verification against a known_hosts file (configurable
via `SSH_KNOWN_HOSTS` env / `known_hosts` config field; fail-closed when
unset). Loads `~/.ssh/known_hosts` first via `load_system_host_keys()`
so existing setups keep working.
**SQL injection in `user_canvas`**
Add `userCanvasOrderableColumns` whitelist + `userCanvasOrderClause`
helper. Both `GetList()` and `ListByTenantIDs()` now route the
user-supplied `orderby` query param through the helper, defaulting to
`create_time` on miss.
**SQL injection in `pipeline_operation_log`**
Existing whitelist documented via CodeQL comment.
**Real SQL injection in `infinity/chunk.go:931`**
Escape `'` → `''` on user-controlled `questionText` before splicing into
`filter_fulltext(...)` SQL filter.
**Real SQL injection in `elasticsearch/sql.go:75`**
Defense-in-depth escape on tokenizer output before splicing into
`MATCH(...)`.
**Python code injection in `result_protocol.go`**
Replace raw JSON literal embedding into Python/JS expressions with
base64 + `json.loads` / `JSON.parse(Buffer.from(...,
'base64').toString('utf8'))`. Eliminates both the unsafe-quoting sink
and the brittleness of mixing JSON true/false/null with Python syntax.
**URL substring check bypass in `embedding_model.py`**
Replace `if "dashscope-intl.aliyuncs.com" in u` with
`urlparse(u).hostname == "dashscope-intl.aliyuncs.com"` so a base_url
like `https://attacker.example/?u=dashscope-intl.aliyuncs.com` cannot
bypass the routing.
**Prototype pollution in `setNestedValue` (TS)**
Reject `__proto__`/`constructor`/`prototype` keys before any assignment.
**Integer overflow**
- scrypt params via `ParseInt` + non-positive check
(`internal/common/password.go`)
- `topN` and `n` caps to 1024 (retrieval_service.go, dataset.go)
- `nalloc*statesize` cast to `size_t` (cpp/re2/onepass.cc)
**Cookie httponly**
Set explicitly with rationale: this is the OAuth bootstrap cookie
intentionally read by the SPA.
**Stack trace exposure**
Replace `error.message` in HTTP 500 response with generic `"internal
error"`; full error still logged server-side via `console.error`.
**Weak hashing**
MD5 → SHA-256 for deterministic `conv_id` derivation
(`conversation_service.py`).
**Log scrubbing**
Remove or redact user-controlled / sensitive content from clear-text
logs across 8 ingestion parsers, `llm_service.py` ×11,
`tenant_llm_service.py` ×7, `misc_utils.py` ×4, `redis_conn.py` ×10,
`conftest.py` ×4, `init_data.py`, `dataset_api_service.py`,
`generator.py`, `mysql_migration.py`, `cli.go`, `user_command.go`,
`pdf_parser.go`. Most patterns converted to parameterized logging
(`logging.info("...: %d", n)`) or static messages.
## CodeQL suppressions (each with rationale)
For alerts where the data flow is genuinely safe but CodeQL can't see
the context — operator-controlled URLs, sanitized inputs, etc. — I added
`// codeql[go/<rule>] <rationale>` annotations rather than dismissing
them, so future readers can audit the rationale inline:
- `internal/agent/component/invoke.go:135` — Invoke is a generic canvas
HTTP client
- `internal/service/langfuse.go` ×2 — host is per-tenant operator config
- `internal/service/file.go:1184` — already SSRF-guarded by
`assertURLSafe`
- `internal/utility/mcp_client.go` ×3 — already `AssertURLSafe` +
IP-pinned
- `internal/entity/models/bedrock.go` — sigv4-signed request, URL can't
be tampered
- `internal/service/deep_researcher.go:269` — `callback` is SSE display
string, not SQL
- `internal/engine/infinity/chunk.go:346` — UUIDs can't contain `'` (RFC
4122)
- `internal/cli/common_command.go` ×2 — CLI trusts operator-configured
URL
- `internal/utility/smtp.go:194` — msg is server-built, not user form
input
- `internal/entity/models/*` ×14 (path-injection) — audio file paths are
caller-supplied
## Test plan
- ✅ All 13 modified Go packages build cleanly
- ✅ 663 tests pass across `internal/agent/sandbox`, `internal/common`,
`internal/agent/component`, `internal/engine/infinity`, `internal/dao`
- ✅ All 11 modified Python files parse via `ast.parse`
- ✅ TypeScript `tsc --noEmit` clean on the modified
`use-provider-fields.tsx`
- ✅ `node --check` clean on the modified JS file
🤖 Generated with [Claude Code](https://claude.com/claude-code)
2026-06-27 19:48:29 +08:00
|
|
|
// fine comes from tokenizer.FineGrainedTokenize, which strips
|
|
|
|
|
// non-alphanumerics; defense-in-depth escape just in case a
|
|
|
|
|
// future tokenizer change reintroduces a quote.
|
|
|
|
|
new: fmt.Sprintf(" MATCH(%s, '%s', 'operator=OR;minimum_should_match=30%%') ", fld, strings.ReplaceAll(fine, "'", "''")),
|
2026-06-18 18:07:27 +08:00
|
|
|
})
|
|
|
|
|
}
|
|
|
|
|
for _, r := range replaces {
|
|
|
|
|
sql = strings.Replace(sql, r.old, r.new, 1)
|
|
|
|
|
}
|
|
|
|
|
return sql
|
|
|
|
|
}
|
|
|
|
|
|
|
|
|
|
// RunSQL posts SQL to `/_sql`, translates the response into chunk-shaped maps.
|
|
|
|
|
// Returns (nil, nil) on empty rows; (nil, error) when retries exhausted.
|
|
|
|
|
func (e *elasticsearchEngine) RunSQL(ctx context.Context, tableName string, sqlText string, kbIDs []string, format string) ([]map[string]interface{}, error) {
|
|
|
|
|
if e == nil || e.client == nil {
|
|
|
|
|
return nil, fmt.Errorf("Elasticsearch RunSQL: client not initialized")
|
|
|
|
|
}
|
|
|
|
|
if sqlText == "" {
|
|
|
|
|
return nil, fmt.Errorf("Elasticsearch RunSQL: empty SQL")
|
|
|
|
|
}
|
|
|
|
|
|
|
|
|
|
common.Debug("ESConnection.sql get sql", zap.String("sql", sqlText))
|
|
|
|
|
sqlText = Preprocess(sqlText)
|
|
|
|
|
common.Debug("ESConnection.sql to es", zap.String("sql", sqlText))
|
|
|
|
|
|
|
|
|
|
var lastErr error
|
|
|
|
|
for attempt := 0; attempt < esSQLRetryAttempts; attempt++ {
|
|
|
|
|
rows, err := e.runSQLOnce(ctx, sqlText, format)
|
|
|
|
|
if err == nil {
|
|
|
|
|
return rows, nil
|
|
|
|
|
}
|
|
|
|
|
lastErr = err
|
|
|
|
|
if !isTimeoutError(err) {
|
|
|
|
|
common.Warn("ESConnection.sql got exception",
|
|
|
|
|
zap.String("sql", sqlText),
|
|
|
|
|
zap.Error(err))
|
|
|
|
|
return nil, fmt.Errorf("SQL error: %w\n\nSQL: %s", err, sqlText)
|
|
|
|
|
}
|
|
|
|
|
common.Warn("ES request timeout",
|
|
|
|
|
zap.String("sql", sqlText),
|
|
|
|
|
zap.Int("attempt", attempt+1),
|
|
|
|
|
zap.Error(err))
|
|
|
|
|
if attempt < esSQLRetryAttempts-1 {
|
|
|
|
|
select {
|
|
|
|
|
case <-ctx.Done():
|
|
|
|
|
return nil, ctx.Err()
|
|
|
|
|
case <-time.After(esSQLRetryDelay):
|
|
|
|
|
}
|
|
|
|
|
}
|
|
|
|
|
}
|
|
|
|
|
common.Error(fmt.Sprintf("ESConnection.sql timeout after %d attempts. SQL: %s", esSQLRetryAttempts, sqlText), lastErr)
|
|
|
|
|
return nil, fmt.Errorf("Elasticsearch RunSQL: timeout after %d attempts: %w", esSQLRetryAttempts, lastErr)
|
|
|
|
|
}
|
|
|
|
|
|
|
|
|
|
func (e *elasticsearchEngine) runSQLOnce(ctx context.Context, sqlText string, format string) ([]map[string]interface{}, error) {
|
|
|
|
|
ctx, cancel := context.WithTimeout(ctx, esSQLRequestTimeout)
|
|
|
|
|
defer cancel()
|
|
|
|
|
|
|
|
|
|
body := map[string]interface{}{
|
|
|
|
|
"query": sqlText,
|
|
|
|
|
"fetch_size": esSQLFetchSize,
|
|
|
|
|
}
|
|
|
|
|
buf, err := json.Marshal(body)
|
|
|
|
|
if err != nil {
|
|
|
|
|
return nil, fmt.Errorf("marshal body: %w", err)
|
|
|
|
|
}
|
|
|
|
|
|
|
|
|
|
req := esapi.SQLQueryRequest{
|
|
|
|
|
Body: bytes.NewReader(buf),
|
|
|
|
|
Format: format,
|
|
|
|
|
}
|
|
|
|
|
res, err := req.Do(ctx, e.client)
|
|
|
|
|
if err != nil {
|
|
|
|
|
return nil, fmt.Errorf("request failed: %w", err)
|
|
|
|
|
}
|
|
|
|
|
defer res.Body.Close()
|
|
|
|
|
|
|
|
|
|
if res.IsError() {
|
|
|
|
|
errBody, _ := io.ReadAll(res.Body)
|
|
|
|
|
return nil, fmt.Errorf("status=%d body=%s", res.StatusCode, string(errBody))
|
|
|
|
|
}
|
|
|
|
|
|
|
|
|
|
// Parse the SQL response.
|
|
|
|
|
var resp struct {
|
|
|
|
|
Columns []struct {
|
|
|
|
|
Name string `json:"name"`
|
|
|
|
|
Type string `json:"type"`
|
|
|
|
|
} `json:"columns"`
|
|
|
|
|
Rows [][]interface{} `json:"rows"`
|
|
|
|
|
}
|
|
|
|
|
if err := json.NewDecoder(res.Body).Decode(&resp); err != nil {
|
|
|
|
|
return nil, fmt.Errorf("decode response: %w", err)
|
|
|
|
|
}
|
|
|
|
|
|
|
|
|
|
if len(resp.Rows) == 0 {
|
|
|
|
|
return nil, nil
|
|
|
|
|
}
|
|
|
|
|
|
|
|
|
|
// Convert to chunk-shaped maps. Column names map 1:1 to JSON keys.
|
|
|
|
|
out := make([]map[string]interface{}, 0, len(resp.Rows))
|
|
|
|
|
for _, row := range resp.Rows {
|
|
|
|
|
cm := make(map[string]interface{}, len(resp.Columns))
|
|
|
|
|
for i, col := range resp.Columns {
|
|
|
|
|
if i < len(row) {
|
|
|
|
|
cm[col.Name] = row[i]
|
|
|
|
|
}
|
|
|
|
|
}
|
|
|
|
|
out = append(out, cm)
|
|
|
|
|
}
|
|
|
|
|
return out, nil
|
|
|
|
|
}
|
|
|
|
|
|
|
|
|
|
// isTimeoutError detects connection-level and per-attempt timeouts
|
|
|
|
|
// via context.DeadlineExceeded, net.Error.Timeout(), and substring
|
|
|
|
|
// matches (for SDKs that wrap without typed errors).
|
|
|
|
|
func isTimeoutError(err error) bool {
|
|
|
|
|
if err == nil {
|
|
|
|
|
return false
|
|
|
|
|
}
|
|
|
|
|
if errors.Is(err, context.DeadlineExceeded) {
|
|
|
|
|
return true
|
|
|
|
|
}
|
|
|
|
|
var netErr net.Error
|
|
|
|
|
if errors.As(err, &netErr) && netErr.Timeout() {
|
|
|
|
|
return true
|
|
|
|
|
}
|
|
|
|
|
msg := err.Error()
|
|
|
|
|
for _, sub := range []string{"i/o timeout", "deadline exceeded", "connection timeout", "context deadline"} {
|
|
|
|
|
if strings.Contains(msg, sub) {
|
|
|
|
|
return true
|
|
|
|
|
}
|
|
|
|
|
}
|
|
|
|
|
return false
|
|
|
|
|
}
|