2024-08-15 09:17:36 +08:00
|
|
|
|
#
|
|
|
|
|
|
# Copyright 2024 The InfiniFlow Authors. All Rights Reserved.
|
|
|
|
|
|
#
|
|
|
|
|
|
# Licensed under the Apache License, Version 2.0 (the "License");
|
|
|
|
|
|
# you may not use this file except in compliance with the License.
|
|
|
|
|
|
# You may obtain a copy of the License at
|
|
|
|
|
|
#
|
|
|
|
|
|
# http://www.apache.org/licenses/LICENSE-2.0
|
|
|
|
|
|
#
|
|
|
|
|
|
# Unless required by applicable law or agreed to in writing, software
|
|
|
|
|
|
# distributed under the License is distributed on an "AS IS" BASIS,
|
|
|
|
|
|
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
|
|
|
|
|
|
# See the License for the specific language governing permissions and
|
|
|
|
|
|
# limitations under the License.
|
|
|
|
|
|
#
|
2025-12-11 17:38:17 +08:00
|
|
|
|
import asyncio
|
2024-11-14 17:13:48 +08:00
|
|
|
|
import logging
|
2024-08-15 09:17:36 +08:00
|
|
|
|
import json
|
|
|
|
|
|
import os
|
|
|
|
|
|
import time
|
|
|
|
|
|
import uuid
|
|
|
|
|
|
|
2026-02-27 12:55:51 +01:00
|
|
|
|
from peewee import IntegrityError
|
|
|
|
|
|
|
2025-11-05 08:01:39 +08:00
|
|
|
|
from api.db import UserTenantRole
|
2026-06-18 16:38:32 +08:00
|
|
|
|
from api.db.db_models import init_database_tables as init_web_db
|
2024-08-15 09:17:36 +08:00
|
|
|
|
from api.db.services import UserService
|
|
|
|
|
|
from api.db.services.canvas_service import CanvasTemplateService
|
|
|
|
|
|
from api.db.services.document_service import DocumentService
|
|
|
|
|
|
from api.db.services.knowledgebase_service import KnowledgebaseService
|
2026-06-18 16:38:32 +08:00
|
|
|
|
from api.db.services.llm_service import LLMBundle
|
2024-08-15 09:17:36 +08:00
|
|
|
|
from api.db.services.user_service import TenantService, UserTenantService
|
2026-01-04 14:21:39 +08:00
|
|
|
|
from api.db.services.system_settings_service import SystemSettingsService
|
2026-04-13 09:26:30 -03:00
|
|
|
|
from api.db.template_utils import normalize_canvas_template_categories
|
2026-01-23 16:56:03 +08:00
|
|
|
|
from api.db.joint_services.memory_message_service import init_message_id_sequence, init_memory_size_cache, fix_missing_tokenized_memory
|
2026-03-05 17:27:17 +08:00
|
|
|
|
from api.db.joint_services.tenant_model_service import get_tenant_default_model_by_type
|
2025-11-05 08:01:39 +08:00
|
|
|
|
from common.constants import LLMType
|
2025-11-02 21:05:28 +08:00
|
|
|
|
from common.file_utils import get_project_base_directory
|
2025-11-06 09:36:38 +08:00
|
|
|
|
from common import settings
|
2025-09-25 23:37:50 +08:00
|
|
|
|
from api.common.base64 import encode_to_base64
|
2024-08-15 09:17:36 +08:00
|
|
|
|
|
2025-11-24 19:02:08 +08:00
|
|
|
|
DEFAULT_SUPERUSER_NICKNAME = os.getenv("DEFAULT_SUPERUSER_NICKNAME", "admin")
|
|
|
|
|
|
DEFAULT_SUPERUSER_EMAIL = os.getenv("DEFAULT_SUPERUSER_EMAIL", "admin@ragflow.io")
|
|
|
|
|
|
DEFAULT_SUPERUSER_PASSWORD = os.getenv("DEFAULT_SUPERUSER_PASSWORD", "admin")
|
2024-09-25 18:30:27 +08:00
|
|
|
|
|
2025-11-24 19:02:08 +08:00
|
|
|
|
def init_superuser(nickname=DEFAULT_SUPERUSER_NICKNAME, email=DEFAULT_SUPERUSER_EMAIL, password=DEFAULT_SUPERUSER_PASSWORD, role=UserTenantRole.OWNER):
|
2026-02-27 12:55:51 +01:00
|
|
|
|
if UserService.query(email=email):
|
|
|
|
|
|
logging.info("User with email %s already exists, skipping initialization.", email)
|
|
|
|
|
|
return
|
|
|
|
|
|
|
2024-08-15 09:17:36 +08:00
|
|
|
|
user_info = {
|
|
|
|
|
|
"id": uuid.uuid1().hex,
|
2025-11-24 19:02:08 +08:00
|
|
|
|
"password": encode_to_base64(password),
|
|
|
|
|
|
"nickname": nickname,
|
2024-08-15 09:17:36 +08:00
|
|
|
|
"is_superuser": True,
|
2025-11-24 19:02:08 +08:00
|
|
|
|
"email": email,
|
2024-08-15 09:17:36 +08:00
|
|
|
|
"creator": "system",
|
|
|
|
|
|
"status": "1",
|
|
|
|
|
|
}
|
|
|
|
|
|
tenant = {
|
|
|
|
|
|
"id": user_info["id"],
|
|
|
|
|
|
"name": user_info["nickname"] + "‘s Kingdom",
|
2024-11-15 17:30:56 +08:00
|
|
|
|
"llm_id": settings.CHAT_MDL,
|
2025-11-06 09:36:38 +08:00
|
|
|
|
"embd_id": settings.EMBEDDING_MDL,
|
2024-11-15 17:30:56 +08:00
|
|
|
|
"asr_id": settings.ASR_MDL,
|
|
|
|
|
|
"parser_ids": settings.PARSERS,
|
2026-02-24 03:57:31 +01:00
|
|
|
|
"img2txt_id": settings.IMAGE2TEXT_MDL,
|
|
|
|
|
|
"rerank_id": settings.RERANK_MDL,
|
2024-08-15 09:17:36 +08:00
|
|
|
|
}
|
|
|
|
|
|
usr_tenant = {
|
|
|
|
|
|
"tenant_id": user_info["id"],
|
|
|
|
|
|
"user_id": user_info["id"],
|
|
|
|
|
|
"invited_by": user_info["id"],
|
2025-11-24 19:02:08 +08:00
|
|
|
|
"role": role
|
2024-08-15 09:17:36 +08:00
|
|
|
|
}
|
2025-08-13 09:46:05 +08:00
|
|
|
|
|
2026-02-27 12:55:51 +01:00
|
|
|
|
try:
|
|
|
|
|
|
if not UserService.save(**user_info):
|
|
|
|
|
|
logging.error("can't init admin.")
|
|
|
|
|
|
return
|
|
|
|
|
|
except IntegrityError:
|
|
|
|
|
|
logging.info("User with email %s already exists, skipping.", email)
|
2024-08-15 09:17:36 +08:00
|
|
|
|
return
|
|
|
|
|
|
TenantService.insert(**tenant)
|
|
|
|
|
|
UserTenantService.insert(**usr_tenant)
|
2024-11-14 17:13:48 +08:00
|
|
|
|
logging.info(
|
2025-12-10 19:08:45 +08:00
|
|
|
|
f"Super user initialized. email: {email},A default password has been set; changing the password after login is strongly recommended.")
|
2024-08-15 09:17:36 +08:00
|
|
|
|
|
2026-02-27 12:55:51 +01:00
|
|
|
|
if tenant["llm_id"]:
|
2026-03-05 17:27:17 +08:00
|
|
|
|
chat_model_config = get_tenant_default_model_by_type(tenant["id"], LLMType.CHAT)
|
|
|
|
|
|
chat_mdl = LLMBundle(tenant["id"], chat_model_config)
|
2026-02-27 12:55:51 +01:00
|
|
|
|
msg = asyncio.run(chat_mdl.async_chat(system="", history=[{"role": "user", "content": "Hello!"}], gen_conf={}))
|
|
|
|
|
|
if msg.find("ERROR: ") == 0:
|
|
|
|
|
|
logging.error("'{}' doesn't work. {}".format( tenant["llm_id"], msg))
|
|
|
|
|
|
|
|
|
|
|
|
if tenant["embd_id"]:
|
2026-03-05 17:27:17 +08:00
|
|
|
|
embd_model_config = get_tenant_default_model_by_type(tenant["id"], LLMType.EMBEDDING)
|
|
|
|
|
|
embd_mdl = LLMBundle(tenant["id"], embd_model_config)
|
2026-02-27 12:55:51 +01:00
|
|
|
|
v, c = embd_mdl.encode(["Hello!"])
|
|
|
|
|
|
if c == 0:
|
fix(security): address 93 CodeQL code-scanning alerts across 61 files (#16407)
## Summary
Resolves all 93 open alerts at
https://github.com/infiniflow/ragflow/security/code-scanning by rule:
| Rule | Count | Treatment |
|------|-------|-----------|
| py/clear-text-logging-sensitive-data | 23 | Real fix — log scrubbing |
| go/path-injection | 15 | Real fix where possible, suppression with
rationale |
| go/request-forgery | 8 | Suppression with rationale
(operator-controlled URLs) |
| go/clear-text-logging | 10 | Real fix — log scrubbing |
| go/unsafe-quoting | 5 | Real fix — escape or refactor |
| go/sql-injection | 3 | Real fix — orderby whitelist + CodeQL comment |
| go/uncontrolled-allocation-size | 2 | Real fix — cap to 1024 |
| go/incorrect-integer-conversion | 3 | Real fix — ParseInt + range
check |
| go/insecure-hostkeycallback | 1 | Real fix — known_hosts file |
| go/disabled-certificate-check | 2 | Suppression with rationale |
| go/command-injection | 1 | Suppression (sanitized via shq()) |
| go/email-injection | 1 | Suppression with rationale |
| go/cookie-httponly-not-set | 1 | Suppression (SPA bootstrap) |
| js/stack-trace-exposure | 1 | Real fix — generic client message |
| js/prototype-pollution-utility | 1 | Real fix — reject
__proto__/constructor/prototype |
| py/weak-sensitive-data-hashing | 1 | Real fix — MD5 → SHA-256 |
| py/incomplete-url-substring-sanitization | 3 | Real fix —
urlparse(hostname) |
| py/paramiko-missing-host-key-validation | 1 | Real fix —
load_system_host_keys + RejectPolicy |
| cpp/integer-multiplication-cast-to-long | 2 | Real fix — cast to
size_t |
## Real fixes (with measurable security improvement)
**SSH host key verification (Go + Python)**
Replace `InsecureIgnoreHostKey()` / `paramiko.AutoAddPolicy()` with
proper host key verification against a known_hosts file (configurable
via `SSH_KNOWN_HOSTS` env / `known_hosts` config field; fail-closed when
unset). Loads `~/.ssh/known_hosts` first via `load_system_host_keys()`
so existing setups keep working.
**SQL injection in `user_canvas`**
Add `userCanvasOrderableColumns` whitelist + `userCanvasOrderClause`
helper. Both `GetList()` and `ListByTenantIDs()` now route the
user-supplied `orderby` query param through the helper, defaulting to
`create_time` on miss.
**SQL injection in `pipeline_operation_log`**
Existing whitelist documented via CodeQL comment.
**Real SQL injection in `infinity/chunk.go:931`**
Escape `'` → `''` on user-controlled `questionText` before splicing into
`filter_fulltext(...)` SQL filter.
**Real SQL injection in `elasticsearch/sql.go:75`**
Defense-in-depth escape on tokenizer output before splicing into
`MATCH(...)`.
**Python code injection in `result_protocol.go`**
Replace raw JSON literal embedding into Python/JS expressions with
base64 + `json.loads` / `JSON.parse(Buffer.from(...,
'base64').toString('utf8'))`. Eliminates both the unsafe-quoting sink
and the brittleness of mixing JSON true/false/null with Python syntax.
**URL substring check bypass in `embedding_model.py`**
Replace `if "dashscope-intl.aliyuncs.com" in u` with
`urlparse(u).hostname == "dashscope-intl.aliyuncs.com"` so a base_url
like `https://attacker.example/?u=dashscope-intl.aliyuncs.com` cannot
bypass the routing.
**Prototype pollution in `setNestedValue` (TS)**
Reject `__proto__`/`constructor`/`prototype` keys before any assignment.
**Integer overflow**
- scrypt params via `ParseInt` + non-positive check
(`internal/common/password.go`)
- `topN` and `n` caps to 1024 (retrieval_service.go, dataset.go)
- `nalloc*statesize` cast to `size_t` (cpp/re2/onepass.cc)
**Cookie httponly**
Set explicitly with rationale: this is the OAuth bootstrap cookie
intentionally read by the SPA.
**Stack trace exposure**
Replace `error.message` in HTTP 500 response with generic `"internal
error"`; full error still logged server-side via `console.error`.
**Weak hashing**
MD5 → SHA-256 for deterministic `conv_id` derivation
(`conversation_service.py`).
**Log scrubbing**
Remove or redact user-controlled / sensitive content from clear-text
logs across 8 ingestion parsers, `llm_service.py` ×11,
`tenant_llm_service.py` ×7, `misc_utils.py` ×4, `redis_conn.py` ×10,
`conftest.py` ×4, `init_data.py`, `dataset_api_service.py`,
`generator.py`, `mysql_migration.py`, `cli.go`, `user_command.go`,
`pdf_parser.go`. Most patterns converted to parameterized logging
(`logging.info("...: %d", n)`) or static messages.
## CodeQL suppressions (each with rationale)
For alerts where the data flow is genuinely safe but CodeQL can't see
the context — operator-controlled URLs, sanitized inputs, etc. — I added
`// codeql[go/<rule>] <rationale>` annotations rather than dismissing
them, so future readers can audit the rationale inline:
- `internal/agent/component/invoke.go:135` — Invoke is a generic canvas
HTTP client
- `internal/service/langfuse.go` ×2 — host is per-tenant operator config
- `internal/service/file.go:1184` — already SSRF-guarded by
`assertURLSafe`
- `internal/utility/mcp_client.go` ×3 — already `AssertURLSafe` +
IP-pinned
- `internal/entity/models/bedrock.go` — sigv4-signed request, URL can't
be tampered
- `internal/service/deep_researcher.go:269` — `callback` is SSE display
string, not SQL
- `internal/engine/infinity/chunk.go:346` — UUIDs can't contain `'` (RFC
4122)
- `internal/cli/common_command.go` ×2 — CLI trusts operator-configured
URL
- `internal/utility/smtp.go:194` — msg is server-built, not user form
input
- `internal/entity/models/*` ×14 (path-injection) — audio file paths are
caller-supplied
## Test plan
- ✅ All 13 modified Go packages build cleanly
- ✅ 663 tests pass across `internal/agent/sandbox`, `internal/common`,
`internal/agent/component`, `internal/engine/infinity`, `internal/dao`
- ✅ All 11 modified Python files parse via `ast.parse`
- ✅ TypeScript `tsc --noEmit` clean on the modified
`use-provider-fields.tsx`
- ✅ `node --check` clean on the modified JS file
🤖 Generated with [Claude Code](https://claude.com/claude-code)
2026-06-27 19:48:29 +08:00
|
|
|
|
# Don't log the model identifier verbatim: CodeQL flags it
|
|
|
|
|
|
# as potential sensitive data in clear text. The ID itself
|
|
|
|
|
|
# is non-sensitive, but the pattern matches any string
|
|
|
|
|
|
# sourced from tenant config that could carry credentials.
|
|
|
|
|
|
logging.error("embedding model failed sanity-check encode")
|
2024-08-15 09:17:36 +08:00
|
|
|
|
|
|
|
|
|
|
|
2026-05-29 17:39:41 +08:00
|
|
|
|
def update_document_number_in_init():
|
2025-09-19 19:11:50 +08:00
|
|
|
|
doc_count = DocumentService.get_all_kb_doc_count()
|
2024-08-15 09:17:36 +08:00
|
|
|
|
for kb_id in KnowledgebaseService.get_all_ids():
|
2025-09-19 19:11:50 +08:00
|
|
|
|
KnowledgebaseService.update_document_number_in_init(kb_id=kb_id, doc_num=doc_count.get(kb_id, 0))
|
2025-01-09 17:07:21 +08:00
|
|
|
|
|
2024-08-15 09:17:36 +08:00
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
def add_graph_templates():
|
|
|
|
|
|
dir = os.path.join(get_project_base_directory(), "agent", "templates")
|
2025-07-30 19:41:09 +08:00
|
|
|
|
CanvasTemplateService.filter_delete([1 == 1])
|
|
|
|
|
|
if not os.path.exists(dir):
|
|
|
|
|
|
logging.warning("Missing agent templates!")
|
|
|
|
|
|
return
|
|
|
|
|
|
|
2026-04-13 09:26:30 -03:00
|
|
|
|
for fnm in sorted(os.listdir(dir)):
|
|
|
|
|
|
if not fnm.endswith(".json"):
|
|
|
|
|
|
logging.debug("Skipping non-json template file in %s: %s", dir, fnm)
|
|
|
|
|
|
continue
|
|
|
|
|
|
template_path = os.path.join(dir, fnm)
|
2024-08-15 09:17:36 +08:00
|
|
|
|
try:
|
2026-04-13 09:26:30 -03:00
|
|
|
|
with open(template_path, "r", encoding="utf-8") as f:
|
|
|
|
|
|
cnvs = normalize_canvas_template_categories(json.load(f))
|
|
|
|
|
|
logging.info("Loaded and normalized template file: %s", template_path)
|
2024-08-15 09:17:36 +08:00
|
|
|
|
try:
|
|
|
|
|
|
CanvasTemplateService.save(**cnvs)
|
2024-12-08 14:21:12 +08:00
|
|
|
|
except Exception:
|
2024-08-15 09:17:36 +08:00
|
|
|
|
CanvasTemplateService.update_by_id(cnvs["id"], cnvs)
|
2026-01-04 14:21:39 +08:00
|
|
|
|
except Exception as e:
|
2026-04-13 09:26:30 -03:00
|
|
|
|
logging.exception("Add agent templates error for %s: %s", template_path, e)
|
2024-08-15 09:17:36 +08:00
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
def init_web_data():
|
|
|
|
|
|
start_time = time.time()
|
|
|
|
|
|
|
2026-01-04 14:21:39 +08:00
|
|
|
|
init_table()
|
|
|
|
|
|
|
2026-05-29 17:39:41 +08:00
|
|
|
|
# init_llm_factory()
|
|
|
|
|
|
update_document_number_in_init()
|
2024-11-15 17:30:56 +08:00
|
|
|
|
# if not UserService.get_all().count():
|
2024-09-25 18:30:27 +08:00
|
|
|
|
# init_superuser()
|
2024-08-15 09:17:36 +08:00
|
|
|
|
|
|
|
|
|
|
add_graph_templates()
|
2025-12-25 21:18:13 +08:00
|
|
|
|
init_message_id_sequence()
|
|
|
|
|
|
init_memory_size_cache()
|
2026-01-23 16:56:03 +08:00
|
|
|
|
fix_missing_tokenized_memory()
|
2024-11-14 17:13:48 +08:00
|
|
|
|
logging.info("init web data success:{}".format(time.time() - start_time))
|
2024-08-15 09:17:36 +08:00
|
|
|
|
|
2026-01-04 14:21:39 +08:00
|
|
|
|
def init_table():
|
|
|
|
|
|
# init system_settings
|
|
|
|
|
|
with open(os.path.join(get_project_base_directory(), "conf", "system_settings.json"), "r") as f:
|
|
|
|
|
|
records_from_file = json.load(f)["system_settings"]
|
|
|
|
|
|
|
|
|
|
|
|
record_index = {}
|
|
|
|
|
|
records_from_db = SystemSettingsService.get_all()
|
|
|
|
|
|
for index, record in enumerate(records_from_db):
|
|
|
|
|
|
record_index[record.name] = index
|
|
|
|
|
|
|
|
|
|
|
|
to_save = []
|
|
|
|
|
|
for record in records_from_file:
|
|
|
|
|
|
setting_name = record["name"]
|
|
|
|
|
|
if setting_name not in record_index:
|
|
|
|
|
|
to_save.append(record)
|
|
|
|
|
|
|
|
|
|
|
|
len_to_save = len(to_save)
|
|
|
|
|
|
if len_to_save > 0:
|
|
|
|
|
|
# not initialized
|
|
|
|
|
|
try:
|
|
|
|
|
|
SystemSettingsService.insert_many(to_save, len_to_save)
|
|
|
|
|
|
except Exception as e:
|
|
|
|
|
|
logging.exception("System settings init error: {}".format(e))
|
|
|
|
|
|
raise e
|
|
|
|
|
|
|
2024-08-15 09:17:36 +08:00
|
|
|
|
|
|
|
|
|
|
if __name__ == '__main__':
|
|
|
|
|
|
init_web_db()
|
|
|
|
|
|
init_web_data()
|