2026-04-03 20:01:37 +08:00
#
# Copyright 2026 The InfiniFlow Authors. All Rights Reserved.
#
# Licensed under the Apache License, Version 2.0 (the "License");
# you may not use this file except in compliance with the License.
# You may obtain a copy of the License at
#
# http://www.apache.org/licenses/LICENSE-2.0
#
# Unless required by applicable law or agreed to in writing, software
# distributed under the License is distributed on an "AS IS" BASIS,
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# See the License for the specific language governing permissions and
# limitations under the License.
#
"""
MySQL Data Migration Script
This script provides a flexible MySQL data migration tool that supports :
1. MySQL configuration via config file or command line arguments
2. Direct peewee operations without importing api . db . services
3. Configurable migration stages via command line
4. Migration logging with table names , row counts , and duration
"""
import argparse
2026-06-02 13:08:58 +08:00
import json
2026-04-03 20:01:37 +08:00
import logging
import os
import sys
import time
import uuid
2026-06-03 11:51:42 +08:00
from packaging . version import InvalidVersion , Version
2026-04-03 20:01:37 +08:00
from peewee import (
CharField ,
IntegerField ,
BigIntegerField ,
DateTimeField ,
MySQLDatabase ,
Model ,
PrimaryKeyField ,
TextField ,
)
from playhouse . migrate import MySQLMigrator
# Add project root to path for imports
PROJECT_BASE = os . path . dirname ( os . path . dirname ( os . path . dirname ( os . path . abspath ( __file__ ) ) ) )
sys . path . insert ( 0 , PROJECT_BASE )
# Configure logging
2026-07-03 12:53:39 +08:00
logging . basicConfig ( level = logging . INFO , format = " %(asctime)s - %(levelname)s - %(message)s " )
2026-04-03 20:01:37 +08:00
logger = logging . getLogger ( __name__ )
2026-06-03 11:51:42 +08:00
MIGRATION_DB_VERSION_MARKER = " mysql_migration.database.version "
2026-04-03 20:01:37 +08:00
class MigrationConfig :
""" Configuration for MySQL connection """
2026-07-03 12:53:39 +08:00
def __init__ ( self , host : str = " localhost " , port : int = 3306 , user : str = " root " , password : str = " " , database : str = " rag_flow " ) :
2026-04-03 20:01:37 +08:00
self . host = host
self . port = port
self . user = user
self . password = password
self . database = database
2026-07-03 12:53:39 +08:00
2026-04-03 20:01:37 +08:00
@classmethod
2026-07-03 12:53:39 +08:00
def from_config_file ( cls , config_path : str ) - > " MigrationConfig " :
2026-04-03 20:01:37 +08:00
""" Load configuration from YAML config file """
try :
from ruamel . yaml import YAML
2026-07-03 12:53:39 +08:00
2026-04-03 20:01:37 +08:00
yaml = YAML ( typ = " safe " , pure = True )
2026-07-03 12:53:39 +08:00
with open ( config_path , " r " ) as f :
2026-04-03 20:01:37 +08:00
config = yaml . load ( f )
2026-07-03 12:53:39 +08:00
2026-04-03 20:01:37 +08:00
# Try to get database config
2026-07-03 12:53:39 +08:00
db_config = config . get ( " database " , config . get ( " mysql " , { } ) )
2026-04-03 20:01:37 +08:00
return cls (
2026-07-03 12:53:39 +08:00
host = db_config . get ( " host " , " localhost " ) ,
port = db_config . get ( " port " , 3306 ) ,
user = db_config . get ( " user " , " root " ) ,
password = db_config . get ( " password " , " " ) ,
database = db_config . get ( " name " , db_config . get ( " database " , " rag_flow " ) ) ,
2026-04-03 20:01:37 +08:00
)
except Exception as e :
logger . warning ( f " Failed to load config file: { e } , using defaults " )
return cls ( )
class MigrationStats :
""" Track migration statistics """
2026-07-03 12:53:39 +08:00
2026-04-03 20:01:37 +08:00
def __init__ ( self ) :
self . tables_operated = [ ]
self . rows_processed = 0
self . start_time = None
self . end_time = None
self . stage_stats = [ ]
2026-07-03 12:53:39 +08:00
2026-04-03 20:01:37 +08:00
def start ( self ) :
self . start_time = time . time ( )
2026-07-03 12:53:39 +08:00
2026-04-03 20:01:37 +08:00
def end ( self ) :
self . end_time = time . time ( )
2026-07-03 12:53:39 +08:00
2026-04-03 20:01:37 +08:00
def add_stage_stats ( self , stage_name : str , tables : list , rows : int , duration : float ) :
2026-07-03 12:53:39 +08:00
self . stage_stats . append ( { " stage " : stage_name , " tables " : tables , " rows " : rows , " duration " : duration } )
2026-04-03 20:01:37 +08:00
self . tables_operated . extend ( tables )
self . rows_processed + = rows
2026-07-03 12:53:39 +08:00
2026-04-03 20:01:37 +08:00
def print_summary ( self ) :
duration = self . end_time - self . start_time if self . end_time and self . start_time else 0
logger . info ( " = " * 60 )
logger . info ( " Migration Summary " )
logger . info ( " = " * 60 )
logger . info ( f " Total Duration: { duration : .2f } s " )
logger . info ( f " Total Rows Processed: { self . rows_processed } " )
logger . info ( f " Tables Operated: { ' , ' . join ( set ( self . tables_operated ) ) } " )
logger . info ( " - " * 60 )
logger . info ( " Stage Details: " )
for stat in self . stage_stats :
2026-07-03 12:53:39 +08:00
logger . info ( f " [ { stat [ ' stage ' ] } ] Tables: { ' , ' . join ( stat [ ' tables ' ] ) } , Rows: { stat [ ' rows ' ] } , Duration: { stat [ ' duration ' ] : .2f } s " )
2026-04-03 20:01:37 +08:00
logger . info ( " = " * 60 )
class MigrationDatabase :
""" Database wrapper for migrations """
2026-07-03 12:53:39 +08:00
2026-04-03 20:01:37 +08:00
def __init__ ( self , config : MigrationConfig ) :
self . config = config
2026-07-03 12:53:39 +08:00
self . db = MySQLDatabase ( config . database , host = config . host , port = config . port , user = config . user , password = config . password , charset = " utf8mb4 " )
2026-04-03 20:01:37 +08:00
self . migrator = MySQLMigrator ( self . db )
2026-07-03 12:53:39 +08:00
2026-04-03 20:01:37 +08:00
def connect ( self ) :
self . db . connect ( )
logger . info ( f " Connected to MySQL database: { self . config . database } " )
2026-07-03 12:53:39 +08:00
2026-04-03 20:01:37 +08:00
def close ( self ) :
if not self . db . is_closed ( ) :
self . db . close ( )
logger . info ( " Database connection closed " )
2026-07-03 12:53:39 +08:00
2026-04-03 20:01:37 +08:00
def execute_sql ( self , sql : str , params = None ) :
return self . db . execute_sql ( sql , params )
2026-07-03 12:53:39 +08:00
2026-04-03 20:01:37 +08:00
def table_exists ( self , table_name : str ) - > bool :
2026-07-03 12:53:39 +08:00
cursor = self . execute_sql ( " SELECT COUNT(*) FROM information_schema.tables WHERE table_schema = %s AND table_name = %s " , ( self . config . database , table_name ) )
2026-04-03 20:01:37 +08:00
return cursor . fetchone ( ) [ 0 ] > 0
2026-06-02 13:08:58 +08:00
def column_exists ( self , table_name : str , column_name : str ) - > bool :
2026-07-03 12:53:39 +08:00
cursor = self . execute_sql ( " SELECT COUNT(*) FROM information_schema.columns WHERE table_schema = %s AND table_name = %s AND column_name = %s " , ( self . config . database , table_name , column_name ) )
2026-06-02 13:08:58 +08:00
return cursor . fetchone ( ) [ 0 ] > 0
2026-06-02 15:04:33 +08:00
def get_system_setting_value ( self , name : str ) - > str | None :
if not self . table_exists ( " system_settings " ) :
logger . info ( " Table ' system_settings ' does not exist, migration marker is unavailable " )
return None
cursor = self . execute_sql (
" SELECT `value` FROM `system_settings` WHERE `name` = %s " ,
( name , ) ,
)
row = cursor . fetchone ( )
return row [ 0 ] if row else None
def upsert_system_setting ( self , name : str , value : str , source : str = " migration " , data_type : str = " string " ) :
if not self . table_exists ( " system_settings " ) :
logger . warning ( " Table ' system_settings ' does not exist, migration marker was not saved " )
return
current_ts = int ( time . time ( ) )
self . execute_sql (
"""
INSERT INTO ` system_settings `
( ` name ` , ` source ` , ` data_type ` , ` value ` , ` create_time ` , ` create_date ` , ` update_time ` , ` update_date ` )
VALUES ( % s , % s , % s , % s , % s , FROM_UNIXTIME ( % s ) , % s , FROM_UNIXTIME ( % s ) )
ON DUPLICATE KEY UPDATE
` source ` = VALUES ( ` source ` ) ,
` data_type ` = VALUES ( ` data_type ` ) ,
` value ` = VALUES ( ` value ` ) ,
` update_time ` = VALUES ( ` update_time ` ) ,
` update_date ` = VALUES ( ` update_date ` )
""" ,
(
name ,
source ,
data_type ,
value ,
current_ts * 1000 ,
current_ts ,
current_ts * 1000 ,
current_ts ,
) ,
)
2026-06-03 11:51:42 +08:00
def get_database_version ( self ) - > str | None :
return self . get_system_setting_value ( MIGRATION_DB_VERSION_MARKER )
def set_database_version ( self , version : str ) :
self . upsert_system_setting ( MIGRATION_DB_VERSION_MARKER , version )
def parse_migration_version ( version : str | None ) - > Version | None :
if not version :
return None
normalized = version . strip ( )
if normalized . startswith ( ( " v " , " V " ) ) :
normalized = normalized [ 1 : ]
try :
return Version ( normalized )
except InvalidVersion :
logger . warning ( " Invalid migration version format: %s " , version )
return None
def should_skip_migration ( current_db_version : str | None , target_version : str ) - > bool :
current = parse_migration_version ( current_db_version )
target = parse_migration_version ( target_version )
if current is None or target is None :
return False
return current > = target
2026-04-03 20:01:37 +08:00
# Define model classes for migration (not importing from api.db.db_models)
class BaseModel ( Model ) :
""" Base model for migration tables """
2026-07-03 12:53:39 +08:00
2026-04-03 20:01:37 +08:00
create_time = BigIntegerField ( null = True , index = True )
create_date = DateTimeField ( null = True , index = True )
update_time = BigIntegerField ( null = True , index = True )
update_date = DateTimeField ( null = True , index = True )
2026-07-03 12:53:39 +08:00
2026-04-03 20:01:37 +08:00
class Meta :
database = None # Will be set dynamically
class TenantLLM ( BaseModel ) :
""" Tenant LLM model (source table) """
2026-07-03 12:53:39 +08:00
2026-04-03 20:01:37 +08:00
id = PrimaryKeyField ( )
tenant_id = CharField ( max_length = 32 , null = False , index = True )
llm_factory = CharField ( max_length = 128 , null = False , index = True )
model_type = CharField ( max_length = 128 , null = True , index = True )
llm_name = CharField ( max_length = 128 , null = True , default = " " , index = True )
api_key = TextField ( null = True )
api_base = CharField ( max_length = 255 , null = True )
max_tokens = IntegerField ( default = 8192 , index = True )
used_tokens = IntegerField ( default = 0 , index = True )
status = CharField ( max_length = 1 , null = False , default = " 1 " , index = True )
2026-07-03 12:53:39 +08:00
2026-04-03 20:01:37 +08:00
class Meta :
table_name = " tenant_llm "
database = None
class TenantModelProvider ( BaseModel ) :
""" Tenant Model Provider model (target table) """
2026-07-03 12:53:39 +08:00
2026-04-03 20:01:37 +08:00
id = CharField ( max_length = 32 , primary_key = True )
provider_name = CharField ( max_length = 128 , null = False , index = True )
tenant_id = CharField ( max_length = 32 , null = False , index = True )
2026-07-03 12:53:39 +08:00
2026-04-03 20:01:37 +08:00
class Meta :
table_name = " tenant_model_provider "
database = None
class MigrationStage :
""" Base class for migration stages """
2026-07-03 12:53:39 +08:00
2026-04-03 20:01:37 +08:00
name = " base_stage "
description = " Base migration stage "
source_tables = [ ]
target_tables = [ ]
2026-07-03 12:53:39 +08:00
2026-04-03 20:01:37 +08:00
def __init__ ( self , db : MigrationDatabase , dry_run : bool = True , create_table_only : bool = False ) :
self . db = db
self . dry_run = dry_run
self . create_table_only = create_table_only
2026-06-02 15:04:33 +08:00
self . _noop_completes_migration = False
2026-07-03 12:53:39 +08:00
2026-04-03 20:01:37 +08:00
def check ( self ) - > bool :
""" Check if migration is needed """
raise NotImplementedError
2026-07-03 12:53:39 +08:00
2026-04-03 20:01:37 +08:00
def execute ( self ) - > tuple [ int , list ] :
""" Execute migration, returns (rows_affected, tables_operated) """
raise NotImplementedError
2026-07-03 12:53:39 +08:00
2026-04-03 20:01:37 +08:00
def create_target_table ( self ) :
""" Create target table (override in subclass if needed) """
pass
2026-06-02 15:04:33 +08:00
def mark_noop_completes_migration ( self ) :
self . _noop_completes_migration = True
def noop_completes_migration ( self ) - > bool :
return self . _noop_completes_migration
2026-04-03 20:01:37 +08:00
class TenantModelProviderStage ( MigrationStage ) :
""" Migrate tenant_llm to tenant_model_provider """
2026-07-03 12:53:39 +08:00
2026-04-03 20:01:37 +08:00
name = " tenant_model_provider "
description = " Migrate tenant_llm.llm_factory to tenant_model_provider.provider_name "
source_tables = [ " tenant_llm " ]
target_tables = [ " tenant_model_provider " ]
2026-07-03 12:53:39 +08:00
2026-04-03 20:01:37 +08:00
def current_timestamp ( self ) - > int :
return int ( time . time ( ) )
2026-07-03 12:53:39 +08:00
2026-04-03 20:01:37 +08:00
def generate_uuid ( self ) - > str :
""" Generate 32-character UUID1 """
return uuid . uuid1 ( ) . hex
2026-07-03 12:53:39 +08:00
2026-04-03 20:01:37 +08:00
def check ( self ) - > bool :
""" Check if migration is needed """
# Check if source table exists
if not self . db . table_exists ( " tenant_llm " ) :
logger . warning ( " Source table ' tenant_llm ' does not exist " )
return False
2026-07-03 12:53:39 +08:00
2026-04-03 20:01:37 +08:00
# Check if target table exists
if not self . db . table_exists ( " tenant_model_provider " ) :
if self . dry_run :
2026-07-03 12:53:39 +08:00
logger . info ( " [DRY RUN] Target table ' tenant_model_provider ' does not exist. Use --execute to create and populate the table. " )
2026-04-03 20:01:37 +08:00
return False
logger . info ( " Target table ' tenant_model_provider ' does not exist, will create " )
return True
2026-07-03 12:53:39 +08:00
2026-04-03 20:01:37 +08:00
# Check if there's data to migrate
cursor = self . db . execute_sql (
2026-07-03 12:53:39 +08:00
" SELECT COUNT(*) FROM tenant_llm t1 WHERE NOT EXISTS ( SELECT 1 FROM tenant_model_provider t2 WHERE t2.tenant_id = t1.tenant_id AND t2.provider_name = t1.llm_factory) "
2026-04-03 20:01:37 +08:00
)
count = cursor . fetchone ( ) [ 0 ]
2026-07-03 12:53:39 +08:00
2026-04-03 20:01:37 +08:00
if count == 0 :
2026-06-02 15:04:33 +08:00
self . mark_noop_completes_migration ( )
2026-04-03 20:01:37 +08:00
logger . info ( " No new data to migrate from tenant_llm to tenant_model_provider " )
return False
2026-07-03 12:53:39 +08:00
2026-04-03 20:01:37 +08:00
logger . info ( f " Found { count } rows to migrate from tenant_llm to tenant_model_provider " )
return True
2026-07-03 12:53:39 +08:00
2026-04-03 20:01:37 +08:00
def execute ( self ) - > tuple [ int , list ] :
""" Execute migration """
current_ts = self . current_timestamp ( )
rows_inserted = 0
2026-07-03 12:53:39 +08:00
2026-04-03 20:01:37 +08:00
# Check if target table exists
if not self . db . table_exists ( " tenant_model_provider " ) :
if self . dry_run :
2026-07-03 12:53:39 +08:00
logger . info ( " [DRY RUN] Target table ' tenant_model_provider ' does not exist. Use --execute to create and populate the table. " )
2026-04-03 20:01:37 +08:00
return 0 , [ ]
logger . info ( " Target table ' tenant_model_provider ' does not exist, will create " )
self . create_target_table ( )
2026-07-03 12:53:39 +08:00
2026-04-03 20:01:37 +08:00
# If create_table_only mode, skip data migration
if self . create_table_only :
logger . info ( " [CREATE TABLE ONLY] Target table created/verified, skipping data migration " )
return 0 , self . target_tables
2026-07-03 12:53:39 +08:00
2026-04-03 20:01:37 +08:00
# Get distinct tenant_id, llm_factory pairs that don't exist in target
cursor = self . db . execute_sql (
" SELECT DISTINCT tenant_id, llm_factory FROM tenant_llm t1 "
" WHERE NOT EXISTS ( "
" SELECT 1 FROM tenant_model_provider t2 "
" WHERE t2.tenant_id = t1.tenant_id AND t2.provider_name = t1.llm_factory "
" ) "
)
2026-07-03 12:53:39 +08:00
2026-04-03 20:01:37 +08:00
records = cursor . fetchall ( )
2026-07-03 12:53:39 +08:00
2026-04-03 20:01:37 +08:00
if not records :
logger . info ( " No records to migrate " )
return 0 , [ ]
2026-07-03 12:53:39 +08:00
2026-04-03 20:01:37 +08:00
logger . info ( f " Migrating { len ( records ) } unique tenant_id/llm_factory pairs... " )
2026-07-03 12:53:39 +08:00
2026-04-03 20:01:37 +08:00
if self . dry_run :
logger . info ( f " [DRY RUN] Would insert { len ( records ) } records " )
return len ( records ) , self . target_tables
2026-07-03 12:53:39 +08:00
2026-06-01 19:29:41 -07:00
# Insert records in batches with parameterized SQL to avoid quote breakage/injection
2026-04-03 20:01:37 +08:00
batch_size = 100
for i in range ( 0 , len ( records ) , batch_size ) :
2026-07-03 12:53:39 +08:00
batch = records [ i : i + batch_size ]
2026-06-01 19:29:41 -07:00
placeholders = [ ]
params = [ ]
2026-04-03 20:01:37 +08:00
for tenant_id , llm_factory in batch :
record_id = self . generate_uuid ( )
2026-06-01 19:29:41 -07:00
placeholders . append ( " ( %s , %s , %s , %s , FROM_UNIXTIME( %s ), %s , FROM_UNIXTIME( %s )) " )
2026-07-03 12:53:39 +08:00
params . extend (
[
record_id ,
llm_factory ,
tenant_id ,
current_ts * 1000 ,
current_ts ,
current_ts * 1000 ,
current_ts ,
]
)
2026-04-03 20:01:37 +08:00
insert_sql = f """
2026-07-03 12:53:39 +08:00
INSERT INTO tenant_model_provider
2026-04-03 20:01:37 +08:00
( id , provider_name , tenant_id , create_time , create_date , update_time , update_date )
2026-07-03 12:53:39 +08:00
VALUES { " , " . join ( placeholders ) }
2026-04-03 20:01:37 +08:00
"""
2026-06-01 19:29:41 -07:00
self . db . execute_sql ( insert_sql , params )
2026-04-03 20:01:37 +08:00
rows_inserted + = len ( batch )
logger . info ( f " Inserted batch { i / / batch_size + 1 } : { len ( batch ) } records " )
2026-07-03 12:53:39 +08:00
2026-04-03 20:01:37 +08:00
return rows_inserted , self . target_tables
2026-07-03 12:53:39 +08:00
2026-04-03 20:01:37 +08:00
def create_target_table ( self ) :
""" Create tenant_model_provider table """
create_sql = """
CREATE TABLE IF NOT EXISTS tenant_model_provider (
id VARCHAR ( 32 ) NOT NULL PRIMARY KEY ,
provider_name VARCHAR ( 128 ) NOT NULL ,
tenant_id VARCHAR ( 32 ) NOT NULL ,
create_time BIGINT ,
create_date DATETIME ,
update_time BIGINT ,
update_date DATETIME ,
INDEX idx_provider_name ( provider_name ) ,
INDEX idx_tenant_id ( tenant_id ) ,
UNIQUE INDEX idx_tenant_provider_unique ( tenant_id , provider_name )
) ENGINE = InnoDB DEFAULT CHARSET = utf8mb4
"""
self . db . execute_sql ( create_sql )
logger . info ( " Created tenant_model_provider table " )
2026-04-09 11:03:39 +08:00
class TenantModelInstanceStage ( MigrationStage ) :
""" Migrate tenant_llm to tenant_model_instance """
name = " tenant_model_instance "
description = " Migrate tenant_llm to tenant_model_instance with provider_id lookup "
source_tables = [ " tenant_llm " , " tenant_model_provider " ]
target_tables = [ " tenant_model_instance " ]
def current_timestamp ( self ) - > int :
return int ( time . time ( ) )
def generate_uuid ( self ) - > str :
""" Generate 32-character UUID1 """
return uuid . uuid1 ( ) . hex
def check ( self ) - > bool :
""" Check if migration is needed """
# Check if source table exists
if not self . db . table_exists ( " tenant_llm " ) :
logger . warning ( " Source table ' tenant_llm ' does not exist " )
return False
# Check if tenant_model_provider exists (dependency)
if not self . db . table_exists ( " tenant_model_provider " ) :
if self . dry_run :
2026-07-03 12:53:39 +08:00
logger . info ( " [DRY RUN] Dependency table ' tenant_model_provider ' does not exist. Run ' tenant_model_provider ' stage first or use --execute. " )
2026-04-09 11:03:39 +08:00
return False
2026-07-03 12:53:39 +08:00
logger . warning ( " Dependency table ' tenant_model_provider ' does not exist. Please run ' tenant_model_provider ' stage first. " )
2026-04-09 11:03:39 +08:00
return False
# Check if target table exists
if not self . db . table_exists ( " tenant_model_instance " ) :
if self . dry_run :
2026-07-03 12:53:39 +08:00
logger . info ( " [DRY RUN] Target table ' tenant_model_instance ' does not exist. Use --execute to create and populate the table. " )
2026-04-09 11:03:39 +08:00
return False
logger . info ( " Target table ' tenant_model_instance ' does not exist, will create " )
return True
# Check if there's data to migrate (distinct by tenant_id, llm_factory, api_key)
cursor = self . db . execute_sql (
" SELECT COUNT(*) FROM ( "
" SELECT tl.tenant_id, tl.llm_factory, tl.api_key, tmp.id as provider_id "
" FROM tenant_llm tl "
" INNER JOIN tenant_model_provider tmp ON tmp.tenant_id = tl.tenant_id AND tmp.provider_name = tl.llm_factory "
" WHERE NOT EXISTS ( "
" SELECT 1 FROM tenant_model_instance tmi "
" WHERE tmi.provider_id = tmp.id AND tmi.api_key = tl.api_key "
" ) "
" GROUP BY tl.tenant_id, tl.llm_factory, tl.api_key, tmp.id "
" ) AS distinct_records "
)
count = cursor . fetchone ( ) [ 0 ]
if count == 0 :
2026-06-02 15:04:33 +08:00
self . mark_noop_completes_migration ( )
2026-04-09 11:03:39 +08:00
logger . info ( " No new data to migrate from tenant_llm to tenant_model_instance " )
return False
logger . info ( f " Found { count } rows to migrate from tenant_llm to tenant_model_instance " )
return True
def execute ( self ) - > tuple [ int , list ] :
""" Execute migration """
current_ts = self . current_timestamp ( )
rows_inserted = 0
# Check if tenant_model_provider exists (dependency)
if not self . db . table_exists ( " tenant_model_provider " ) :
2026-07-03 12:53:39 +08:00
logger . error ( " Dependency table ' tenant_model_provider ' does not exist. Please run ' tenant_model_provider ' stage first. " )
2026-04-09 11:03:39 +08:00
return 0 , [ ]
# Check if target table exists
if not self . db . table_exists ( " tenant_model_instance " ) :
if self . dry_run :
2026-07-03 12:53:39 +08:00
logger . info ( " [DRY RUN] Target table ' tenant_model_instance ' does not exist. Use --execute to create and populate the table. " )
2026-04-09 11:03:39 +08:00
return 0 , [ ]
logger . info ( " Target table ' tenant_model_instance ' does not exist, will create " )
self . create_target_table ( )
# If create_table_only mode, skip data migration
if self . create_table_only :
logger . info ( " [CREATE TABLE ONLY] Target table created/verified, skipping data migration " )
return 0 , self . target_tables
# Get records from tenant_llm with provider_id lookup
# Group by tenant_id, llm_factory, api_key to get distinct records
# instance_name = llm_factory, provider_id from tenant_model_provider, api_key from tenant_llm
cursor = self . db . execute_sql (
" SELECT tl.tenant_id, tl.llm_factory, tl.api_key, MAX(tl.status) as status, tmp.id as provider_id "
" FROM tenant_llm tl "
" INNER JOIN tenant_model_provider tmp ON tmp.tenant_id = tl.tenant_id AND tmp.provider_name = tl.llm_factory "
" WHERE NOT EXISTS ( "
" SELECT 1 FROM tenant_model_instance tmi "
" WHERE tmi.provider_id = tmp.id AND tmi.api_key = tl.api_key "
" ) "
" GROUP BY tl.tenant_id, tl.llm_factory, tl.api_key, tmp.id "
)
records = cursor . fetchall ( )
if not records :
logger . info ( " No records to migrate " )
return 0 , [ ]
2026-06-09 19:06:31 +08:00
# Deduplicate records where api_keys differ only by is_tools encoding.
# When _encode_api_key_config wraps a plain api_key into {"api_key": "...", "is_tools": true/false},
# multiple tenant_llm rows for the same provider can have logically identical api_keys that
# only differ in the is_tools field. We merge these by stripping is_tools for comparison.
records = self . _dedup_api_key_records ( records )
2026-04-09 11:03:39 +08:00
logger . info ( f " Migrating { len ( records ) } tenant_model_instance records... " )
if self . dry_run :
logger . info ( f " [DRY RUN] Would insert { len ( records ) } records " )
for tenant_id , llm_factory , api_key , status , provider_id in records [ : 5 ] :
2026-05-29 17:39:41 +08:00
logger . info ( f " instance_name=default, provider_id= { provider_id } , api_key=*** " )
2026-04-09 11:03:39 +08:00
if len ( records ) > 5 :
logger . info ( f " ... and { len ( records ) - 5 } more records " )
return len ( records ) , self . target_tables
# Insert records in batches
batch_size = 100
for i in range ( 0 , len ( records ) , batch_size ) :
2026-07-03 12:53:39 +08:00
batch = records [ i : i + batch_size ]
2026-04-09 11:03:39 +08:00
values = [ ]
for tenant_id , llm_factory , api_key , status , provider_id in batch :
record_id = self . generate_uuid ( )
2026-05-29 17:39:41 +08:00
instance_name = " default "
2026-04-09 11:03:39 +08:00
api_key_escaped = api_key . replace ( " ' " , " ' ' " ) if api_key else " "
2026-05-29 17:39:41 +08:00
status_val = " active " if status in [ " 1 " , " active " , " enable " ] else " inactive "
2026-07-03 12:53:39 +08:00
values . append (
f " ( ' { record_id } ' , ' { instance_name } ' , ' { provider_id } ' , "
f " ' { api_key_escaped } ' , ' { status_val } ' , "
f " { current_ts * 1000 } , FROM_UNIXTIME( { current_ts } ), "
f " { current_ts * 1000 } , FROM_UNIXTIME( { current_ts } )) "
)
2026-04-09 11:03:39 +08:00
insert_sql = f """
2026-07-03 12:53:39 +08:00
INSERT INTO tenant_model_instance
2026-04-09 11:03:39 +08:00
( id , instance_name , provider_id , api_key , status , create_time , create_date , update_time , update_date )
2026-07-03 12:53:39 +08:00
VALUES { " , " . join ( values ) }
2026-04-09 11:03:39 +08:00
"""
self . db . execute_sql ( insert_sql )
rows_inserted + = len ( batch )
logger . info ( f " Inserted batch { i / / batch_size + 1 } : { len ( batch ) } records " )
return rows_inserted , self . target_tables
2026-06-09 19:06:31 +08:00
@staticmethod
2026-06-10 14:06:23 +08:00
def _strip_is_tools_from_api_key ( api_key : str ) - > str :
2026-06-09 19:06:31 +08:00
""" Strip is_tools from api_key for dedup comparison.
Handles three api_key formats :
1. Plain string ( e . g . " sk-xxx " or " x " ) — returned as - is .
2. JSON with only { " api_key " : " ... " , " is_tools " : true / false } — extract the inner api_key value .
3. JSON with factory - specific fields + optional " is_tools " — remove only the " is_tools " key .
For format 3 , the factory - specific JSON structures are :
VolcEngine : { " ark_api_key " : . . . , " endpoint_id " : . . . }
Tencent Cloud : { " tencent_cloud_sid " : . . . , " tencent_cloud_sk " : . . . }
Bedrock : { " auth_mode " : . . . , " bedrock_ak " : . . . , " bedrock_sk " : . . . , " bedrock_region " : . . . , " aws_role_arn " : . . . }
XunFei Spark ( tts ) : { " spark_app_id " : . . . , " spark_api_secret " : . . . , " spark_api_key " : . . . }
BaiduYiyan : { " yiyan_ak " : . . . , " yiyan_sk " : . . . }
Fish Audio : { " fish_audio_ak " : . . . , " fish_audio_refid " : . . . }
Google Cloud : { " google_project_id " : . . . , " google_region " : . . . , " google_service_account_key " : . . . }
Azure - OpenAI : { " api_key " : . . . , " api_version " : . . . }
OpenRouter : { " api_key " : . . . , " provider_order " : . . . }
MinerU : { " api_key " : . . . , " provider_order " : . . . }
PaddleOCR : { " api_key " : . . . , " provider_order " : . . . }
OpenDataLoader : { " api_key " : . . . , " provider_order " : . . . }
"""
if not api_key :
return api_key
try :
parsed = json . loads ( api_key )
except ( json . JSONDecodeError , TypeError , ValueError ) :
return api_key
if not isinstance ( parsed , dict ) :
return api_key
# Case 2: {"api_key": "...", "is_tools": true/false} — extract inner api_key
if set ( parsed . keys ( ) ) < = { " api_key " , " is_tools " } :
return parsed . get ( " api_key " , " " )
# Case 3: factory-specific JSON with is_tools appended — remove is_tools key
if " is_tools " in parsed :
payload = { k : v for k , v in parsed . items ( ) if k != " is_tools " }
return json . dumps ( payload , sort_keys = True )
# Already a JSON dict without is_tools — return as-is
return json . dumps ( parsed , sort_keys = True )
def _dedup_api_key_records ( self , records : list ) - > list :
""" Deduplicate records whose api_keys are logically identical after stripping is_tools.
Groups by ( tenant_id , llm_factory , provider_id ) . Within each group , if multiple
records share the same canonical api_key ( with is_tools removed ) , only one is kept .
The kept record uses the original api_key value from the first occurrence ; is_tools
information is not needed in tenant_model_instance ( it is stored in tenant_model instead ) .
"""
from collections import defaultdict
groups = defaultdict ( list )
for rec in records :
tenant_id , llm_factory , api_key , status , provider_id = rec
groups [ ( tenant_id , llm_factory , provider_id ) ] . append ( rec )
deduped = [ ]
dup_count = 0
for ( tenant_id , llm_factory , provider_id ) , group in groups . items ( ) :
if len ( group ) < = 1 :
deduped . extend ( group )
continue
# Multiple records in group — dedup by canonical api_key
seen = { } # canonical_key -> first record
for rec in group :
_ , _ , api_key , _ , _ = rec
2026-06-10 14:06:23 +08:00
canonical = self . _strip_is_tools_from_api_key ( api_key )
2026-06-09 19:06:31 +08:00
if canonical not in seen :
seen [ canonical ] = rec
else :
dup_count + = 1
2026-07-03 12:53:39 +08:00
logger . debug ( f " Dedup api_key for tenant= { tenant_id } , factory= { llm_factory } , provider= { provider_id } : keeping ' { api_key [ : 20 ] } ... ' , dropping ' { seen [ canonical ] [ 2 ] [ : 20 ] } ... ' " )
2026-06-09 19:06:31 +08:00
deduped . extend ( seen . values ( ) )
if dup_count > 0 :
logger . info ( f " Deduplicated { dup_count } api_key records (is_tools-only differences) " )
return deduped
2026-04-09 11:03:39 +08:00
def create_target_table ( self ) :
""" Create tenant_model_instance table """
create_sql = """
CREATE TABLE IF NOT EXISTS tenant_model_instance (
id VARCHAR ( 32 ) NOT NULL PRIMARY KEY ,
instance_name VARCHAR ( 128 ) NOT NULL ,
provider_id VARCHAR ( 32 ) NOT NULL ,
api_key VARCHAR ( 512 ) NOT NULL ,
status VARCHAR ( 32 ) DEFAULT ' active ' ,
2026-05-29 17:39:41 +08:00
extra VARCHAR ( 512 ) DEFAULT ' {} ' ,
2026-04-09 11:03:39 +08:00
create_time BIGINT ,
create_date DATETIME ,
update_time BIGINT ,
update_date DATETIME ,
INDEX idx_provider_id ( provider_id )
) ENGINE = InnoDB DEFAULT CHARSET = utf8mb4
"""
self . db . execute_sql ( create_sql )
logger . info ( " Created tenant_model_instance table " )
class TenantModelStage ( MigrationStage ) :
""" Migrate tenant_llm to tenant_model """
name = " tenant_model "
2026-06-02 13:24:53 +08:00
description = " Migrate tenant_llm to tenant_model (status= ' 0 ' records, plus status= ' 1 ' for empty-llm factories) "
2026-04-09 11:03:39 +08:00
source_tables = [ " tenant_llm " , " tenant_model_provider " , " tenant_model_instance " ]
target_tables = [ " tenant_model " ]
2026-06-02 13:24:53 +08:00
@staticmethod
def _get_empty_llm_factories ( ) - > list [ str ] :
""" Load factory names whose llm field is an empty list from conf/llm_factories.json """
conf_path = os . path . join ( PROJECT_BASE , " conf " , " llm_factories.json " )
with open ( conf_path , " r " ) as f :
data = json . load ( f )
factories = [ ]
for key , items in data . items ( ) :
if isinstance ( items , list ) :
for item in items :
if isinstance ( item , dict ) :
llm = item . get ( " llm " )
if isinstance ( llm , list ) and len ( llm ) == 0 :
factories . append ( item [ " name " ] )
return factories
def _build_status_condition ( self ) - > str :
""" Build SQL WHERE condition for status filtering """
empty_factories = self . _get_empty_llm_factories ( )
if empty_factories :
placeholders = " , " . join ( f " ' { f } ' " for f in empty_factories )
return f " (tl.status = ' 0 ' OR (tl.status = ' 1 ' AND tl.llm_factory IN ( { placeholders } ))) "
return " tl.status = ' 0 ' "
2026-04-09 11:03:39 +08:00
def current_timestamp ( self ) - > int :
return int ( time . time ( ) )
def generate_uuid ( self ) - > str :
""" Generate 32-character UUID1 """
return uuid . uuid1 ( ) . hex
def check ( self ) - > bool :
""" Check if migration is needed """
# Check if source table exists
if not self . db . table_exists ( " tenant_llm " ) :
logger . warning ( " Source table ' tenant_llm ' does not exist " )
return False
# Check if tenant_model_provider exists (dependency)
if not self . db . table_exists ( " tenant_model_provider " ) :
if self . dry_run :
2026-07-03 12:53:39 +08:00
logger . info ( " [DRY RUN] Dependency table ' tenant_model_provider ' does not exist. Run ' tenant_model_provider ' stage first or use --execute. " )
2026-04-09 11:03:39 +08:00
return False
2026-07-03 12:53:39 +08:00
logger . warning ( " Dependency table ' tenant_model_provider ' does not exist. Please run ' tenant_model_provider ' stage first. " )
2026-04-09 11:03:39 +08:00
return False
# Check if tenant_model_instance exists (dependency)
if not self . db . table_exists ( " tenant_model_instance " ) :
if self . dry_run :
2026-07-03 12:53:39 +08:00
logger . info ( " [DRY RUN] Dependency table ' tenant_model_instance ' does not exist. Run ' tenant_model_instance ' stage first or use --execute. " )
2026-04-09 11:03:39 +08:00
return False
2026-07-03 12:53:39 +08:00
logger . warning ( " Dependency table ' tenant_model_instance ' does not exist. Please run ' tenant_model_instance ' stage first. " )
2026-04-09 11:03:39 +08:00
return False
# Check if target table exists
if not self . db . table_exists ( " tenant_model " ) :
if self . dry_run :
2026-07-03 12:53:39 +08:00
logger . info ( " [DRY RUN] Target table ' tenant_model ' does not exist. Use --execute to create and populate the table. " )
2026-04-09 11:03:39 +08:00
return False
logger . info ( " Target table ' tenant_model ' does not exist, will create " )
return True
2026-06-02 13:24:53 +08:00
status_condition = self . _build_status_condition ( )
# Check if there's data to migrate
2026-06-10 14:06:23 +08:00
# We cannot JOIN tenant_model_instance on api_key directly because the instance
# stage deduped api_keys (stripping is_tools), so a plain SQL equality won't
# match records whose api_key was merged. Count at the provider level instead.
2026-04-09 11:03:39 +08:00
cursor = self . db . execute_sql (
2026-06-02 13:24:53 +08:00
f " SELECT COUNT(*) FROM ( "
f " SELECT tl.id "
f " FROM tenant_llm tl "
f " INNER JOIN tenant_model_provider tmp ON tmp.tenant_id = tl.tenant_id AND tmp.provider_name = tl.llm_factory "
f " WHERE { status_condition } "
2026-06-10 14:06:23 +08:00
f " ) AS source_records "
2026-04-09 11:03:39 +08:00
)
count = cursor . fetchone ( ) [ 0 ]
if count == 0 :
2026-06-02 15:04:33 +08:00
self . mark_noop_completes_migration ( )
2026-06-02 13:24:53 +08:00
logger . info ( " No new data to migrate from tenant_llm to tenant_model " )
2026-04-09 11:03:39 +08:00
return False
logger . info ( f " Found { count } rows to migrate from tenant_llm to tenant_model " )
return True
def execute ( self ) - > tuple [ int , list ] :
""" Execute migration """
current_ts = self . current_timestamp ( )
rows_inserted = 0
# Check if tenant_model_provider exists (dependency)
if not self . db . table_exists ( " tenant_model_provider " ) :
2026-07-03 12:53:39 +08:00
logger . error ( " Dependency table ' tenant_model_provider ' does not exist. Please run ' tenant_model_provider ' stage first. " )
2026-04-09 11:03:39 +08:00
return 0 , [ ]
# Check if tenant_model_instance exists (dependency)
if not self . db . table_exists ( " tenant_model_instance " ) :
2026-07-03 12:53:39 +08:00
logger . error ( " Dependency table ' tenant_model_instance ' does not exist. Please run ' tenant_model_instance ' stage first. " )
2026-04-09 11:03:39 +08:00
return 0 , [ ]
# Check if target table exists
if not self . db . table_exists ( " tenant_model " ) :
if self . dry_run :
2026-07-03 12:53:39 +08:00
logger . info ( " [DRY RUN] Target table ' tenant_model ' does not exist. Use --execute to create and populate the table. " )
2026-04-09 11:03:39 +08:00
return 0 , [ ]
logger . info ( " Target table ' tenant_model ' does not exist, will create " )
self . create_target_table ( )
# If create_table_only mode, skip data migration
if self . create_table_only :
logger . info ( " [CREATE TABLE ONLY] Target table created/verified, skipping data migration " )
return 0 , self . target_tables
2026-06-02 13:24:53 +08:00
status_condition = self . _build_status_condition ( )
2026-06-10 14:06:23 +08:00
# Load all tenant_model_instance records into memory for Python-level matching.
# We cannot JOIN on api_key in SQL because the instance stage deduped api_keys
# (stripping is_tools), so a plain SQL equality won't match records whose
# api_key was merged during dedup.
instance_lookup = self . _build_instance_lookup ( )
# Get records from tenant_llm with provider_id lookup (no instance JOIN)
2026-06-02 13:24:53 +08:00
# Migrate status='0' records, plus status='1' for empty-llm factories
2026-04-09 11:03:39 +08:00
cursor = self . db . execute_sql (
2026-06-10 14:06:23 +08:00
f " SELECT tl.id, tl.llm_name, tmp.id as provider_id, "
2026-06-09 19:06:31 +08:00
f " tl.model_type, tl.status, tl.api_key "
2026-06-02 13:24:53 +08:00
f " FROM tenant_llm tl "
f " INNER JOIN tenant_model_provider tmp ON tmp.tenant_id = tl.tenant_id AND tmp.provider_name = tl.llm_factory "
f " WHERE { status_condition } "
f " AND NOT EXISTS ( "
f " SELECT 1 FROM tenant_model tm "
2026-06-10 14:06:23 +08:00
f " WHERE tm.provider_id = tmp.id AND tm.model_name = tl.llm_name "
2026-06-02 13:24:53 +08:00
f " ) "
2026-04-09 11:03:39 +08:00
)
records = cursor . fetchall ( )
if not records :
logger . info ( " No records to migrate " )
return 0 , [ ]
2026-06-10 14:06:23 +08:00
# Resolve instance_id for each record using Python-level canonical matching
resolved_records = self . _resolve_instance_ids ( records , instance_lookup )
if not resolved_records :
logger . info ( " No records with matching instance_id to migrate " )
return 0 , [ ]
logger . info ( f " Migrating { len ( resolved_records ) } tenant_model records... " )
2026-04-09 11:03:39 +08:00
if self . dry_run :
2026-06-10 14:06:23 +08:00
logger . info ( f " [DRY RUN] Would insert { len ( resolved_records ) } records " )
for source_id , llm_name , provider_id , instance_id , model_type , status , api_key in resolved_records [ : 5 ] :
2026-07-03 12:53:39 +08:00
logger . info ( f " model_name= { llm_name } , provider_id= { provider_id } , instance_id= { instance_id } , model_type= { model_type } " )
2026-06-10 14:06:23 +08:00
if len ( resolved_records ) > 5 :
logger . info ( f " ... and { len ( resolved_records ) - 5 } more records " )
return len ( resolved_records ) , self . target_tables
2026-04-09 11:03:39 +08:00
# Insert records in batches
batch_size = 100
2026-06-10 14:06:23 +08:00
for i in range ( 0 , len ( resolved_records ) , batch_size ) :
2026-07-03 12:53:39 +08:00
batch = resolved_records [ i : i + batch_size ]
2026-04-09 11:03:39 +08:00
values = [ ]
2026-06-09 19:06:31 +08:00
for source_id , llm_name , provider_id , instance_id , model_type , status , api_key in batch :
2026-04-09 11:03:39 +08:00
record_id = self . generate_uuid ( )
model_name_escaped = llm_name . replace ( " ' " , " ' ' " ) if llm_name else " "
model_type_escaped = model_type . replace ( " ' " , " ' ' " ) if model_type else " "
2026-05-29 17:39:41 +08:00
status_val = " active " if status in [ " 1 " , " active " , " enable " ] else " inactive "
2026-06-09 19:06:31 +08:00
# Extract is_tools from api_key JSON and put it in extra
extra = self . _extract_extra_from_api_key ( api_key )
extra_escaped = extra . replace ( " ' " , " ' ' " ) if extra else " {} "
2026-07-03 12:53:39 +08:00
values . append (
f " ( ' { record_id } ' , ' { model_name_escaped } ' , ' { provider_id } ' , "
f " ' { instance_id } ' , ' { model_type_escaped } ' , ' { status_val } ' , "
f " ' { extra_escaped } ' , "
f " { current_ts * 1000 } , FROM_UNIXTIME( { current_ts } ), "
f " { current_ts * 1000 } , FROM_UNIXTIME( { current_ts } )) "
)
2026-04-09 11:03:39 +08:00
insert_sql = f """
2026-07-03 12:53:39 +08:00
INSERT INTO tenant_model
2026-06-09 19:06:31 +08:00
( id , model_name , provider_id , instance_id , model_type , status , extra ,
2026-04-09 11:03:39 +08:00
create_time , create_date , update_time , update_date )
2026-07-03 12:53:39 +08:00
VALUES { " , " . join ( values ) }
2026-04-09 11:03:39 +08:00
"""
self . db . execute_sql ( insert_sql )
rows_inserted + = len ( batch )
logger . info ( f " Inserted batch { i / / batch_size + 1 } : { len ( batch ) } records " )
return rows_inserted , self . target_tables
2026-06-10 14:06:23 +08:00
def _build_instance_lookup ( self ) - > dict :
""" Load all tenant_model_instance records, indexed by (provider_id, canonical_api_key).
The canonical_api_key is computed by stripping is_tools from the stored api_key ,
matching the dedup logic used during the instance migration stage .
Returns :
dict mapping ( provider_id , canonical_api_key ) - > instance_id
"""
2026-07-03 12:53:39 +08:00
cursor = self . db . execute_sql ( " SELECT id, provider_id, api_key FROM tenant_model_instance " )
2026-06-10 14:06:23 +08:00
lookup = { }
for instance_id , provider_id , api_key in cursor . fetchall ( ) :
canonical = TenantModelInstanceStage . _strip_is_tools_from_api_key ( api_key )
lookup [ ( provider_id , canonical ) ] = instance_id
logger . info ( f " Loaded { len ( lookup ) } instance records for lookup " )
return lookup
@staticmethod
def _resolve_instance_ids ( records : list , instance_lookup : dict ) - > list :
""" Resolve instance_id for each tenant_llm record using canonical api_key matching.
Args :
records : list of tuples ( source_id , llm_name , provider_id , model_type , status , api_key )
instance_lookup : dict mapping ( provider_id , canonical_api_key ) - > instance_id
Returns :
list of tuples ( source_id , llm_name , provider_id , instance_id , model_type , status , api_key )
Only records with a matching instance_id are included .
"""
resolved = [ ]
skipped = 0
for source_id , llm_name , provider_id , model_type , status , api_key in records :
canonical = TenantModelInstanceStage . _strip_is_tools_from_api_key ( api_key )
instance_id = instance_lookup . get ( ( provider_id , canonical ) )
if instance_id :
resolved . append ( ( source_id , llm_name , provider_id , instance_id , model_type , status , api_key ) )
else :
skipped + = 1
fix(security): address 93 CodeQL code-scanning alerts across 61 files (#16407)
## Summary
Resolves all 93 open alerts at
https://github.com/infiniflow/ragflow/security/code-scanning by rule:
| Rule | Count | Treatment |
|------|-------|-----------|
| py/clear-text-logging-sensitive-data | 23 | Real fix — log scrubbing |
| go/path-injection | 15 | Real fix where possible, suppression with
rationale |
| go/request-forgery | 8 | Suppression with rationale
(operator-controlled URLs) |
| go/clear-text-logging | 10 | Real fix — log scrubbing |
| go/unsafe-quoting | 5 | Real fix — escape or refactor |
| go/sql-injection | 3 | Real fix — orderby whitelist + CodeQL comment |
| go/uncontrolled-allocation-size | 2 | Real fix — cap to 1024 |
| go/incorrect-integer-conversion | 3 | Real fix — ParseInt + range
check |
| go/insecure-hostkeycallback | 1 | Real fix — known_hosts file |
| go/disabled-certificate-check | 2 | Suppression with rationale |
| go/command-injection | 1 | Suppression (sanitized via shq()) |
| go/email-injection | 1 | Suppression with rationale |
| go/cookie-httponly-not-set | 1 | Suppression (SPA bootstrap) |
| js/stack-trace-exposure | 1 | Real fix — generic client message |
| js/prototype-pollution-utility | 1 | Real fix — reject
__proto__/constructor/prototype |
| py/weak-sensitive-data-hashing | 1 | Real fix — MD5 → SHA-256 |
| py/incomplete-url-substring-sanitization | 3 | Real fix —
urlparse(hostname) |
| py/paramiko-missing-host-key-validation | 1 | Real fix —
load_system_host_keys + RejectPolicy |
| cpp/integer-multiplication-cast-to-long | 2 | Real fix — cast to
size_t |
## Real fixes (with measurable security improvement)
**SSH host key verification (Go + Python)**
Replace `InsecureIgnoreHostKey()` / `paramiko.AutoAddPolicy()` with
proper host key verification against a known_hosts file (configurable
via `SSH_KNOWN_HOSTS` env / `known_hosts` config field; fail-closed when
unset). Loads `~/.ssh/known_hosts` first via `load_system_host_keys()`
so existing setups keep working.
**SQL injection in `user_canvas`**
Add `userCanvasOrderableColumns` whitelist + `userCanvasOrderClause`
helper. Both `GetList()` and `ListByTenantIDs()` now route the
user-supplied `orderby` query param through the helper, defaulting to
`create_time` on miss.
**SQL injection in `pipeline_operation_log`**
Existing whitelist documented via CodeQL comment.
**Real SQL injection in `infinity/chunk.go:931`**
Escape `'` → `''` on user-controlled `questionText` before splicing into
`filter_fulltext(...)` SQL filter.
**Real SQL injection in `elasticsearch/sql.go:75`**
Defense-in-depth escape on tokenizer output before splicing into
`MATCH(...)`.
**Python code injection in `result_protocol.go`**
Replace raw JSON literal embedding into Python/JS expressions with
base64 + `json.loads` / `JSON.parse(Buffer.from(...,
'base64').toString('utf8'))`. Eliminates both the unsafe-quoting sink
and the brittleness of mixing JSON true/false/null with Python syntax.
**URL substring check bypass in `embedding_model.py`**
Replace `if "dashscope-intl.aliyuncs.com" in u` with
`urlparse(u).hostname == "dashscope-intl.aliyuncs.com"` so a base_url
like `https://attacker.example/?u=dashscope-intl.aliyuncs.com` cannot
bypass the routing.
**Prototype pollution in `setNestedValue` (TS)**
Reject `__proto__`/`constructor`/`prototype` keys before any assignment.
**Integer overflow**
- scrypt params via `ParseInt` + non-positive check
(`internal/common/password.go`)
- `topN` and `n` caps to 1024 (retrieval_service.go, dataset.go)
- `nalloc*statesize` cast to `size_t` (cpp/re2/onepass.cc)
**Cookie httponly**
Set explicitly with rationale: this is the OAuth bootstrap cookie
intentionally read by the SPA.
**Stack trace exposure**
Replace `error.message` in HTTP 500 response with generic `"internal
error"`; full error still logged server-side via `console.error`.
**Weak hashing**
MD5 → SHA-256 for deterministic `conv_id` derivation
(`conversation_service.py`).
**Log scrubbing**
Remove or redact user-controlled / sensitive content from clear-text
logs across 8 ingestion parsers, `llm_service.py` ×11,
`tenant_llm_service.py` ×7, `misc_utils.py` ×4, `redis_conn.py` ×10,
`conftest.py` ×4, `init_data.py`, `dataset_api_service.py`,
`generator.py`, `mysql_migration.py`, `cli.go`, `user_command.go`,
`pdf_parser.go`. Most patterns converted to parameterized logging
(`logging.info("...: %d", n)`) or static messages.
## CodeQL suppressions (each with rationale)
For alerts where the data flow is genuinely safe but CodeQL can't see
the context — operator-controlled URLs, sanitized inputs, etc. — I added
`// codeql[go/<rule>] <rationale>` annotations rather than dismissing
them, so future readers can audit the rationale inline:
- `internal/agent/component/invoke.go:135` — Invoke is a generic canvas
HTTP client
- `internal/service/langfuse.go` ×2 — host is per-tenant operator config
- `internal/service/file.go:1184` — already SSRF-guarded by
`assertURLSafe`
- `internal/utility/mcp_client.go` ×3 — already `AssertURLSafe` +
IP-pinned
- `internal/entity/models/bedrock.go` — sigv4-signed request, URL can't
be tampered
- `internal/service/deep_researcher.go:269` — `callback` is SSE display
string, not SQL
- `internal/engine/infinity/chunk.go:346` — UUIDs can't contain `'` (RFC
4122)
- `internal/cli/common_command.go` ×2 — CLI trusts operator-configured
URL
- `internal/utility/smtp.go:194` — msg is server-built, not user form
input
- `internal/entity/models/*` ×14 (path-injection) — audio file paths are
caller-supplied
## Test plan
- ✅ All 13 modified Go packages build cleanly
- ✅ 663 tests pass across `internal/agent/sandbox`, `internal/common`,
`internal/agent/component`, `internal/engine/infinity`, `internal/dao`
- ✅ All 11 modified Python files parse via `ast.parse`
- ✅ TypeScript `tsc --noEmit` clean on the modified
`use-provider-fields.tsx`
- ✅ `node --check` clean on the modified JS file
🤖 Generated with [Claude Code](https://claude.com/claude-code)
2026-06-27 19:48:29 +08:00
# Don't include the API key (even truncated) in the log:
# CodeQL flags this as clear-text-logging-sensitive-data,
# and the first 30 chars of an API key often carry enough
# entropy to be useful to an attacker who reads the log.
2026-06-10 14:06:23 +08:00
logger . warning (
fix(security): address 93 CodeQL code-scanning alerts across 61 files (#16407)
## Summary
Resolves all 93 open alerts at
https://github.com/infiniflow/ragflow/security/code-scanning by rule:
| Rule | Count | Treatment |
|------|-------|-----------|
| py/clear-text-logging-sensitive-data | 23 | Real fix — log scrubbing |
| go/path-injection | 15 | Real fix where possible, suppression with
rationale |
| go/request-forgery | 8 | Suppression with rationale
(operator-controlled URLs) |
| go/clear-text-logging | 10 | Real fix — log scrubbing |
| go/unsafe-quoting | 5 | Real fix — escape or refactor |
| go/sql-injection | 3 | Real fix — orderby whitelist + CodeQL comment |
| go/uncontrolled-allocation-size | 2 | Real fix — cap to 1024 |
| go/incorrect-integer-conversion | 3 | Real fix — ParseInt + range
check |
| go/insecure-hostkeycallback | 1 | Real fix — known_hosts file |
| go/disabled-certificate-check | 2 | Suppression with rationale |
| go/command-injection | 1 | Suppression (sanitized via shq()) |
| go/email-injection | 1 | Suppression with rationale |
| go/cookie-httponly-not-set | 1 | Suppression (SPA bootstrap) |
| js/stack-trace-exposure | 1 | Real fix — generic client message |
| js/prototype-pollution-utility | 1 | Real fix — reject
__proto__/constructor/prototype |
| py/weak-sensitive-data-hashing | 1 | Real fix — MD5 → SHA-256 |
| py/incomplete-url-substring-sanitization | 3 | Real fix —
urlparse(hostname) |
| py/paramiko-missing-host-key-validation | 1 | Real fix —
load_system_host_keys + RejectPolicy |
| cpp/integer-multiplication-cast-to-long | 2 | Real fix — cast to
size_t |
## Real fixes (with measurable security improvement)
**SSH host key verification (Go + Python)**
Replace `InsecureIgnoreHostKey()` / `paramiko.AutoAddPolicy()` with
proper host key verification against a known_hosts file (configurable
via `SSH_KNOWN_HOSTS` env / `known_hosts` config field; fail-closed when
unset). Loads `~/.ssh/known_hosts` first via `load_system_host_keys()`
so existing setups keep working.
**SQL injection in `user_canvas`**
Add `userCanvasOrderableColumns` whitelist + `userCanvasOrderClause`
helper. Both `GetList()` and `ListByTenantIDs()` now route the
user-supplied `orderby` query param through the helper, defaulting to
`create_time` on miss.
**SQL injection in `pipeline_operation_log`**
Existing whitelist documented via CodeQL comment.
**Real SQL injection in `infinity/chunk.go:931`**
Escape `'` → `''` on user-controlled `questionText` before splicing into
`filter_fulltext(...)` SQL filter.
**Real SQL injection in `elasticsearch/sql.go:75`**
Defense-in-depth escape on tokenizer output before splicing into
`MATCH(...)`.
**Python code injection in `result_protocol.go`**
Replace raw JSON literal embedding into Python/JS expressions with
base64 + `json.loads` / `JSON.parse(Buffer.from(...,
'base64').toString('utf8'))`. Eliminates both the unsafe-quoting sink
and the brittleness of mixing JSON true/false/null with Python syntax.
**URL substring check bypass in `embedding_model.py`**
Replace `if "dashscope-intl.aliyuncs.com" in u` with
`urlparse(u).hostname == "dashscope-intl.aliyuncs.com"` so a base_url
like `https://attacker.example/?u=dashscope-intl.aliyuncs.com` cannot
bypass the routing.
**Prototype pollution in `setNestedValue` (TS)**
Reject `__proto__`/`constructor`/`prototype` keys before any assignment.
**Integer overflow**
- scrypt params via `ParseInt` + non-positive check
(`internal/common/password.go`)
- `topN` and `n` caps to 1024 (retrieval_service.go, dataset.go)
- `nalloc*statesize` cast to `size_t` (cpp/re2/onepass.cc)
**Cookie httponly**
Set explicitly with rationale: this is the OAuth bootstrap cookie
intentionally read by the SPA.
**Stack trace exposure**
Replace `error.message` in HTTP 500 response with generic `"internal
error"`; full error still logged server-side via `console.error`.
**Weak hashing**
MD5 → SHA-256 for deterministic `conv_id` derivation
(`conversation_service.py`).
**Log scrubbing**
Remove or redact user-controlled / sensitive content from clear-text
logs across 8 ingestion parsers, `llm_service.py` ×11,
`tenant_llm_service.py` ×7, `misc_utils.py` ×4, `redis_conn.py` ×10,
`conftest.py` ×4, `init_data.py`, `dataset_api_service.py`,
`generator.py`, `mysql_migration.py`, `cli.go`, `user_command.go`,
`pdf_parser.go`. Most patterns converted to parameterized logging
(`logging.info("...: %d", n)`) or static messages.
## CodeQL suppressions (each with rationale)
For alerts where the data flow is genuinely safe but CodeQL can't see
the context — operator-controlled URLs, sanitized inputs, etc. — I added
`// codeql[go/<rule>] <rationale>` annotations rather than dismissing
them, so future readers can audit the rationale inline:
- `internal/agent/component/invoke.go:135` — Invoke is a generic canvas
HTTP client
- `internal/service/langfuse.go` ×2 — host is per-tenant operator config
- `internal/service/file.go:1184` — already SSRF-guarded by
`assertURLSafe`
- `internal/utility/mcp_client.go` ×3 — already `AssertURLSafe` +
IP-pinned
- `internal/entity/models/bedrock.go` — sigv4-signed request, URL can't
be tampered
- `internal/service/deep_researcher.go:269` — `callback` is SSE display
string, not SQL
- `internal/engine/infinity/chunk.go:346` — UUIDs can't contain `'` (RFC
4122)
- `internal/cli/common_command.go` ×2 — CLI trusts operator-configured
URL
- `internal/utility/smtp.go:194` — msg is server-built, not user form
input
- `internal/entity/models/*` ×14 (path-injection) — audio file paths are
caller-supplied
## Test plan
- ✅ All 13 modified Go packages build cleanly
- ✅ 663 tests pass across `internal/agent/sandbox`, `internal/common`,
`internal/agent/component`, `internal/engine/infinity`, `internal/dao`
- ✅ All 11 modified Python files parse via `ast.parse`
- ✅ TypeScript `tsc --noEmit` clean on the modified
`use-provider-fields.tsx`
- ✅ `node --check` clean on the modified JS file
🤖 Generated with [Claude Code](https://claude.com/claude-code)
2026-06-27 19:48:29 +08:00
" No matching instance for tenant_llm id= %s provider_id= %s llm_name= %s " ,
2026-07-03 12:53:39 +08:00
source_id ,
provider_id ,
llm_name ,
2026-06-10 14:06:23 +08:00
)
if skipped > 0 :
logger . warning ( f " Skipped { skipped } records with no matching instance_id " )
return resolved
2026-06-09 19:06:31 +08:00
@staticmethod
def _extract_extra_from_api_key ( api_key : str ) - > str :
""" Extract is_tools from api_key JSON and return an extra JSON string for tenant_model.
If api_key is a JSON dict containing " is_tools " : true , return ' { " is_tools " : true} ' .
Otherwise return ' {} ' ( empty dict ) .
"""
if not api_key :
return " {} "
try :
parsed = json . loads ( api_key )
except ( json . JSONDecodeError , TypeError , ValueError ) :
return " {} "
if not isinstance ( parsed , dict ) :
return " {} "
if parsed . get ( " is_tools " ) is True :
return json . dumps ( { " is_tools " : True } )
return " {} "
2026-04-09 11:03:39 +08:00
def create_target_table ( self ) :
""" Create tenant_model table """
create_sql = """
CREATE TABLE IF NOT EXISTS tenant_model (
id VARCHAR ( 32 ) NOT NULL PRIMARY KEY ,
model_name VARCHAR ( 128 ) ,
provider_id VARCHAR ( 32 ) NOT NULL ,
instance_id VARCHAR ( 32 ) NOT NULL ,
model_type VARCHAR ( 32 ) NOT NULL ,
status VARCHAR ( 32 ) DEFAULT ' active ' ,
2026-05-29 17:39:41 +08:00
extra VARCHAR ( 1024 ) DEFAULT ' {} ' ,
2026-04-09 11:03:39 +08:00
create_time BIGINT ,
create_date DATETIME ,
update_time BIGINT ,
update_date DATETIME ,
2026-06-09 17:54:18 +08:00
INDEX idx_instance_id ( instance_id )
2026-04-09 11:03:39 +08:00
) ENGINE = InnoDB DEFAULT CHARSET = utf8mb4
"""
self . db . execute_sql ( create_sql )
logger . info ( " Created tenant_model table " )
2026-06-02 13:08:58 +08:00
class ModelIdConfigStage ( MigrationStage ) :
""" Normalize stored model IDs from model@provider to model@default@provider. """
name = " model_id_config "
description = " Normalize stored model IDs in config columns to model@default@provider "
source_tables = [
" tenant " ,
" knowledgebase " ,
" document " ,
" dialog " ,
" memory " ,
" search " ,
" user_canvas " ,
" canvas_template " ,
" user_canvas_version " ,
" api_4_conversation " ,
" pipeline_operation_log " ,
" connector " ,
" evaluation_runs " ,
]
target_tables = source_tables
model_id_fields = {
" llm_id " ,
" embd_id " ,
" embedding_model " ,
" rerank_id " ,
" asr_id " ,
" img2txt_id " ,
" tts_id " ,
" ocr_id " ,
}
search_config_model_id_fields = { " chat_id " }
scan_batch_size = 500
string_columns = {
" tenant " : ( " llm_id " , " embd_id " , " asr_id " , " img2txt_id " , " rerank_id " , " tts_id " , " ocr_id " ) ,
" knowledgebase " : ( " embd_id " , ) ,
" dialog " : ( " llm_id " , " rerank_id " ) ,
" memory " : ( " embd_id " , " llm_id " ) ,
}
json_columns = {
" knowledgebase " : ( " parser_config " , ) ,
" document " : ( " parser_config " , ) ,
" search " : ( " search_config " , ) ,
" user_canvas " : ( " dsl " , ) ,
" canvas_template " : ( " dsl " , ) ,
" user_canvas_version " : ( " dsl " , ) ,
" api_4_conversation " : ( " dsl " , ) ,
" pipeline_operation_log " : ( " dsl " , ) ,
" connector " : ( " config " , ) ,
" evaluation_runs " : ( " config_snapshot " , ) ,
}
def normalize_model_id ( self , value ) :
if not isinstance ( value , str ) or not value :
return value , False
parts = value . split ( " @ " )
if len ( parts ) != 2 :
return value , False
model_name , provider_name = parts
if not model_name or not provider_name :
return value , False
return f " { model_name } @default@ { provider_name } " , True
def normalize_config ( self , value , path = None ) :
path = path or ( )
if isinstance ( value , dict ) :
changed = False
normalized = { }
for key , item in value . items ( ) :
key_path = path + ( str ( key ) , )
2026-07-03 12:53:39 +08:00
should_normalize = key in self . model_id_fields or ( key in self . search_config_model_id_fields and " search_config " in path )
2026-06-02 13:08:58 +08:00
if should_normalize :
normalized_item , item_changed = self . normalize_model_id ( item )
else :
normalized_item , item_changed = self . normalize_config ( item , key_path )
normalized [ key ] = normalized_item
changed = changed or item_changed
return normalized , changed
if isinstance ( value , list ) :
changed = False
normalized = [ ]
for index , item in enumerate ( value ) :
normalized_item , item_changed = self . normalize_config ( item , path + ( str ( index ) , ) )
normalized . append ( normalized_item )
changed = changed or item_changed
return normalized , changed
return value , False
def existing_columns ( self , table_columns ) :
for table_name , columns in table_columns . items ( ) :
if not self . db . table_exists ( table_name ) :
logger . info ( " Table ' %s ' does not exist, skipping " , table_name )
continue
for column_name in columns :
if not self . db . column_exists ( table_name , column_name ) :
logger . info ( " Column ' %s . %s ' does not exist, skipping " , table_name , column_name )
continue
yield table_name , column_name
def load_json_value ( self , raw_value , table_name , column_name , row_id ) :
if raw_value in ( None , " " ) :
return None , False
if isinstance ( raw_value , ( dict , list ) ) :
return raw_value , True
try :
return json . loads ( raw_value ) , True
except ( TypeError , json . JSONDecodeError ) :
logger . warning (
" Failed to parse JSON in %s . %s id= %s , skipping " ,
table_name ,
column_name ,
row_id ,
)
return None , False
def iter_string_changes ( self ) :
for table_name , column_name in self . existing_columns ( self . string_columns ) :
cursor = self . db . execute_sql (
2026-07-03 12:53:39 +08:00
f " SELECT id, ` { column_name } ` FROM ` { table_name } ` WHERE ` { column_name } ` IS NOT NULL AND ` { column_name } ` != ' ' AND ` { column_name } ` LIKE %s " ,
2026-06-02 13:08:58 +08:00
( " % @ % " , ) ,
)
while True :
rows = cursor . fetchmany ( self . scan_batch_size )
if not rows :
break
for row_id , value in rows :
normalized , changed = self . normalize_model_id ( value )
if changed :
yield table_name , column_name , row_id , normalized
def iter_json_changes ( self ) :
for table_name , column_name in self . existing_columns ( self . json_columns ) :
cursor = self . db . execute_sql (
2026-07-03 12:53:39 +08:00
f " SELECT id, ` { column_name } ` FROM ` { table_name } ` WHERE ` { column_name } ` IS NOT NULL AND ` { column_name } ` != ' ' AND ` { column_name } ` LIKE %s " ,
2026-06-02 13:08:58 +08:00
( " % @ % " , ) ,
)
while True :
rows = cursor . fetchmany ( self . scan_batch_size )
if not rows :
break
for row_id , raw_value in rows :
config , loaded = self . load_json_value ( raw_value , table_name , column_name , row_id )
if not loaded :
continue
normalized , changed = self . normalize_config ( config , ( column_name , ) )
if changed :
normalized_json = json . dumps (
normalized ,
ensure_ascii = False ,
separators = ( " , " , " : " ) ,
)
yield table_name , column_name , row_id , normalized_json
def count_changes ( self ) - > tuple [ int , set ] :
rows = 0
tables = set ( )
for table_name , _ , _ , _ in self . iter_string_changes ( ) :
rows + = 1
tables . add ( table_name )
for table_name , _ , _ , _ in self . iter_json_changes ( ) :
rows + = 1
tables . add ( table_name )
return rows , tables
def check ( self ) - > bool :
rows , tables = self . count_changes ( )
if rows == 0 :
2026-06-02 15:04:33 +08:00
self . mark_noop_completes_migration ( )
2026-06-02 13:08:58 +08:00
logger . info ( " No stored model IDs need normalization " )
return False
logger . info (
" Found %s rows to normalize across tables: %s " ,
rows ,
" , " . join ( sorted ( tables ) ) ,
)
return True
def execute ( self ) - > tuple [ int , list ] :
if self . create_table_only :
logger . info ( " [CREATE TABLE ONLY] No tables are created for this data migration " )
return 0 , [ ]
rows_updated = 0
tables_operated = set ( )
for table_name , column_name , row_id , normalized in self . iter_string_changes ( ) :
tables_operated . add ( table_name )
rows_updated + = 1
if rows_updated < = 10 :
logger . info (
" %s %s . %s id= %s -> %s " ,
" [DRY RUN] Would update " if self . dry_run else " Updating " ,
table_name ,
column_name ,
row_id ,
normalized ,
)
if not self . dry_run :
self . db . execute_sql (
f " UPDATE ` { table_name } ` SET ` { column_name } ` = %s WHERE id = %s " ,
( normalized , row_id ) ,
)
for table_name , column_name , row_id , normalized_json in self . iter_json_changes ( ) :
tables_operated . add ( table_name )
rows_updated + = 1
if rows_updated < = 10 :
logger . info (
" %s %s . %s id= %s " ,
" [DRY RUN] Would update " if self . dry_run else " Updating " ,
table_name ,
column_name ,
row_id ,
)
if not self . dry_run :
self . db . execute_sql (
f " UPDATE ` { table_name } ` SET ` { column_name } ` = %s WHERE id = %s " ,
( normalized_json , row_id ) ,
)
if rows_updated > 10 :
logger . info ( " ... and %s more row updates " , rows_updated - 10 )
if self . dry_run :
logger . info ( " [DRY RUN] Would update %s rows " , rows_updated )
else :
logger . info ( " Updated %s rows " , rows_updated )
return rows_updated , sorted ( tables_operated )
2026-04-03 20:01:37 +08:00
# Registry of available migration stages
MIGRATION_STAGES = {
2026-07-03 12:53:39 +08:00
" tenant_model_provider " : TenantModelProviderStage ,
" tenant_model_instance " : TenantModelInstanceStage ,
" tenant_model " : TenantModelStage ,
" model_id_config " : ModelIdConfigStage ,
2026-04-03 20:01:37 +08:00
}
def list_available_stages ( ) :
""" List all available migration stages """
logger . info ( " Available migration stages: " )
for name , stage_cls in MIGRATION_STAGES . items ( ) :
logger . info ( f " - { name } : { stage_cls . description } " )
logger . info ( f " Source tables: { stage_cls . source_tables } " )
logger . info ( f " Target tables: { stage_cls . target_tables } " )
2026-06-03 11:51:42 +08:00
def run_migration (
config : MigrationConfig ,
stages : list ,
dry_run : bool = True ,
create_table_only : bool = False ,
database_version : str | None = None ,
mark_database_version_on_success : bool = False ,
) :
2026-04-03 20:01:37 +08:00
""" Run migration with specified stages """
stats = MigrationStats ( )
stats . start ( )
2026-07-03 12:53:39 +08:00
2026-04-03 20:01:37 +08:00
db = MigrationDatabase ( config )
2026-07-03 12:53:39 +08:00
2026-04-03 20:01:37 +08:00
try :
db . connect ( )
2026-06-03 11:51:42 +08:00
if database_version :
current_db_version = db . get_database_version ( )
if should_skip_migration ( current_db_version , database_version ) :
logger . info (
" Database migration version is %s , target version is %s , skipping all stages " ,
current_db_version ,
database_version ,
)
return
if current_db_version :
logger . info (
" Current database migration version is %s , target version is %s " ,
current_db_version ,
database_version ,
)
else :
logger . info (
" Database migration version marker is not set, target version is %s " ,
database_version ,
)
2026-07-03 12:53:39 +08:00
2026-04-03 20:01:37 +08:00
total_stages = len ( stages )
2026-06-03 11:51:42 +08:00
all_stages_completed = True
2026-07-03 12:53:39 +08:00
2026-04-03 20:01:37 +08:00
for idx , stage_name in enumerate ( stages , 1 ) :
2026-04-09 11:03:39 +08:00
logger . info ( f " { ' = ' * 60 } " )
2026-04-03 20:01:37 +08:00
logger . info ( f " Stage [ { idx } / { total_stages } ]: { stage_name } " )
logger . info ( f " { ' = ' * 60 } " )
2026-07-03 12:53:39 +08:00
2026-04-03 20:01:37 +08:00
if stage_name not in MIGRATION_STAGES :
logger . error ( f " Unknown stage: { stage_name } " )
stats . add_stage_stats ( stage_name , [ ] , 0 , 0 )
2026-06-03 11:51:42 +08:00
all_stages_completed = False
2026-04-03 20:01:37 +08:00
continue
2026-07-03 12:53:39 +08:00
2026-04-03 20:01:37 +08:00
stage_cls = MIGRATION_STAGES [ stage_name ]
stage = stage_cls ( db , dry_run = dry_run , create_table_only = create_table_only )
2026-07-03 12:53:39 +08:00
2026-04-03 20:01:37 +08:00
stage_start = time . time ( )
2026-07-03 12:53:39 +08:00
2026-04-03 20:01:37 +08:00
# For create_table_only mode, skip check and directly execute
if create_table_only :
logger . info ( " [CREATE TABLE ONLY] Skipping check, will create/verify target table " )
rows , tables = stage . execute ( )
else :
# Check if migration is needed
if not stage . check ( ) :
logger . info ( f " Stage ' { stage_name } ' check: no migration needed " )
stats . add_stage_stats ( stage_name , [ ] , 0 , time . time ( ) - stage_start )
continue
2026-07-03 12:53:39 +08:00
2026-04-03 20:01:37 +08:00
# Execute migration
rows , tables = stage . execute ( )
2026-07-03 12:53:39 +08:00
2026-04-03 20:01:37 +08:00
stage_duration = time . time ( ) - stage_start
2026-07-03 12:53:39 +08:00
2026-04-03 20:01:37 +08:00
stats . add_stage_stats ( stage_name , tables , rows , stage_duration )
logger . info ( f " Stage ' { stage_name } ' completed: { rows } rows in { stage_duration : .2f } s " )
2026-06-03 11:51:42 +08:00
2026-07-03 12:53:39 +08:00
if mark_database_version_on_success and not dry_run and not create_table_only and database_version and all_stages_completed :
2026-06-03 11:51:42 +08:00
db . set_database_version ( database_version )
logger . info ( " Marked database migration version as %s " , database_version )
2026-04-03 20:01:37 +08:00
finally :
db . close ( )
stats . end ( )
stats . print_summary ( )
2026-06-03 11:51:42 +08:00
def check_database_version ( config : MigrationConfig , target_version : str ) - > int :
db = MigrationDatabase ( config )
try :
db . connect ( )
current_db_version = db . get_database_version ( )
if should_skip_migration ( current_db_version , target_version ) :
logger . info (
" Database migration version is %s , target version is %s , migration is not needed " ,
current_db_version ,
target_version ,
)
return 0
if current_db_version :
logger . info (
" Database migration version is %s , target version is %s , migration is needed " ,
current_db_version ,
target_version ,
)
else :
logger . info (
" Database migration version marker is not set, target version is %s , migration is needed " ,
target_version ,
)
return 1
finally :
db . close ( )
def mark_database_version ( config : MigrationConfig , version : str ) - > None :
db = MigrationDatabase ( config )
try :
db . connect ( )
db . set_database_version ( version )
logger . info ( " Marked database migration version as %s " , version )
finally :
db . close ( )
2026-04-03 20:01:37 +08:00
def main ( ) :
parser = argparse . ArgumentParser (
2026-07-03 12:53:39 +08:00
description = " MySQL Data Migration Tool " ,
2026-04-03 20:01:37 +08:00
formatter_class = argparse . RawDescriptionHelpFormatter ,
epilog = """
Examples :
# List available stages
python mysql_migration . py - - list - stages
2026-06-03 11:51:42 +08:00
# Check whether migration is needed for a target version
2026-07-02 20:55:15 +08:00
python mysql_migration . py - - check - database - version - - database - version v0 .26 .3 - - config / path / to / config . yaml
2026-06-03 11:51:42 +08:00
# Mark database version separately
2026-07-02 20:55:15 +08:00
python mysql_migration . py - - mark - database - version - - database - version v0 .26 .3 - - config / path / to / config . yaml
2026-07-03 12:53:39 +08:00
2026-04-13 20:45:11 +08:00
# Dry run (default - check only, no write) with config file
2026-04-03 20:01:37 +08:00
python mysql_migration . py - - stages tenant_model_provider - - config / path / to / config . yaml
2026-07-03 12:53:39 +08:00
2026-04-13 20:45:11 +08:00
# Dry run with command line MySQL connection
python mysql_migration . py - - stages tenant_model_provider - - host localhost - - port 3306 - - user root - - password secret
2026-07-03 12:53:39 +08:00
2026-04-03 20:01:37 +08:00
# Create target tables only (no data migration)
python mysql_migration . py - - stages tenant_model_provider - - config / path / to / config . yaml - - create - table - only
2026-07-03 12:53:39 +08:00
2026-04-03 20:01:37 +08:00
# Execute full migration (create tables and migrate data)
python mysql_migration . py - - stages tenant_model_provider - - config / path / to / config . yaml - - execute
2026-06-03 11:51:42 +08:00
2026-07-02 20:55:15 +08:00
# Execute migration only when database version is lower than v0.26.3
python mysql_migration . py - - stages tenant_model_provider - - config / path / to / config . yaml - - execute - - database - version v0 .26 .3
2026-06-03 11:51:42 +08:00
# Execute migration and mark the database version when all stages succeed
2026-07-02 20:55:15 +08:00
python mysql_migration . py - - stages tenant_model_provider , tenant_model_instance , tenant_model , model_id_config - - config / path / to / config . yaml - - execute - - database - version v0 .26 .3 - - mark - database - version - on - success
2026-07-03 12:53:39 +08:00
2026-06-02 13:08:58 +08:00
# Normalize legacy model IDs in stored configs
python mysql_migration . py - - stages model_id_config - - config / path / to / config . yaml - - execute
2026-04-03 20:01:37 +08:00
# Run multiple stages
python mysql_migration . py - - stages stage1 , stage2 , stage3 - - config / path / to / config . yaml - - execute
2026-07-03 12:53:39 +08:00
""" ,
2026-04-03 20:01:37 +08:00
)
2026-07-03 12:53:39 +08:00
2026-04-13 20:45:11 +08:00
# MySQL connection options
2026-07-03 12:53:39 +08:00
parser . add_argument ( " --host " , type = str , default = " localhost " , help = " MySQL host (default: localhost) " )
parser . add_argument ( " --port " , type = int , default = 3306 , help = " MySQL port (default: 3306) " )
parser . add_argument ( " --user " , type = str , default = " root " , help = " MySQL user (default: root) " )
parser . add_argument ( " --password " , type = str , default = " " , help = " MySQL password (default: empty) " )
parser . add_argument ( " --database " , type = str , default = " rag_flow " , help = " MySQL database name (default: rag_flow) " )
2026-04-03 20:01:37 +08:00
# Configuration options
2026-07-03 12:53:39 +08:00
parser . add_argument ( " --config " , " -c " , type = str , help = " Path to YAML config file " )
2026-04-03 20:01:37 +08:00
# Migration options
2026-07-03 12:53:39 +08:00
parser . add_argument ( " --stages " , " -s " , type = str , help = " Comma-separated list of stages to run " )
parser . add_argument ( " --list-stages " , " -l " , action = " store_true " , help = " List available stages " )
parser . add_argument ( " --check-database-version " , action = " store_true " , help = " Check whether migration is needed for the target database version " )
parser . add_argument ( " --mark-database-version " , action = " store_true " , help = " Write the database migration version marker and exit " )
parser . add_argument ( " --database-version " , type = str , metavar = " VERSION " , help = " Database migration version used by check/mark commands and as the migration threshold for --stages " )
parser . add_argument ( " --mark-database-version-on-success " , action = " store_true " , help = " When used with --stages and --execute, write --database-version after all stages succeed " )
parser . add_argument ( " --execute " , " -e " , action = " store_true " , default = False , help = " Execute full migration: create tables and migrate data " )
parser . add_argument ( " --create-table-only " , action = " store_true " , default = False , help = " Only create target tables, skip data migration " )
2026-04-03 20:01:37 +08:00
args = parser . parse_args ( )
2026-07-03 12:53:39 +08:00
2026-04-03 20:01:37 +08:00
# List stages and exit
if args . list_stages :
list_available_stages ( )
return
2026-07-03 12:53:39 +08:00
2026-04-13 20:45:11 +08:00
# Load configuration: command line args take precedence over config file
2026-04-03 20:01:37 +08:00
if args . config :
config = MigrationConfig . from_config_file ( args . config )
2026-04-13 20:45:11 +08:00
# Override with command line args if provided
2026-07-03 12:53:39 +08:00
if args . host != " localhost " :
2026-04-13 20:45:11 +08:00
config . host = args . host
if args . port != 3306 :
config . port = args . port
2026-07-03 12:53:39 +08:00
if args . user != " root " :
2026-04-13 20:45:11 +08:00
config . user = args . user
2026-07-03 12:53:39 +08:00
if args . password != " " :
2026-04-13 20:45:11 +08:00
config . password = args . password
2026-07-03 12:53:39 +08:00
if args . database != " rag_flow " :
2026-04-13 20:45:11 +08:00
config . database = args . database
2026-04-03 20:01:37 +08:00
else :
2026-04-13 20:45:11 +08:00
# Use command line args directly
2026-07-03 12:53:39 +08:00
config = MigrationConfig ( host = args . host , port = args . port , user = args . user , password = args . password , database = args . database )
logger . info ( f " MySQL Configuration: host= { config . host } , port= { config . port } , user= { config . user } , database= { config . database } " )
2026-06-03 11:51:42 +08:00
if args . check_database_version and args . mark_database_version :
logger . error ( " --check-database-version and --mark-database-version are mutually exclusive " )
sys . exit ( 1 )
if args . check_database_version :
if not args . database_version :
logger . error ( " --check-database-version requires --database-version " )
sys . exit ( 1 )
sys . exit ( check_database_version ( config , args . database_version ) )
if args . mark_database_version :
if not args . database_version :
logger . error ( " --mark-database-version requires --database-version " )
sys . exit ( 1 )
mark_database_version ( config , args . database_version )
return
if args . mark_database_version_on_success and not args . database_version :
logger . error ( " --mark-database-version-on-success requires --database-version " )
sys . exit ( 1 )
2026-07-03 12:53:39 +08:00
2026-04-03 20:01:37 +08:00
# Three mutually exclusive modes: dry-run (default), create-table-only, execute
if args . execute and args . create_table_only :
logger . error ( " --execute and --create-table-only are mutually exclusive " )
sys . exit ( 1 )
2026-06-03 11:51:42 +08:00
if not args . stages :
logger . error ( " No stages specified. Use --stages to specify stages or --list-stages to see available stages. " )
sys . exit ( 1 )
2026-07-03 12:53:39 +08:00
stages = [ s . strip ( ) for s in args . stages . split ( " , " ) ]
2026-04-03 20:01:37 +08:00
dry_run = True
create_table_only = False
2026-07-03 12:53:39 +08:00
2026-04-03 20:01:37 +08:00
if args . create_table_only :
logger . info ( " Running in CREATE TABLE ONLY mode (create tables, no data migration) " )
dry_run = False
create_table_only = True
elif args . execute :
logger . info ( " Running in EXECUTE mode (create tables and migrate data) " )
dry_run = False
else :
2026-07-03 12:53:39 +08:00
logger . info ( " Running in DRY-RUN mode (check only, no write). Use --create-table-only to create tables, or --execute for full migration. " )
2026-04-03 20:01:37 +08:00
run_migration (
config = config ,
stages = stages ,
dry_run = dry_run ,
2026-06-03 11:51:42 +08:00
create_table_only = create_table_only ,
database_version = args . database_version ,
mark_database_version_on_success = args . mark_database_version_on_success ,
2026-04-03 20:01:37 +08:00
)
2026-07-03 12:53:39 +08:00
if __name__ == " __main__ " :
2026-04-03 20:01:37 +08:00
main ( )