MySQL集群配置与高可用架构实战
Orion K Lv6

MySQL集群配置与高可用架构实战

在现代企业级应用中,MySQL集群和高可用架构是保障业务连续性的关键技术。本文将详细介绍MySQL集群的配置、管理和优化实践。

集群架构设计

主从复制架构

1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
-- 主库配置
-- 1. 修改主库配置文件 /etc/mysql/mysql.conf.d/mysqld.cnf
[mysqld]
server-id = 1
log-bin = mysql-bin
binlog-format = ROW
binlog-do-db = production_db
binlog-ignore-db = mysql,information_schema,performance_schema

-- 2. 创建复制用户
CREATE USER 'repl'@'%' IDENTIFIED BY 'repl123';
GRANT REPLICATION SLAVE ON *.* TO 'repl'@'%';
FLUSH PRIVILEGES;

-- 3. 查看主库状态
SHOW MASTER STATUS;
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
-- 从库配置
-- 1. 修改从库配置文件
[mysqld]
server-id = 2
relay-log = mysql-relay-bin
read-only = 1
log-slave-updates = 1

-- 2. 配置主从复制
CHANGE MASTER TO
MASTER_HOST='192.168.1.100',
MASTER_USER='repl',
MASTER_PASSWORD='repl123',
MASTER_LOG_FILE='mysql-bin.000001',
MASTER_LOG_POS=154;

-- 3. 启动从库复制
START SLAVE;

-- 4. 检查复制状态
SHOW SLAVE STATUS\G

复制状态监控

1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
-- 创建复制监控存储过程
DELIMITER //
CREATE PROCEDURE CheckReplicationStatus()
BEGIN
DECLARE slave_io_running VARCHAR(10);
DECLARE slave_sql_running VARCHAR(10);
DECLARE seconds_behind_master INT;
DECLARE last_error TEXT;

-- 获取复制状态
SELECT
Slave_IO_Running,
Slave_SQL_Running,
Seconds_Behind_Master,
Last_Error
INTO
slave_io_running,
slave_sql_running,
seconds_behind_master,
last_error
FROM information_schema.REPLICA_HOST_STATUS
WHERE Channel_Name = '';

-- 输出状态信息
SELECT
slave_io_running AS 'IO线程状态',
slave_sql_running AS 'SQL线程状态',
seconds_behind_master AS '延迟秒数',
CASE
WHEN slave_io_running = 'Yes' AND slave_sql_running = 'Yes' THEN '正常'
ELSE '异常'
END AS '复制状态',
last_error AS '最后错误';

-- 记录监控日志
INSERT INTO replication_monitor_log (
check_time,
io_running,
sql_running,
lag_seconds,
status,
error_message
) VALUES (
NOW(),
slave_io_running,
slave_sql_running,
seconds_behind_master,
CASE
WHEN slave_io_running = 'Yes' AND slave_sql_running = 'Yes' THEN 'OK'
ELSE 'ERROR'
END,
last_error
);
END //
DELIMITER ;

-- 创建监控日志表
CREATE TABLE replication_monitor_log (
id INT AUTO_INCREMENT PRIMARY KEY,
check_time DATETIME,
io_running VARCHAR(10),
sql_running VARCHAR(10),
lag_seconds INT,
status VARCHAR(10),
error_message TEXT,
INDEX idx_check_time (check_time)
);

读写分离配置

ProxySQL配置

1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
-- ProxySQL配置脚本
-- 1. 添加MySQL服务器
INSERT INTO mysql_servers(hostgroup_id, hostname, port, weight) VALUES
(0, '192.168.1.100', 3306, 1000), -- 主库(写)
(1, '192.168.1.101', 3306, 900), -- 从库1(读)
(1, '192.168.1.102', 3306, 900); -- 从库2(读)

-- 2. 配置用户
INSERT INTO mysql_users(username, password, default_hostgroup) VALUES
('app_user', 'app_pass', 0);

-- 3. 配置查询路由规则
INSERT INTO mysql_query_rules(rule_id, active, match_pattern, destination_hostgroup, apply) VALUES
(1, 1, '^SELECT.*', 1, 1), -- 读查询路由到从库
(2, 1, '^INSERT.*', 0, 1), -- 写查询路由到主库
(3, 1, '^UPDATE.*', 0, 1),
(4, 1, '^DELETE.*', 0, 1);

-- 4. 加载配置
LOAD MYSQL SERVERS TO RUNTIME;
LOAD MYSQL USERS TO RUNTIME;
LOAD MYSQL QUERY RULES TO RUNTIME;

-- 5. 保存配置
SAVE MYSQL SERVERS TO DISK;
SAVE MYSQL USERS TO DISK;
SAVE MYSQL QUERY RULES TO DISK;

读写分离监控

1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
-- 创建读写分离性能监控视图
CREATE VIEW rw_split_stats AS
SELECT
hostgroup,
srv_host,
srv_port,
status,
ConnUsed,
ConnFree,
ConnOK,
ConnERR,
Queries,
Bytes_data_sent,
Bytes_data_recv
FROM stats_mysql_connection_pool
ORDER BY hostgroup, srv_host;

-- 查询路由统计
SELECT
rule_id,
hits,
destination_hostgroup,
match_pattern
FROM stats_mysql_query_rules
ORDER BY hits DESC;

-- 连接池状态监控
SELECT
hostgroup,
srv_host,
status,
ConnUsed,
ConnFree,
ConnOK + ConnERR as total_connections,
ROUND(ConnOK / (ConnOK + ConnERR) * 100, 2) as success_rate
FROM stats_mysql_connection_pool;

高可用架构

MHA配置

1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
# MHA Manager配置文件 /etc/mha/app1.cnf
[server default]
manager_log=/var/log/mha/app1/manager.log
manager_workdir=/var/log/mha/app1
master_binlog_dir=/var/lib/mysql
user=mha
password=mha123
ping_interval=3
repl_user=repl
repl_password=repl123
ssh_user=root

[server1]
hostname=192.168.1.100
port=3306
candidate_master=1

[server2]
hostname=192.168.1.101
port=3306
candidate_master=1
check_repl_delay=0

[server3]
hostname=192.168.1.102
port=3306

Keepalived高可用

1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
# Keepalived配置文件 /etc/keepalived/keepalived.conf
# 主库配置
vrrp_script chk_mysql {
script "/etc/keepalived/check_mysql.sh"
interval 3
weight -2
fall 3
rise 2
}

vrrp_instance VI_1 {
state MASTER
interface eth0
virtual_router_id 51
priority 100
advert_int 1
authentication {
auth_type PASS
auth_pass mysql123
}
virtual_ipaddress {
192.168.1.200
}
track_script {
chk_mysql
}
}

# MySQL健康检查脚本 /etc/keepalived/check_mysql.sh
#!/bin/bash
mysql -uroot -proot123 -e "SELECT 1" > /dev/null 2>&1
if [ $? -eq 0 ]; then
exit 0
else
exit 1
fi

自动故障切换

1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
-- 创建故障切换日志表
CREATE TABLE failover_log (
id INT AUTO_INCREMENT PRIMARY KEY,
event_time DATETIME,
event_type ENUM('FAILOVER_START', 'FAILOVER_END', 'MASTER_CHANGE'),
old_master VARCHAR(100),
new_master VARCHAR(100),
description TEXT,
INDEX idx_event_time (event_time)
);

-- 故障切换触发器
DELIMITER //
CREATE TRIGGER log_master_change
AFTER INSERT ON mysql.slave_master_info
FOR EACH ROW
BEGIN
INSERT INTO failover_log (
event_time,
event_type,
new_master,
description
) VALUES (
NOW(),
'MASTER_CHANGE',
NEW.Host,
CONCAT('Master changed to ', NEW.Host, ':', NEW.Port)
);
END //
DELIMITER ;

集群管理工具

管理仪表板

1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
-- 创建集群状态仪表板视图
CREATE VIEW cluster_dashboard AS
SELECT
'Cluster Overview' as section,
(SELECT COUNT(*) FROM information_schema.PROCESSLIST WHERE COMMAND != 'Sleep') as active_connections,
(SELECT COUNT(*) FROM mysql_servers WHERE status = 'ONLINE') as online_servers,
(SELECT AVG(Seconds_Behind_Master) FROM information_schema.REPLICA_HOST_STATUS) as avg_replication_lag,
(SELECT COUNT(*) FROM replication_monitor_log WHERE status = 'ERROR' AND check_time > DATE_SUB(NOW(), INTERVAL 1 HOUR)) as recent_errors;

-- 服务器性能统计
CREATE VIEW server_performance AS
SELECT
@@hostname as server_name,
@@server_id as server_id,
(SELECT VARIABLE_VALUE FROM information_schema.GLOBAL_STATUS WHERE VARIABLE_NAME = 'Uptime') as uptime_seconds,
(SELECT VARIABLE_VALUE FROM information_schema.GLOBAL_STATUS WHERE VARIABLE_NAME = 'Queries') as total_queries,
(SELECT VARIABLE_VALUE FROM information_schema.GLOBAL_STATUS WHERE VARIABLE_NAME = 'Threads_connected') as current_connections,
(SELECT VARIABLE_VALUE FROM information_schema.GLOBAL_STATUS WHERE VARIABLE_NAME = 'Innodb_buffer_pool_read_requests') as buffer_pool_reads,
(SELECT VARIABLE_VALUE FROM information_schema.GLOBAL_STATUS WHERE VARIABLE_NAME = 'Innodb_buffer_pool_reads') as disk_reads,
ROUND(
(SELECT VARIABLE_VALUE FROM information_schema.GLOBAL_STATUS WHERE VARIABLE_NAME = 'Innodb_buffer_pool_read_requests') /
(SELECT VARIABLE_VALUE FROM information_schema.GLOBAL_STATUS WHERE VARIABLE_NAME = 'Innodb_buffer_pool_reads') * 100, 2
) as buffer_pool_hit_rate;

集群监控系统

1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
-- 创建综合监控存储过程
DELIMITER //
CREATE PROCEDURE ClusterHealthCheck()
BEGIN
DECLARE done INT DEFAULT FALSE;
DECLARE server_host VARCHAR(100);
DECLARE server_port INT;
DECLARE server_status VARCHAR(20);

-- 声明游标
DECLARE server_cursor CURSOR FOR
SELECT hostname, port, status FROM mysql_servers;
DECLARE CONTINUE HANDLER FOR NOT FOUND SET done = TRUE;

-- 创建临时结果表
DROP TEMPORARY TABLE IF EXISTS temp_cluster_status;
CREATE TEMPORARY TABLE temp_cluster_status (
server_host VARCHAR(100),
server_port INT,
server_status VARCHAR(20),
connection_status VARCHAR(20),
replication_status VARCHAR(20),
lag_seconds INT
);

-- 遍历所有服务器
OPEN server_cursor;
read_loop: LOOP
FETCH server_cursor INTO server_host, server_port, server_status;
IF done THEN
LEAVE read_loop;
END IF;

-- 检查连接状态
SET @connection_check = CONCAT('SELECT 1 FROM DUAL');

-- 插入检查结果
INSERT INTO temp_cluster_status (
server_host,
server_port,
server_status,
connection_status,
replication_status,
lag_seconds
) VALUES (
server_host,
server_port,
server_status,
'OK',
'CHECKING',
0
);
END LOOP;
CLOSE server_cursor;

-- 返回结果
SELECT * FROM temp_cluster_status;

-- 记录检查日志
INSERT INTO cluster_health_log (
check_time,
total_servers,
online_servers,
offline_servers
) SELECT
NOW(),
COUNT(*),
SUM(CASE WHEN server_status = 'ONLINE' THEN 1 ELSE 0 END),
SUM(CASE WHEN server_status != 'ONLINE' THEN 1 ELSE 0 END)
FROM temp_cluster_status;

END //
DELIMITER ;

-- 创建健康检查日志表
CREATE TABLE cluster_health_log (
id INT AUTO_INCREMENT PRIMARY KEY,
check_time DATETIME,
total_servers INT,
online_servers INT,
offline_servers INT,
INDEX idx_check_time (check_time)
);

集群管理脚本

集群状态检查脚本

1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
#!/bin/bash
# cluster_health_check.sh - MySQL集群健康检查脚本

# 配置信息
MASTER_HOST="192.168.1.100"
SLAVE_HOSTS=("192.168.1.101" "192.168.1.102")
MYSQL_USER="monitor"
MYSQL_PASS="monitor123"

# 检查主库状态
check_master_status() {
echo "=== 检查主库状态 ==="
mysql -h$MASTER_HOST -u$MYSQL_USER -p$MYSQL_PASS -e "
SHOW MASTER STATUS;
SHOW PROCESSLIST;
SELECT COUNT(*) as Active_Connections FROM information_schema.PROCESSLIST WHERE COMMAND != 'Sleep';
"
}

# 检查从库状态
check_slave_status() {
for slave in "${SLAVE_HOSTS[@]}"; do
echo "=== 检查从库 $slave 状态 ==="
mysql -h$slave -u$MYSQL_USER -p$MYSQL_PASS -e "
SHOW SLAVE STATUS\G
" | grep -E "(Slave_IO_Running|Slave_SQL_Running|Seconds_Behind_Master|Last_Error)"
done
}

# 主函数
main() {
check_master_status
check_slave_status
}

main

自动故障切换脚本

1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
#!/bin/bash
# auto_failover.sh - MySQL自动故障切换脚本

MASTER_HOST="192.168.1.100"
NEW_MASTER="192.168.1.101"
SLAVE_HOSTS=("192.168.1.102")
MYSQL_USER="admin"
MYSQL_PASS="admin123"
VIP="192.168.1.200"

# 检查主库是否可用
check_master_alive() {
mysqladmin -h$MASTER_HOST -u$MYSQL_USER -p$MYSQL_PASS ping > /dev/null 2>&1
return $?
}

# 提升从库为主库
promote_slave_to_master() {
echo "提升 $NEW_MASTER 为新主库..."

# 停止从库复制
mysql -h$NEW_MASTER -u$MYSQL_USER -p$MYSQL_PASS -e "
STOP SLAVE;
RESET SLAVE ALL;
"

# 启用二进制日志
mysql -h$NEW_MASTER -u$MYSQL_USER -p$MYSQL_PASS -e "
SET GLOBAL read_only = OFF;
FLUSH LOGS;
"

echo "新主库 $NEW_MASTER 已就绪"
}

# 重新配置其他从库
reconfigure_slaves() {
for slave in "${SLAVE_HOSTS[@]}"; do
echo "重新配置从库 $slave..."

# 获取新主库的二进制日志信息
MASTER_LOG_FILE=$(mysql -h$NEW_MASTER -u$MYSQL_USER -p$MYSQL_PASS -e "SHOW MASTER STATUS" | awk 'NR==2{print $1}')
MASTER_LOG_POS=$(mysql -h$NEW_MASTER -u$MYSQL_USER -p$MYSQL_PASS -e "SHOW MASTER STATUS" | awk 'NR==2{print $2}')

# 重新配置从库
mysql -h$slave -u$MYSQL_USER -p$MYSQL_PASS -e "
STOP SLAVE;
CHANGE MASTER TO
MASTER_HOST='$NEW_MASTER',
MASTER_USER='repl',
MASTER_PASSWORD='repl123',
MASTER_LOG_FILE='$MASTER_LOG_FILE',
MASTER_LOG_POS=$MASTER_LOG_POS;
START SLAVE;
"
done
}

# 切换VIP
switch_vip() {
echo "切换VIP到新主库..."
# 这里需要根据实际环境配置VIP切换逻辑
# 例如使用keepalived或其他高可用工具
ssh root@$NEW_MASTER "ip addr add $VIP/24 dev eth0"
echo "VIP已切换到 $NEW_MASTER"
}

# 主函数
main() {
if ! check_master_alive; then
echo "主库 $MASTER_HOST 不可用,开始故障切换..."
promote_slave_to_master
reconfigure_slaves
switch_vip
echo "故障切换完成!"
else
echo "主库 $MASTER_HOST 正常运行"
fi
}

main

监控和告警

集群监控脚本

1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
#!/bin/bash
# cluster_monitor.sh - MySQL集群监控脚本

# 配置文件
CONFIG_FILE="/etc/mysql/cluster_monitor.conf"
LOG_FILE="/var/log/mysql_cluster_monitor.log"

# 读取配置
source $CONFIG_FILE

# 日志函数
log_message() {
echo "$(date '+%Y-%m-%d %H:%M:%S') - $1" >> $LOG_FILE
}

# 发送告警
send_alert() {
local message="$1"
local level="$2"

# 发送邮件告警
echo "$message" | mail -s "MySQL集群告警 - $level" $ALERT_EMAIL

# 发送钉钉告警(如果配置了)
if [ ! -z "$DINGTALK_WEBHOOK" ]; then
curl -X POST "$DINGTALK_WEBHOOK" \
-H 'Content-Type: application/json' \
-d "{\"msgtype\": \"text\", \"text\": {\"content\": \"$message\"}}"
fi

log_message "ALERT: $message"
}

# 检查复制延迟
check_replication_lag() {
for slave in "${SLAVE_HOSTS[@]}"; do
lag=$(mysql -h$slave -u$MONITOR_USER -p$MONITOR_PASS -e "SHOW SLAVE STATUS\G" | grep "Seconds_Behind_Master" | awk '{print $2}')

if [ "$lag" != "NULL" ] && [ "$lag" -gt "$MAX_LAG_SECONDS" ]; then
send_alert "从库 $slave 复制延迟过高: ${lag}秒" "WARNING"
fi
done
}

# 检查连接数
check_connections() {
for host in "${ALL_HOSTS[@]}"; do
connections=$(mysql -h$host -u$MONITOR_USER -p$MONITOR_PASS -e "SHOW STATUS LIKE 'Threads_connected'" | awk 'NR==2{print $2}')
max_connections=$(mysql -h$host -u$MONITOR_USER -p$MONITOR_PASS -e "SHOW VARIABLES LIKE 'max_connections'" | awk 'NR==2{print $2}')

usage_percent=$((connections * 100 / max_connections))

if [ $usage_percent -gt 80 ]; then
send_alert "服务器 $host 连接数使用率过高: ${usage_percent}%" "WARNING"
fi
done
}

# 检查磁盘空间
check_disk_space() {
for host in "${ALL_HOSTS[@]}"; do
disk_usage=$(ssh root@$host "df -h /var/lib/mysql | awk 'NR==2{print \$5}' | sed 's/%//'")

if [ $disk_usage -gt 85 ]; then
send_alert "服务器 $host 磁盘空间不足: ${disk_usage}%" "CRITICAL"
fi
done
}

# 主监控循环
main() {
while true; do
check_replication_lag
check_connections
check_disk_space

sleep $MONITOR_INTERVAL
done
}

# 配置文件示例
cat > /etc/mysql/cluster_monitor.conf << 'EOF'
# MySQL集群监控配置

# 服务器列表
MASTER_HOST="192.168.1.100"
SLAVE_HOSTS=("192.168.1.101" "192.168.1.102")
ALL_HOSTS=("192.168.1.100" "192.168.1.101" "192.168.1.102")

# 监控用户
MONITOR_USER="monitor"
MONITOR_PASS="monitor123"

# 告警配置
ALERT_EMAIL="admin@company.com"
DINGTALK_WEBHOOK="https://oapi.dingtalk.com/robot/send?access_token=xxx"

# 阈值配置
MAX_LAG_SECONDS=30
MONITOR_INTERVAL=60
EOF

main

性能优化

集群性能调优

1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
-- 主库优化配置
-- /etc/mysql/mysql.conf.d/mysqld.cnf

[mysqld]
# 基础配置
server-id = 1
log-bin = mysql-bin
binlog-format = ROW
sync_binlog = 1
innodb_flush_log_at_trx_commit = 1

# 性能优化
innodb_buffer_pool_size = 8G
innodb_log_file_size = 1G
innodb_log_buffer_size = 64M
innodb_flush_method = O_DIRECT
innodb_file_per_table = 1

# 复制优化
slave_parallel_workers = 8
slave_parallel_type = LOGICAL_CLOCK
binlog_group_commit_sync_delay = 1000
binlog_group_commit_sync_no_delay_count = 100

# 连接优化
max_connections = 2000
thread_cache_size = 100
table_open_cache = 4000
table_definition_cache = 2000

# 查询缓存(MySQL 5.7及以下)
query_cache_type = 1
query_cache_size = 256M

# 临时表优化
tmp_table_size = 256M
max_heap_table_size = 256M

读写分离性能监控

1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
-- 创建性能监控视图
CREATE VIEW cluster_performance_summary AS
SELECT
'Master' as server_role,
@@hostname as hostname,
(SELECT COUNT(*) FROM information_schema.PROCESSLIST WHERE COMMAND != 'Sleep') as active_connections,
(SELECT VARIABLE_VALUE FROM information_schema.GLOBAL_STATUS WHERE VARIABLE_NAME = 'Queries') as total_queries,
(SELECT VARIABLE_VALUE FROM information_schema.GLOBAL_STATUS WHERE VARIABLE_NAME = 'Com_select') as select_queries,
(SELECT VARIABLE_VALUE FROM information_schema.GLOBAL_STATUS WHERE VARIABLE_NAME = 'Com_insert') as insert_queries,
(SELECT VARIABLE_VALUE FROM information_schema.GLOBAL_STATUS WHERE VARIABLE_NAME = 'Com_update') as update_queries,
(SELECT VARIABLE_VALUE FROM information_schema.GLOBAL_STATUS WHERE VARIABLE_NAME = 'Com_delete') as delete_queries,
(SELECT VARIABLE_VALUE FROM information_schema.GLOBAL_STATUS WHERE VARIABLE_NAME = 'Innodb_buffer_pool_read_requests') as buffer_pool_reads,
(SELECT VARIABLE_VALUE FROM information_schema.GLOBAL_STATUS WHERE VARIABLE_NAME = 'Innodb_buffer_pool_reads') as disk_reads,
ROUND(
(SELECT VARIABLE_VALUE FROM information_schema.GLOBAL_STATUS WHERE VARIABLE_NAME = 'Innodb_buffer_pool_read_requests') /
(SELECT VARIABLE_VALUE FROM information_schema.GLOBAL_STATUS WHERE VARIABLE_NAME = 'Innodb_buffer_pool_reads') * 100, 2
) as buffer_pool_hit_rate,
NOW() as check_time;

-- 查询性能统计
SELECT
schemaname,
tablename,
attname,
n_distinct,
correlation
FROM pg_stats
WHERE schemaname = 'your_database'
ORDER BY n_distinct DESC;

-- 慢查询分析
SELECT
query_time,
lock_time,
rows_sent,
rows_examined,
sql_text
FROM mysql.slow_log
WHERE start_time >= DATE_SUB(NOW(), INTERVAL 1 HOUR)
ORDER BY query_time DESC
LIMIT 20;

备份和恢复策略

集群备份脚本

1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
#!/bin/bash
# cluster_backup.sh - MySQL集群备份脚本

BACKUP_DIR="/backup/mysql"
DATE=$(date +%Y%m%d_%H%M%S)
MASTER_HOST="192.168.1.100"
SLAVE_HOST="192.168.1.101"
MYSQL_USER="backup"
MYSQL_PASS="backup123"

# 创建备份目录
mkdir -p $BACKUP_DIR/$DATE

# 主库备份(使用从库进行备份以减少主库负载)
backup_from_slave() {
echo "开始从从库备份..."

# 记录主库位置
mysql -h$MASTER_HOST -u$MYSQL_USER -p$MYSQL_PASS -e "SHOW MASTER STATUS" > $BACKUP_DIR/$DATE/master_status.txt

# 从从库备份
mysqldump -h$SLAVE_HOST -u$MYSQL_USER -p$MYSQL_PASS \
--single-transaction \
--routines \
--triggers \
--all-databases \
--master-data=2 \
--flush-logs \
--hex-blob > $BACKUP_DIR/$DATE/full_backup.sql

# 压缩备份文件
gzip $BACKUP_DIR/$DATE/full_backup.sql

echo "备份完成: $BACKUP_DIR/$DATE/full_backup.sql.gz"
}

# 增量备份
incremental_backup() {
echo "开始增量备份..."

# 获取最新的二进制日志
LATEST_LOG=$(mysql -h$MASTER_HOST -u$MYSQL_USER -p$MYSQL_PASS -e "SHOW MASTER STATUS" | awk 'NR==2{print $1}')

# 复制二进制日志文件
scp root@$MASTER_HOST:/var/lib/mysql/mysql-bin.* $BACKUP_DIR/$DATE/

echo "增量备份完成"
}

# 备份验证
verify_backup() {
echo "验证备份文件..."

if [ -f "$BACKUP_DIR/$DATE/full_backup.sql.gz" ]; then
# 检查备份文件完整性
gunzip -t $BACKUP_DIR/$DATE/full_backup.sql.gz
if [ $? -eq 0 ]; then
echo "备份文件验证成功"
else
echo "备份文件损坏!"
exit 1
fi
else
echo "备份文件不存在!"
exit 1
fi
}

# 清理旧备份
cleanup_old_backups() {
echo "清理7天前的备份..."
find $BACKUP_DIR -type d -mtime +7 -exec rm -rf {} \;
}

# 主函数
main() {
backup_from_slave
incremental_backup
verify_backup
cleanup_old_backups

echo "集群备份任务完成!"
}

main

集群恢复脚本

1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
#!/bin/bash
# cluster_restore.sh - MySQL集群恢复脚本

BACKUP_DIR="/backup/mysql"
RESTORE_DATE="$1"
MASTER_HOST="192.168.1.100"
SLAVE_HOSTS=("192.168.1.101" "192.168.1.102")
MYSQL_USER="root"
MYSQL_PASS="root123"

if [ -z "$RESTORE_DATE" ]; then
echo "用法: $0 <备份日期> (格式: YYYYMMDD_HHMMSS)"
exit 1
fi

# 停止所有MySQL服务
stop_all_mysql() {
echo "停止所有MySQL服务..."

for host in $MASTER_HOST "${SLAVE_HOSTS[@]}"; do
ssh root@$host "systemctl stop mysql"
done
}

# 恢复主库
restore_master() {
echo "恢复主库数据..."

# 启动主库MySQL服务
ssh root@$MASTER_HOST "systemctl start mysql"

# 恢复数据
gunzip -c $BACKUP_DIR/$RESTORE_DATE/full_backup.sql.gz | \
mysql -h$MASTER_HOST -u$MYSQL_USER -p$MYSQL_PASS

# 重置主库状态
mysql -h$MASTER_HOST -u$MYSQL_USER -p$MYSQL_PASS -e "
RESET MASTER;
FLUSH LOGS;
"
}

# 重建从库
rebuild_slaves() {
echo "重建从库..."

# 获取主库状态
MASTER_LOG_FILE=$(mysql -h$MASTER_HOST -u$MYSQL_USER -p$MYSQL_PASS -e "SHOW MASTER STATUS" | awk 'NR==2{print $1}')
MASTER_LOG_POS=$(mysql -h$MASTER_HOST -u$MYSQL_USER -p$MYSQL_PASS -e "SHOW MASTER STATUS" | awk 'NR==2{print $2}')

for slave in "${SLAVE_HOSTS[@]}"; do
echo "重建从库 $slave..."

# 启动从库MySQL服务
ssh root@$slave "systemctl start mysql"

# 恢复数据
gunzip -c $BACKUP_DIR/$RESTORE_DATE/full_backup.sql.gz | \
mysql -h$slave -u$MYSQL_USER -p$MYSQL_PASS

# 配置复制
mysql -h$slave -u$MYSQL_USER -p$MYSQL_PASS -e "
RESET SLAVE ALL;
CHANGE MASTER TO
MASTER_HOST='$MASTER_HOST',
MASTER_USER='repl',
MASTER_PASSWORD='repl123',
MASTER_LOG_FILE='$MASTER_LOG_FILE',
MASTER_LOG_POS=$MASTER_LOG_POS;
START SLAVE;
"
done
}

# 验证恢复结果
verify_restore() {
echo "验证恢复结果..."

# 检查主库状态
mysql -h$MASTER_HOST -u$MYSQL_USER -p$MYSQL_PASS -e "SHOW MASTER STATUS;"

# 检查从库状态
for slave in "${SLAVE_HOSTS[@]}"; do
echo "检查从库 $slave 状态..."
mysql -h$slave -u$MYSQL_USER -p$MYSQL_PASS -e "SHOW SLAVE STATUS\G" | \
grep -E "(Slave_IO_Running|Slave_SQL_Running|Seconds_Behind_Master)"
done
}

# 主函数
main() {
if [ ! -d "$BACKUP_DIR/$RESTORE_DATE" ]; then
echo "备份目录不存在: $BACKUP_DIR/$RESTORE_DATE"
exit 1
fi

echo "开始恢复MySQL集群..."
echo "备份日期: $RESTORE_DATE"

read -p "确认要恢复集群吗?这将覆盖现有数据 (y/N): " confirm
if [ "$confirm" != "y" ]; then
echo "取消恢复操作"
exit 0
fi

stop_all_mysql
restore_master
rebuild_slaves
verify_restore

echo "集群恢复完成!"
}

main

总结

MySQL集群配置与高可用架构是企业级应用的重要组成部分。通过本文的实战指南,我们学习了:

核心要点

  1. 架构设计:合理的主从复制架构是高可用的基础
  2. 监控告警:完善的监控体系能及时发现和解决问题
  3. 自动化运维:通过脚本实现自动化管理和故障切换
  4. 性能优化:针对集群特点进行专门的性能调优
  5. 备份恢复:完整的备份恢复策略保障数据安全

最佳实践

  • 定期检查复制状态和性能指标
  • 建立完善的监控和告警机制
  • 制定详细的故障切换预案
  • 定期进行备份恢复演练
  • 持续优化集群性能和稳定性

通过这些实战经验和工具脚本,可以构建一个稳定、高效的MySQL集群环境,为业务系统提供可靠的数据服务支撑。

本站由 提供部署服务