| Bug #97707 | group replication lost transaction when init by ansible | ||
|---|---|---|---|
| Submitted: | 20 Nov 2019 12:39 | Modified: | 12 Sep 2023 10:49 |
| Reporter: | Haixing Weng (OCA) | Email Updates: | |
| Status: | Can't repeat | Impact on me: | |
| Category: | MySQL Server: Group Replication | Severity: | S1 (Critical) |
| Version: | all version with MGR | OS: | Any |
| Assigned to: | CPU Architecture: | Any | |
| Tags: | group replication; ansible | ||
[20 Nov 2019 12:40]
Haixing Weng
binlog & gtid_executed is Inconsistent in MGR
Attachment: status_in_mgr.txt (text/plain), 11.01 KiB.
[20 Nov 2019 12:49]
Haixing Weng
cat make-group-replication.sh
#!/bin/bash
#set -o nounset
set -o pipefail
usage()
{
echo
echo "Usage:"
echo " make-group-replication.sh node"
echo " node:"
echo " primary"
echo " secondary"
echo
echo "Help:"
echo " zhengsilong@unionpay.com"
echo
}
basic_single_escape() {
echo "$1" | sed 's/\(['"'"'\]\)/\\\1/g'
}
process_path()
{
path=$1
tmp_path1=`echo $path | sed -e "s=\(\.\.\/\)=???=g" | sed -e "s=[^?]==g" |sed -e "s=???=\.\.\/=g"`
if [[ ! -z $tmp_path1 ]]; then
tmp_path1=$(cd $tmp_path1;pwd)
tmp_path2=`echo $path | sed -e "s=\(\.\.\/\)==g"`
path=$tmp_path1/$tmp_path2
#echo $path
fi
echo $path
}
read_config() {
scriptdir="$(cd "$(dirname "${BASH_SOURCE[0]}")" && pwd)" # 脚本目录
cd $scriptdir
conf_file="upsql-deploy.cfg"
linux_user=`sed '/^linux_user=/!d;s/.*=//' $conf_file`
linux_group=`sed '/^linux_group=/!d;s/.*=//' $conf_file`
linux_user_pwd=`sed '/^linux_user_pwd=/!d;s/.*=//' $conf_file`
tarfile=`sed '/^tar_file=/!d;s/.*=//' $conf_file`
# instance_name
instance_name=`sed '/^instance_name=/!d;s/.*=//' $conf_file`
port=`sed '/^port=/!d;s/.*=//' $conf_file`
basedir=`sed '/^base_dir=/!d;s/.*=//' $conf_file`
conf_dir=`sed '/^conf_dir=/!d;s/.*=//' $conf_file`
sock_dir=`sed '/^sock_dir=/!d;s/.*=//' $conf_file`
datadir=`sed '/^data_dir=/!d;s/.*=//' $conf_file`
binlogdir=`sed '/^binlog_dir=/!d;s/.*=//' $conf_file`
relaylogdir=`sed '/^relaylog_dir=/!d;s/.*=//' $conf_file`
redologdir=`sed '/^redolog_dir=/!d;s/.*=//' $conf_file`
undologdir=`sed '/^undolog_dir=/!d;s/.*=//' $conf_file`
dba_name=`sed '/^dba_name=/!d;s/.*=//' $conf_file`
dba_pwd=`sed '/^dba_pwd=/!d;s/.*=//' $conf_file`
root_pass=$dba_pwd
rpl_user=`sed '/^rpl_user=/!d;s/.*=//' $conf_file`
master_user=$rpl_user
rpl_pwd=`sed '/^rpl_pwd=/!d;s/.*=//' $conf_file`
# master host
master_host=`sed '/^master_host=/!d;s/.*=//' $conf_file`
# master port
master_port=`sed '/^master_port=/!d;s/.*=//' $conf_file`
mode=`sed '/^mode=/!d;s/.*=//' $conf_file`
decompress=`sed '/^install=/!d;s/.*=//' $conf_file`
upsql_create=`sed '/^upsql_create=/!d;s/.*=//' $conf_file`
upsql_sec=`sed '/^upsql_sec=/!d;s/.*=//' $conf_file`
upsql_repl=`sed '/^upsql_repl=/!d;s/.*=//' $conf_file`
}
add_mgr_config(){
cat $scriptdir/group-replication.cfg >> $conf_dir/$instance_name.cnf
}
execute_sql() {
source /tmp/mybashrc
$basedir/bin/mysql -u$dba_name -p$root_pass -S $sock_dir/$instance_name.sock -BNe "$1"
}
# need one param
if [[ $# -ne 1 ]]; then
usage
exit 1
fi
node=$1
if [[ x$node != x"primary" && x$node != x"secondary" ]]; then
echo "Error: need one param of 'primary' or 'secondary'"
exit 1
fi
# root
if [[ $LOGNAME != root ]]; then
echo "Please use the root account operation."
exit 1
fi
read_config
add_mgr_config
count_rpluser=`execute_sql "select count(*) from mysql.user where user = '$rpl_user'"`
if [[ $count_rpluser -eq 0 ]]; then
execute_sql "create user $rpl_user; alter user $rpl_user identified by '$rpl_pwd'"
execute_sql "grant replication slave on *.* to $rpl_user"
fi
execute_sql "drop user if exists root@localhost"
execute_sql "create user root@localhost; alter user root@localhost identified by '$root_pass'"
execute_sql "grant all on *.* to root@localhost with grant option"
execute_sql "reset master"
execute_sql "change master to master_user = '$rpl_user', master_password = '$rpl_pwd' for channel 'group_replication_recovery'"
# user
su - $linux_user <<EOF
source /tmp/mybashrc
$scriptdir/upsqlimgm $instance_name restart -p $root_pass -u $dba_name -c $conf_dir/$instance_name.cnf -b $basedir -s $sock_dir/$instance_name.sock
EOF
if [[ $node == "primary" ]]; then
execute_sql "set global group_replication_bootstrap_group = 1"
fi
execute_sql "start group_replication"
if [[ $node == "primary" ]]; then
execute_sql "set global group_replication_bootstrap_group = 0"
fi
[21 Nov 2019 1:42]
Haixing Weng
ansible-play-book and config file template
Attachment: upsql.cnf (application/octet-stream, text), 6.57 KiB.
[26 Nov 2019 5:35]
Haixing Weng
Any ideas about this issue?
[23 Dec 2019 6:27]
Haixing Weng
The vital clue to incorrect MGR cluster is that ansible's killing signal, I found that when use ansible to initial MGR, ansible would,somehow , send SIGHUP frequently to mysqld, and if mysqld receive SIGHUP before signal_handler inited compeletly, it will shutdown in abnormal flow.Due to SIGHUP, primary's binlog will endding without a rotate event, I think it lead to an unexpected MGR established.
[16 Jan 2020 0:56]
MySQL Verification Team
Hi, Thanks for the test case, verified!
[17 Jan 2020 1:44]
Haixing Weng
Hi: Could you please tell me something about cause of defect, so that I could add some patches for database kernel or ansible playbook in our production env for temporary ?

Description: We use ansible to intergrate HA-proxy and Group replication, we found that after init group-replication, some transaction will lost in MGR, and the cluster still report everything is OK, even Gtid_executed & binlog is inconsistent between nodes (Primary Node has more GTID than Secondary Nodes) We repeat on every version of mysql which has MGR: Just like this (In Mysql 8.0) in attaching files below: How to repeat: The pseudocode of ansible is post below: - name: Install UPSQL on database nodes hosts: database roles: - { role: upsql, method: install_mysql } - name: Make group replication on primiary node hosts: primary_db vars: node: "primary" roles: - { role: upsql, method: make_group_replication } - name: Make group replication on secondary nodes hosts: secondary_db vars: node: "secondary" roles: - { role: upsql, method: make_group_replication} - name: Install xinetd service yum: name: xinetd state: latest - name: Start xinetd service service: name: xinetd state: started enabled: yes - name: Copy check scripts to destination template: src: mysql_gr_routing_check.sh.j2 dest: /usr/local/bin/mysql_gr_routing_check.sh mode: 0755 - name: Ensure 6446 and 6447 port is received in service lineinfile: path: /etc/services regexp: "{{ item.regexp }}" line: "{{ item.line }}" with_items: - { regexp: "^mysql_gr_routing_check_write" , line: "mysql_gr_routing_check_write 6446/tcp"} - { regexp: "^mysql_gr_routing_check_read" , line: "mysql_gr_routing_check_read 6447/tcp"} - name: Copy check scripts to xinetd template: src: "{{ item.name }}" dest: "{{ item.dest }}" with_items: - { name: "mysql_gr_routing_check_read", dest: "/etc/xinetd.d/"} - { name: "mysql_gr_routing_check_write", dest: "/etc/xinetd.d/"} - { name: "addition_to_sys.sql", dest: "/opt/"} - name: wait shell: source /tmp/mybashrc;/usr/local/mysql/bin/mysql -S /etc/mysock/upsql.sock -u"{{ dba_name }}" -p"{{ dba_pwd }}" -e 'select sleep(180)' when: inventory_hostname in groups['primary_db'] ignore_errors: yes - name: exec addition to system sql shell: source /tmp/mybashrc;/usr/local/mysql/bin/mysql -S /etc/mysock/upsql.sock -u"{{ dba_name }}" -p"{{ dba_pwd }}" -e 'source /opt/addition_to_sys.sql' when: inventory_hostname in groups['primary_db'] - name: Flush privilege shell: source /tmp/mybashrc;/usr/local/mysql/bin/mysql -S /etc/mysock/upsql.sock -u"{{ dba_name }}" -p"{{ dba_pwd }}" -e "grant all privileges on *.* to '"{{ dba_name }}"'@'%' identified by '"{{ dba_pwd }}"' with grant option;" when: inventory_hostname in groups['primary_db'] [root@PRONOVADB02 opt]# cat addition_to_sys.sql USE sys; DELIMITER $$ CREATE FUNCTION IFZERO(a INT, b INT) RETURNS INT DETERMINISTIC RETURN IF(a = 0, b, a)$$ CREATE FUNCTION LOCATE2(needle TEXT(10000), haystack TEXT(10000), offset INT) RETURNS INT DETERMINISTIC RETURN IFZERO(LOCATE(needle, haystack, offset), LENGTH(haystack) + 1)$$ CREATE FUNCTION GTID_NORMALIZE(g TEXT(10000)) RETURNS TEXT(10000) DETERMINISTIC RETURN GTID_SUBTRACT(g, '')$$ CREATE FUNCTION GTID_COUNT(gtid_set TEXT(10000)) RETURNS INT DETERMINISTIC BEGIN DECLARE result BIGINT DEFAULT 0; DECLARE colon_pos INT; DECLARE next_dash_pos INT; DECLARE next_colon_pos INT; DECLARE next_comma_pos INT; SET gtid_set = GTID_NORMALIZE(gtid_set); SET colon_pos = LOCATE2(':', gtid_set, 1); WHILE colon_pos != LENGTH(gtid_set) + 1 DO SET next_dash_pos = LOCATE2('-', gtid_set, colon_pos + 1); SET next_colon_pos = LOCATE2(':', gtid_set, colon_pos + 1); SET next_comma_pos = LOCATE2(',', gtid_set, colon_pos + 1); IF next_dash_pos < next_colon_pos AND next_dash_pos < next_comma_pos THEN SET result = result + SUBSTR(gtid_set, next_dash_pos + 1, LEAST(next_colon_pos, next_comma_pos) - (next_dash_pos + 1)) - SUBSTR(gtid_set, colon_pos + 1, next_dash_pos - (colon_pos + 1)) + 1; ELSE SET result = result + 1; END IF; SET colon_pos = next_colon_pos; END WHILE; RETURN result; END$$ CREATE FUNCTION gr_applier_queue_length() RETURNS INT DETERMINISTIC BEGIN RETURN (SELECT sys.gtid_count( GTID_SUBTRACT( (SELECT Received_transaction_set FROM performance_schema.replication_connection_status WHERE Channel_name = 'group_replication_applier' ), (SELECT @@global.GTID_EXECUTED) ))); END$$ CREATE FUNCTION gr_member_in_primary_partition() RETURNS VARCHAR(3) DETERMINISTIC BEGIN RETURN (SELECT IF( MEMBER_STATE='ONLINE' AND ((SELECT COUNT(*) FROM performance_schema.replication_group_members WHERE MEMBER_STATE != 'ONLINE') >= ((SELECT COUNT(*) FROM performance_schema.replication_group_members)/2) = 0), 'YES', 'NO' ) FROM performance_schema.replication_group_members JOIN performance_schema.replication_group_member_stats USING(member_id)); END$$ CREATE VIEW gr_member_routing_candidate_status AS SELECT sys.gr_member_in_primary_partition() as viable_candidate, IF( (SELECT (SELECT GROUP_CONCAT(variable_value) FROM performance_schema.global_variables WHERE variable_name IN ('read_only', 'super_read_only')) != 'OFF,OFF'), 'YES', 'NO') as read_only, sys.gr_applier_queue_length() as transactions_behind, Count_Transactions_in_queue as 'transactions_to_cert' from performance_schema.replication_group_member_stats;$$ DELIMITER ;