Bug #97707 group replication lost transaction when init by ansible
Submitted: 20 Nov 2019 12:39 Modified: 12 Sep 2023 10:49
Reporter: Haixing Weng (OCA) Email Updates:
Status: Can't repeat Impact on me:
None 
Category:MySQL Server: Group Replication Severity:S1 (Critical)
Version:all version with MGR OS:Any
Assigned to: CPU Architecture:Any
Tags: group replication; ansible

[20 Nov 2019 12:39] Haixing Weng
Description:
We use ansible to intergrate HA-proxy and Group replication, we found that after
init group-replication, some transaction will lost in MGR, and the cluster still report everything is OK, even Gtid_executed & binlog is inconsistent between nodes
(Primary Node has more GTID than Secondary Nodes)

We repeat on every version of mysql which has MGR:

Just like this (In Mysql 8.0) in attaching files below:

How to repeat:
The pseudocode of ansible is post below:

- name: Install UPSQL on database nodes
  hosts: database
  roles:
    - { role: upsql, method: install_mysql }

- name: Make group replication on primiary node
  hosts: primary_db
  vars:
     node: "primary"
  roles:
     - { role: upsql, method: make_group_replication }

- name: Make group replication on secondary nodes
  hosts: secondary_db
  vars:
     node: "secondary"
  roles:
     - { role: upsql, method: make_group_replication}

- name: Install xinetd service
  yum:
    name: xinetd
    state: latest

- name: Start xinetd service
  service:
     name: xinetd
     state: started
     enabled: yes

- name: Copy check scripts to destination
  template:
    src: mysql_gr_routing_check.sh.j2
    dest: /usr/local/bin/mysql_gr_routing_check.sh
    mode: 0755

- name: Ensure 6446 and 6447 port is received in service
  lineinfile:
     path: /etc/services
     regexp: "{{ item.regexp }}"
     line: "{{ item.line }}"
  with_items:
     - { regexp: "^mysql_gr_routing_check_write" , line: "mysql_gr_routing_check_write 6446/tcp"}
     - { regexp: "^mysql_gr_routing_check_read" , line: "mysql_gr_routing_check_read 6447/tcp"}

- name: Copy check scripts to xinetd
  template:
     src: "{{ item.name  }}"
     dest: "{{ item.dest }}"
  with_items:
     - { name: "mysql_gr_routing_check_read", dest: "/etc/xinetd.d/"}
     - { name: "mysql_gr_routing_check_write", dest: "/etc/xinetd.d/"}
     - { name: "addition_to_sys.sql", dest: "/opt/"}

- name: wait
  shell: source /tmp/mybashrc;/usr/local/mysql/bin/mysql -S /etc/mysock/upsql.sock -u"{{ dba_name }}" -p"{{ dba_pwd }}" -e 'select sleep(180)'
  when: inventory_hostname in groups['primary_db']
  ignore_errors: yes

- name: exec addition to system sql
  shell: source /tmp/mybashrc;/usr/local/mysql/bin/mysql -S /etc/mysock/upsql.sock -u"{{ dba_name }}" -p"{{ dba_pwd }}" -e 'source /opt/addition_to_sys.sql'
  when: inventory_hostname in groups['primary_db']

- name: Flush privilege
  shell: source /tmp/mybashrc;/usr/local/mysql/bin/mysql -S /etc/mysock/upsql.sock -u"{{ dba_name }}" -p"{{ dba_pwd }}" -e "grant all privileges on *.* to '"{{ dba_name }}"'@'%' identified by '"{{ dba_pwd }}"' with grant option;"
  when: inventory_hostname in groups['primary_db']

[root@PRONOVADB02 opt]# cat addition_to_sys.sql 
USE sys;

DELIMITER $$

CREATE FUNCTION IFZERO(a INT, b INT)
RETURNS INT
DETERMINISTIC
RETURN IF(a = 0, b, a)$$

CREATE FUNCTION LOCATE2(needle TEXT(10000), haystack TEXT(10000), offset INT)
RETURNS INT
DETERMINISTIC
RETURN IFZERO(LOCATE(needle, haystack, offset), LENGTH(haystack) + 1)$$

CREATE FUNCTION GTID_NORMALIZE(g TEXT(10000))
RETURNS TEXT(10000)
DETERMINISTIC
RETURN GTID_SUBTRACT(g, '')$$

CREATE FUNCTION GTID_COUNT(gtid_set TEXT(10000))
RETURNS INT
DETERMINISTIC
BEGIN
  DECLARE result BIGINT DEFAULT 0;
  DECLARE colon_pos INT;
  DECLARE next_dash_pos INT;
  DECLARE next_colon_pos INT;
  DECLARE next_comma_pos INT;
  SET gtid_set = GTID_NORMALIZE(gtid_set);
  SET colon_pos = LOCATE2(':', gtid_set, 1);
  WHILE colon_pos != LENGTH(gtid_set) + 1 DO
     SET next_dash_pos = LOCATE2('-', gtid_set, colon_pos + 1);
     SET next_colon_pos = LOCATE2(':', gtid_set, colon_pos + 1);
     SET next_comma_pos = LOCATE2(',', gtid_set, colon_pos + 1);
     IF next_dash_pos < next_colon_pos AND next_dash_pos < next_comma_pos THEN
       SET result = result +
         SUBSTR(gtid_set, next_dash_pos + 1,
                LEAST(next_colon_pos, next_comma_pos) - (next_dash_pos + 1)) -
         SUBSTR(gtid_set, colon_pos + 1, next_dash_pos - (colon_pos + 1)) + 1;
     ELSE
       SET result = result + 1;
     END IF;
     SET colon_pos = next_colon_pos;
  END WHILE;
  RETURN result;
END$$

CREATE FUNCTION gr_applier_queue_length()
RETURNS INT
DETERMINISTIC
BEGIN
    RETURN (SELECT sys.gtid_count( GTID_SUBTRACT( (SELECT
  Received_transaction_set FROM performance_schema.replication_connection_status
  WHERE Channel_name = 'group_replication_applier' ), (SELECT
@@global.GTID_EXECUTED) )));
END$$

CREATE FUNCTION gr_member_in_primary_partition()
RETURNS VARCHAR(3)
DETERMINISTIC
BEGIN
  RETURN (SELECT IF( MEMBER_STATE='ONLINE' AND ((SELECT COUNT(*) FROM
performance_schema.replication_group_members WHERE MEMBER_STATE != 'ONLINE') >=
((SELECT COUNT(*) FROM performance_schema.replication_group_members)/2) = 0),
'YES', 'NO' ) FROM performance_schema.replication_group_members JOIN
performance_schema.replication_group_member_stats USING(member_id));
END$$

CREATE VIEW gr_member_routing_candidate_status AS SELECT
sys.gr_member_in_primary_partition() as viable_candidate,
IF( (SELECT (SELECT GROUP_CONCAT(variable_value) FROM
performance_schema.global_variables WHERE variable_name IN ('read_only',
'super_read_only')) != 'OFF,OFF'), 'YES', 'NO') as read_only,
sys.gr_applier_queue_length() as transactions_behind, Count_Transactions_in_queue as 'transactions_to_cert' from performance_schema.replication_group_member_stats;$$

DELIMITER ;
[20 Nov 2019 12:40] Haixing Weng
binlog & gtid_executed is Inconsistent in MGR

Attachment: status_in_mgr.txt (text/plain), 11.01 KiB.

[20 Nov 2019 12:49] Haixing Weng
cat make-group-replication.sh

#!/bin/bash

#set -o nounset
set -o pipefail

usage()
{
    echo
    echo "Usage:"
    echo "  make-group-replication.sh node"
    echo "  node:"
    echo "       primary"
    echo "       secondary"
    echo
    echo "Help:"
    echo "  zhengsilong@unionpay.com"
    echo
}

basic_single_escape() {
    echo "$1" | sed 's/\(['"'"'\]\)/\\\1/g'
}

process_path()
{
    path=$1
    tmp_path1=`echo $path | sed -e "s=\(\.\.\/\)=???=g" | sed -e "s=[^?]==g" |sed -e "s=???=\.\.\/=g"`
    if [[ ! -z $tmp_path1 ]]; then
        tmp_path1=$(cd $tmp_path1;pwd)
        tmp_path2=`echo $path | sed -e "s=\(\.\.\/\)==g"`
        path=$tmp_path1/$tmp_path2
        #echo $path
    fi
    echo $path
}

read_config() {
    scriptdir="$(cd "$(dirname "${BASH_SOURCE[0]}")" && pwd)"             # 脚本目录
    cd $scriptdir

    conf_file="upsql-deploy.cfg"

    linux_user=`sed '/^linux_user=/!d;s/.*=//' $conf_file`

    linux_group=`sed '/^linux_group=/!d;s/.*=//' $conf_file`

    linux_user_pwd=`sed '/^linux_user_pwd=/!d;s/.*=//' $conf_file`

    tarfile=`sed '/^tar_file=/!d;s/.*=//' $conf_file`

    # instance_name
    instance_name=`sed '/^instance_name=/!d;s/.*=//' $conf_file`

    port=`sed '/^port=/!d;s/.*=//' $conf_file`

    basedir=`sed '/^base_dir=/!d;s/.*=//' $conf_file`

    conf_dir=`sed '/^conf_dir=/!d;s/.*=//' $conf_file`

    sock_dir=`sed '/^sock_dir=/!d;s/.*=//' $conf_file`

    datadir=`sed '/^data_dir=/!d;s/.*=//' $conf_file`

    binlogdir=`sed '/^binlog_dir=/!d;s/.*=//' $conf_file`

    relaylogdir=`sed '/^relaylog_dir=/!d;s/.*=//' $conf_file`

    redologdir=`sed '/^redolog_dir=/!d;s/.*=//' $conf_file`

    undologdir=`sed '/^undolog_dir=/!d;s/.*=//' $conf_file`

    dba_name=`sed '/^dba_name=/!d;s/.*=//' $conf_file`

    dba_pwd=`sed '/^dba_pwd=/!d;s/.*=//' $conf_file`
    root_pass=$dba_pwd

    rpl_user=`sed '/^rpl_user=/!d;s/.*=//' $conf_file`
    master_user=$rpl_user

    rpl_pwd=`sed '/^rpl_pwd=/!d;s/.*=//' $conf_file`

   # master host
    master_host=`sed '/^master_host=/!d;s/.*=//' $conf_file`

    # master port
    master_port=`sed '/^master_port=/!d;s/.*=//' $conf_file`

    mode=`sed '/^mode=/!d;s/.*=//' $conf_file`

    decompress=`sed '/^install=/!d;s/.*=//' $conf_file`

    upsql_create=`sed '/^upsql_create=/!d;s/.*=//' $conf_file`

    upsql_sec=`sed '/^upsql_sec=/!d;s/.*=//' $conf_file`

    upsql_repl=`sed '/^upsql_repl=/!d;s/.*=//' $conf_file`

}

add_mgr_config(){
    cat $scriptdir/group-replication.cfg >> $conf_dir/$instance_name.cnf 
}

execute_sql() {
    source /tmp/mybashrc
    $basedir/bin/mysql -u$dba_name -p$root_pass -S $sock_dir/$instance_name.sock -BNe "$1"
}

# need one param
if [[ $# -ne 1 ]]; then
    usage
    exit 1
fi

node=$1

if [[ x$node != x"primary" && x$node != x"secondary" ]]; then
    echo "Error: need one param of 'primary' or 'secondary'"
    exit 1
fi

# root
if [[ $LOGNAME != root ]]; then
    echo "Please use the root account operation."
    exit 1
fi

read_config

add_mgr_config

count_rpluser=`execute_sql "select count(*) from mysql.user where user = '$rpl_user'"`
if [[ $count_rpluser -eq 0 ]]; then
    execute_sql "create user $rpl_user; alter user $rpl_user identified by '$rpl_pwd'"
    execute_sql "grant replication slave on *.* to $rpl_user"
fi

execute_sql "drop user if exists root@localhost"
execute_sql "create user root@localhost; alter user root@localhost identified by '$root_pass'"
execute_sql "grant all on *.* to root@localhost with grant option"
execute_sql "reset master"
execute_sql "change master to master_user = '$rpl_user', master_password = '$rpl_pwd' for channel 'group_replication_recovery'"

# user
su - $linux_user <<EOF
    source /tmp/mybashrc
    $scriptdir/upsqlimgm $instance_name restart -p $root_pass -u $dba_name -c $conf_dir/$instance_name.cnf -b $basedir -s $sock_dir/$instance_name.sock
EOF

if [[ $node == "primary" ]]; then
    execute_sql "set global group_replication_bootstrap_group = 1"
fi

execute_sql "start group_replication"

if [[ $node == "primary" ]]; then
    execute_sql "set global group_replication_bootstrap_group = 0"
fi
[21 Nov 2019 1:42] Haixing Weng
ansible-play-book and config file template

Attachment: upsql.cnf (application/octet-stream, text), 6.57 KiB.

[26 Nov 2019 5:35] Haixing Weng
Any ideas about this issue?
[23 Dec 2019 6:27] Haixing Weng
The vital clue to incorrect MGR cluster is that ansible's killing signal, I found that when  use ansible to initial MGR, ansible would,somehow , send SIGHUP frequently to mysqld, and if mysqld receive SIGHUP before signal_handler inited compeletly, it will shutdown in abnormal flow.Due to SIGHUP, primary's binlog will endding without a rotate event, I think it lead to an unexpected MGR established.
[16 Jan 2020 0:56] MySQL Verification Team
Hi,

Thanks for the test case, verified!
[17 Jan 2020 1:44] Haixing Weng
Hi:

Could you please tell me something about cause of defect, so that I could add some patches for database kernel or ansible playbook in our production env for temporary ?