Description:
We have a single primary replication group managed via MySQL shell spanning 3 datacenters, with 2 members per datacenter to be able to sustain losing a DC:
MySQL mydb-6005.dc03.prod.example.com:3306 ssl Py > c=dba.get_cluster() [28/770]
MySQL mydb-6005.dc03.prod.example.com:3306 ssl Py > c.status()
{
"clusterName": "gr__db__1",
"defaultReplicaSet": {
"name": "default",
"primary": "mydb-1004.dc01.prod.example.com:3306",
"ssl": "REQUIRED",
"status": "OK",
"statusText": "Cluster is ONLINE and can tolerate up to 2 failures.",
"topology": {
"mydb-1001.dc01.prod.example.com:3306": {
"address": "mydb-1001.dc01.prod.example.com:3306",
"memberRole": "SECONDARY",
"mode": "R/O",
"readReplicas": {},
"replicationLag": null,
"role": "HA",
"status": "ONLINE",
"version": "8.0.27"
},
"mydb-1004.dc01.prod.example.com:3306": {
"address": "mydb-1004.dc01.prod.example.com:3306",
"memberRole": "PRIMARY",
"mode": "R/W",
"readReplicas": {},
"replicationLag": null,
"role": "HA",
"status": "ONLINE",
"version": "8.0.27"
},
"mydb-2001.dc02.prod.example.com:3306": {
"address": "mydb-2001.dc02.prod.example.com:3306",
"memberRole": "SECONDARY",
"mode": "R/O",
"readReplicas": {},
"replicationLag": null,
"role": "HA",
"status": "ONLINE",
"version": "8.0.27"
},
"mydb-2003.dc02.prod.example.com:3306": {
"address": "mydb-2003.dc02.prod.example.com:3306",
"memberRole": "SECONDARY",
"mode": "R/O",
"readReplicas": {},
"replicationLag": null,
"role": "HA",
"status": "ONLINE",
},
"mydb-6001.dc03.prod.example.com:3306": {
"address": "mydb-6001.dc03.prod.example.com:3306",
"memberRole": "SECONDARY",
"mode": "R/O",
"readReplicas": {},
"replicationLag": null,
"role": "HA",
"status": "ONLINE",
"version": "8.0.28"
},
"mydb-6005.dc03.prod.example.com:3306": {
"address": "mydb-6005.dc03.prod.example.com:3306",
"memberRole": "SECONDARY",
"mode": "R/O",
"readReplicas": {},
"replicationLag": null,
"role": "HA",
"status": "ONLINE",
"version": "8.0.27"
}
},
"topologyMode": "Single-Primary"
},
"groupInformationSourceMember": "mydb-1004.dc01.prod.example.com:3306"
}
We have noticed that when a host is out of the group (e.g. because it was being upgraded) and it is trying to rejoin, writes to the group (which go to a different host - the current primary) can stall for a short while. Given that group replication is a high availability solution for GR, and that the chagne in question does not directly involve the group primary, it is a bit counter intuitive to find that adding a host to the group (i.e. adding more redundancy to the group) causes writes to stall. Some apps may have tight deadlines to complete writes (e.g. web application that is expected to provide a response to the visitor in less than a certain amount of time). Is this known/intended? Are there any solutions/workarounds?
How to repeat:
To reproduce it, we wrote a tiny go program that tries to write once per second, and times out writes if they take more than one second to complete. Code below:
package main
import (
"context"
"database/sql"
"fmt"
_ "github.com/go-sql-driver/mysql"
"os"
"time"
)
const WriteProbeQuery = `
INSERT INTO meta.heartbeat
(id, master_ts, master_csec, update_by, master_id)
VALUES
(1, UTC_TIMESTAMP(6), ROUND(100 * @@timestamp), 'mysql_availability_collector', @@global.server_id)
ON DUPLICATE KEY UPDATE
master_ts=UTC_TIMESTAMP(6),
master_csec=ROUND(100 * @@timestamp),
update_by=VALUES(update_by),
master_id=@@global.server_id
;`
func main() {
mysqlUser := os.Getenv("MYSQL_USER")
mysqlPassword := os.Getenv("MYSQL_PASSWORD")
mysqlHostname := os.Getenv("MYSQL_HOSTNAME")
mysqlPort := os.Getenv("MYSQL_PORT")
mysqlDatabase := os.Getenv("MYSQL_DATABASE")
connString := fmt.Sprintf("%s:%s@tcp(%s:%s)/%s", mysqlUser, mysqlPassword, mysqlHostname, mysqlPort, mysqlDatabase)
db, err := sql.Open("mysql", connString)
if err != nil {
panic(fmt.Sprintf("Failed to open DB connections: %+v", err))
}
defer db.Close()
for true {
func() {
ctx, cancel := context.WithTimeout(context.Background(), time.Second)
defer cancel()
_, err = db.ExecContext(ctx, WriteProbeQuery)
if err != nil {
fmt.Fprintf(os.Stderr, "Got error: %v\n", err)
}
}()
time.Sleep(1 * time.Second)
}
}
We set the program to run, and, at the same time, we do this in a secondary group member:
root@mydb-6005 [(none)]> stop group_replication; select sleep(60); start group_replication;
When the group replication is stopped, there seems to be no impact. However, after the 60 seconds sleep, when the host is rejoining the group, the test program prints the following:
[ 129 ] user@host-101 ~ % ➜ ./iodku;
Got error: context deadline exceeded
Got error: context deadline exceeded
Which means that the write did not complete on time. We have seen it printed up to 4 times, which means that writes have been stalling by up to 8 seconds (1 second wait time for each write + 1 second sleep time between writes, all of this times 4 messages).