Description:
When a NDB storage daemon with lower id starts, all mysqld API's are disconnected from the cluster according to ndb_mgm. MySQL client access remains "down" for a few minutes and until the NDB storage goes up. In the following case, the mysqld API's refused to service any MySQL clients until NDB storage id=3 has started up.
This is a 3 node NDBCLUSTER. Two of the nodes act as NDB storage and also run mysqld API daemons. 10.117.0/24 is the cluster interconnect on a separate private network.
[NDBD DEFAULT]
NoOfReplicas=2 # Number of replicas (max 4)
DataMemory=155M # ndb_mgm> all dump 1000
IndexMemory=15M
MaxNoOfTables=800 # max 1600
MaxNoOfOrderedIndexes=1024 # max 2048
MaxNoOfUniqueHashIndexes=1024 # max 2048
MaxNoOfTriggers=384 # max 768
MaxNoOfAttributes=8000 # max 16000
# Help guarantee the cluster's real-time characteristics
#LockPagesInMainMemory # Boolean
StartFailureTimeout=600000 # milliseconds (600 seconds)
HeartbeatIntervalDbDb=20000 # milliseconds (20 seconds) make gt CLMS timeout
MaxNoOfOpenFiles=80 # Default:40 Min:20
[TCP DEFAULT]
[NDB_MGMD]
Id=1
hostname=10.0.0.1 # Hostname or IP address of MGM node
datadir=/var/lib/mysql-cluster # Directory for MGM node logfiles, process output files, and the daemon's pid file
ArbitrationRank=1 # 0 never, 1 high, 2 low
[MYSQLD]
Id=5
hostname=10.0.0.13 # Hostname or IP address
# ArbitrationRank 2
[MYSQLD]
Id=6
hostname=10.0.0.7 # Hostname or IP address
# ArbitrationRank 2
[MYSQLD]
Id=7 # Backup/Restore
hostname=10.0.0.13 # Hostname or IP address
# ArbitrationRank 0
[MYSQLD]
Id=8 # Backup/Restore
hostname=10.0.0.7 # Hostname or IP address
# ArbitrationRank 0
[NDBD]
Id=3
hostname=10.0.0.13
datadir=/cluster/node1/var/lib/mysql-cluster/data
[NDBD]
Id=4
hostname=10.0.0.7
datadir=/cluster/node2/var/lib/mysql-cluster/data
[TCP]
NodeId1=3
NodeId2=4
Hostname1=10.117.0.1
Hostname2=10.117.0.2
# SendBufferMemory 768KB
[root@node2 sysconfig]# ndb_mgm -e show
Connected to Management Server at: 10.0.0.1:1186
Cluster Configuration
---------------------
[ndbd(NDB)] 2 node(s)
id=3 (not connected, accepting connect from 10.0.0.13)
id=4 @10.0.0.7 (Version: 4.1.14, Nodegroup: 0, Master)
[ndb_mgmd(MGM)] 1 node(s)
id=1 @10.0.0.1 (Version: 4.1.14)
[mysqld(API)] 4 node(s)
id=5 @10.0.0.13 (Version: 4.1.14)
id=6 @10.0.0.7 (Version: 4.1.14)
id=7 (not connected, accepting connect from 10.0.0.13)
id=8 (not connected, accepting connect from 10.0.0.7)
[root@node2 sysconfig]# onnode 1 ndbd
Error handler restarting system
Error handler shutdown completed - exiting
[root@node2 sysconfig]# ndb_mgm -e "purge stale sessions"
Connected to Management Server at: 10.0.0.1:1186
Purged sessions with node id's: 3
[root@node2 sysconfig]# onnode 1 ndbd
[root@node2 sysconfig]# ndb_mgm -e show
Connected to Management Server at: 10.0.0.1:1186
Cluster Configuration
---------------------
[ndbd(NDB)] 2 node(s)
id=3 @10.0.0.13 (Version: 4.1.14, starting, Nodegroup: 0)
id=4 @10.0.0.7 (Version: 4.1.14, Nodegroup: 0, Master)
[ndb_mgmd(MGM)] 1 node(s)
id=1 @10.0.0.1 (Version: 4.1.14)
[mysqld(API)] 4 node(s)
id=5 (not connected, accepting connect from 10.0.0.13)
id=6 (not connected, accepting connect from 10.0.0.7)
id=7 (not connected, accepting connect from 10.0.0.13)
id=8 (not connected, accepting connect from 10.0.0.7)
========== after a few minutes ==========
[root@node2 sysconfig]# ndb_mgm -e show
Connected to Management Server at: 10.0.0.1:1186
Cluster Configuration
---------------------
[ndbd(NDB)] 2 node(s)
id=3 @10.0.0.13 (Version: 4.1.14, Nodegroup: 0)
id=4 @10.0.0.7 (Version: 4.1.14, Nodegroup: 0, Master)
[ndb_mgmd(MGM)] 1 node(s)
id=1 @10.0.0.1 (Version: 4.1.14)
[mysqld(API)] 4 node(s)
id=5 @10.0.0.13 (Version: 4.1.14)
id=6 @10.0.0.7 (Version: 4.1.14)
id=7 (not connected, accepting connect from 10.0.0.13)
id=8 (not connected, accepting connect from 10.0.0.7)
How to repeat:
See description.
Suggested fix:
No workaround available.