Bug #70897 NDB Datanode crash after running query.
Submitted: 13 Nov 2013 16:17 Modified: 13 Nov 2014 14:35
Reporter: Martin van Wilderen Email Updates:
Status: Duplicate Impact on me:
None 
Category:MySQL Cluster: Cluster (NDB) storage engine Severity:S1 (Critical)
Version:mysql-5.6.11 ndb-7.3.2, 7.3.3 OS:Linux (CentOS release 6.4 (Final) / 2.6.32-358.6.2.el6.x86_64)
Assigned to: CPU Architecture:Any
Tags: ndbd

[13 Nov 2013 16:17] Martin van Wilderen
Description:
While running a query, one datanode will crash.
Running the query again crashes the other datanode.
We are running mySQL cluster with 2 data nodes.

How to repeat:
Use this query:
SELECT l.id, l.CreationDate AS Datum, re.EAN, re.HubID, re.PortalID, re.PortalUserID, kr.NAME AS Naam, c.CreationDate AS Claimdatum, c.role AS Claimrol, c.EndDate AS Einddatum,
                                                                              l.Statuscode, l.Errormessage, "" AS Acties, kr.UserHash, l.AttributenOK, l.Doelgroep, l.Licentiegeldig, l.ClientIP, c.ID AS ClaimID, l.Request_ID,
                                                                              COALESCE(l.KeyringLookupTime, l.VerifyClaimTime, l.CreateClaimTime) AS hasSubgrid
                                               FROM loguserflow l
                                                                              LEFT JOIN request re ON (l.Request_ID = re.id)
                                                                              LEFT JOIN `key` k ON (k.HubID = re.HubID AND k.PortalID = re.PortalID AND k.PortalUserID = re.PortalUserID)
                                                                              LEFT JOIN keyring kr ON (kr.id = k.Keyring_ID)
                                                                              LEFT JOIN claim c ON (c.Request_ID = re.ID)
                                                                            
WHERE 1 = 1
and re.HubID = "Kennisnet"
and  l.CreationDate >= "2013-10-01 00:30"
and  l.CreationDate <= "2013-11-11 00:45"
and kr.name like '%Bargboer%’
order by  l.id DESC limit 0, 100

CREATE TABLE `loguserflow` (
  `ID` int(11) NOT NULL AUTO_INCREMENT,
  `Request_ID` int(11) DEFAULT NULL,
  `Key_ID` int(11) DEFAULT NULL,
  `StatusCode` int(11) DEFAULT NULL,
  `ErrorMessage` text,
  `AttributenOK` tinyint(1) DEFAULT NULL,
  `Doelgroep` tinyint(1) DEFAULT NULL,
  `EanKomtVoorInPortaal` tinyint(1) DEFAULT NULL,
  `SamlAuthenticated` tinyint(1) DEFAULT NULL,
  `SignatureAuthenticated` tinyint(1) DEFAULT NULL,
  `ERKeyringOphalenGelukt` tinyint(1) DEFAULT NULL,
  `BinnenkomendeKeyType` enum('SCHOOL_PORTAL','DISTRIBUTOR_PORTAL','DOWNLOAD') DEFAULT NULL,
  `KeyInXSKeyring` enum('Ja','Nee','Aanvullen') DEFAULT NULL,
  `LicentieGeldig` tinyint(1) DEFAULT NULL,
  `TegoedType` enum('EBF','IBF') DEFAULT NULL,
  `Afgeboekt` enum('Tegoed','ServiceTegoed') DEFAULT NULL,
  `RedirectProtocol` enum('DTDL','Sessie','Downloadportaal','Servicetools') DEFAULT NULL,
  `RedirectURL` varchar(4096) DEFAULT NULL,
  `VerifyClaimTime` int(11) DEFAULT NULL,
  `CreateClaimTime` int(11) DEFAULT NULL,
  `KeyringLookupTime` int(11) DEFAULT NULL,
  `CreationDate` timestamp NOT NULL DEFAULT CURRENT_TIMESTAMP,
  `ClientIP` int(11) unsigned DEFAULT NULL,
  PRIMARY KEY (`ID`),
  KEY `FK_loguserflow_Request_ID` (`Request_ID`),
  KEY `FK_loguserflow_Key_ID` (`Key_ID`),
  KEY `IX_StatusCode` (`StatusCode`),
  KEY `IX_CreationDate` (`CreationDate`),
  KEY `IX_Statuscode_CreationDate` (`StatusCode`,`CreationDate`)
) ENGINE=ndbcluster DEFAULT CHARSET=latin1
 
CREATE TABLE `request` (
  `ID` int(11) NOT NULL AUTO_INCREMENT,
  `Dafsession_ID` int(10) unsigned DEFAULT NULL,
  `EAN` varchar(15) NOT NULL,
  `HubID` varchar(10) NOT NULL,
  `PortalID` varchar(50) DEFAULT NULL,
  `PortalUserID` varchar(50) DEFAULT NULL,
  `Attributes` varchar(4096) DEFAULT NULL,
  `CreationDate` timestamp NOT NULL DEFAULT CURRENT_TIMESTAMP,
  PRIMARY KEY (`ID`),
  KEY `FK_request_Dafsession_ID` (`Dafsession_ID`),
  KEY `IX_request_HubID_PortalID_PortalUserID` (`HubID`,`PortalID`,`PortalUserID`),
  KEY `IX_request_EAN` (`EAN`),
  KEY `IX_HubID_PortalID` (`HubID`,`PortalID`),
  KEY `IX_HubID_PortalID_EAN` (`HubID`,`PortalID`,`EAN`),
  KEY `IX_EAN` (`EAN`),
  KEY `IX_CreationDate` (`CreationDate`)
) ENGINE=ndbcluster  DEFAULT CHARSET=latin1
 
CREATE TABLE `key` (
  `ID` int(11) NOT NULL AUTO_INCREMENT,
  `Keyring_ID` int(11) NOT NULL,
  `HubID` varchar(50) NOT NULL,
  `PortalID` varchar(50) NOT NULL,
  `PortalUserID` varchar(50) NOT NULL,
  `CreationDate` timestamp NOT NULL DEFAULT CURRENT_TIMESTAMP,
  PRIMARY KEY (`ID`),
  UNIQUE KEY `IX_key_HubID_PortalID_PortalUserID` (`HubID`,`PortalID`,`PortalUserID`),
  KEY `FK_Key_Keyring_ID` (`Keyring_ID`),
  KEY `IX_PortalUserID` (`PortalUserID`)
) ENGINE=ndbcluster  DEFAULT CHARSET=latin1
 
CREATE TABLE `keyring` (
  `ID` int(11) NOT NULL AUTO_INCREMENT,
  `UserHash` varchar(64) NOT NULL,
  `Aanvullen` tinyint(1) DEFAULT NULL,
  `Name` varchar(100) DEFAULT NULL,
  `Email` varchar(254) DEFAULT NULL,
  `Role` varchar(255) DEFAULT NULL,
  `KeyringGUID` varchar(38) DEFAULT NULL COMMENT 'Meeste varianten van GUID zijn tussen 32 en 38 karakters',
  `UniekeID` varchar(255) DEFAULT NULL,
  `CreationDate` timestamp NOT NULL DEFAULT CURRENT_TIMESTAMP,
  PRIMARY KEY (`ID`),
  UNIQUE KEY `IX_keyring_UserHash` (`UserHash`),
  KEY `IX_KeyringGUID` (`KeyringGUID`),
  KEY `IX_Name` (`Name`),
  KEY `IX_Email` (`Email`)
) ENGINE=ndbcluster  DEFAULT CHARSET=latin1
 
CREATE TABLE `claim` (
  `ID` int(11) NOT NULL AUTO_INCREMENT,
  `Request_ID` int(11) NOT NULL,
  `Creditused_ID` int(11) NOT NULL,
  `Key_ID` int(11) NOT NULL,
  `Role` varchar(255) DEFAULT NULL,
  `EndDate` datetime DEFAULT NULL,
  `CreationDate` timestamp NOT NULL DEFAULT CURRENT_TIMESTAMP,
  PRIMARY KEY (`ID`),
  KEY `FK_claim_request_ID` (`Request_ID`),
  KEY `FK_claim_creditused_ID` (`Creditused_ID`),
  KEY `FK_claim_key_ID` (`Key_ID`)
) ENGINE=ndbcluster  DEFAULT CHARSET=latin1

Crash report in datanode log:
 
Time: Monday 11 November 2013 - 20:15:38
Status: Temporary error, restart node
Message: Internal program error (failed ndbrequire) (Internal error, programming error or missing error message, please report a bug)
Error: 2341
Error data: DbaccMain.cpp
Error object: DBACC (Line: 1348) 0x00000002
Program: ndbd
Pid: 18653
Version: mysql-5.6.11 ndb-7.3.2
Trace: /var/lib/mysql/ndb_3_trace.log.12 [t1..t1]
***EOM***
 
Query error:
Error Code : 1297
Got temporary error 4010 'Node failure caused abort of transaction' from NDBCLUSTER

Suggested fix:
Run query without crashing a ndb datanode.
[19 Nov 2013 10:47] MySQL Verification Team
Hello Martin,

Thank you for the report.
I can not repeat described behavior with dummy data.
Could you please provide repeatable data? Please mark as private when you upload the data.

Also, Could you please attach the cluster logs? Preferably using the ndb_error_reporter utility:

  http://dev.mysql.com/doc/refman/5.5/en/mysql-cluster-programs-ndb-error-reporter.html

Thanks,
Umesh
[19 Nov 2013 11:03] Martin van Wilderen
Hi Umesh,

> Could you please provide repeatable data?
Would you like a full dump of the database? This is a 5.7 GB SQL dump.

> Also, Could you please attach the cluster logs? Preferably using the ndb_error_reporter utility
Done
[29 Nov 2013 20:00] MySQL Verification Team
Hello Martin,

Thank you for the test case.
I've hit this issue only 1/20 times so for on 7.3.3 when simulating a load with mysqlslap ( just running the query never triggered any data node crash).

// How to repeat

- setup cluster(1mgm, 2 data, 1 api node) and import provided data

- Simulate load with mysqlslap

bin/mysqlslap --no-defaults --create-schema=test --user=root --delimiter=";"  --query=/tmp/query.sql --concurrency=50 --iterations=200

// query used

[root@cluster-repo mysql-cluster-gpl-7.3.3]# more /tmp/query.sql
SELECT l.id, l.CreationDate AS Datum, re.EAN, re.HubID, re.PortalID, re.PortalUserID, kr.NAME AS Naam, c.CreationDate AS Claimdatum, c.role AS Claimrol, c.EndDate AS Einddatum,l.Statuscode, l.Errormessage, "" AS Acties, kr.UserHash, l.AttributenOK, l.Doelgroep, l.Licentiegeldig, l.ClientIP, c.ID AS ClaimID, l.Request_ID,COALESCE(l.KeyringLookupTime, l.VerifyClaimTime, l.CreateClaimTime) AS hasSubgrid FROM loguserflow l LEFT JOIN request re ON (l.Request_ID = re.id) LEFT JOIN `key` k ON (k.HubID = re.HubID AND k.PortalID = re.PortalID AND k.PortalUserID = re.PortalUserID) LEFT JOIN keyring kr ON (kr.id = k.Keyring_ID) LEFT JOIN claim c ON (c.Request_ID = re.ID) WHERE 1 = 1 and re.HubID = "Kennisnet" and  l.CreationDate >= "2013-10-01 00:30" and  l.CreationDate <= "2013-11-11 00:45" and kr.name like '%Bargboer%' order by  l.id DESC limit 0, 100;

Thanks,
Umesh
[20 Feb 2014 9:07] Martin van Wilderen
Any updates on the fix of this bug?
[13 Nov 2014 14:01] Mikael Ronström
It crashes due to an error in the function xfrm_key. So probably the information received in the signal has some problems in it. The signal comes from DBSPJ, so this means that it is very likely that something has
occurred with the signal data while in transit. Quite likely an error in the SPJ part.
[13 Nov 2014 14:34] Ole John Aske
Posted by developer:
 
This seems to be a variant of bug#17845161 

ERROR 1296: GOT ERROR 290 'CORRUPT KEY IN TC, UNABLE TO XFRM'; NDB_JOIN_PUSHDOWN

This issue has been fixed in 7.2.15, 7.3.4 and 7.4.x