Description:
I have repeated this twice now.
With a four data node cluster, I should be able to loose one data node and continue to process. Instead the one elected to takeover as master (I believe) crashes.
cluster.log
2005-05-31 23:26:39 [MgmSrvr] INFO -- Node 4: Communication to Node 3 closed
2005-05-31 23:26:39 [MgmSrvr] ALERT -- Node 2: Arbitration check won - node group majority2005-05-31 23:26:39 [MgmSrvr] INFO -- Node 2: President restarts arbitration thread [state=6]
2005-05-31 23:26:48 [MgmSrvr] ALERT -- Node 5: Node 2 Disconnected
2005-05-31 23:26:48 [MgmSrvr] ALERT -- Node 4: Node 2 Disconnected
2005-05-31 23:26:48 [MgmSrvr] INFO -- Node 4: Communication to Node 2 closed
2005-05-31 23:26:48 [MgmSrvr] INFO -- Node 5: Communication to Node 2 closed
2005-05-31 23:26:48 [MgmSrvr] INFO -- Node 1: Node 2 Connected
2005-05-31 23:26:49 [MgmSrvr] INFO -- Node 1: Node 4 Connected
2005-05-31 23:26:50 [MgmSrvr] INFO -- Node 1: Node 5 Connected
2005-05-31 23:27:24 [MgmSrvr] INFO -- Mgmt server state: nodeid 13 freed, m_reserved_nodes 00000000000011c2.
2005-05-31 23:27:24 [MgmSrvr] INFO -- Mgmt server state: nodeid 12 freed, m_reserved_nodes 00000000000001c2.
Node 2 error log
Date/Time: Tuesday 31 May 2005 - 23:26:48
Type of error: assert
Message: Assertion, probably a programming error
Fault ID: 2301
Problem data: ArrayPool<T>::getPtr
Object of reference: ../../../../../../storage/ndb/src/kernel/vm/ArrayPool.hpp line: 350 (block: SUMA)
ProgramName: ../libexec/ndbd
ProcessID: 11762
TraceFile: ./ndb_2_trace.log.2
Version 5.1.0 (rowrepl_drop1)
***EOM***
Stack trace#1: (Core.11762 Node 2)
#0 0x40106da1 in kill () from /lib/libc.so.6
#1 0x40037f4a in pthread_kill () from /lib/libpthread.so.0
#2 0x400382f9 in raise () from /lib/libpthread.so.0
#3 0x40106b7a in raise () from /lib/libc.so.6
#4 0x40107d95 in abort () from /lib/libc.so.6
#5 0x081e749f in NdbShutdown (type=136503103, restartType=NRT_Default) at Emulator.cpp:233
#6 0x081ec4bd in ErrorReporter::handleAssert(char const*, char const*, int) (
message=0x8229a8f "ArrayPool<T>::getPtr",
file=0x8225fe0 "../../../../../../storage/ndb/src/kernel/vm/ArrayPool.hpp", line=350)
at ErrorReporter.cpp:173
#7 0x081cf525 in Suma::resend_bucket(Signal*, unsigned, unsigned, unsigned, unsigned) (
this=0x8484cd8, signal=0x826d280, buck=1, min_gci=5264, pos=126082, last_gci=5264)
at ArrayPool.hpp:350
#8 0x081c6065 in Suma::execCONTINUEB(Signal*) (this=0x8484cd8, signal=0x6) at Suma.cpp:449
#9 0x080c170b in SimulatedBlock::executeFunction(unsigned short, Signal*) (this=0x8484cd8,
gsn=164, signal=0x826d280) at SimulatedBlock.hpp:530
#10 0x081e45a2 in FastScheduler::doJob() (this=0x8276b60) at FastScheduler.cpp:138
#11 0x081e58cb in ThreadConfig::ipControlLoop() (this=0x827e0f0) at ThreadConfig.cpp:175
#12 0x0809e8fc in main (argc=3, argv=0x827e0d8) at main.cpp:213
#13 0x400f562d in __libc_start_main () from /lib/libc.so.6
(gdb) frame 9
#9 0x080c170b in SimulatedBlock::executeFunction(unsigned short, Signal*) (this=0x8484cd8,
gsn=164, signal=0x826d280) at SimulatedBlock.hpp:530
530 SimulatedBlock.hpp: No such file or directory.
in SimulatedBlock.hpp
(gdb) down
#8 0x081c6065 in Suma::execCONTINUEB(Signal*) (this=0x8484cd8, signal=0x6) at Suma.cpp:449
449 resend_bucket(signal,
(gdb) l
444 switch(type){
445 case SumaContinueB::RELEASE_GCI:
446 release_gci(signal, signal->theData[1], signal->theData[2]);
447 return;
448 case SumaContinueB::RESEND_BUCKET:
449 resend_bucket(signal,
450 signal->theData[1],
451 signal->theData[2],
452 signal->theData[3],
453 signal->theData[4]);
(gdb) down
#7 0x081cf525 in Suma::resend_bucket(Signal*, unsigned, unsigned, unsigned, unsigned) (
this=0x8484cd8, signal=0x826d280, buck=1, min_gci=5264, pos=126082, last_gci=5264)
at ArrayPool.hpp:350
350 ArrayPool.hpp: No such file or directory.
in ArrayPool.hpp
(gdb) l
345 in ArrayPool.hpp
(gdb) down
#6 0x081ec4bd in ErrorReporter::handleAssert(char const*, char const*, int) (
message=0x8229a8f "ArrayPool<T>::getPtr",
file=0x8225fe0 "../../../../../../storage/ndb/src/kernel/vm/ArrayPool.hpp", line=350)
at ErrorReporter.cpp:173
173 NdbShutdown(NST_ErrorHandler);
(gdb) l
168 file, line, blockName);
169 #endif
170 WriteMessage(assert, ERR_ERROR_PRGERR, message, refMessage,
171 theEmulatedJamIndex, theEmulatedJam);
172
173 NdbShutdown(NST_ErrorHandler);
174 }
175
176 void
177 ErrorReporter::handleThreadAssert(const char* message,
Stack trace#2: (core.11798 Node 4)
#0 0x40106da1 in kill () from /lib/libc.so.6
#1 0x40037f4a in pthread_kill () from /lib/libpthread.so.0
#2 0x400382f9 in raise () from /lib/libpthread.so.0
#3 0x40106b7a in raise () from /lib/libc.so.6
#4 0x40107d95 in abort () from /lib/libc.so.6
#5 0x081e749f in NdbShutdown (type=136503103, restartType=NRT_Default) at Emulator.cpp:233
#6 0x081e0d5f in SimulatedBlock::progError(int, int, char const*) const (this=0x1,
line=3788, err_code=2305, extra=0x822c0c0 "Arbitrator decided to shutdown this node")
at SimulatedBlock.cpp:730
#7 0x081a4e6e in Qmgr::stateArbitCrash(Signal*) (this=0x8455b40, signal=0x826d280)
at QmgrMain.cpp:3788
#8 0x081a38d1 in Qmgr::runArbitThread(Signal*) (this=0x84572ac, signal=0x8455b40)
at QmgrMain.cpp:3321
#9 0x081a3368 in Qmgr::handleArbitCheck(Signal*) (this=0x8455b40, signal=0x826d280)
at QmgrMain.cpp:3262
#10 0x081a0b6d in Qmgr::execPREP_FAILCONF(Signal*) (this=0x8455b40, signal=0x826d280)
at QmgrMain.cpp:2324
#11 0x080c170b in SimulatedBlock::executeFunction(unsigned short, Signal*) (this=0x8455b40,
gsn=0, signal=0x826d280) at SimulatedBlock.hpp:530
#12 0x081e45a2 in FastScheduler::doJob() (this=0x8276b60) at FastScheduler.cpp:138
#13 0x081e58cb in ThreadConfig::ipControlLoop() (this=0x827e0f0) at ThreadConfig.cpp:175
#14 0x0809e8fc in main (argc=3, argv=0x827e0d8) at main.cpp:213
#15 0x400f562d in __libc_start_main () from /lib/libc.so.6
(gdb) frame 11
#11 0x080c170b in SimulatedBlock::executeFunction(unsigned short, Signal*) (this=0x8455b40,
gsn=0, signal=0x826d280) at SimulatedBlock.hpp:530
530 SimulatedBlock.hpp: No such file or directory.
in SimulatedBlock.hpp
(gdb) l
525 in SimulatedBlock.hpp
(gdb) down
#10 0x081a0b6d in Qmgr::execPREP_FAILCONF(Signal*) (this=0x8455b40, signal=0x826d280)
at QmgrMain.cpp:2324
2324 handleArbitCheck(signal);
(gdb) l
2319 * We're performing a system shutdown,
2320 * don't let artibtrator shut us down
2321 */
2322 return;
2323 }
2324 handleArbitCheck(signal);
2325 return;
2326 }//Qmgr::execPREP_FAILCONF()
2327
2328 void
(gdb) down
#9 0x081a3368 in Qmgr::handleArbitCheck(Signal*) (this=0x8455b40, signal=0x826d280)
at QmgrMain.cpp:3262
3262 runArbitThread(signal);
(gdb) l
3257 jam();
3258 ndbrequire(cpresident == getOwnNodeId());
3259 arbitRec.code = ArbitCode::ThreadStart;
3260 reportArbitEvent(signal, NDB_LE_ArbitState);
3261 signal->theData[1] = ++arbitRec.thread;
3262 runArbitThread(signal);
3263 }
3264
3265 /**
3266 * Handle arbitration thread. The initial thread normally ends
(gdb) down
#8 0x081a38d1 in Qmgr::runArbitThread(Signal*) (this=0x84572ac, signal=0x8455b40)
at QmgrMain.cpp:3321
3321 stateArbitCrash(signal);
(gdb) l
3316 jam();
3317 stateArbitChoose(signal);
3318 break;
3319 case ARBIT_CRASH:
3320 jam();
3321 stateArbitCrash(signal);
3322 break;
3323 default:
3324 ndbrequire(false);
3325 break;
(gdb)
#7 0x081a4e6e in Qmgr::stateArbitCrash(Signal*) (this=0x8455b40, signal=0x826d280)
at QmgrMain.cpp:3788
3788 progError(__LINE__, ERR_ARBIT_SHUTDOWN, "Arbitrator decided to shutdown this node");
(gdb) l
3783 }
3784 #ifdef ndb_arbit_crash_wait_for_event_report_to_get_out
3785 if (! (arbitRec.getTimediff() > getArbitTimeout()))
3786 return;
3787 #endif
3788 progError(__LINE__, ERR_ARBIT_SHUTDOWN, "Arbitrator decided to shutdown this node");
3789 }
3790
3791 /**
3792 * Arbitrator may inform us that it will exit. This lets us
Stack Trace #3 (core.4466 node 5)
Same as #2
How to repeat:
Setup a cluster as follows:
System1: ndb_mgmd, ndbd, ndbd, api, api, api
System2: ndbd, ndbd
System3: mysqld, mysqld ( I was using the new instatance manager to start and stop them)
Start the bank program just running to one database. Let it go for a little bit (have not tried this without letting the program run for a while). Find which node it the master (TC) data node. (I have not tried this wilth any of the other nodes). Give the command kill -9 on that data node and the cluster will come down.
config.ini file:
[DB DEFAULT]
NoOfReplicas: 2
IndexMemory: 100M
DataMemory: 300M
BackupMemory: 64M
MaxNoOfConcurrentScans: 100
DataDir: .
FileSystemPath: /home/ndbdev/jmiller/builds/run
[MGM DEFAULT]
PortNumber: 14000
ArbitrationRank: 1
DataDir: .
[ndb_mgmd]
Id: 1
HostName: ndb08
[ndbd]
Id: 2
HostName: ndb08
[ndbd]
Id: 3
HostName: ndb09
[ndbd]
Id: 4
HostName: ndb08
[ndbd]
Id: 5
HostName: ndb09
[api]
Id: 6
HostName: ndb08
[api]
Id: 7
HostName: ndb08
[api]
Id: 8
HostName: ndb08
[api]
Id: 9
HostName: ndb09
[api]
Id: 10
HostName: ndb09
[api]
Id: 11
HostName: ndb09
[mysqld]
Id: 12
HostName: ndb10
[mysqld]
Id: 13
HostName: ndb10
[api]
Id: 14
HostName: ndb10
[api]
Id: 15
HostName: ndb10
Suggested fix:
Cluster should handle the lost of one node.