Description:
There is a potential race between a thread receiving data with TransporterRegistry::performReceive, and another thread (async) initiating disconnect of the transporter by calling Transporter::doDisconnect(). The later method will call TCP_Transporter::disconnectImpl() which
clears the receive buffer. There are no concurrency logic which protects against this happening while another thread is doing ::performReceive() on the same transporter, and it might then see an inconsistent receivebuffer.
We have seen this happening with the following call stacks:
#0 0x00007ffff6c968a5 in raise () from /lib64/libc.so.6
#1 0x00007ffff6c98085 in abort () from /lib64/libc.so.6
#2 0x00007ffff6c8fa1e in __assert_fail_base () from /lib64/libc.so.6
#3 0x00007ffff6c8fae0 in __assert_fail () from /lib64/libc.so.6
#4 0x0000000000478d9c in TCP_Transporter::updateReceiveDataPtr (this=0xbfbb50, bytesRead=80)
at /net/fimafeng09/export/home/tmp/oleja/mysql/mysql-5.6-cluster-7.4/storage/ndb/src/common/transporter/TCP_Transporter.hpp:167
#5 0x0000000000471e17 in TransporterRegistry::performReceive (this=0xbd7c50, recvdata=...)
at /net/fimafeng09/export/home/tmp/oleja/mysql/mysql-5.6-cluster-7.4/storage/ndb/src/common/transporter/TransporterRegistry.cpp:1412
#6 0x0000000000462a1d in TransporterRegistry::performReceive (this=0xbd7c50)
at /net/fimafeng09/export/home/tmp/oleja/mysql/mysql-5.6-cluster-7.4/storage/ndb/include/transporter/TransporterRegistry.hpp:634
#7 0x000000000045f031 in TransporterFacade::external_poll (this=0xc33050, wait_time=1000)
at /net/fimafeng09/export/home/tmp/oleja/mysql/mysql-5.6-cluster-7.4/storage/ndb/src/ndbapi/TransporterFacade.cpp:926
#8 0x000000000045f12a in TransporterFacade::do_poll (this=0xc33050, clnt=0xbeac00, wait_time=1000, is_poll_owner=false,
stay_poll_owner=false)
at /net/fimafeng09/export/home/tmp/oleja/mysql/mysql-5.6-cluster-7.4/storage/ndb/src/ndbapi/TransporterFacade.cpp:2371
#9 0x00000000004642d8 in trp_client::do_poll (this=0xbeac00, to=1000)
at /net/fimafeng09/export/home/tmp/oleja/mysql/mysql-5.6-cluster-7.4/storage/ndb/src/ndbapi/trp_client.cpp:125
#10 0x0000000000467a37 in SignalSender::waitFor<WaitForAny> (this=0xbeac00, timeOutMillis=1000, t=...)
at /net/fimafeng09/export/home/tmp/oleja/mysql/mysql-5.6-cluster-7.4/storage/ndb/src/ndbapi/SignalSender.cpp:260
#11 0x0000000000466080 in SignalSender::waitFor (this=0xbeac00, timeOutMillis=1000)
at /net/fimafeng09/export/home/tmp/oleja/mysql/mysql-5.6-cluster-7.4/storage/ndb/src/ndbapi/SignalSender.cpp:298
#12 0x0000000000439fb0 in ConfigManager::run (this=0xbbe5b0)
at /net/fimafeng09/export/home/tmp/oleja/mysql/mysql-5.6-cluster-7.4/storage/ndb/src/mgmsrv/ConfigManager.cpp:1922
#13 0x000000000041ed54 in MgmtThread::run_C (t=0xbbe5b0)
at /net/fimafeng09/export/home/tmp/oleja/mysql/mysql-5.6-cluster-7.4/storage/ndb/src/mgmsrv/MgmtThread.hpp:32
#14 0x000000000049b5dc in ndb_thread_wrapper (_ss=0xc49ab0)
at /net/fimafeng09/export/home/tmp/oleja/mysql/mysql-5.6-cluster-7.4/storage/ndb/src/common/portlib/NdbThread.c:202
#15 0x00007ffff7bc7851 in start_thread () from /lib64/libpthread.so.0
#16 0x00007ffff6d4b67d in clone () from /lib64/libc.so.6
========== and ===============
0 0x00007ffff6c968a5 in raise () from /lib64/libc.so.6
#1 0x00007ffff6c98085 in abort () from /lib64/libc.so.6
#2 0x00007ffff6c8fa1e in __assert_fail_base () from /lib64/libc.so.6
#3 0x00007ffff6c8fae0 in __assert_fail () from /lib64/libc.so.6
#4 0x0000000000478e60 in TCP_Transporter::updateReceiveDataPtr (this=0xc91c40, bytesRead=276)
at /net/fimafeng09/export/home/tmp/oleja/mysql/mysql-5.6-cluster-7.4/storage/ndb/src/common/transporter/TCP_Transporter.hpp:167
#5 0x0000000000471ea3 in TransporterRegistry::performReceive (this=0xbd7c50, recvdata=...)
at /net/fimafeng09/export/home/tmp/oleja/mysql/mysql-5.6-cluster-7.4/storage/ndb/src/common/transporter/TransporterRegistry.cpp:1416
#6 0x0000000000462a1d in TransporterRegistry::performReceive (this=0xbd7c50)
at /net/fimafeng09/export/home/tmp/oleja/mysql/mysql-5.6-cluster-7.4/storage/ndb/include/transporter/TransporterRegistry.hpp:634
#7 0x000000000045f031 in TransporterFacade::external_poll (this=0xc33050, wait_time=0)
at /net/fimafeng09/export/home/tmp/oleja/mysql/mysql-5.6-cluster-7.4/storage/ndb/src/ndbapi/TransporterFacade.cpp:926
#8 0x000000000045f12a in TransporterFacade::do_poll (this=0xc33050, clnt=0x7ffff7e54320, wait_time=0, is_poll_owner=false,
stay_poll_owner=false)
at /net/fimafeng09/export/home/tmp/oleja/mysql/mysql-5.6-cluster-7.4/storage/ndb/src/ndbapi/TransporterFacade.cpp:2371
#9 0x00000000004642d8 in trp_client::do_poll (this=0x7ffff7e54320, to=0)
at /net/fimafeng09/export/home/tmp/oleja/mysql/mysql-5.6-cluster-7.4/storage/ndb/src/ndbapi/trp_client.cpp:125
#10 0x000000000045e012 in TransporterFacade::close_clnt (this=0xc33050, clnt=0x7ffff7e54320)
at /net/fimafeng09/export/home/tmp/oleja/mysql/mysql-5.6-cluster-7.4/storage/ndb/src/ndbapi/TransporterFacade.cpp:1291
#11 0x0000000000464696 in trp_client::close (this=0x7ffff7e54320)
at /net/fimafeng09/export/home/tmp/oleja/mysql/mysql-5.6-cluster-7.4/storage/ndb/src/ndbapi/trp_client.cpp:101
#12 0x000000000046627a in SignalSender::~SignalSender (this=0x7ffff7e54320, __in_chrg=<value optimized out>)
at /net/fimafeng09/export/home/tmp/oleja/mysql/mysql-5.6-cluster-7.4/storage/ndb/src/ndbapi/SignalSender.cpp:139
#13 0x0000000000419619 in MgmtSrvr::sendVersionReq (this=0xc13710, v_nodeId=1, version=@0x7ffff7e5480c,
mysql_version=@0x7ffff7e54808, address=0x7ffff7e54948)
at /net/fimafeng09/export/home/tmp/oleja/mysql/mysql-5.6-cluster-7.4/storage/ndb/src/mgmsrv/MgmtSrvr.cpp:1151
#14 0x000000000041971c in MgmtSrvr::status_mgmd (this=0xc13710, node_id=1, node_status=@0x7ffff7e54970, version=@0x7ffff7e54968,
mysql_version=@0x7ffff7e54964, address=0x7ffff7e54948)
at /net/fimafeng09/export/home/tmp/oleja/mysql/mysql-5.6-cluster-7.4/storage/ndb/src/mgmsrv/MgmtSrvr.cpp:2093
#15 0x0000000000419bfd in MgmtSrvr::status (this=0xc13710, nodeId=1, _status=0x7ffff7e54970, version=0x7ffff7e54968,
mysql_version=0x7ffff7e54964, _phase=0x7ffff7e5496c, _system=0x7ffff7e54957, dynamic=0x7ffff7e54960,
nodegroup=0x7ffff7e5495c, connectCount=0x7ffff7e54958, address=0x7ffff7e54948)
at /net/fimafeng09/export/home/tmp/oleja/mysql/mysql-5.6-cluster-7.4/storage/ndb/src/mgmsrv/MgmtSrvr.cpp:2163
#16 0x000000000042d631 in printNodeStatus (output=0x7fffd40039e0, mgmsrv=..., type=NDB_MGM_NODE_TYPE_MGM)
at /net/fimafeng09/export/home/tmp/oleja/mysql/mysql-5.6-cluster-7.4/storage/ndb/src/mgmsrv/Services.cpp:1009
#17 0x000000000042dab6 in MgmApiSession::getStatus (this=0x7fffd4003530, args=...)
at /net/fimafeng09/export/home/tmp/oleja/mysql/mysql-5.6-cluster-7.4/storage/ndb/src/mgmsrv/Services.cpp:1066
#18 0x00000000004333dd in Parser<MgmApiSession>::run (this=0x7fffd4003a30, ctx=..., t=..., stop=0x0)
at /net/fimafeng09/export/home/tmp/oleja/mysql/mysql-5.6-cluster-7.4/storage/ndb/include/util/Parser.hpp:192
#19 0x000000000042f413 in MgmApiSession::runSession (this=0x7fffd4003530)
at /net/fimafeng09/export/home/tmp/oleja/mysql/mysql-5.6-cluster-7.4/storage/ndb/src/mgmsrv/Services.cpp:387
#20 0x00000000004aced6 in sessionThread_C (_sc=0x7fffd4003530)
at /net/fimafeng09/export/home/tmp/oleja/mysql/mysql-5.6-cluster-7.4/storage/ndb/src/common/util/SocketServer.cpp:398
#21 0x000000000049b784 in ndb_thread_wrapper (_ss=0x7fffd4003ac0)
at /net/fimafeng09/export/home/tmp/oleja/mysql/mysql-5.6-cluster-7.4/storage/ndb/src/common/portlib/NdbThread.c:202
#22 0x00007ffff7bc7851 in start_thread () from /lib64/libpthread.so.0
#23 0x00007ffff6d4b67d in clone () from /lib64/libc.so.6
How to repeat:
This happens at rare occations while running the AutoTest: 'testDict -n SchemaTrans'
Suggested fix:
Move reset of the Transporter buffers away from the ::doDisconnect() call path, and into a place where we know that there cant be any concurrent ::performReceive() on the same transporter.
This should either be done as a part of ::report_disconnect(), which brings the Transporter into the DISCONNECTED state, (synched with receive), or done while the Transporter is known to already be DISCONNECTED. (Can't be any concurrent receive in this state)