Bug #51054 mysqld segfault after binlog enabled
Submitted: 10 Feb 2010 10:15 Modified: 9 Sep 2010 17:06
Reporter: Pascal Schmiel Email Updates:
Status: No Feedback Impact on me:
None 
Category:MySQL Server: Replication Severity:S1 (Critical)
Version:5.1.37, 5.5.x, next-mr-merge OS:Linux (RHEL5)
Assigned to: Assigned Account CPU Architecture:Any

[10 Feb 2010 10:15] Pascal Schmiel
Description:
We're running a high-traffic Webservice. There were no problems in the past but since we have enabled the binlog and changed some binlog related settings mysqld is segfaulting frequently.
The new settings are:

log-bin         = /var/lib/mysql/binlog/mysql-bin
server-id       = 10
sync_binlog     = 1
log_slave_updates
innodb_flush_log_at_trx_commit  = 1
binlog-format = MIXED
max_binlog_size = 500M
expire_logs_days        = 10
innodb_support_xa                       = 1
innodb_locks_unsafe_for_binlog          = 0

Here are some informations about the system:

- Linux db 2.6.18-164.11.1.el5 #1 SMP Wed Jan 6 13:26:04 EST 2010 x86_64 x86_64 x86_64 GNU/Linux

- Used filesystem for DB is ext3 with "noatime" mount option. Binary logs go to seperate ext3 on RAID1

Here is the mysqld.log output:

100210  3:02:48 - mysqld got signal 11 ;                                                                                                                                                                  
blabla                                                                                                                                                                                       

key_buffer_size=536870912
read_buffer_size=131072  
max_used_connections=450 
max_threads=500          
threads_connected=150    
It is possible that mysqld could use up to 
key_buffer_size + (read_buffer_size + sort_buffer_size)*max_threads = 1617373 K
bytes of memory                                                                
Hope that's ok; if not, decrease some variables in the equation.               

thd: 0x2ab1ccd545d0
Attempting backtrace. You can use the following information to find out
where mysqld died. If you see no messages after this, something went   
terribly wrong...                                                      
stack_bottom = 0x62cc7f20 thread_stack 0x40000                         
/usr/sbin/mysqld(my_print_stacktrace+0x2e)[0x8496fe]                   
/usr/sbin/mysqld(handle_segfault+0x322)[0x5be9e2]                      
/lib64/libpthread.so.0[0x356f80e930]                                   
/usr/sbin/mysqld(lock_sec_rec_cons_read_sees+0x42)[0x7805b2]           
/usr/sbin/mysqld(row_search_for_mysql+0x16a5)[0x7b7255]                
/usr/sbin/mysqld(_ZN11ha_innobase13general_fetchEPhjj+0x79)[0x72f539]  
/usr/sbin/mysqld(_ZN7handler21read_multi_range_nextEPP18st_key_multi_range+0x29)[0x6958f9]
/usr/sbin/mysqld(_ZN18QUICK_RANGE_SELECT8get_nextEv+0x167)[0x682567]                      
/usr/sbin/mysqld(_ZN26QUICK_ROR_INTERSECT_SELECT8get_nextEv+0x29)[0x682289]               
/usr/sbin/mysqld[0x691d01]                                                                
/usr/sbin/mysqld(_Z10sub_selectP4JOINP13st_join_tableb+0x56)[0x622446]                    
/usr/sbin/mysqld[0x628f50]                                                                
/usr/sbin/mysqld(_ZN4JOIN4execEv+0x90c)[0x63505c]                                         
/usr/sbin/mysqld(_Z12mysql_selectP3THDPPP4ItemP10TABLE_LISTjR4ListIS1_ES2_jP8st_orderSB_S2_SB_yP13select_resultP18st_select_lex_unitP13st_select_lex+0x16e)[0x636aae]
/usr/sbin/mysqld(_Z13handle_selectP3THDP6st_lexP13select_resultm+0x169)[0x6373c9]                                                                                    
/usr/sbin/mysqld[0x5c95d4]                                                                                                                                           
/usr/sbin/mysqld(_Z21mysql_execute_commandP3THD+0x4c1)[0x5cbc81]                                                                                                     
/usr/sbin/mysqld(_Z11mysql_parseP3THDPKcjPS2_+0x1f1)[0x5d1161]                                                                                                       
/usr/sbin/mysqld(_Z16dispatch_command19enum_server_commandP3THDPcj+0xfd0)[0x5d2150]                                                                                  
/usr/sbin/mysqld(_Z10do_commandP3THD+0xe4)[0x5d26e4]                                                                                                                 
/usr/sbin/mysqld(handle_one_connection+0x6f0)[0x5c5930]                                                                                                              
/lib64/libpthread.so.0[0x356f806617]                                                                                                                                 
/lib64/libc.so.6(clone+0x6d)[0x356f0d3c2d]                                                                                                                           
Trying to get some variables.                                                                                                                                        
Some pointers may be invalid and cause the dump to abort...                                                                                                          
thd->query at 0x3ad87d50 = select count(this_.id) as y0_ from im_contact this_ inner join im_profile profiledto1_ on this_.friend=profiledto1_.userId where this_.user=4040934 and this_.status=1 and this_.friend in (select s_.uid as y0_ from session_log s_ where s_.`validto`>'2010-02-10 03:02:47.0') and profiledto1_.memberStatus<>0                                                                                            
thd->thread_id=430893                                                                                                                                                                                               
thd->killed=NOT_KILLED                                                                                                                                                                                              
The manual page at http://dev.mysql.com/doc/mysql/en/crashing.html contains                                                                                                                                         
information that should help you find out what is causing the crash.                                                                                                                                                
mysqld: my_new.cc:51: int __cxa_pure_virtual(): Assertion `! "Aborted: pure virtual method called."' failed.                                                                                                        
mysqld: my_new.cc:51: int __cxa_pure_virtual(): Assertion `! "Aborted: pure virtual method called."' failed.                                                                                                        
Fatal signal 6 while backtracing                                                                                                                                                                                    
100210 03:02:51 mysqld_safe Number of processes running now: 0                                                                                                                                                      
100210 03:02:51 mysqld_safe mysqld restarted                                                                                                                                                                        

In addition to the segfaulting we have much more deadlocks since the new settings are enabled, but i don't think it's related to the segfault.

How to repeat:
N/A
[10 Feb 2010 10:32] Valeriy Kravchuk
I am not sure how crash while execuing SELECT query can be related to binlog being enabled... Anyway, please, try to repeat with a newer version, 5.1.43. In case of the same problem, please, send your my.cnf file content and EXPLAIN results for that crashing SELECT.
[10 Feb 2010 10:39] Pascal Schmiel
I'm not sure if the SELECT query is the cause of the crash because it is executed about 20000 times / hour.
nevertheless i will try an update asap.
[10 Feb 2010 16:12] MySQL Verification Team
this looks alot like bug #39022 to me!
[11 Feb 2010 10:05] Pascal Schmiel
we can now reproduce the bug. it occures when executing this query:

DELETE FROM users.session_log WHERE (validto IS NULL or validto < '2010-02-11 00:00:00' ) AND timestamp not like '2010-02-11%;

it is not reproducable in our development-system, i think because there are no deadlocks.
[1 Jun 2010 6:42] Sveta Smirnova
Thank you for the feedback.

As this really looks like bug #39022 which is fixed in 5.1.45 please upgrade to current version 5.1.47 and inform us if problem still exists.
[1 Jul 2010 23:00] Bugs System
No feedback was provided for this bug for over a month, so it is
being suspended automatically. If you are able to provide the
information that was originally requested, please do so and change
the status of the bug back to "Open".
[7 Jul 2010 14:27] Alfranio Junior
Re-opening the bug as something similar is happening in pb.

Please, take a look at http://tinyurl.com/2eaojup
[23 Jul 2010 15:28] Sveta Smirnova
Alfranio,

thank you for the feedback. Set to "Verified" as we have this crash repeated and documented on our boxes.
[30 Aug 2010 8:42] Alfranio Junior
HOW TO REPEAT
-------------
. Get mysql-next-mr.crash-safe.

. run:

#!/bin/bash

for i in `seq 1 300`; do
  ./mtr --parallel=auto --mem rpl_mixed_crash_safe rpl_stm_crash_safe rpl_row_crash_safe 2>&1 | tee output.log
  res=`grep fail output.log`
  if [ "!$res" != "!" ]; then
    exit $?
  fi
done

The test cases crash the slave periodically and verify if the safely recovery.
Sporadically, after a crash, there will be:

mysqld: /home/acorreia/workspace.sun/repository.mysql.new/bzrwork/wl-2775/mysql-next-mr-rpl-merge.crash-safe.2775/mysys/my_new.cc:51: int __cxa_pure_virtual(): Assertion `! "Aborted: pure virtual method called."' failed.
100830  4:56:00 [Note] Plugin 'FEDERATED' is disabled.
InnoDB: The InnoDB memory heap is disabled
InnoDB: Mutexes and rw_locks use GCC atomic builtins
InnoDB: Compressed tables use zlib 1.2.3.3
100830  4:56:00  InnoDB: highest supported file format is Barracuda.
InnoDB: The log sequence number in ibdata files does not match
InnoDB: the log sequence number in the ib_logfiles!
100830  4:56:00  InnoDB: Database was not shut down normally!
InnoDB: Starting crash recovery.
InnoDB: Reading tablespace information from the .ibd files...
InnoDB: Restoring possible half-written data pages from the doublewrite
InnoDB: buffer...
InnoDB: 1 transaction(s) which must be rolled back or cleaned up
InnoDB: in total 4 row operations to undo
InnoDB: Trx id counter is 2900
InnoDB: Last MySQL binlog file position 0 636, file name ./slave-bin.000018
100830  4:56:00 InnoDB 1.2.0 started; log sequence number 1703595
100830  4:56:00 [Note] Recovering after a crash using slave-bin
100830  4:56:00 [Note] Starting crash recovery...
100830  4:56:00 [Note] Crash recovery finished.
InnoDB: Starting in background the rollback of uncommitted transactions
100830  4:56:00  InnoDB: Rolling back trx with id 2713, 4 rows to undo
[9 Sep 2010 16:45] Alfranio Junior
Hi all,

After some tests, I've figured out that the new failure is not related to Innodb. We have the following stacktrace:

/export/home2/tmp/ac230274/mysql-5.6.99-m5/sql/mysqld(__cxa_pure_virtual+0x6f)[0xa3d523]
/export/home2/tmp/ac230274/mysql-5.6.99-m5/sql/mysqld(_Z23get_cached_table_accessP22st_grant_internal_infoPKcS2_+0x73)[0x54416d]
/export/home2/tmp/ac230274/mysql-5.6.99-m5/sql/mysqld(_Z11check_grantP3THDmP10TABLE_LISTbjb+0x18c)[0x546dcc]
/export/home2/tmp/ac230274/mysql-5.6.99-m5/sql/mysqld(_Z18check_table_accessP3THDmP10TABLE_LISTbjb+0x238)[0x5abc78]
/export/home2/tmp/ac230274/mysql-5.6.99-m5/sql/mysqld(_Z21mysql_execute_commandP3THD+0x740)[0x5ad8be]
/export/home2/tmp/ac230274/mysql-5.6.99-m5/sql/mysqld(_Z11mysql_parseP3THDPcjP12Parser_state+0x1c9)[0x5b4dad]
/export/home2/tmp/ac230274/mysql-5.6.99-m5/sql/mysqld(_Z16dispatch_command19enum_server_commandP3THDPcj+0x9cf)[0x5b620b]
/export/home2/tmp/ac230274/mysql-5.6.99-m5/sql/mysqld(_Z10do_commandP3THD+0x227)[0x5b7603]
/export/home2/tmp/ac230274/mysql-5.6.99-m5/sql/mysqld(_Z24do_handle_one_connectionP3THD+0x135)[0x686553]
/export/home2/tmp/ac230274/mysql-5.6.99-m5/sql/mysqld(handle_one_connection+0x2d)[0x686619]
/export/home2/tmp/ac230274/mysql-5.6.99-m5/sql/mysqld(pfs_spawn_thread+0x92)[0x9f5c2e]
/lib64/libpthread.so.0[0x33d20062f7]
/lib64/libc.so.6(clone+0x6d)[0x33d14d1b6d]
Fatal signal 11 while backtracing 

The problem stems from the fact that destructor of both ACL_internal_schema_access and ACL_internal_table_access are called while exiting the server.

Specifically, the test case explicitly crashes the server by calling abort() that calls

  handle_segfault() --> exit(1)

While processing exit(1), associated handlers/callbacks are executed and destroy
the ACL_internal_schema_access and ACL_internal_table_access. Meanwhile, another
thread is able to start processing a statement and fails because the objects
ACL_internal_schema_access and ACL_internal_table_access do not exist.
[9 Sep 2010 16:47] Alfranio Junior
I've made the following changes to the code to get the stacktrace:

int __cxa_pure_virtual()
{
DBUG_ENTER("__cxa_pure_virtual");
fprintf(stderr,"Aborted: pure virtual method called.\n");
fflush(stderr);
void *addrs[128];
char **strings= NULL;
int n= backtrace(addrs, array_elements(addrs));
strings= backtrace_symbols(addrs, n);
backtrace_symbols_fd(addrs, n, fileno(stderr));
fflush(stderr);
DBUG_RETURN(1);
}

mysql-test/suite/rpl/t/rpl_*_crash_safe-slave.opt
--skip-core-file --skip-slave-start --relay-log-info-repository=TABLE --relay-log-recovery=1