From 877ea910afabf9192d44f8e86a88c65d147aaf8b Mon Sep 17 00:00:00 2001 From: Yibo Cai Date: Mon, 4 Mar 2019 11:07:44 +0800 Subject: [PATCH] Bug #94699: Mysql deadlock and bugcheck on aarch64 Mysql stress test on aarch64 results to rwlock bug assert and deadlock. Errorlog#1: assertion --------------------- 2019-01-31 16:59:35 0xffe617a951c0 InnoDB: Assertion failure in thread 281363704533440 in file sync0rw.cc line 560 InnoDB: Failing assertion: !lock->recursive InnoDB: We intentionally generate a memory trap. InnoDB: Submit a detailed bug report to http://bugs.mysql.com. Errorlog#2: deadlock -------------------- 2019-01-30T06:23:22.614979Z 0 [Warning] InnoDB: A long semaphore wait: --Thread 281365295645120 has waited at btr0sea.ic line 90 for 241.00 seconds the semaphore: X-lock on RW-latch at 0xaaabc8bb56a8 created in file btr0sea.cc line 195 a writer (thread id 281365295104448) has reserved it in mode exclusive number of readers 0, waiters flag 1, lock_word: 0 Last time read locked in file btr0sea.ic line 128 Last time write locked in file /home/linux/mysql/0-source/mysql-server/storage/innobase/include/btr0sea.ic line 90 Bug analysis ------------ "writer_thread" and "recursive" are two members of rw_lock_t struct. lock->writer_thread MUST be updated BEFORE lock->recursive, as mentioned in comments of function rw_lock_set_writer_id_and_recursion_flag() in file sync0rw.ic. Writer memory barrier is inserted correctly in producer code by calling os_compare_and_swap_thread_id(). https://github.com/mysql/mysql-server/blob/mysql-5.7.24/storage/innobase/include/sync0rw.ic#L331-L357 local_thread = lock->writer_thread; success = os_compare_and_swap_thread_id( &lock->writer_thread, local_thread, curr_thread); <-- wmb inserted ut_a(success); lock->recursive = recursive; But the consumer code lacks necessary reader memory barrier and may suffer from memory order issue. https://github.com/mysql/mysql-server/blob/mysql-5.7.24/storage/innobase/sync/sync0rw.cc#L571-L579 if (!pass) { os_rmb; // XXX: this rmb doesn't help } /* Decrement failed: An X or SX lock is held by either this thread or another. Try to relock. */ if (!pass && lock->recursive // XXX: should insert rmb here! && os_thread_eq(lock->writer_thread, thread_id)) { In above code, lock->recursive is visited before lock->writer_thread in program order. No problems have been observed on x86 with current code. But this code is unsafe for weak memory order platforms like Arm. Without explicit rmb() between loading the two variables, we may observe obsolete lock->recursive but latest lock->writer_thread, leads to inconsistent data. In this patch we re-order lock->recursive before an existing rmb. This ensures that the read path barrier correlates correctly with the write path barrier and that the read path observes lock->recursive before lock->writer_thread. Test result ----------- Bug is reproduced steadily in 15 minutes with original code. After applying this patch, bug is not reproduced after 7 days' stress test. --- storage/innobase/sync/sync0rw.cc | 17 ++++++++++++----- 1 file changed, 12 insertions(+), 5 deletions(-) diff --git a/storage/innobase/sync/sync0rw.cc b/storage/innobase/sync/sync0rw.cc index 72654dbf9e0..fee22ebb1d6 100644 --- a/storage/innobase/sync/sync0rw.cc +++ b/storage/innobase/sync/sync0rw.cc @@ -568,15 +568,18 @@ rw_lock_x_lock_low( } else { os_thread_id_t thread_id = os_thread_get_curr_id(); + bool recursive; + os_thread_id_t writer_thread; + if (!pass) { + recursive = lock->recursive; os_rmb; + writer_thread = lock->writer_thread; } /* Decrement failed: An X or SX lock is held by either this thread or another. Try to relock. */ - if (!pass - && lock->recursive - && os_thread_eq(lock->writer_thread, thread_id)) { + if (!pass && recursive && os_thread_eq(writer_thread, thread_id)) { /* Other s-locks can be allowed. If it is request x recursively while holding sx lock, this x lock should be along with the latching-order. */ @@ -647,15 +650,19 @@ rw_lock_sx_lock_low( } else { os_thread_id_t thread_id = os_thread_get_curr_id(); + bool recursive; + os_thread_id_t writer_thread; + if (!pass) { + recursive = lock->recursive; os_rmb; + writer_thread = lock->writer_thread; } /* Decrement failed: It already has an X or SX lock by this thread or another thread. If it is this thread, relock, else fail. */ - if (!pass && lock->recursive - && os_thread_eq(lock->writer_thread, thread_id)) { + if (!pass && recursive && os_thread_eq(writer_thread, thread_id)) { /* This thread owns an X or SX lock */ if (lock->sx_recursive++ == 0) { /* This thread is making first SX-lock request -- 2.17.1