Index: os/os0sync.c =================================================================== --- os/os0sync.c (revision 1984) +++ os/os0sync.c (working copy) @@ -151,7 +151,14 @@ ut_a(0 == pthread_cond_init(&(event->cond_var), NULL)); #endif event->is_set = FALSE; - event->signal_count = 0; + + /* We return this value in os_event_reset(), which can then be + be used to pass to the os_event_wait_low(). The value of zero + is reserved in os_event_wait_low() for the case when the + caller does not want to pass any signal_count value. To + distinguish between the two cases we initialize signal_couunt + to 1 here. */ + event->signal_count = 1; #endif /* __WIN__ */ /* The os_sync_mutex can be NULL because during startup an event @@ -244,13 +251,20 @@ /************************************************************** Resets an event semaphore to the nonsignaled state. Waiting threads will -stop to wait for the event. */ +stop to wait for the event. +The return value should be passed to os_even_wait_low() if it is desired +that this thread should not wait in case of an intervening call to +os_event_set() between this os_event_reset() and the +os_event_wait_low() call. See comments for os_event_wait_low(). */ -void +ib_longlong os_event_reset( /*===========*/ + /* out: current signal_count. */ os_event_t event) /* in: event to reset */ { + ib_longlong ret = 0; + #ifdef __WIN__ ut_a(event); @@ -265,9 +279,11 @@ } else { event->is_set = FALSE; } + ret = event->signal_count; os_fast_mutex_unlock(&(event->os_mutex)); #endif + return(ret); } /************************************************************** @@ -335,18 +351,38 @@ Waits for an event object until it is in the signaled state. If srv_shutdown_state == SRV_SHUTDOWN_EXIT_THREADS this also exits the waiting thread when the event becomes signaled (or immediately if the -event is already in the signaled state). */ +event is already in the signaled state). +Typically, if the event has been signalled after the os_event_reset() +we'll return immediately because event->is_set == TRUE. +There are, however, situations (e.g.: sync_array code) where we may +lose this information. For example: + +thread A calls os_event_reset() +thread B calls os_event_set() [event->is_set == TRUE] +thread C calls os_event_reset() [event->is_set == FALSE] +thread A calls os_event_wait() [infinite wait!] +thread C calls os_event_wait() [infinite wait!] + +Where such a scenario is possible, to avoid infinite wait, the +value returned by os_event_reset() should be passed in as +reset_sig_count. */ + void -os_event_wait( -/*==========*/ - os_event_t event) /* in: event to wait */ +os_event_wait_low( +/*==============*/ + os_event_t event, /* in: event to wait */ + ib_longlong reset_sig_count)/* in: zero or the value + returned by previous call of + os_event_reset(). */ { #ifdef __WIN__ DWORD err; ut_a(event); + UT_NOT_USED(reset_sig_count); + /* Specify an infinite time limit for waiting */ err = WaitForSingleObject(event->handle, INFINITE); @@ -360,7 +396,11 @@ os_fast_mutex_lock(&(event->os_mutex)); - old_signal_count = event->signal_count; + if (reset_sig_count) { + old_signal_count = reset_sig_count; + } else { + old_signal_count = event->signal_count; + } for (;;) { if (event->is_set == TRUE Index: srv/srv0srv.c =================================================================== --- srv/srv0srv.c (revision 1984) +++ srv/srv0srv.c (working copy) @@ -1881,12 +1881,6 @@ os_thread_sleep(1000000); - /* In case mutex_exit is not a memory barrier, it is - theoretically possible some threads are left waiting though - the semaphore is already released. Wake up those threads: */ - - sync_arr_wake_threads_if_sema_free(); - current_time = time(NULL); time_elapsed = difftime(current_time, last_monitor_time); @@ -2083,9 +2077,15 @@ srv_refresh_innodb_monitor_stats(); } + /* In case mutex_exit is not a memory barrier, it is + theoretically possible some threads are left waiting though + the semaphore is already released. Wake up those threads: */ + + sync_arr_wake_threads_if_sema_free(); + if (sync_array_print_long_waits()) { fatal_cnt++; - if (fatal_cnt > 5) { + if (fatal_cnt > 10) { fprintf(stderr, "InnoDB: Error: semaphore wait has lasted > %lu seconds\n" @@ -2103,7 +2103,7 @@ fflush(stderr); - os_thread_sleep(2000000); + os_thread_sleep(1000000); if (srv_shutdown_state < SRV_SHUTDOWN_CLEANUP) { Index: sync/sync0arr.c =================================================================== --- sync/sync0arr.c (revision 1984) +++ sync/sync0arr.c (working copy) @@ -40,8 +40,16 @@ say 200. In NT 3.51, allocating events seems to be a quadratic algorithm, because 10 000 events are created fast, but 100 000 events takes a couple of minutes to create. -*/ +As of 5.0.30 the above mentioned design is changed. Since now +OS can handle millions of wait events efficiently, we no longer +have this concept of each cell of wait array having one event. +Instead, now the event that a thread wants to wait on is embedded +in the wait object (mutex or rw_lock). We still keep the global +wait array for the sake of diagnostics and also to avoid infinite +wait The error_monitor thread scans the global wait array to signal +any waiting threads who have missed the signal. */ + /* A cell where an individual thread may wait suspended until a resource is released. The suspending is implemented using an operating system event semaphore. */ @@ -62,6 +70,14 @@ ibool waiting; /* TRUE if the thread has already called sync_array_event_wait on this cell */ + ib_longlong signal_count; /* We capture the signal_count + of the wait_object when we + reset the event. This value is + then passed on to os_event_wait + and we wait only if the event + has not been signalled in the + period between the reset and + wait call. */ time_t reservation_time;/* time when the thread reserved the wait cell */ }; @@ -216,6 +232,7 @@ cell = sync_array_get_nth_cell(arr, i); cell->wait_object = NULL; cell->waiting = FALSE; + cell->signal_count = 0; } return(arr); @@ -282,16 +299,23 @@ /*********************************************************************** Puts the cell event in reset state. */ static -void +ib_longlong sync_cell_event_reset( /*==================*/ + /* out: value of signal_count + at the time of reset. */ ulint type, /* in: lock type mutex/rw_lock */ void* object) /* in: the rw_lock/mutex object */ { if (type == SYNC_MUTEX) { - os_event_reset(((mutex_t *) object)->event); + return(os_event_reset(((mutex_t *) object)->event)); +#ifdef __WIN__ + } else if (type == RW_LOCK_WAIT_EX) { + return(os_event_reset( + ((rw_lock_t *) object)->wait_ex_event)); +#endif } else { - os_event_reset(((rw_lock_t *) object)->event); + return(os_event_reset(((rw_lock_t *) object)->event)); } } @@ -345,8 +369,11 @@ sync_array_exit(arr); - /* Make sure the event is reset */ - sync_cell_event_reset(type, object); + /* Make sure the event is reset and also store + the value of signal_count at which the event + was reset. */ + cell->signal_count = sync_cell_event_reset(type, + object); cell->reservation_time = time(NULL); @@ -388,7 +415,14 @@ if (cell->request_type == SYNC_MUTEX) { event = ((mutex_t*) cell->wait_object)->event; - } else { +#ifdef __WIN__ + /* On windows if the thread about to wait is the one which + has set the state of the rw_lock to RW_LOCK_WAIT_EX, then + it waits on a special event i.e.: wait_ex_event. */ + } else if (cell->request_type == RW_LOCK_WAIT_EX) { + event = ((rw_lock_t*) cell->wait_object)->wait_ex_event; +#endif + } else { event = ((rw_lock_t*) cell->wait_object)->event; } @@ -413,7 +447,7 @@ #endif sync_array_exit(arr); - os_event_wait(event); + os_event_wait_low(event, cell->signal_count); sync_array_free_cell(arr, index); } @@ -457,7 +491,11 @@ #endif /* UNIV_SYNC_DEBUG */ (ulong) mutex->waiters); - } else if (type == RW_LOCK_EX || type == RW_LOCK_SHARED) { + } else if (type == RW_LOCK_EX +#ifdef __WIN__ + || type == RW_LOCK_WAIT_EX +#endif + || type == RW_LOCK_SHARED) { fputs(type == RW_LOCK_EX ? "X-lock on" : "S-lock on", file); @@ -638,7 +676,8 @@ return(FALSE); /* No deadlock */ - } else if (cell->request_type == RW_LOCK_EX) { + } else if (cell->request_type == RW_LOCK_EX + || cell->request_type == RW_LOCK_WAIT_EX) { lock = cell->wait_object; @@ -734,7 +773,8 @@ return(TRUE); } - } else if (cell->request_type == RW_LOCK_EX) { + } else if (cell->request_type == RW_LOCK_EX + || cell->request_type == RW_LOCK_WAIT_EX) { lock = cell->wait_object; @@ -783,6 +823,7 @@ cell->waiting = FALSE; cell->wait_object = NULL; + cell->signal_count = 0; ut_a(arr->n_reserved > 0); arr->n_reserved--; @@ -839,6 +880,14 @@ mutex = cell->wait_object; os_event_set(mutex->event); +#ifdef __WIN__ + } else if (cell->request_type + == RW_LOCK_WAIT_EX) { + rw_lock_t* lock; + + lock = cell->wait_object; + os_event_set(lock->wait_ex_event); +#endif } else { rw_lock_t* lock; Index: sync/sync0rw.c =================================================================== --- sync/sync0rw.c (revision 1984) +++ sync/sync0rw.c (working copy) @@ -132,6 +132,10 @@ lock->last_x_line = 0; lock->event = os_event_create(NULL); +#ifdef __WIN__ + lock->wait_ex_event = os_event_create(NULL); +#endif + mutex_enter(&rw_lock_list_mutex); if (UT_LIST_GET_LEN(rw_lock_list) > 0) { @@ -168,6 +172,10 @@ mutex_enter(&rw_lock_list_mutex); os_event_free(lock->event); +#ifdef __WIN__ + os_event_free(lock->wait_ex_event); +#endif + if (UT_LIST_GET_PREV(list, lock)) { ut_a(UT_LIST_GET_PREV(list, lock)->magic_n == RW_LOCK_MAGIC_N); } @@ -521,7 +529,15 @@ rw_x_system_call_count++; sync_array_reserve_cell(sync_primary_wait_array, - lock, RW_LOCK_EX, + lock, +#ifdef __WIN__ + /* On windows RW_LOCK_WAIT_EX signifies + that this thread should wait on the + special wait_ex_event. */ + (state == RW_LOCK_WAIT_EX) + ? RW_LOCK_WAIT_EX : +#endif + RW_LOCK_EX, file_name, line, &index); Index: sync/sync0sync.c =================================================================== --- sync/sync0sync.c (revision 1984) +++ sync/sync0sync.c (working copy) @@ -95,17 +95,47 @@ it and did not see the waiters byte set to 1, a case which would lead the other thread to an infinite wait. -LEMMA 1: After a thread resets the event of the cell it reserves for waiting -======== -for a mutex, some thread will eventually call sync_array_signal_object with -the mutex as an argument. Thus no infinite wait is possible. +LEMMA 1: After a thread resets the event of a mutex (or rw_lock), some +======= +thread will eventually call os_event_set() on that particular event. +Thus no infinite wait is possible in this case. Proof: After making the reservation the thread sets the waiters field in the mutex to 1. Then it checks that the mutex is still reserved by some thread, or it reserves the mutex for itself. In any case, some thread (which may be also some earlier thread, not necessarily the one currently holding the mutex) will set the waiters field to 0 in mutex_exit, and then call -sync_array_signal_object with the mutex as an argument. +os_event_set() with the mutex as an argument. +Q.E.D. + +LEMMA 2: If an os_event_set() call is made after some thread has called +======= +the os_event_reset() and before it starts wait on that event, the call +will not be lost to the second thread. This is true even if there is an +intervening call to os_event_reset() by another thread. +Thus no infinite wait is possible in this case. + +Proof (non-windows platforms): os_event_reset() returns a monotonically +increasing value of signal_count. This value is increased at every +call of os_event_set() If thread A has called os_event_reset() followed +by thread B calling os_event_set() and then some other thread C calling +os_event_reset(), the is_set flag of the event will be set to FALSE; +but now if thread A calls os_event_wait_low() with the signal_count +value returned from the earlier call of os_event_reset(), it will +return immediately without waiting. +Q.E.D. + +Proof (windows): If there is a writer thread which is forced to wait for +the lock, it may be able to set the state of rw_lock to RW_LOCK_WAIT_EX +The design of rw_lock ensures that there is one and only one thread +that is able to change the state to RW_LOCK_WAIT_EX and this thread is +guaranteed to acquire the lock after it is released by the current +holders and before any other waiter gets the lock. +On windows this thread waits on a separate event i.e.: wait_ex_event. +Since only one thread can wait on this event there is no chance +of this event getting reset before the writer starts wait on it. +Therefore, this thread is guaranteed to catch the os_set_event() +signalled unconditionally at the release of the lock. Q.E.D. */ ulint sync_dummy = 0; Index: include/os0sync.h =================================================================== --- include/os0sync.h (revision 1984) +++ include/os0sync.h (working copy) @@ -112,9 +112,13 @@ os_event_t event); /* in: event to set */ /************************************************************** Resets an event semaphore to the nonsignaled state. Waiting threads will -stop to wait for the event. */ +stop to wait for the event. +The return value should be passed to os_even_wait_low() if it is desired +that this thread should not wait in case of an intervening call to +os_event_set() between this os_event_reset() and the +os_event_wait_low() call. See comments for os_event_wait_low(). */ -void +ib_longlong os_event_reset( /*===========*/ os_event_t event); /* in: event to reset */ @@ -125,16 +129,38 @@ os_event_free( /*==========*/ os_event_t event); /* in: event to free */ + /************************************************************** Waits for an event object until it is in the signaled state. If srv_shutdown_state == SRV_SHUTDOWN_EXIT_THREADS this also exits the waiting thread when the event becomes signaled (or immediately if the -event is already in the signaled state). */ +event is already in the signaled state). +Typically, if the event has been signalled after the os_event_reset() +we'll return immediately because event->is_set == TRUE. +There are, however, situations (e.g.: sync_array code) where we may +lose this information. For example: + +thread A calls os_event_reset() +thread B calls os_event_set() [event->is_set == TRUE] +thread C calls os_event_reset() [event->is_set == FALSE] +thread A calls os_event_wait() [infinite wait!] +thread C calls os_event_wait() [infinite wait!] + +Where such a scenario is possible, to avoid infinite wait, the +value returned by os_event_reset() should be passed in as +reset_sig_count. */ + +#define os_event_wait(event) os_event_wait_low((event), 0) + void -os_event_wait( -/*==========*/ - os_event_t event); /* in: event to wait */ +os_event_wait_low( +/*==============*/ + os_event_t event, /* in: event to wait */ + ib_longlong reset_sig_count);/* in: zero or the value + returned by previous call of + os_event_reset(). */ + /************************************************************** Waits for an event object until it is in the signaled state or a timeout is exceeded. In Unix the timeout is always infinite. */ Index: include/sync0sync.ic =================================================================== --- include/sync0sync.ic (revision 1984) +++ include/sync0sync.ic (working copy) @@ -207,7 +207,7 @@ perform the read first, which could leave a waiting thread hanging indefinitely. - Our current solution call every 10 seconds + Our current solution call every second sync_arr_wake_threads_if_sema_free() to wake up possible hanging threads if they are missed in mutex_signal_object. */ Index: include/sync0rw.h =================================================================== --- include/sync0rw.h (revision 1984) +++ include/sync0rw.h (working copy) @@ -418,6 +418,17 @@ struct rw_lock_struct { os_event_t event; /* Used by sync0arr.c for thread queueing */ + +#ifdef __WIN__ + os_event_t wait_ex_event; /* This windows specific event is + used by the thread which has set the + lock state to RW_LOCK_WAIT_EX. The + rw_lock design guarantees that this + thread will be the next one to proceed + once the current the event gets + signalled. See LEMMA 2 in sync0sync.c */ +#endif + ulint reader_count; /* Number of readers who have locked this lock in the shared mode */ ulint writer; /* This field is set to RW_LOCK_EX if there Index: include/sync0rw.ic =================================================================== --- include/sync0rw.ic (revision 1984) +++ include/sync0rw.ic (working copy) @@ -382,6 +382,9 @@ mutex_exit(mutex); if (UNIV_UNLIKELY(sg)) { +#ifdef __WIN__ + os_event_set(lock->wait_ex_event); +#endif os_event_set(lock->event); sync_array_object_signalled(sync_primary_wait_array); } @@ -463,6 +466,9 @@ mutex_exit(&(lock->mutex)); if (UNIV_UNLIKELY(sg)) { +#ifdef __WIN__ + os_event_set(lock->wait_ex_event); +#endif os_event_set(lock->event); sync_array_object_signalled(sync_primary_wait_array); }