summaryrefslogtreecommitdiff
path: root/nptl
diff options
context:
space:
mode:
Diffstat (limited to 'nptl')
-rw-r--r--nptl/pthread_spin_init.c3
-rw-r--r--nptl/pthread_spin_lock.c71
-rw-r--r--nptl/pthread_spin_trylock.c54
-rw-r--r--nptl/pthread_spin_unlock.c6
4 files changed, 100 insertions, 34 deletions
diff --git a/nptl/pthread_spin_init.c b/nptl/pthread_spin_init.c
index 01dec5eea4..fe3091377e 100644
--- a/nptl/pthread_spin_init.c
+++ b/nptl/pthread_spin_init.c
@@ -22,6 +22,7 @@
int
pthread_spin_init (pthread_spinlock_t *lock, int pshared)
{
- *lock = 0;
+ /* Relaxed MO is fine because this is an initializing store. */
+ atomic_store_relaxed (lock, 0);
return 0;
}
diff --git a/nptl/pthread_spin_lock.c b/nptl/pthread_spin_lock.c
index 4d03b7893a..682af80240 100644
--- a/nptl/pthread_spin_lock.c
+++ b/nptl/pthread_spin_lock.c
@@ -19,27 +19,35 @@
#include <atomic.h>
#include "pthreadP.h"
-/* A machine-specific version can define SPIN_LOCK_READS_BETWEEN_CMPXCHG
- to the number of plain reads that it's optimal to spin on between uses
- of atomic_compare_and_exchange_val_acq. If spinning forever is optimal
- then use -1. If no plain reads here would ever be optimal, use 0. */
-#ifndef SPIN_LOCK_READS_BETWEEN_CMPXCHG
-# warning machine-dependent file should define SPIN_LOCK_READS_BETWEEN_CMPXCHG
-# define SPIN_LOCK_READS_BETWEEN_CMPXCHG 1000
-#endif
-
int
pthread_spin_lock (pthread_spinlock_t *lock)
{
- /* atomic_exchange usually takes less instructions than
- atomic_compare_and_exchange. On the other hand,
- atomic_compare_and_exchange potentially generates less bus traffic
- when the lock is locked.
- We assume that the first try mostly will be successful, and we use
- atomic_exchange. For the subsequent tries we use
- atomic_compare_and_exchange. */
- if (atomic_exchange_acq (lock, 1) == 0)
+ int val = 0;
+
+ /* We assume that the first try mostly will be successful, thus we use
+ atomic_exchange if it is not implemented by a CAS loop (we also assume
+ that atomic_exchange can be faster if it succeeds, see
+ ATOMIC_EXCHANGE_USES_CAS). Otherwise, we use a weak CAS and not an
+ exchange so we bail out after the first failed attempt to change the
+ state. For the subsequent attempts we use atomic_compare_and_exchange
+ after we observe that the lock is not acquired.
+ See also comment in pthread_spin_trylock.
+ We use acquire MO to synchronize-with the release MO store in
+ pthread_spin_unlock, and thus ensure that prior critical sections
+ happen-before this critical section. */
+#if ! ATOMIC_EXCHANGE_USES_CAS
+ /* Try to acquire the lock with an exchange instruction as this architecture
+ has such an instruction and we assume it is faster than a CAS.
+ The acquisition succeeds if the lock is not in an acquired state. */
+ if (__glibc_likely (atomic_exchange_acquire (lock, 1) == 0))
return 0;
+#else
+ /* Try to acquire the lock with a CAS instruction as this architecture
+ has no exchange instruction. The acquisition succeeds if the lock is not
+ acquired. */
+ if (__glibc_likely (atomic_compare_exchange_weak_acquire (lock, &val, 1)))
+ return 0;
+#endif
do
{
@@ -47,23 +55,26 @@ pthread_spin_lock (pthread_spinlock_t *lock)
to cmpxchg is not a good idea on many targets as that will force
expensive memory synchronizations among processors and penalize other
running threads.
- On the other hand, we do want to update memory state on the local core
- once in a while to avoid spinning indefinitely until some event that
- will happen to update local memory as a side-effect. */
- if (SPIN_LOCK_READS_BETWEEN_CMPXCHG >= 0)
+ There is no technical reason for throwing in a CAS every now and then,
+ and so far we have no evidence that it can improve performance.
+ If that would be the case, we have to adjust other spin-waiting loops
+ elsewhere, too!
+ Thus we use relaxed MO reads until we observe the lock to not be
+ acquired anymore. */
+ do
{
- int wait = SPIN_LOCK_READS_BETWEEN_CMPXCHG;
+ /* TODO Back-off. */
- while (*lock != 0 && wait > 0)
- --wait;
- }
- else
- {
- while (*lock != 0)
- ;
+ atomic_spin_nop ();
+
+ val = atomic_load_relaxed (lock);
}
+ while (val != 0);
+
+ /* We need acquire memory order here for the same reason as mentioned
+ for the first try to lock the spinlock. */
}
- while (atomic_compare_and_exchange_val_acq (lock, 1, 0) != 0);
+ while (!atomic_compare_exchange_weak_acquire (lock, &val, 1));
return 0;
}
diff --git a/nptl/pthread_spin_trylock.c b/nptl/pthread_spin_trylock.c
index 593bba3ed8..83921b06b8 100644
--- a/nptl/pthread_spin_trylock.c
+++ b/nptl/pthread_spin_trylock.c
@@ -23,5 +23,57 @@
int
pthread_spin_trylock (pthread_spinlock_t *lock)
{
- return atomic_exchange_acq (lock, 1) ? EBUSY : 0;
+ /* For the spin try lock, we have the following possibilities:
+
+ 1) If we assume that trylock will most likely succeed in practice:
+ * We just do an exchange.
+
+ 2) If we want to bias towards cases where trylock succeeds, but don't
+ rule out contention:
+ * If exchange is not implemented by a CAS loop, and exchange is faster
+ than CAS, do an exchange.
+ * If exchange is implemented by a CAS loop, use a weak CAS and not an
+ exchange so we bail out after the first failed attempt to change the state.
+
+ 3) If we expect contention to be likely:
+ * If CAS always brings the cache line into an exclusive state even if the
+ spinlock is already acquired, then load the value first with
+ atomic_load_relaxed and test if lock is not acquired. Then do 2).
+
+ We assume that 2) is the common case, and that this won't be slower than
+ 1) in the common case.
+
+ We use acquire MO to synchronize-with the release MO store in
+ pthread_spin_unlock, and thus ensure that prior critical sections
+ happen-before this critical section. */
+#if ! ATOMIC_EXCHANGE_USES_CAS
+ /* Try to acquire the lock with an exchange instruction as this architecture
+ has such an instruction and we assume it is faster than a CAS.
+ The acquisition succeeds if the lock is not in an acquired state. */
+ if (atomic_exchange_acquire (lock, 1) == 0)
+ return 0;
+#else
+ /* Try to acquire the lock with a CAS instruction as this architecture
+ has no exchange instruction. The acquisition succeeds if the lock is not
+ acquired. */
+ do
+ {
+ int val = 0;
+ if (atomic_compare_exchange_weak_acquire (lock, &val, 1))
+ return 0;
+ }
+ /* atomic_compare_exchange_weak_acquire can fail spuriously. Whereas
+ C++11 and C11 make it clear that trylock operations can fail spuriously,
+ POSIX does not explicitly specify this; it only specifies that failing
+ synchronization operations do not need to have synchronization effects
+ themselves, but a spurious failure is something that could contradict a
+ happens-before established earlier (e.g., that we need to observe that
+ the lock is acquired). Therefore, we emulate a strong CAS by simply
+ checking with a relaxed MO load that the lock is really acquired before
+ returning EBUSY; the additional overhead this may cause is on the slow
+ path. */
+ while (atomic_load_relaxed (lock) == 0);
+#endif
+
+ return EBUSY;
}
diff --git a/nptl/pthread_spin_unlock.c b/nptl/pthread_spin_unlock.c
index 5fd73e578b..f83b69639a 100644
--- a/nptl/pthread_spin_unlock.c
+++ b/nptl/pthread_spin_unlock.c
@@ -23,7 +23,9 @@
int
pthread_spin_unlock (pthread_spinlock_t *lock)
{
- atomic_full_barrier ();
- *lock = 0;
+ /* The atomic_store_release synchronizes-with the atomic_exchange_acquire
+ or atomic_compare_exchange_weak_acquire in pthread_spin_lock /
+ pthread_spin_trylock. */
+ atomic_store_release (lock, 0);
return 0;
}