Merge branch 'wip/tsan/storage' into wip/tsan/all

author: Ben Gamari <ben@smart-cactus.org> 2020-11-01 13:10:01 -0500
committer: Ben Gamari <ben@smart-cactus.org> 2020-11-01 13:10:01 -0500
commit: b8e66e0eecdc58ec5fea0b2c9a9454d38858886c (patch)
tree: 7d25f3ee8f2b714175d1b5647d9aec1fdb550cc1
parent: b4686bff56377a583f0605b81fae290d3fee4c4a (diff)
parent: 3a18155331e07e53b9f3b1d987ed430066b17aa4 (diff)
download: haskell-b8e66e0eecdc58ec5fea0b2c9a9454d38858886c.tar.gz
27 files changed, 431 insertions, 282 deletions
diff --git a/includes/rts/OSThreads.h b/includes/rts/OSThreads.h
index a68f1ea140..21b92950b2 100644
--- a/includes/rts/OSThreads.h
+++ b/includes/rts/OSThreads.h
@@ -164,7 +164,8 @@ typedef void* OSThreadProcAttr OSThreadProc(void *);
 extern int  createOSThread        ( OSThreadId* tid, char *name,
                                     OSThreadProc *startProc, void *param);
 extern bool osThreadIsAlive       ( OSThreadId id );
-extern void interruptOSThread     (OSThreadId id);
+extern void interruptOSThread     ( OSThreadId id );
+extern void joinOSThread          ( OSThreadId id );
 
 //
 // Condition Variables
diff --git a/includes/rts/SpinLock.h b/includes/rts/SpinLock.h
index 0ac51455dd..c1fe6c866c 100644
--- a/includes/rts/SpinLock.h
+++ b/includes/rts/SpinLock.h
@@ -39,19 +39,14 @@ typedef struct SpinLock_
 #define IF_PROF_SPIN(x)
 #endif
 
+void acquire_spin_lock_slow_path(SpinLock * p);
+
 // acquire spin lock
 INLINE_HEADER void ACQUIRE_SPIN_LOCK(SpinLock * p)
 {
-    do {
-        for (uint32_t i = 0; i < SPIN_COUNT; i++) {
-            StgWord32 r = cas((StgVolatilePtr)&(p->lock), 1, 0);
-            if (r != 0) return;
-            IF_PROF_SPIN(__atomic_fetch_add(&p->spin, 1, __ATOMIC_RELAXED));
-            busy_wait_nop();
-        }
-        IF_PROF_SPIN(__atomic_fetch_add(&p->yield, 1, __ATOMIC_RELAXED));
-        yieldThread();
-    } while (1);
+    StgWord32 r = cas((StgVolatilePtr)&(p->lock), 1, 0);
+    if (RTS_UNLIKELY(r == 0))
+        acquire_spin_lock_slow_path(p);
 }
 
 // release spin lock
diff --git a/includes/rts/StablePtr.h b/includes/rts/StablePtr.h
index f42c353d2b..56113b9f81 100644
--- a/includes/rts/StablePtr.h
+++ b/includes/rts/StablePtr.h
@@ -31,5 +31,9 @@ extern DLL_IMPORT_RTS spEntry *stable_ptr_table;
 EXTERN_INLINE
 StgPtr deRefStablePtr(StgStablePtr sp)
 {
-    return stable_ptr_table[(StgWord)sp].addr;
+    // acquire load to ensure that we see the new SPT if it has been recently
+    // enlarged.
+    const spEntry *spt = ACQUIRE_LOAD(&stable_ptr_table);
+    // acquire load to ensure that the referenced object is visible.
+    return ACQUIRE_LOAD(&spt[(StgWord)sp].addr);
 }
diff --git a/includes/rts/storage/GC.h b/includes/rts/storage/GC.h
index 9f4a0dde07..e8dc05048a 100644
--- a/includes/rts/storage/GC.h
+++ b/includes/rts/storage/GC.h
@@ -247,9 +247,9 @@ extern bool keepCAFs;
 
 INLINE_HEADER void initBdescr(bdescr *bd, generation *gen, generation *dest)
 {
-    bd->gen     = gen;
-    bd->gen_no  = gen->no;
-    bd->dest_no = dest->no;
+    RELAXED_STORE(&bd->gen, gen);
+    RELAXED_STORE(&bd->gen_no, gen->no);
+    RELAXED_STORE(&bd->dest_no, dest->no);
 
 #if !IN_STG_CODE
     /* See Note [RtsFlags is a pointer in STG code] */
diff --git a/includes/stg/SMP.h b/includes/stg/SMP.h
index 9390c00eb1..8eff276e60 100644
--- a/includes/stg/SMP.h
+++ b/includes/stg/SMP.h
@@ -467,6 +467,7 @@ EXTERN_INLINE void load_load_barrier () {} /* nothing */
 // Relaxed atomic operations
 #define RELAXED_LOAD(ptr) *ptr
 #define RELAXED_STORE(ptr,val) *ptr = val
+#define RELAXED_ADD(ptr,val) *ptr += val
 
 // Acquire/release atomic operations
 #define ACQUIRE_LOAD(ptr) *ptr
diff --git a/rts/Capability.h b/rts/Capability.h
index bc2e48412a..8c5b1e814e 100644
--- a/rts/Capability.h
+++ b/rts/Capability.h
@@ -419,14 +419,16 @@ recordMutableCap (const StgClosure *p, Capability *cap, uint32_t gen)
     //    ASSERT(cap->running_task == myTask());
     // NO: assertion is violated by performPendingThrowTos()
     bd = cap->mut_lists[gen];
-    if (bd->free >= bd->start + BLOCK_SIZE_W) {
+    if (RELAXED_LOAD(&bd->free) >= bd->start + BLOCK_SIZE_W) {
         bdescr *new_bd;
         new_bd = allocBlockOnNode_lock(cap->node);
         new_bd->link = bd;
+        new_bd->free = new_bd->start;
         bd = new_bd;
         cap->mut_lists[gen] = bd;
     }
-    *bd->free++ = (StgWord)p;
+    RELAXED_STORE(bd->free, (StgWord) p);
+    NONATOMIC_ADD(&bd->free, 1);
 }
 
 EXTERN_INLINE void
diff --git a/rts/SMPClosureOps.h b/rts/SMPClosureOps.h
index c73821a782..3191a8c600 100644
--- a/rts/SMPClosureOps.h
+++ b/rts/SMPClosureOps.h
@@ -119,9 +119,8 @@ tryLockClosure(StgClosure *p)
 
 EXTERN_INLINE void unlockClosure(StgClosure *p, const StgInfoTable *info)
 {
-    // This is a strictly ordered write, so we need a write_barrier():
-    write_barrier();
-    p->header.info = info;
+    // This is a strictly ordered write, so we need a RELEASE ordering.
+    RELEASE_STORE(&p->header.info, info);
 }
 
 #endif /* CMINUSMINUS */
diff --git a/rts/Schedule.c b/rts/Schedule.c
index 52d89a08fb..b97da30848 100644
--- a/rts/Schedule.c
+++ b/rts/Schedule.c
@@ -435,7 +435,7 @@ run_thread:
     RELAXED_STORE(&cap->interrupt, false);
 
     cap->in_haskell = true;
-    cap->idle = 0;
+    RELAXED_STORE(&cap->idle, false);
 
     dirty_TSO(cap,t);
     dirty_STACK(cap,t->stackobj);
@@ -1793,7 +1793,7 @@ scheduleDoGC (Capability **pcap, Task *task USED_IF_THREADS,
         debugTrace(DEBUG_sched, "%d idle caps", n_idle_caps);
 
         for (i=0; i < n_capabilities; i++) {
-            capabilities[i]->idle++;
+            NONATOMIC_ADD(&capabilities[i]->idle, 1);
         }
 
         // For all capabilities participating in this GC, wait until
diff --git a/rts/SpinLock.c b/rts/SpinLock.c
new file mode 100644
index 0000000000..5289694aa7
--- /dev/null
+++ b/rts/SpinLock.c
@@ -0,0 +1,41 @@
+/* ----------------------------------------------------------------------------
+ *
+ * (c) The GHC Team, 2006-2009
+ *
+ * Spin locks
+ *
+ * These are simple spin-only locks as opposed to Mutexes which
+ * probably spin for a while before blocking in the kernel.  We use
+ * these when we are sure that all our threads are actively running on
+ * a CPU, eg. in the GC.
+ *
+ * TODO: measure whether we really need these, or whether Mutexes
+ * would do (and be a bit safer if a CPU becomes loaded).
+ *
+ * Do not #include this file directly: #include "Rts.h" instead.
+ *
+ * To understand the structure of the RTS headers, see the wiki:
+ *   https://gitlab.haskell.org/ghc/ghc/wikis/commentary/source-tree/includes
+ *
+ * -------------------------------------------------------------------------- */
+
+#include "PosixSource.h"
+#include "Rts.h"
+
+#if defined(THREADED_RTS)
+
+void acquire_spin_lock_slow_path(SpinLock * p)
+{
+    do {
+        for (uint32_t i = 0; i < SPIN_COUNT; i++) {
+            StgWord32 r = cas((StgVolatilePtr)&(p->lock), 1, 0);
+            if (r != 0) return;
+            IF_PROF_SPIN(RELAXED_ADD(&p->spin, 1));
+            busy_wait_nop();
+        }
+        IF_PROF_SPIN(RELAXED_ADD(&p->yield, 1));
+        yieldThread();
+    } while (1);
+}
+
+#endif
diff --git a/rts/StablePtr.c b/rts/StablePtr.c
index edcd863183..469a17a5b9 100644
--- a/rts/StablePtr.c
+++ b/rts/StablePtr.c
@@ -191,9 +191,10 @@ enlargeStablePtrTable(void)
 
     /* When using the threaded RTS, the update of stable_ptr_table is assumed to
      * be atomic, so that another thread simultaneously dereferencing a stable
-     * pointer will always read a valid address.
+     * pointer will always read a valid address. Release ordering to ensure
+     * that the new table is visible to others.
      */
-    stable_ptr_table = new_stable_ptr_table;
+    RELEASE_STORE(&stable_ptr_table, new_stable_ptr_table);
 
     initSpEntryFreeList(stable_ptr_table + old_SPT_size, old_SPT_size, NULL);
 }
@@ -247,7 +248,7 @@ exitStablePtrTable(void)
 STATIC_INLINE void
 freeSpEntry(spEntry *sp)
 {
-    sp->addr = (P_)stable_ptr_free;
+    RELAXED_STORE(&sp->addr, (P_)stable_ptr_free);
     stable_ptr_free = sp;
 }
 
@@ -279,7 +280,7 @@ getStablePtr(StgPtr p)
   if (!stable_ptr_free) enlargeStablePtrTable();
   sp = stable_ptr_free - stable_ptr_table;
   stable_ptr_free  = (spEntry*)(stable_ptr_free->addr);
-  stable_ptr_table[sp].addr = p;
+  RELAXED_STORE(&stable_ptr_table[sp].addr, p);
   stablePtrUnlock();
   return (StgStablePtr)(sp);
 }
diff --git a/rts/Updates.h b/rts/Updates.h
index 608aaff524..aa5fbe0133 100644
--- a/rts/Updates.h
+++ b/rts/Updates.h
@@ -76,7 +76,6 @@ INLINE_HEADER void updateWithIndirection (Capability *cap,
     /* not necessarily true: ASSERT( !closure_IND(p1) ); */
     /* occurs in RaiseAsync.c:raiseAsync() */
     /* See Note [Heap memory barriers] in SMP.h */
-    write_barrier();
     bdescr *bd = Bdescr((StgPtr)p1);
     if (bd->gen_no != 0) {
       IF_NONMOVING_WRITE_BARRIER_ENABLED {
@@ -88,9 +87,8 @@ INLINE_HEADER void updateWithIndirection (Capability *cap,
         TICK_UPD_NEW_IND();
     }
     OVERWRITING_CLOSURE(p1);
-    ((StgInd *)p1)->indirectee = p2;
-    write_barrier();
-    SET_INFO(p1, &stg_BLACKHOLE_info);
+    RELEASE_STORE(&((StgInd *)p1)->indirectee, p2);
+    SET_INFO_RELEASE(p1, &stg_BLACKHOLE_info);
     LDV_RECORD_CREATE(p1);
 }
 
diff --git a/rts/Weak.c b/rts/Weak.c
index fe4516794a..0adf5a8b92 100644
--- a/rts/Weak.c
+++ b/rts/Weak.c
@@ -57,8 +57,7 @@ runAllCFinalizers(StgWeak *list)
         // If there's no major GC between the time that the finalizer for the
         // object from the oldest generation is manually called and shutdown
         // we end up running the same finalizer twice. See #7170.
-        const StgInfoTable *winfo = w->header.info;
-        load_load_barrier();
+        const StgInfoTable *winfo = ACQUIRE_LOAD(&w->header.info);
         if (winfo != &stg_DEAD_WEAK_info) {
             runCFinalizers((StgCFinalizerList *)w->cfinalizers);
         }
@@ -93,10 +92,10 @@ scheduleFinalizers(Capability *cap, StgWeak *list)
     StgWord size;
     uint32_t n, i;
 
-    // This assertion does not hold with non-moving collection because
-    // non-moving collector does not wait for the list to be consumed (by
-    // doIdleGcWork()) before appending the list with more finalizers.
-    ASSERT(RtsFlags.GcFlags.useNonmoving || n_finalizers == 0);
+    // n_finalizers is not necessarily zero under non-moving collection
+    // because non-moving collector does not wait for the list to be consumed
+    // (by doIdleGcWork()) before appending the list with more finalizers.
+    ASSERT(RtsFlags.GcFlags.useNonmoving || SEQ_CST_LOAD(&n_finalizers) == 0);
 
     // Append finalizer_list with the new list. TODO: Perhaps cache tail of the
     // list for faster append. NOTE: We can't append `list` here! Otherwise we
@@ -105,7 +104,7 @@ scheduleFinalizers(Capability *cap, StgWeak *list)
     while (*tl) {
         tl = &(*tl)->link;
     }
-    *tl = list;
+    SEQ_CST_STORE(tl, list);
 
     // Traverse the list and
     //  * count the number of Haskell finalizers
@@ -140,7 +139,7 @@ scheduleFinalizers(Capability *cap, StgWeak *list)
         SET_HDR(w, &stg_DEAD_WEAK_info, w->header.prof.ccs);
     }
 
-    n_finalizers += i;
+    SEQ_CST_ADD(&n_finalizers, i);
 
     // No Haskell finalizers to run?
     if (n == 0) return;
@@ -226,7 +225,7 @@ static volatile StgWord finalizer_lock = 0;
 //
 bool runSomeFinalizers(bool all)
 {
-    if (n_finalizers == 0)
+    if (RELAXED_LOAD(&n_finalizers) == 0)
         return false;
 
     if (cas(&finalizer_lock, 0, 1) != 0) {
@@ -252,17 +251,15 @@ bool runSomeFinalizers(bool all)
         if (!all && count >= finalizer_chunk) break;
     }
 
-    finalizer_list = w;
-    n_finalizers -= count;
+    RELAXED_STORE(&finalizer_list, w);
+    SEQ_CST_ADD(&n_finalizers, -count);
 
     if (task != NULL) {
         task->running_finalizers = false;
     }
 
     debugTrace(DEBUG_sched, "ran %d C finalizers", count);
-
-    write_barrier();
-    finalizer_lock = 0;
-
-    return n_finalizers != 0;
+    bool ret = n_finalizers != 0;
+    RELEASE_STORE(&finalizer_lock, 0);
+    return ret;
 }
diff --git a/rts/posix/OSThreads.c b/rts/posix/OSThreads.c
index c51ccfcafb..6347e8ce7a 100644
--- a/rts/posix/OSThreads.c
+++ b/rts/posix/OSThreads.c
@@ -398,6 +398,14 @@ interruptOSThread (OSThreadId id)
     pthread_kill(id, SIGPIPE);
 }
 
+void
+joinOSThread (OSThreadId id)
+{
+    if (pthread_join(id, NULL) != 0) {
+        sysErrorBelch("joinOSThread: error %d", errno);
+    }
+}
+
 KernelThreadId kernelThreadId (void)
 {
 #if defined(linux_HOST_OS)
diff --git a/rts/rts.cabal.in b/rts/rts.cabal.in
index 08ebd3d7bf..12a4d68e4a 100644
--- a/rts/rts.cabal.in
+++ b/rts/rts.cabal.in
@@ -462,6 +462,7 @@ library
                STM.c
                Schedule.c
                Sparks.c
+               SpinLock.c
                StableName.c
                StablePtr.c
                StaticPtrTable.c
diff --git a/rts/sm/BlockAlloc.c b/rts/sm/BlockAlloc.c
index 2bf497197e..451c182ac3 100644
--- a/rts/sm/BlockAlloc.c
+++ b/rts/sm/BlockAlloc.c
@@ -787,6 +787,26 @@ free_mega_group (bdescr *mg)
 }
 
 
+/* Note [Data races in freeGroup]
+ * ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
+ * freeGroup commits a rather serious concurrency sin in its block coalescence
+ * logic: When freeing a block it looks at bd->free of the previous/next block
+ * to see whether it is allocated. However, the free'ing thread likely does not
+ * own the previous/next block, nor do we make any attempt to synchronize with
+ * the thread that *does* own it; this makes this access a data race.
+ *
+ * The original design argued that this was correct because `bd->free` will
+ * only take a value of -1 when the block is free and thereby owned by the
+ * storage manager. However, this is nevertheless unsafe under the C11 data
+ * model, which guarantees no particular semantics for data races.
+ *
+ * We currently assume (and hope) we won't see torn values and consequently
+ * we will never see `bd->free == -1` for an allocated block which we do not
+ * own. However, this is all extremely dodgy.
+ *
+ * This is tracked as #18913.
+ */
+
 void
 freeGroup(bdescr *p)
 {
@@ -796,7 +816,7 @@ freeGroup(bdescr *p)
   // not true in multithreaded GC:
   // ASSERT_SM_LOCK();
 
-  ASSERT(p->free != (P_)-1);
+  ASSERT(RELAXED_LOAD(&p->free) != (P_)-1);
 
 #if defined(DEBUG)
   for (uint32_t i=0; i < p->blocks; i++) {
@@ -806,9 +826,9 @@ freeGroup(bdescr *p)
 
   node = p->node;
 
-  p->free = (void *)-1;  /* indicates that this block is free */
-  p->gen = NULL;
-  p->gen_no = 0;
+  RELAXED_STORE(&p->free, (void *) -1);  /* indicates that this block is free */
+  RELAXED_STORE(&p->gen, NULL);
+  RELAXED_STORE(&p->gen_no, 0);
   /* fill the block group with garbage if sanity checking is on */
   IF_DEBUG(zero_on_gc, memset(p->start, 0xaa, (W_)p->blocks * BLOCK_SIZE));
 
@@ -834,7 +854,11 @@ freeGroup(bdescr *p)
   {
       bdescr *next;
       next = p + p->blocks;
-      if (next <= LAST_BDESCR(MBLOCK_ROUND_DOWN(p)) && next->free == (P_)-1)
+
+      // See Note [Data races in freeGroup].
+      TSAN_ANNOTATE_BENIGN_RACE(&next->free, "freeGroup");
+      if (next <= LAST_BDESCR(MBLOCK_ROUND_DOWN(p))
+          && RELAXED_LOAD(&next->free) == (P_)-1)
       {
           p->blocks += next->blocks;
           ln = log_2(next->blocks);
@@ -855,7 +879,9 @@ freeGroup(bdescr *p)
       prev = p - 1;
       if (prev->blocks == 0) prev = prev->link; // find the head
 
-      if (prev->free == (P_)-1)
+      // See Note [Data races in freeGroup].
+      TSAN_ANNOTATE_BENIGN_RACE(&prev->free, "freeGroup");
+      if (RELAXED_LOAD(&prev->free) == (P_)-1)
       {
           ln = log_2(prev->blocks);
           dbl_link_remove(prev, &free_list[node][ln]);
diff --git a/rts/sm/Evac.c b/rts/sm/Evac.c
index 0ece06016a..b324a59179 100644
--- a/rts/sm/Evac.c
+++ b/rts/sm/Evac.c
@@ -171,7 +171,11 @@ copy_tag(StgClosure **p, const StgInfoTable *info,
 #endif
             return evacuate(p); // does the failed_to_evac stuff
         } else {
-            *p = TAG_CLOSURE(tag,(StgClosure*)to);
+            // This doesn't need to have RELEASE ordering since we are guaranteed
+            // to scavenge the to-space object on the current core therefore
+            // no-one else will follow this pointer (FIXME: Is this true in
+            // light of the selector optimization?).
+            RELEASE_STORE(p, TAG_CLOSURE(tag,(StgClosure*)to));
         }
     }
 #else
@@ -206,9 +210,9 @@ copy_tag_nolock(StgClosure **p, const StgInfoTable *info,
 
     // if somebody else reads the forwarding pointer, we better make
     // sure there's a closure at the end of it.
-    write_barrier();
-    *p = TAG_CLOSURE(tag,(StgClosure*)to);
-    src->header.info = (const StgInfoTable *)MK_FORWARDING_PTR(to);
+    RELEASE_STORE(p, TAG_CLOSURE(tag,(StgClosure*)to));
+    RELEASE_STORE(&src->header.info, \
+                  (const StgInfoTable *)MK_FORWARDING_PTR(to));
 
 //  if (to+size+2 < bd->start + BLOCK_SIZE_W) {
 //      __builtin_prefetch(to + size + 2, 1);
@@ -245,7 +249,7 @@ spin:
             goto spin;
         }
     if (IS_FORWARDING_PTR(info)) {
-        src->header.info = (const StgInfoTable *)info;
+        RELEASE_STORE(&src->header.info, (const StgInfoTable *)info);
         evacuate(p); // does the failed_to_evac stuff
         return false;
     }
@@ -261,9 +265,8 @@ spin:
         to[i] = from[i];
     }
 
-    write_barrier();
-    *p = (StgClosure *)to;
-    src->header.info = (const StgInfoTable*)MK_FORWARDING_PTR(to);
+    RELEASE_STORE(p, (StgClosure *) to);
+    RELEASE_STORE(&src->header.info, (const StgInfoTable*)MK_FORWARDING_PTR(to));
 
 #if defined(PROFILING)
     // We store the size of the just evacuated object in the LDV word so that
@@ -306,12 +309,12 @@ evacuate_large(StgPtr p)
   gen_workspace *ws;
 
   bd = Bdescr(p);
-  gen = bd->gen;
-  gen_no = bd->gen_no;
+  gen = RELAXED_LOAD(&bd->gen);
+  gen_no = RELAXED_LOAD(&bd->gen_no);
   ACQUIRE_SPIN_LOCK(&gen->sync);
 
   // already evacuated?
-  if (bd->flags & BF_EVACUATED) {
+  if (RELAXED_LOAD(&bd->flags) & BF_EVACUATED) {
     /* Don't forget to set the gct->failed_to_evac flag if we didn't get
      * the desired destination (see comments in evacuate()).
      */
@@ -344,9 +347,9 @@ evacuate_large(StgPtr p)
   ws = &gct->gens[new_gen_no];
   new_gen = &generations[new_gen_no];
 
-  bd->flags |= BF_EVACUATED;
+  __atomic_fetch_or(&bd->flags, BF_EVACUATED, __ATOMIC_ACQ_REL);
   if (RTS_UNLIKELY(RtsFlags.GcFlags.useNonmoving && new_gen == oldest_gen)) {
-      bd->flags |= BF_NONMOVING;
+      __atomic_fetch_or(&bd->flags, BF_NONMOVING, __ATOMIC_ACQ_REL);
   }
   initBdescr(bd, new_gen, new_gen->to);
 
@@ -354,7 +357,7 @@ evacuate_large(StgPtr p)
   // these objects, because they aren't allowed to contain any outgoing
   // pointers.  For these blocks, we skip the scavenge stage and put
   // them straight on the scavenged_large_objects list.
-  if (bd->flags & BF_PINNED) {
+  if (RELAXED_LOAD(&bd->flags) & BF_PINNED) {
       ASSERT(get_itbl((StgClosure *)p)->type == ARR_WORDS);
 
       if (new_gen != gen) { ACQUIRE_SPIN_LOCK(&new_gen->sync); }
@@ -389,7 +392,7 @@ evacuate_static_object (StgClosure **link_field, StgClosure *q)
         return;
     }
 
-    StgWord link = (StgWord)*link_field;
+    StgWord link = RELAXED_LOAD((StgWord*) link_field);
 
     // See Note [STATIC_LINK fields] for how the link field bits work
     if (((link & STATIC_BITS) | prev_static_flag) != 3) {
@@ -435,7 +438,7 @@ evacuate_compact (StgPtr p)
     bd = Bdescr((StgPtr)str);
     gen_no = bd->gen_no;
 
-    if (bd->flags & BF_NONMOVING) {
+    if (RELAXED_LOAD(&bd->flags) & BF_NONMOVING) {
         // We may have evacuated the block to the nonmoving generation. If so
         // we need to make sure it is added to the mark queue since the only
         // reference to it may be from the moving heap.
@@ -500,7 +503,7 @@ evacuate_compact (StgPtr p)
     // in the GC, and that should never see blocks other than the first)
     bd->flags |= BF_EVACUATED;
     if (RTS_UNLIKELY(RtsFlags.GcFlags.useNonmoving && new_gen == oldest_gen)) {
-        bd->flags |= BF_NONMOVING;
+      __atomic_fetch_or(&bd->flags, BF_NONMOVING, __ATOMIC_RELAXED);
     }
     initBdescr(bd, new_gen, new_gen->to);
 
@@ -581,7 +584,7 @@ evacuate(StgClosure **p)
   const StgInfoTable *info;
   StgWord tag;
 
-  q = *p;
+  q = RELAXED_LOAD(p);
 
 loop:
   /* The tag and the pointer are split, to be merged after evacing */
@@ -638,10 +641,11 @@ loop:
 
   bd = Bdescr((P_)q);
 
-  if ((bd->flags & (BF_LARGE | BF_MARKED | BF_EVACUATED | BF_COMPACT | BF_NONMOVING)) != 0) {
+  uint16_t flags = RELAXED_LOAD(&bd->flags);
+  if ((flags & (BF_LARGE | BF_MARKED | BF_EVACUATED | BF_COMPACT | BF_NONMOVING)) != 0) {
       // Pointer to non-moving heap. Non-moving heap is collected using
       // mark-sweep so this object should be marked and then retained in sweep.
-      if (RTS_UNLIKELY(bd->flags & BF_NONMOVING)) {
+      if (RTS_UNLIKELY(RELAXED_LOAD(&bd->flags) & BF_NONMOVING)) {
           // NOTE: large objects in nonmoving heap are also marked with
           // BF_NONMOVING. Those are moved to scavenged_large_objects list in
           // mark phase.
@@ -656,11 +660,11 @@ loop:
       // happen often, but allowing it makes certain things a bit
       // easier; e.g. scavenging an object is idempotent, so it's OK to
       // have an object on the mutable list multiple times.
-      if (bd->flags & BF_EVACUATED) {
+      if (flags & BF_EVACUATED) {
           // We aren't copying this object, so we have to check
           // whether it is already in the target generation.  (this is
           // the write barrier).
-          if (bd->gen_no < gct->evac_gen_no) {
+          if (RELAXED_LOAD(&bd->gen_no) < gct->evac_gen_no) {
               gct->failed_to_evac = true;
               TICK_GC_FAILED_PROMOTION();
           }
@@ -671,20 +675,20 @@ loop:
       // right thing for objects that are half way in the middle of the first
       // block of a compact (and would be treated as large objects even though
       // they are not)
-      if (bd->flags & BF_COMPACT) {
+      if (flags & BF_COMPACT) {
           evacuate_compact((P_)q);
           return;
       }
 
       /* evacuate large objects by re-linking them onto a different list.
        */
-      if (bd->flags & BF_LARGE) {
+      if (flags & BF_LARGE) {
           evacuate_large((P_)q);
 
           // We may have evacuated the block to the nonmoving generation. If so
           // we need to make sure it is added to the mark queue since the only
           // reference to it may be from the moving heap.
-          if (major_gc && bd->flags & BF_NONMOVING && !deadlock_detect_gc) {
+          if (major_gc && flags & BF_NONMOVING && !deadlock_detect_gc) {
               markQueuePushClosureGC(&gct->cap->upd_rem_set.queue, q);
           }
           return;
@@ -702,7 +706,7 @@ loop:
 
   gen_no = bd->dest_no;
 
-  info = q->header.info;
+  info = ACQUIRE_LOAD(&q->header.info);
   if (IS_FORWARDING_PTR(info))
   {
     /* Already evacuated, just return the forwarding address.
@@ -722,9 +726,12 @@ loop:
      * check if gen is too low.
      */
       StgClosure *e = (StgClosure*)UN_FORWARDING_PTR(info);
-      *p = TAG_CLOSURE(tag,e);
+      RELAXED_STORE(p, TAG_CLOSURE(tag,e));
       if (gen_no < gct->evac_gen_no) {  // optimisation
-          if (Bdescr((P_)e)->gen_no < gct->evac_gen_no) {
+          // The ACQUIRE here is necessary to ensure that we see gen_no if the
+          // evacuted object lives in a block newly-allocated by a GC thread on
+          // another core.
+          if (ACQUIRE_LOAD(&Bdescr((P_)e)->gen_no) < gct->evac_gen_no) {
               gct->failed_to_evac = true;
               TICK_GC_FAILED_PROMOTION();
           }
@@ -752,15 +759,17 @@ loop:
       if (info == Czh_con_info &&
           // unsigned, so always true:  (StgChar)w >= MIN_CHARLIKE &&
           (StgChar)w <= MAX_CHARLIKE) {
-          *p =  TAG_CLOSURE(tag,
-                            (StgClosure *)CHARLIKE_CLOSURE((StgChar)w)
-                           );
+          RELAXED_STORE(p, \
+                        TAG_CLOSURE(tag, \
+                                    (StgClosure *)CHARLIKE_CLOSURE((StgChar)w)
+                                   ));
       }
       else if (info == Izh_con_info &&
           (StgInt)w >= MIN_INTLIKE && (StgInt)w <= MAX_INTLIKE) {
-          *p = TAG_CLOSURE(tag,
-                             (StgClosure *)INTLIKE_CLOSURE((StgInt)w)
-                             );
+          RELAXED_STORE(p, \
+                        TAG_CLOSURE(tag, \
+                                    (StgClosure *)INTLIKE_CLOSURE((StgInt)w)
+                                   ));
       }
       else {
           copy_tag_nolock(p,info,q,sizeofW(StgHeader)+1,gen_no,tag);
@@ -814,10 +823,10 @@ loop:
       const StgInfoTable *i;
       r = ((StgInd*)q)->indirectee;
       if (GET_CLOSURE_TAG(r) == 0) {
-          i = r->header.info;
+          i = ACQUIRE_LOAD(&r->header.info);
           if (IS_FORWARDING_PTR(i)) {
               r = (StgClosure *)UN_FORWARDING_PTR(i);
-              i = r->header.info;
+              i = ACQUIRE_LOAD(&r->header.info);
           }
           if (i == &stg_TSO_info
               || i == &stg_WHITEHOLE_info
@@ -842,7 +851,7 @@ loop:
           ASSERT(i != &stg_IND_info);
       }
       q = r;
-      *p = r;
+      RELEASE_STORE(p, r);
       goto loop;
   }
 
@@ -868,8 +877,8 @@ loop:
 
   case IND:
     // follow chains of indirections, don't evacuate them
-    q = ((StgInd*)q)->indirectee;
-    *p = q;
+    q = RELAXED_LOAD(&((StgInd*)q)->indirectee);
+    RELAXED_STORE(p, q);
     goto loop;
 
   case RET_BCO:
@@ -983,11 +992,12 @@ evacuate_BLACKHOLE(StgClosure **p)
     ASSERT(GET_CLOSURE_TAG(q) == 0);
 
     bd = Bdescr((P_)q);
+    const uint16_t flags = RELAXED_LOAD(&bd->flags);
 
     // blackholes can't be in a compact
-    ASSERT((bd->flags & BF_COMPACT) == 0);
+    ASSERT((flags & BF_COMPACT) == 0);
 
-    if (RTS_UNLIKELY(bd->flags & BF_NONMOVING)) {
+    if (RTS_UNLIKELY(RELAXED_LOAD(&bd->flags) & BF_NONMOVING)) {
         if (major_gc && !deadlock_detect_gc)
             markQueuePushClosureGC(&gct->cap->upd_rem_set.queue, q);
         return;
@@ -996,18 +1006,18 @@ evacuate_BLACKHOLE(StgClosure **p)
     // blackholes *can* be in a large object: when raiseAsync() creates an
     // AP_STACK the payload might be large enough to create a large object.
     // See #14497.
-    if (bd->flags & BF_LARGE) {
+    if (flags & BF_LARGE) {
         evacuate_large((P_)q);
         return;
     }
-    if (bd->flags & BF_EVACUATED) {
+    if (flags & BF_EVACUATED) {
         if (bd->gen_no < gct->evac_gen_no) {
             gct->failed_to_evac = true;
             TICK_GC_FAILED_PROMOTION();
         }
         return;
     }
-    if (bd->flags & BF_MARKED) {
+    if (flags & BF_MARKED) {
         if (!is_marked((P_)q,bd)) {
             mark((P_)q,bd);
             push_mark_stack((P_)q);
@@ -1015,13 +1025,13 @@ evacuate_BLACKHOLE(StgClosure **p)
         return;
     }
     gen_no = bd->dest_no;
-    info = q->header.info;
+    info = ACQUIRE_LOAD(&q->header.info);
     if (IS_FORWARDING_PTR(info))
     {
         StgClosure *e = (StgClosure*)UN_FORWARDING_PTR(info);
         *p = e;
         if (gen_no < gct->evac_gen_no) {  // optimisation
-            if (Bdescr((P_)e)->gen_no < gct->evac_gen_no) {
+            if (ACQUIRE_LOAD(&Bdescr((P_)e)->gen_no) < gct->evac_gen_no) {
                 gct->failed_to_evac = true;
                 TICK_GC_FAILED_PROMOTION();
             }
@@ -1090,13 +1100,11 @@ unchain_thunk_selectors(StgSelector *p, StgClosure *val)
             // XXX we do not have BLACKHOLEs any more; replace with
             // a THUNK_SELECTOR again.  This will go into a loop if it is
             // entered, and should result in a NonTermination exception.
-            ((StgThunk *)p)->payload[0] = val;
-            write_barrier();
-            SET_INFO((StgClosure *)p, &stg_sel_0_upd_info);
+            RELAXED_STORE(&((StgThunk *)p)->payload[0], val);
+            SET_INFO_RELEASE((StgClosure *)p, &stg_sel_0_upd_info);
         } else {
-            ((StgInd *)p)->indirectee = val;
-            write_barrier();
-            SET_INFO((StgClosure *)p, &stg_IND_info);
+            RELAXED_STORE(&((StgInd *)p)->indirectee, val);
+            SET_INFO_RELEASE((StgClosure *)p, &stg_IND_info);
         }
 
         // For the purposes of LDV profiling, we have created an
@@ -1143,7 +1151,7 @@ selector_chain:
         // save any space in any case, and updating with an indirection is
         // trickier in a non-collected gen: we would have to update the
         // mutable list.
-        if (bd->flags & (BF_EVACUATED | BF_NONMOVING)) {
+        if (RELAXED_LOAD(&bd->flags) & (BF_EVACUATED | BF_NONMOVING)) {
             unchain_thunk_selectors(prev_thunk_selector, (StgClosure *)p);
             *q = (StgClosure *)p;
             // shortcut, behave as for:  if (evac) evacuate(q);
@@ -1198,8 +1206,7 @@ selector_chain:
             //     need the write-barrier stuff.
             //   - undo the chain we've built to point to p.
             SET_INFO((StgClosure *)p, (const StgInfoTable *)info_ptr);
-            write_barrier();
-            *q = (StgClosure *)p;
+            RELEASE_STORE(q, (StgClosure *) p);
             if (evac) evacuate(q);
             unchain_thunk_selectors(prev_thunk_selector, (StgClosure *)p);
             return;
@@ -1225,7 +1232,7 @@ selector_loop:
     // from-space during marking, for example.  We rely on the property
     // that evacuate() doesn't mind if it gets passed a to-space pointer.
 
-    info = (StgInfoTable*)selectee->header.info;
+    info = RELAXED_LOAD((StgInfoTable**) &selectee->header.info);
 
     if (IS_FORWARDING_PTR(info)) {
         // We don't follow pointers into to-space; the constructor
@@ -1252,7 +1259,7 @@ selector_loop:
                                           info->layout.payload.nptrs));
 
               // Select the right field from the constructor
-              StgClosure *val = selectee->payload[field];
+              StgClosure *val = RELAXED_LOAD(&selectee->payload[field]);
 
 #if defined(PROFILING)
               // For the purposes of LDV profiling, we have destroyed
@@ -1278,19 +1285,19 @@ selector_loop:
               // evaluating until we find the real value, and then
               // update the whole chain to point to the value.
           val_loop:
-              info_ptr = (StgWord)UNTAG_CLOSURE(val)->header.info;
+              info_ptr = ACQUIRE_LOAD((StgWord*) &UNTAG_CLOSURE(val)->header.info);
               if (!IS_FORWARDING_PTR(info_ptr))
               {
                   info = INFO_PTR_TO_STRUCT((StgInfoTable *)info_ptr);
                   switch (info->type) {
                   case IND:
                   case IND_STATIC:
-                      val = ((StgInd *)val)->indirectee;
+                      val = RELAXED_LOAD(&((StgInd *)val)->indirectee);
                       goto val_loop;
                   case THUNK_SELECTOR:
                       // Use payload to make a list of thunk selectors, to be
                       // used in unchain_thunk_selectors
-                      ((StgClosure*)p)->payload[0] = (StgClosure *)prev_thunk_selector;
+                      RELAXED_STORE(&((StgClosure*)p)->payload[0], (StgClosure *)prev_thunk_selector);
                       prev_thunk_selector = p;
                       p = (StgSelector*)val;
                       goto selector_chain;
@@ -1298,7 +1305,7 @@ selector_loop:
                       break;
                   }
               }
-              ((StgClosure*)p)->payload[0] = (StgClosure *)prev_thunk_selector;
+              RELAXED_STORE(&((StgClosure*)p)->payload[0], (StgClosure *)prev_thunk_selector);
               prev_thunk_selector = p;
 
               *q = val;
@@ -1320,22 +1327,22 @@ selector_loop:
       case IND:
       case IND_STATIC:
           // Again, we might need to untag a constructor.
-          selectee = UNTAG_CLOSURE( ((StgInd *)selectee)->indirectee );
+          selectee = UNTAG_CLOSURE( RELAXED_LOAD(&((StgInd *)selectee)->indirectee) );
           goto selector_loop;
 
       case BLACKHOLE:
       {
           StgClosure *r;
           const StgInfoTable *i;
-          r = ((StgInd*)selectee)->indirectee;
+          r = ACQUIRE_LOAD(&((StgInd*)selectee)->indirectee);
 
           // establish whether this BH has been updated, and is now an
           // indirection, as in evacuate().
           if (GET_CLOSURE_TAG(r) == 0) {
-              i = r->header.info;
+              i = ACQUIRE_LOAD(&r->header.info);
               if (IS_FORWARDING_PTR(i)) {
                   r = (StgClosure *)UN_FORWARDING_PTR(i);
-                  i = r->header.info;
+                  i = RELAXED_LOAD(&r->header.info);
               }
               if (i == &stg_TSO_info
                   || i == &stg_WHITEHOLE_info
@@ -1346,7 +1353,7 @@ selector_loop:
               ASSERT(i != &stg_IND_info);
           }
 
-          selectee = UNTAG_CLOSURE( ((StgInd *)selectee)->indirectee );
+          selectee = UNTAG_CLOSURE( RELAXED_LOAD(&((StgInd *)selectee)->indirectee) );
           goto selector_loop;
       }
 
diff --git a/rts/sm/GC.c b/rts/sm/GC.c
index 0fa927f2ad..8a8acb1b53 100644
--- a/rts/sm/GC.c
+++ b/rts/sm/GC.c
@@ -112,14 +112,8 @@ static W_ g0_pcnt_kept = 30; // percentage of g0 live at last minor GC
 
 /* Mut-list stats */
 #if defined(DEBUG)
-uint32_t mutlist_MUTVARS,
-    mutlist_MUTARRS,
-    mutlist_MVARS,
-    mutlist_TVAR,
-    mutlist_TVAR_WATCH_QUEUE,
-    mutlist_TREC_CHUNK,
-    mutlist_TREC_HEADER,
-    mutlist_OTHERS;
+// For lack of a better option we protect mutlist_scav_stats with oldest_gen->sync
+MutListScavStats mutlist_scav_stats;
 #endif
 
 /* Thread-local data for each GC thread
@@ -184,6 +178,36 @@ bdescr *mark_stack_top_bd; // topmost block in the mark stack
 bdescr *mark_stack_bd;     // current block in the mark stack
 StgPtr mark_sp;            // pointer to the next unallocated mark stack entry
 
+
+/* -----------------------------------------------------------------------------
+   Statistics from mut_list scavenging
+   -------------------------------------------------------------------------- */
+
+#if defined(DEBUG)
+void
+zeroMutListScavStats(MutListScavStats *src)
+{
+    memset(src, 0, sizeof(MutListScavStats));
+}
+
+void
+addMutListScavStats(const MutListScavStats *src,
+                    MutListScavStats *dest)
+{
+#define ADD_STATS(field) dest->field += src->field;
+    ADD_STATS(n_MUTVAR);
+    ADD_STATS(n_MUTARR);
+    ADD_STATS(n_MVAR);
+    ADD_STATS(n_TVAR);
+    ADD_STATS(n_TREC_CHUNK);
+    ADD_STATS(n_TVAR_WATCH_QUEUE);
+    ADD_STATS(n_TREC_HEADER);
+    ADD_STATS(n_OTHERS);
+#undef ADD_STATS
+}
+#endif /* DEBUG */
+
+
 /* -----------------------------------------------------------------------------
    GarbageCollect: the main entry point to the garbage collector.
 
@@ -250,14 +274,7 @@ GarbageCollect (uint32_t collect_gen,
   stablePtrLock();
 
 #if defined(DEBUG)
-  mutlist_MUTVARS = 0;
-  mutlist_MUTARRS = 0;
-  mutlist_MVARS = 0;
-  mutlist_TVAR = 0;
-  mutlist_TVAR_WATCH_QUEUE = 0;
-  mutlist_TREC_CHUNK = 0;
-  mutlist_TREC_HEADER = 0;
-  mutlist_OTHERS = 0;
+  zeroMutListScavStats(&mutlist_scav_stats);
 #endif
 
   // attribute any costs to CCS_GC
@@ -520,37 +537,37 @@ GarbageCollect (uint32_t collect_gen,
       const gc_thread* thread;
 
       for (i=0; i < n_gc_threads; i++) {
-          copied += gc_threads[i]->copied;
+          copied += RELAXED_LOAD(&gc_threads[i]->copied);
       }
       for (i=0; i < n_gc_threads; i++) {
           thread = gc_threads[i];
           if (n_gc_threads > 1) {
               debugTrace(DEBUG_gc,"thread %d:", i);
               debugTrace(DEBUG_gc,"   copied           %ld",
-                         thread->copied * sizeof(W_));
+                         RELAXED_LOAD(&thread->copied) * sizeof(W_));
               debugTrace(DEBUG_gc,"   scanned          %ld",
-                         thread->scanned * sizeof(W_));
+                         RELAXED_LOAD(&thread->scanned) * sizeof(W_));
               debugTrace(DEBUG_gc,"   any_work         %ld",
-                         thread->any_work);
+                         RELAXED_LOAD(&thread->any_work));
               debugTrace(DEBUG_gc,"   no_work          %ld",
-                         thread->no_work);
+                         RELAXED_LOAD(&thread->no_work));
               debugTrace(DEBUG_gc,"   scav_find_work %ld",
-                         thread->scav_find_work);
+                         RELAXED_LOAD(&thread->scav_find_work));
 
 #if defined(THREADED_RTS) && defined(PROF_SPIN)
-              gc_spin_spin += thread->gc_spin.spin;
-              gc_spin_yield += thread->gc_spin.yield;
-              mut_spin_spin += thread->mut_spin.spin;
-              mut_spin_yield += thread->mut_spin.yield;
+              gc_spin_spin += RELAXED_LOAD(&thread->gc_spin.spin);
+              gc_spin_yield += RELAXED_LOAD(&thread->gc_spin.yield);
+              mut_spin_spin += RELAXED_LOAD(&thread->mut_spin.spin);
+              mut_spin_yield += RELAXED_LOAD(&thread->mut_spin.yield);
 #endif
 
-              any_work += thread->any_work;
-              no_work += thread->no_work;
-              scav_find_work += thread->scav_find_work;
+              any_work += RELAXED_LOAD(&thread->any_work);
+              no_work += RELAXED_LOAD(&thread->no_work);
+              scav_find_work += RELAXED_LOAD(&thread->scav_find_work);
 
-              par_max_copied = stg_max(gc_threads[i]->copied, par_max_copied);
+              par_max_copied = stg_max(RELAXED_LOAD(&thread->copied), par_max_copied);
               par_balanced_copied_acc +=
-                  stg_min(n_gc_threads * gc_threads[i]->copied, copied);
+                  stg_min(n_gc_threads * RELAXED_LOAD(&thread->copied), copied);
           }
       }
       if (n_gc_threads > 1) {
@@ -590,10 +607,14 @@ GarbageCollect (uint32_t collect_gen,
         debugTrace(DEBUG_gc,
                    "mut_list_size: %lu (%d vars, %d arrays, %d MVARs, %d TVARs, %d TVAR_WATCH_QUEUEs, %d TREC_CHUNKs, %d TREC_HEADERs, %d others)",
                    (unsigned long)(mut_list_size * sizeof(W_)),
-                   mutlist_MUTVARS, mutlist_MUTARRS, mutlist_MVARS,
-                   mutlist_TVAR, mutlist_TVAR_WATCH_QUEUE,
-                   mutlist_TREC_CHUNK, mutlist_TREC_HEADER,
-                   mutlist_OTHERS);
+                   mutlist_scav_stats.n_MUTVAR,
+                   mutlist_scav_stats.n_MUTARR,
+                   mutlist_scav_stats.n_MVAR,
+                   mutlist_scav_stats.n_TVAR,
+                   mutlist_scav_stats.n_TVAR_WATCH_QUEUE,
+                   mutlist_scav_stats.n_TREC_CHUNK,
+                   mutlist_scav_stats.n_TREC_HEADER,
+                   mutlist_scav_stats.n_OTHERS);
     }
 
     bdescr *next, *prev;
@@ -1109,7 +1130,7 @@ inc_running (void)
 static StgWord
 dec_running (void)
 {
-    ASSERT(gc_running_threads != 0);
+    ASSERT(RELAXED_LOAD(&gc_running_threads) != 0);
     return atomic_dec(&gc_running_threads);
 }
 
@@ -1119,7 +1140,7 @@ any_work (void)
     int g;
     gen_workspace *ws;
 
-    gct->any_work++;
+    NONATOMIC_ADD(&gct->any_work, 1);
 
     write_barrier();
 
@@ -1152,7 +1173,7 @@ any_work (void)
     }
 #endif
 
-    gct->no_work++;
+    __atomic_fetch_add(&gct->no_work, 1, __ATOMIC_RELAXED);
 #if defined(THREADED_RTS)
     yieldThread();
 #endif
@@ -1193,7 +1214,7 @@ loop:
 
     debugTrace(DEBUG_gc, "%d GC threads still running", r);
 
-    while (gc_running_threads != 0) {
+    while (SEQ_CST_LOAD(&gc_running_threads) != 0) {
         // usleep(1);
         if (any_work()) {
             inc_running();
@@ -1230,7 +1251,7 @@ gcWorkerThread (Capability *cap)
     //    measurements more accurate on Linux, perhaps because it syncs
     //    the CPU time across the multiple cores.  Without this, CPU time
     //    is heavily skewed towards GC rather than MUT.
-    gct->wakeup = GC_THREAD_STANDING_BY;
+    SEQ_CST_STORE(&gct->wakeup, GC_THREAD_STANDING_BY);
     debugTrace(DEBUG_gc, "GC thread %d standing by...", gct->thread_index);
     ACQUIRE_SPIN_LOCK(&gct->gc_spin);
 
@@ -1257,10 +1278,13 @@ gcWorkerThread (Capability *cap)
 
     // Wait until we're told to continue
     RELEASE_SPIN_LOCK(&gct->gc_spin);
-    gct->wakeup = GC_THREAD_WAITING_TO_CONTINUE;
     debugTrace(DEBUG_gc, "GC thread %d waiting to continue...",
                gct->thread_index);
     stat_endGCWorker (cap, gct);
+    // This must come *after* stat_endGCWorker since it serves to
+    // synchronize us with the GC leader, which will later aggregate the
+    // GC statistics.
+    SEQ_CST_STORE(&gct->wakeup, GC_THREAD_WAITING_TO_CONTINUE);
     ACQUIRE_SPIN_LOCK(&gct->mut_spin);
     debugTrace(DEBUG_gc, "GC thread %d on my way...", gct->thread_index);
 
@@ -1285,7 +1309,7 @@ waitForGcThreads (Capability *cap USED_IF_THREADS, bool idle_cap[])
     while(retry) {
         for (i=0; i < n_threads; i++) {
             if (i == me || idle_cap[i]) continue;
-            if (gc_threads[i]->wakeup != GC_THREAD_STANDING_BY) {
+            if (SEQ_CST_LOAD(&gc_threads[i]->wakeup) != GC_THREAD_STANDING_BY) {
                 prodCapability(capabilities[i], cap->running_task);
             }
         }
@@ -1295,7 +1319,7 @@ waitForGcThreads (Capability *cap USED_IF_THREADS, bool idle_cap[])
                 if (i == me || idle_cap[i]) continue;
                 write_barrier();
                 interruptCapability(capabilities[i]);
-                if (gc_threads[i]->wakeup != GC_THREAD_STANDING_BY) {
+                if (SEQ_CST_LOAD(&gc_threads[i]->wakeup) != GC_THREAD_STANDING_BY) {
                     retry = true;
                 }
             }
@@ -1352,10 +1376,10 @@ wakeup_gc_threads (uint32_t me USED_IF_THREADS,
         if (i == me || idle_cap[i]) continue;
         inc_running();
         debugTrace(DEBUG_gc, "waking up gc thread %d", i);
-        if (gc_threads[i]->wakeup != GC_THREAD_STANDING_BY)
+        if (SEQ_CST_LOAD(&gc_threads[i]->wakeup) != GC_THREAD_STANDING_BY)
             barf("wakeup_gc_threads");
 
-        gc_threads[i]->wakeup = GC_THREAD_RUNNING;
+        SEQ_CST_STORE(&gc_threads[i]->wakeup, GC_THREAD_RUNNING);
         ACQUIRE_SPIN_LOCK(&gc_threads[i]->mut_spin);
         RELEASE_SPIN_LOCK(&gc_threads[i]->gc_spin);
     }
@@ -1376,9 +1400,8 @@ shutdown_gc_threads (uint32_t me USED_IF_THREADS,
 
     for (i=0; i < n_gc_threads; i++) {
         if (i == me || idle_cap[i]) continue;
-        while (gc_threads[i]->wakeup != GC_THREAD_WAITING_TO_CONTINUE) {
+        while (SEQ_CST_LOAD(&gc_threads[i]->wakeup) != GC_THREAD_WAITING_TO_CONTINUE) {
             busy_wait_nop();
-            write_barrier();
         }
     }
 #endif
@@ -1393,10 +1416,10 @@ releaseGCThreads (Capability *cap USED_IF_THREADS, bool idle_cap[])
     uint32_t i;
     for (i=0; i < n_threads; i++) {
         if (i == me || idle_cap[i]) continue;
-        if (gc_threads[i]->wakeup != GC_THREAD_WAITING_TO_CONTINUE)
+        if (SEQ_CST_LOAD(&gc_threads[i]->wakeup) != GC_THREAD_WAITING_TO_CONTINUE)
             barf("releaseGCThreads");
 
-        gc_threads[i]->wakeup = GC_THREAD_INACTIVE;
+        SEQ_CST_STORE(&gc_threads[i]->wakeup, GC_THREAD_INACTIVE);
         ACQUIRE_SPIN_LOCK(&gc_threads[i]->gc_spin);
         RELEASE_SPIN_LOCK(&gc_threads[i]->mut_spin);
     }
@@ -1412,7 +1435,7 @@ static void
 stash_mut_list (Capability *cap, uint32_t gen_no)
 {
     cap->saved_mut_lists[gen_no] = cap->mut_lists[gen_no];
-    cap->mut_lists[gen_no] = allocBlockOnNode_sync(cap->node);
+    RELEASE_STORE(&cap->mut_lists[gen_no], allocBlockOnNode_sync(cap->node));
 }
 
 /* ----------------------------------------------------------------------------
@@ -1438,9 +1461,11 @@ prepare_collected_gen (generation *gen)
         // mutable list always has at least one block; this means we can avoid
         // a check for NULL in recordMutable().
         for (i = 0; i < n_capabilities; i++) {
-            freeChain(capabilities[i]->mut_lists[g]);
-            capabilities[i]->mut_lists[g] =
-                allocBlockOnNode(capNoToNumaNode(i));
+            bdescr *old = RELAXED_LOAD(&capabilities[i]->mut_lists[g]);
+            freeChain(old);
+
+            bdescr *new = allocBlockOnNode(capNoToNumaNode(i));
+            RELAXED_STORE(&capabilities[i]->mut_lists[g], new);
         }
     }
 
@@ -1654,7 +1679,7 @@ collect_pinned_object_blocks (void)
         bdescr *last = NULL;
         if (use_nonmoving && gen == oldest_gen) {
             // Mark objects as belonging to the nonmoving heap
-            for (bdescr *bd = capabilities[n]->pinned_object_blocks; bd != NULL; bd = bd->link) {
+            for (bdescr *bd = RELAXED_LOAD(&capabilities[n]->pinned_object_blocks); bd != NULL; bd = bd->link) {
                 bd->flags |= BF_NONMOVING;
                 bd->gen = oldest_gen;
                 bd->gen_no = oldest_gen->no;
@@ -1673,8 +1698,8 @@ collect_pinned_object_blocks (void)
             if (gen->large_objects != NULL) {
                 gen->large_objects->u.back = last;
             }
-            gen->large_objects = capabilities[n]->pinned_object_blocks;
-            capabilities[n]->pinned_object_blocks = NULL;
+            g0->large_objects = RELAXED_LOAD(&capabilities[n]->pinned_object_blocks);
+            RELAXED_STORE(&capabilities[n]->pinned_object_blocks, NULL);
         }
     }
 }
diff --git a/rts/sm/GC.h b/rts/sm/GC.h
index bde006913b..c5d5f6ac81 100644
--- a/rts/sm/GC.h
+++ b/rts/sm/GC.h
@@ -42,20 +42,32 @@ extern StgPtr mark_sp;
 
 extern bool work_stealing;
 
-#if defined(DEBUG)
-extern uint32_t mutlist_MUTVARS, mutlist_MUTARRS, mutlist_MVARS, mutlist_OTHERS,
-    mutlist_TVAR,
-    mutlist_TVAR_WATCH_QUEUE,
-    mutlist_TREC_CHUNK,
-    mutlist_TREC_HEADER;
-#endif
-
 #if defined(PROF_SPIN) && defined(THREADED_RTS)
 extern volatile StgWord64 whitehole_gc_spin;
 extern volatile StgWord64 waitForGcThreads_spin;
 extern volatile StgWord64 waitForGcThreads_yield;
 #endif
 
+// mutable list scavenging statistics
+#if defined(DEBUG)
+typedef struct {
+    StgWord n_MUTVAR;
+    StgWord n_MUTARR;
+    StgWord n_MVAR;
+    StgWord n_TVAR;
+    StgWord n_TREC_CHUNK;
+    StgWord n_TVAR_WATCH_QUEUE;
+    StgWord n_TREC_HEADER;
+    StgWord n_OTHERS;
+} MutListScavStats;
+
+extern MutListScavStats mutlist_scav_stats;
+
+void zeroMutListScavStats(MutListScavStats *src);
+void addMutListScavStats(const MutListScavStats *src,
+                         MutListScavStats *dest);
+#endif /* DEBUG */
+
 void gcWorkerThread (Capability *cap);
 void initGcThreads (uint32_t from, uint32_t to);
 void freeGcThreads (void);
diff --git a/rts/sm/GCAux.c b/rts/sm/GCAux.c
index 11080c1f22..55b4f99596 100644
--- a/rts/sm/GCAux.c
+++ b/rts/sm/GCAux.c
@@ -83,7 +83,7 @@ isAlive(StgClosure *p)
         return p;
     }
 
-    info = q->header.info;
+    info = RELAXED_LOAD(&q->header.info);
 
     if (IS_FORWARDING_PTR(info)) {
         // alive!
diff --git a/rts/sm/GCUtils.c b/rts/sm/GCUtils.c
index 02c26ddf5e..d58fdc48ae 100644
--- a/rts/sm/GCUtils.c
+++ b/rts/sm/GCUtils.c
@@ -249,8 +249,8 @@ todo_block_full (uint32_t size, gen_workspace *ws)
         return p;
     }
 
-    gct->copied += ws->todo_free - bd->free;
-    bd->free = ws->todo_free;
+    gct->copied += ws->todo_free - RELAXED_LOAD(&bd->free);
+    RELAXED_STORE(&bd->free, ws->todo_free);
 
     ASSERT(bd->u.scan >= bd->start && bd->u.scan <= bd->free);
 
@@ -330,10 +330,11 @@ alloc_todo_block (gen_workspace *ws, uint32_t size)
                 gct->free_blocks = bd->link;
             }
         }
-        // blocks in to-space get the BF_EVACUATED flag.
-        bd->flags = BF_EVACUATED;
-        bd->u.scan = bd->start;
         initBdescr(bd, ws->gen, ws->gen->to);
+        RELAXED_STORE(&bd->u.scan, RELAXED_LOAD(&bd->start));
+        // blocks in to-space get the BF_EVACUATED flag.
+        // RELEASE here to ensure that bd->gen is visible to other cores.
+        RELEASE_STORE(&bd->flags, BF_EVACUATED);
     }
 
     bd->link = NULL;
@@ -345,7 +346,7 @@ alloc_todo_block (gen_workspace *ws, uint32_t size)
                      // See Note [big objects]
 
     debugTrace(DEBUG_gc, "alloc new todo block %p for gen  %d",
-               bd->free, ws->gen->no);
+               RELAXED_LOAD(&bd->free), ws->gen->no);
 
     return ws->todo_free;
 }
diff --git a/rts/sm/GCUtils.h b/rts/sm/GCUtils.h
index a71d6dcb92..798a795deb 100644
--- a/rts/sm/GCUtils.h
+++ b/rts/sm/GCUtils.h
@@ -67,7 +67,9 @@ recordMutableGen_GC (StgClosure *p, uint32_t gen_no)
         bd = new_bd;
         gct->mut_lists[gen_no] = bd;
     }
-    *bd->free++ = (StgWord)p;
+    *bd->free++ = (StgWord) p;
+    // N.B. we are allocating into our Capability-local mut_list, therefore
+    // we don't need an atomic increment.
 }
 
 #include "EndPrivate.h"
diff --git a/rts/sm/MarkWeak.c b/rts/sm/MarkWeak.c
index 65b1338f10..b8d120823c 100644
--- a/rts/sm/MarkWeak.c
+++ b/rts/sm/MarkWeak.c
@@ -414,14 +414,13 @@ markWeakPtrList ( void )
         StgWeak *w, **last_w;
 
         last_w = &gen->weak_ptr_list;
-        for (w = gen->weak_ptr_list; w != NULL; w = w->link) {
+        for (w = gen->weak_ptr_list; w != NULL; w = RELAXED_LOAD(&w->link)) {
             // w might be WEAK, EVACUATED, or DEAD_WEAK (actually CON_STATIC) here
 
 #if defined(DEBUG)
             {   // careful to do this assertion only reading the info ptr
                 // once, because during parallel GC it might change under our feet.
-                const StgInfoTable *info;
-                info = w->header.info;
+                const StgInfoTable *info = RELAXED_LOAD(&w->header.info);
                 ASSERT(IS_FORWARDING_PTR(info)
                        || info == &stg_DEAD_WEAK_info
                        || INFO_PTR_TO_STRUCT(info)->type == WEAK);
diff --git a/rts/sm/NonMoving.c b/rts/sm/NonMoving.c
index 388ceae2fd..3eafd6be98 100644
--- a/rts/sm/NonMoving.c
+++ b/rts/sm/NonMoving.c
@@ -726,6 +726,7 @@ void nonmovingStop(void)
                    "waiting for nonmoving collector thread to terminate");
         ACQUIRE_LOCK(&concurrent_coll_finished_lock);
         waitCondition(&concurrent_coll_finished, &concurrent_coll_finished_lock);
+        joinOSThread(mark_thread);
     }
 #endif
 }
diff --git a/rts/sm/Scav.c b/rts/sm/Scav.c
index dd9a96adf8..9fe2c6006e 100644
--- a/rts/sm/Scav.c
+++ b/rts/sm/Scav.c
@@ -65,6 +65,8 @@
 #include "sm/NonMoving.h" // for nonmoving_set_closure_mark_bit
 #include "sm/NonMovingScav.h"
 
+#include <string.h> /* for memset */
+
 static void scavenge_large_bitmap (StgPtr p,
                                    StgLargeBitmap *large_bitmap,
                                    StgWord size );
@@ -201,9 +203,9 @@ scavenge_compact(StgCompactNFData *str)
 
     gct->eager_promotion = saved_eager;
     if (gct->failed_to_evac) {
-        ((StgClosure *)str)->header.info = &stg_COMPACT_NFDATA_DIRTY_info;
+        RELEASE_STORE(&((StgClosure *)str)->header.info, &stg_COMPACT_NFDATA_DIRTY_info);
     } else {
-        ((StgClosure *)str)->header.info = &stg_COMPACT_NFDATA_CLEAN_info;
+        RELEASE_STORE(&((StgClosure *)str)->header.info, &stg_COMPACT_NFDATA_CLEAN_info);
     }
 }
 
@@ -464,9 +466,9 @@ scavenge_block (bdescr *bd)
         gct->eager_promotion = saved_eager_promotion;
 
         if (gct->failed_to_evac) {
-            mvar->header.info = &stg_MVAR_DIRTY_info;
+            RELEASE_STORE(&mvar->header.info, &stg_MVAR_DIRTY_info);
         } else {
-            mvar->header.info = &stg_MVAR_CLEAN_info;
+            RELEASE_STORE(&mvar->header.info, &stg_MVAR_CLEAN_info);
         }
         p += sizeofW(StgMVar);
         break;
@@ -481,9 +483,9 @@ scavenge_block (bdescr *bd)
         gct->eager_promotion = saved_eager_promotion;
 
         if (gct->failed_to_evac) {
-            tvar->header.info = &stg_TVAR_DIRTY_info;
+            RELEASE_STORE(&tvar->header.info, &stg_TVAR_DIRTY_info);
         } else {
-            tvar->header.info = &stg_TVAR_CLEAN_info;
+            RELEASE_STORE(&tvar->header.info, &stg_TVAR_CLEAN_info);
         }
         p += sizeofW(StgTVar);
         break;
@@ -615,9 +617,9 @@ scavenge_block (bdescr *bd)
         gct->eager_promotion = saved_eager_promotion;
 
         if (gct->failed_to_evac) {
-            ((StgClosure *)q)->header.info = &stg_MUT_VAR_DIRTY_info;
+            RELEASE_STORE(&((StgClosure *) q)->header.info, &stg_MUT_VAR_DIRTY_info);
         } else {
-            ((StgClosure *)q)->header.info = &stg_MUT_VAR_CLEAN_info;
+            RELEASE_STORE(&((StgClosure *) q)->header.info, &stg_MUT_VAR_CLEAN_info);
         }
         p += sizeofW(StgMutVar);
         break;
@@ -634,9 +636,9 @@ scavenge_block (bdescr *bd)
         gct->eager_promotion = saved_eager_promotion;
 
         if (gct->failed_to_evac) {
-            bq->header.info = &stg_BLOCKING_QUEUE_DIRTY_info;
+            RELEASE_STORE(&bq->header.info, &stg_BLOCKING_QUEUE_DIRTY_info);
         } else {
-            bq->header.info = &stg_BLOCKING_QUEUE_CLEAN_info;
+            RELEASE_STORE(&bq->header.info, &stg_BLOCKING_QUEUE_CLEAN_info);
         }
         p += sizeofW(StgBlockingQueue);
         break;
@@ -686,9 +688,9 @@ scavenge_block (bdescr *bd)
         p = scavenge_mut_arr_ptrs((StgMutArrPtrs*)p);
 
         if (gct->failed_to_evac) {
-            ((StgClosure *)q)->header.info = &stg_MUT_ARR_PTRS_DIRTY_info;
+            RELEASE_STORE(&((StgClosure *) q)->header.info, &stg_MUT_ARR_PTRS_DIRTY_info);
         } else {
-            ((StgClosure *)q)->header.info = &stg_MUT_ARR_PTRS_CLEAN_info;
+            RELEASE_STORE(&((StgClosure *) q)->header.info, &stg_MUT_ARR_PTRS_CLEAN_info);
         }
 
         gct->eager_promotion = saved_eager_promotion;
@@ -703,9 +705,9 @@ scavenge_block (bdescr *bd)
         p = scavenge_mut_arr_ptrs((StgMutArrPtrs*)p);
 
         if (gct->failed_to_evac) {
-            ((StgClosure *)q)->header.info = &stg_MUT_ARR_PTRS_FROZEN_DIRTY_info;
+            RELEASE_STORE(&((StgClosure *) q)->header.info, &stg_MUT_ARR_PTRS_FROZEN_DIRTY_info);
         } else {
-            ((StgClosure *)q)->header.info = &stg_MUT_ARR_PTRS_FROZEN_CLEAN_info;
+            RELEASE_STORE(&((StgClosure *) q)->header.info, &stg_MUT_ARR_PTRS_FROZEN_CLEAN_info);
         }
         break;
     }
@@ -728,9 +730,9 @@ scavenge_block (bdescr *bd)
         gct->eager_promotion = saved_eager_promotion;
 
         if (gct->failed_to_evac) {
-            ((StgClosure *)q)->header.info = &stg_SMALL_MUT_ARR_PTRS_DIRTY_info;
+            RELEASE_STORE(&((StgClosure *) q)->header.info, &stg_SMALL_MUT_ARR_PTRS_DIRTY_info);
         } else {
-            ((StgClosure *)q)->header.info = &stg_SMALL_MUT_ARR_PTRS_CLEAN_info;
+            RELEASE_STORE(&((StgClosure *) q)->header.info, &stg_SMALL_MUT_ARR_PTRS_CLEAN_info);
         }
 
         gct->failed_to_evac = true; // always put it on the mutable list.
@@ -749,9 +751,9 @@ scavenge_block (bdescr *bd)
         }
 
         if (gct->failed_to_evac) {
-            ((StgClosure *)q)->header.info = &stg_SMALL_MUT_ARR_PTRS_FROZEN_DIRTY_info;
+            RELEASE_STORE(&((StgClosure *) q)->header.info, &stg_SMALL_MUT_ARR_PTRS_FROZEN_DIRTY_info);
         } else {
-            ((StgClosure *)q)->header.info = &stg_SMALL_MUT_ARR_PTRS_FROZEN_CLEAN_info;
+            RELEASE_STORE(&((StgClosure *) q)->header.info, &stg_SMALL_MUT_ARR_PTRS_FROZEN_CLEAN_info);
         }
         break;
     }
@@ -834,7 +836,7 @@ scavenge_block (bdescr *bd)
 
   if (p > bd->free)  {
       gct->copied += ws->todo_free - bd->free;
-      bd->free = p;
+      RELEASE_STORE(&bd->free, p);
   }
 
   debugTrace(DEBUG_gc, "   scavenged %ld bytes",
@@ -889,9 +891,9 @@ scavenge_mark_stack(void)
             gct->eager_promotion = saved_eager_promotion;
 
             if (gct->failed_to_evac) {
-                mvar->header.info = &stg_MVAR_DIRTY_info;
+                RELEASE_STORE(&mvar->header.info, &stg_MVAR_DIRTY_info);
             } else {
-                mvar->header.info = &stg_MVAR_CLEAN_info;
+                RELEASE_STORE(&mvar->header.info, &stg_MVAR_CLEAN_info);
             }
             break;
         }
@@ -905,9 +907,9 @@ scavenge_mark_stack(void)
             gct->eager_promotion = saved_eager_promotion;
 
             if (gct->failed_to_evac) {
-                tvar->header.info = &stg_TVAR_DIRTY_info;
+                RELEASE_STORE(&tvar->header.info, &stg_TVAR_DIRTY_info);
             } else {
-                tvar->header.info = &stg_TVAR_CLEAN_info;
+                RELEASE_STORE(&tvar->header.info, &stg_TVAR_CLEAN_info);
             }
             break;
         }
@@ -1011,9 +1013,9 @@ scavenge_mark_stack(void)
             gct->eager_promotion = saved_eager_promotion;
 
             if (gct->failed_to_evac) {
-                ((StgClosure *)q)->header.info = &stg_MUT_VAR_DIRTY_info;
+                RELEASE_STORE(&((StgClosure *) q)->header.info, &stg_MUT_VAR_DIRTY_info);
             } else {
-                ((StgClosure *)q)->header.info = &stg_MUT_VAR_CLEAN_info;
+                RELEASE_STORE(&((StgClosure *) q)->header.info, &stg_MUT_VAR_CLEAN_info);
             }
             break;
         }
@@ -1030,9 +1032,9 @@ scavenge_mark_stack(void)
             gct->eager_promotion = saved_eager_promotion;
 
             if (gct->failed_to_evac) {
-                bq->header.info = &stg_BLOCKING_QUEUE_DIRTY_info;
+                RELEASE_STORE(&bq->header.info, &stg_BLOCKING_QUEUE_DIRTY_info);
             } else {
-                bq->header.info = &stg_BLOCKING_QUEUE_CLEAN_info;
+                RELEASE_STORE(&bq->header.info, &stg_BLOCKING_QUEUE_CLEAN_info);
             }
             break;
         }
@@ -1078,9 +1080,9 @@ scavenge_mark_stack(void)
             scavenge_mut_arr_ptrs((StgMutArrPtrs *)p);
 
             if (gct->failed_to_evac) {
-                ((StgClosure *)q)->header.info = &stg_MUT_ARR_PTRS_DIRTY_info;
+                RELEASE_STORE(&((StgClosure *) q)->header.info, &stg_MUT_ARR_PTRS_DIRTY_info);
             } else {
-                ((StgClosure *)q)->header.info = &stg_MUT_ARR_PTRS_CLEAN_info;
+                RELEASE_STORE(&((StgClosure *) q)->header.info, &stg_MUT_ARR_PTRS_CLEAN_info);
             }
 
             gct->eager_promotion = saved_eager_promotion;
@@ -1097,9 +1099,9 @@ scavenge_mark_stack(void)
             scavenge_mut_arr_ptrs((StgMutArrPtrs *)p);
 
             if (gct->failed_to_evac) {
-                ((StgClosure *)q)->header.info = &stg_MUT_ARR_PTRS_FROZEN_DIRTY_info;
+                RELEASE_STORE(&((StgClosure *) q)->header.info, &stg_MUT_ARR_PTRS_FROZEN_DIRTY_info);
             } else {
-                ((StgClosure *)q)->header.info = &stg_MUT_ARR_PTRS_FROZEN_CLEAN_info;
+                RELEASE_STORE(&((StgClosure *) q)->header.info, &stg_MUT_ARR_PTRS_FROZEN_CLEAN_info);
             }
             break;
         }
@@ -1124,9 +1126,9 @@ scavenge_mark_stack(void)
             gct->eager_promotion = saved_eager;
 
             if (gct->failed_to_evac) {
-                ((StgClosure *)q)->header.info = &stg_SMALL_MUT_ARR_PTRS_DIRTY_info;
+                RELEASE_STORE(&((StgClosure *)q)->header.info, &stg_SMALL_MUT_ARR_PTRS_DIRTY_info);
             } else {
-                ((StgClosure *)q)->header.info = &stg_SMALL_MUT_ARR_PTRS_CLEAN_info;
+                RELEASE_STORE(&((StgClosure *)q)->header.info, &stg_SMALL_MUT_ARR_PTRS_CLEAN_info);
             }
 
             gct->failed_to_evac = true; // mutable anyhow.
@@ -1145,9 +1147,9 @@ scavenge_mark_stack(void)
             }
 
             if (gct->failed_to_evac) {
-                ((StgClosure *)q)->header.info = &stg_SMALL_MUT_ARR_PTRS_FROZEN_DIRTY_info;
+                RELEASE_STORE(&((StgClosure *)q)->header.info, &stg_SMALL_MUT_ARR_PTRS_FROZEN_DIRTY_info);
             } else {
-                ((StgClosure *)q)->header.info = &stg_SMALL_MUT_ARR_PTRS_FROZEN_CLEAN_info;
+                RELEASE_STORE(&((StgClosure *)q)->header.info, &stg_SMALL_MUT_ARR_PTRS_FROZEN_CLEAN_info);
             }
             break;
         }
@@ -1251,9 +1253,9 @@ scavenge_one(StgPtr p)
         gct->eager_promotion = saved_eager_promotion;
 
         if (gct->failed_to_evac) {
-            mvar->header.info = &stg_MVAR_DIRTY_info;
+            RELEASE_STORE(&mvar->header.info, &stg_MVAR_DIRTY_info);
         } else {
-            mvar->header.info = &stg_MVAR_CLEAN_info;
+            RELEASE_STORE(&mvar->header.info, &stg_MVAR_CLEAN_info);
         }
         break;
     }
@@ -1267,9 +1269,9 @@ scavenge_one(StgPtr p)
         gct->eager_promotion = saved_eager_promotion;
 
         if (gct->failed_to_evac) {
-            tvar->header.info = &stg_TVAR_DIRTY_info;
+            RELEASE_STORE(&tvar->header.info, &stg_TVAR_DIRTY_info);
         } else {
-            tvar->header.info = &stg_TVAR_CLEAN_info;
+            RELEASE_STORE(&tvar->header.info, &stg_TVAR_CLEAN_info);
         }
         break;
     }
@@ -1331,9 +1333,9 @@ scavenge_one(StgPtr p)
         gct->eager_promotion = saved_eager_promotion;
 
         if (gct->failed_to_evac) {
-            ((StgClosure *)q)->header.info = &stg_MUT_VAR_DIRTY_info;
+            RELEASE_STORE(&((StgClosure *)q)->header.info, &stg_MUT_VAR_DIRTY_info);
         } else {
-            ((StgClosure *)q)->header.info = &stg_MUT_VAR_CLEAN_info;
+            RELEASE_STORE(&((StgClosure *)q)->header.info, &stg_MUT_VAR_CLEAN_info);
         }
         break;
     }
@@ -1350,9 +1352,9 @@ scavenge_one(StgPtr p)
         gct->eager_promotion = saved_eager_promotion;
 
         if (gct->failed_to_evac) {
-            bq->header.info = &stg_BLOCKING_QUEUE_DIRTY_info;
+            RELEASE_STORE(&bq->header.info, &stg_BLOCKING_QUEUE_DIRTY_info);
         } else {
-            bq->header.info = &stg_BLOCKING_QUEUE_CLEAN_info;
+            RELEASE_STORE(&bq->header.info, &stg_BLOCKING_QUEUE_CLEAN_info);
         }
         break;
     }
@@ -1398,9 +1400,9 @@ scavenge_one(StgPtr p)
         scavenge_mut_arr_ptrs((StgMutArrPtrs *)p);
 
         if (gct->failed_to_evac) {
-            ((StgClosure *)p)->header.info = &stg_MUT_ARR_PTRS_DIRTY_info;
+            RELEASE_STORE(&((StgClosure *)p)->header.info, &stg_MUT_ARR_PTRS_DIRTY_info);
         } else {
-            ((StgClosure *)p)->header.info = &stg_MUT_ARR_PTRS_CLEAN_info;
+            RELEASE_STORE(&((StgClosure *)p)->header.info, &stg_MUT_ARR_PTRS_CLEAN_info);
         }
 
         gct->eager_promotion = saved_eager_promotion;
@@ -1415,9 +1417,9 @@ scavenge_one(StgPtr p)
         scavenge_mut_arr_ptrs((StgMutArrPtrs *)p);
 
         if (gct->failed_to_evac) {
-            ((StgClosure *)p)->header.info = &stg_MUT_ARR_PTRS_FROZEN_DIRTY_info;
+            RELEASE_STORE(&((StgClosure *)p)->header.info, &stg_MUT_ARR_PTRS_FROZEN_DIRTY_info);
         } else {
-            ((StgClosure *)p)->header.info = &stg_MUT_ARR_PTRS_FROZEN_CLEAN_info;
+            RELEASE_STORE(&((StgClosure *)p)->header.info, &stg_MUT_ARR_PTRS_FROZEN_CLEAN_info);
         }
         break;
     }
@@ -1442,9 +1444,9 @@ scavenge_one(StgPtr p)
         gct->eager_promotion = saved_eager;
 
         if (gct->failed_to_evac) {
-            ((StgClosure *)q)->header.info = &stg_SMALL_MUT_ARR_PTRS_DIRTY_info;
+            RELEASE_STORE(&((StgClosure *)q)->header.info, &stg_SMALL_MUT_ARR_PTRS_DIRTY_info);
         } else {
-            ((StgClosure *)q)->header.info = &stg_SMALL_MUT_ARR_PTRS_CLEAN_info;
+            RELEASE_STORE(&((StgClosure *)q)->header.info, &stg_SMALL_MUT_ARR_PTRS_CLEAN_info);
         }
 
         gct->failed_to_evac = true;
@@ -1463,9 +1465,9 @@ scavenge_one(StgPtr p)
         }
 
         if (gct->failed_to_evac) {
-            ((StgClosure *)q)->header.info = &stg_SMALL_MUT_ARR_PTRS_FROZEN_DIRTY_info;
+            RELEASE_STORE(&((StgClosure *)q)->header.info, &stg_SMALL_MUT_ARR_PTRS_FROZEN_DIRTY_info);
         } else {
-            ((StgClosure *)q)->header.info = &stg_SMALL_MUT_ARR_PTRS_FROZEN_CLEAN_info;
+            RELEASE_STORE(&((StgClosure *)q)->header.info, &stg_SMALL_MUT_ARR_PTRS_FROZEN_CLEAN_info);
         }
         break;
     }
@@ -1583,6 +1585,10 @@ static void
 scavenge_mutable_list(bdescr *bd, generation *gen)
 {
     StgPtr p, q;
+#if defined(DEBUG)
+    MutListScavStats stats; // Local accumulator
+    zeroMutListScavStats(&stats);
+#endif
 
     uint32_t gen_no = gen->no;
     gct->evac_gen_no = gen_no;
@@ -1598,31 +1604,31 @@ scavenge_mutable_list(bdescr *bd, generation *gen)
             case MUT_VAR_CLEAN:
                 // can happen due to concurrent writeMutVars
             case MUT_VAR_DIRTY:
-                mutlist_MUTVARS++; break;
+                stats.n_MUTVAR++; break;
             case MUT_ARR_PTRS_CLEAN:
             case MUT_ARR_PTRS_DIRTY:
             case MUT_ARR_PTRS_FROZEN_CLEAN:
             case MUT_ARR_PTRS_FROZEN_DIRTY:
-                mutlist_MUTARRS++; break;
+                stats.n_MUTARR++; break;
             case MVAR_CLEAN:
                 barf("MVAR_CLEAN on mutable list");
             case MVAR_DIRTY:
-                mutlist_MVARS++; break;
+                stats.n_MVAR++; break;
             case TVAR:
-                mutlist_TVAR++; break;
+                stats.n_TVAR++; break;
             case TREC_CHUNK:
-                mutlist_TREC_CHUNK++; break;
+                stats.n_TREC_CHUNK++; break;
             case MUT_PRIM:
                 pinfo = ((StgClosure*)p)->header.info;
                 if (pinfo == &stg_TVAR_WATCH_QUEUE_info)
-                    mutlist_TVAR_WATCH_QUEUE++;
+                    stats.n_TVAR_WATCH_QUEUE++;
                 else if (pinfo == &stg_TREC_HEADER_info)
-                    mutlist_TREC_HEADER++;
+                    stats.n_TREC_HEADER++;
                 else
-                    mutlist_OTHERS++;
+                    stats.n_OTHERS++;
                 break;
             default:
-                mutlist_OTHERS++; break;
+                stats.n_OTHERS++; break;
             }
 #endif
 
@@ -1647,9 +1653,9 @@ scavenge_mutable_list(bdescr *bd, generation *gen)
                 scavenge_mut_arr_ptrs_marked((StgMutArrPtrs *)p);
 
                 if (gct->failed_to_evac) {
-                    ((StgClosure *)p)->header.info = &stg_MUT_ARR_PTRS_DIRTY_info;
+                    RELEASE_STORE(&((StgClosure *)p)->header.info, &stg_MUT_ARR_PTRS_DIRTY_info);
                 } else {
-                    ((StgClosure *)p)->header.info = &stg_MUT_ARR_PTRS_CLEAN_info;
+                    RELEASE_STORE(&((StgClosure *)p)->header.info, &stg_MUT_ARR_PTRS_CLEAN_info);
                 }
 
                 gct->eager_promotion = saved_eager_promotion;
@@ -1671,6 +1677,13 @@ scavenge_mutable_list(bdescr *bd, generation *gen)
             }
         }
     }
+
+#if defined(DEBUG)
+    // For lack of a better option we protect mutlist_scav_stats with oldest_gen->sync
+    ACQUIRE_SPIN_LOCK(&oldest_gen->sync);
+    addMutListScavStats(&stats, &mutlist_scav_stats);
+    RELEASE_SPIN_LOCK(&oldest_gen->sync);
+#endif
 }
 
 void
@@ -1740,8 +1753,9 @@ scavenge_static(void)
     /* Take this object *off* the static_objects list,
      * and put it on the scavenged_static_objects list.
      */
-    gct->static_objects = *STATIC_LINK(info,p);
-    *STATIC_LINK(info,p) = gct->scavenged_static_objects;
+    StgClosure **link = STATIC_LINK(info,p);
+    gct->static_objects = RELAXED_LOAD(link);
+    RELAXED_STORE(link, gct->scavenged_static_objects);
     gct->scavenged_static_objects = flagged_p;
 
     switch (info -> type) {
diff --git a/rts/sm/Storage.c b/rts/sm/Storage.c
index 96bc133d02..251353de6d 100644
--- a/rts/sm/Storage.c
+++ b/rts/sm/Storage.c
@@ -445,7 +445,7 @@ lockCAF (StgRegTable *reg, StgIndStatic *caf)
     Capability *cap = regTableToCapability(reg);
     StgInd *bh;
 
-    orig_info = caf->header.info;
+    orig_info = RELAXED_LOAD(&caf->header.info);
 
 #if defined(THREADED_RTS)
     const StgInfoTable *cur_info;
@@ -501,12 +501,11 @@ lockCAF (StgRegTable *reg, StgIndStatic *caf)
     }
     bh->indirectee = (StgClosure *)cap->r.rCurrentTSO;
     SET_HDR(bh, &stg_CAF_BLACKHOLE_info, caf->header.prof.ccs);
-    // Ensure that above writes are visible before we introduce reference as CAF indirectee.
-    write_barrier();
 
-    caf->indirectee = (StgClosure *)bh;
-    write_barrier();
-    SET_INFO((StgClosure*)caf,&stg_IND_STATIC_info);
+    // RELEASE ordering to ensure that above writes are visible before we
+    // introduce reference as CAF indirectee.
+    RELEASE_STORE(&caf->indirectee, (StgClosure *) bh);
+    SET_INFO_RELEASE((StgClosure*)caf, &stg_IND_STATIC_info);
 
     return bh;
 }
@@ -1033,8 +1032,8 @@ allocateMightFail (Capability *cap, W_ n)
         g0->n_new_large_words += n;
         RELEASE_SM_LOCK;
         initBdescr(bd, g0, g0);
-        bd->flags = BF_LARGE;
-        bd->free = bd->start + n;
+        RELAXED_STORE(&bd->flags, BF_LARGE);
+        RELAXED_STORE(&bd->free, bd->start + n);
         cap->total_allocated += n;
         return bd->start;
     }
@@ -1300,8 +1299,8 @@ dirty_MUT_VAR(StgRegTable *reg, StgMutVar *mvar, StgClosure *old)
     Capability *cap = regTableToCapability(reg);
     // No barrier required here as no other heap object fields are read. See
     // note [Heap memory barriers] in SMP.h.
-    if (mvar->header.info == &stg_MUT_VAR_CLEAN_info) {
-        mvar->header.info = &stg_MUT_VAR_DIRTY_info;
+    if (RELAXED_LOAD(&mvar->header.info) == &stg_MUT_VAR_CLEAN_info) {
+        SET_INFO((StgClosure*) mvar, &stg_MUT_VAR_DIRTY_info);
         recordClosureMutated(cap, (StgClosure *) mvar);
         IF_NONMOVING_WRITE_BARRIER_ENABLED {
             // See Note [Dirty flags in the non-moving collector] in NonMoving.c
@@ -1323,8 +1322,8 @@ dirty_TVAR(Capability *cap, StgTVar *p,
 {
     // No barrier required here as no other heap object fields are read. See
     // note [Heap memory barriers] in SMP.h.
-    if (p->header.info == &stg_TVAR_CLEAN_info) {
-        p->header.info = &stg_TVAR_DIRTY_info;
+    if (RELAXED_LOAD(&p->header.info) == &stg_TVAR_CLEAN_info) {
+        SET_INFO((StgClosure*) p, &stg_TVAR_DIRTY_info);
         recordClosureMutated(cap,(StgClosure*)p);
         IF_NONMOVING_WRITE_BARRIER_ENABLED {
             // See Note [Dirty flags in the non-moving collector] in NonMoving.c
@@ -1341,8 +1340,8 @@ dirty_TVAR(Capability *cap, StgTVar *p,
 void
 setTSOLink (Capability *cap, StgTSO *tso, StgTSO *target)
 {
-    if (tso->dirty == 0) {
-        tso->dirty = 1;
+    if (RELAXED_LOAD(&tso->dirty) == 0) {
+        RELAXED_STORE(&tso->dirty, 1);
         recordClosureMutated(cap,(StgClosure*)tso);
         IF_NONMOVING_WRITE_BARRIER_ENABLED {
             updateRemembSetPushClosure(cap, (StgClosure *) tso->_link);
@@ -1354,8 +1353,8 @@ setTSOLink (Capability *cap, StgTSO *tso, StgTSO *target)
 void
 setTSOPrev (Capability *cap, StgTSO *tso, StgTSO *target)
 {
-    if (tso->dirty == 0) {
-        tso->dirty = 1;
+    if (RELAXED_LOAD(&tso->dirty) == 0) {
+        RELAXED_STORE(&tso->dirty, 1);
         recordClosureMutated(cap,(StgClosure*)tso);
         IF_NONMOVING_WRITE_BARRIER_ENABLED {
             updateRemembSetPushClosure(cap, (StgClosure *) tso->block_info.prev);
@@ -1367,8 +1366,8 @@ setTSOPrev (Capability *cap, StgTSO *tso, StgTSO *target)
 void
 dirty_TSO (Capability *cap, StgTSO *tso)
 {
-    if (tso->dirty == 0) {
-        tso->dirty = 1;
+    if (RELAXED_LOAD(&tso->dirty) == 0) {
+        RELAXED_STORE(&tso->dirty, 1);
         recordClosureMutated(cap,(StgClosure*)tso);
     }
 
@@ -1386,8 +1385,8 @@ dirty_STACK (Capability *cap, StgStack *stack)
         updateRemembSetPushStack(cap, stack);
     }
 
-    if (! (stack->dirty & STACK_DIRTY)) {
-        stack->dirty = STACK_DIRTY;
+    if (RELAXED_LOAD(&stack->dirty) == 0) {
+        RELAXED_STORE(&stack->dirty, 1);
         recordClosureMutated(cap,(StgClosure*)stack);
     }
 
@@ -1562,10 +1561,13 @@ calcNeeded (bool force_major, memcount *blocks_needed)
 
     for (uint32_t g = 0; g < RtsFlags.GcFlags.generations; g++) {
         generation *gen = &generations[g];
-
         W_ blocks = gen->live_estimate ? (gen->live_estimate / BLOCK_SIZE_W) : gen->n_blocks;
-        blocks += gen->n_large_blocks
-                + gen->n_compact_blocks;
+
+        // This can race with allocate() and compactAllocateBlockInternal()
+        // but only needs to be approximate
+        TSAN_ANNOTATE_BENIGN_RACE(&gen->n_large_blocks, "n_large_blocks");
+        blocks += RELAXED_LOAD(&gen->n_large_blocks)
+                + RELAXED_LOAD(&gen->n_compact_blocks);
 
         // we need at least this much space
         needed += blocks;
diff --git a/rts/sm/Storage.h b/rts/sm/Storage.h
index 8d90c3ba5f..48ddcf35f5 100644
--- a/rts/sm/Storage.h
+++ b/rts/sm/Storage.h
@@ -72,8 +72,11 @@ bool     getNewNursery        (Capability *cap);
 INLINE_HEADER
 bool doYouWantToGC(Capability *cap)
 {
+    // This is necessarily approximate since otherwise we would need to take
+    // SM_LOCK to safely look at n_new_large_words.
+    TSAN_ANNOTATE_BENIGN_RACE(&g0->n_new_large_words, "doYouWantToGC(n_new_large_words)");
     return ((cap->r.rCurrentNursery->link == NULL && !getNewNursery(cap)) ||
-            g0->n_new_large_words >= large_alloc_lim);
+            RELAXED_LOAD(&g0->n_new_large_words) >= large_alloc_lim);
 }
 
 /* -----------------------------------------------------------------------------
@@ -91,7 +94,7 @@ INLINE_HEADER void finishedNurseryBlock (Capability *cap, bdescr *bd) {
 }
 
 INLINE_HEADER void newNurseryBlock (bdescr *bd) {
-    bd->free = bd->start;
+    RELAXED_STORE(&bd->free, bd->start);
 }
 
 void     updateNurseriesStats (void);
diff --git a/rts/win32/OSThreads.c b/rts/win32/OSThreads.c
index f3bdefd998..ed8a598e51 100644
--- a/rts/win32/OSThreads.c
+++ b/rts/win32/OSThreads.c
@@ -444,6 +444,15 @@ interruptOSThread (OSThreadId id)
     CloseHandle(hdl);
 }
 
+void
+joinOSThread (OSThreadId id)
+{
+    int ret = WaitForSingleObject(id, INFINITE);
+    if (ret != WAIT_OBJECT_0) {
+        sysErrorBelch("joinOSThread: error %d", ret);
+    }
+}
+
 void setThreadNode (uint32_t node)
 {
     if (osNumaAvailable())
author	Ben Gamari <ben@smart-cactus.org>	2020-11-01 13:10:01 -0500
committer	Ben Gamari <ben@smart-cactus.org>	2020-11-01 13:10:01 -0500
commit	b8e66e0eecdc58ec5fea0b2c9a9454d38858886c (patch)
tree	7d25f3ee8f2b714175d1b5647d9aec1fdb550cc1
parent	b4686bff56377a583f0605b81fae290d3fee4c4a (diff)
parent	3a18155331e07e53b9f3b1d987ed430066b17aa4 (diff)
download	haskell-b8e66e0eecdc58ec5fea0b2c9a9454d38858886c.tar.gz