15 files changed, 482 insertions, 386 deletions
diff --git a/rts/sm/BlockAlloc.c b/rts/sm/BlockAlloc.c
index 2a02ecc9c5..bbb4f8a6c1 100644
--- a/rts/sm/BlockAlloc.c
+++ b/rts/sm/BlockAlloc.c
@@ -210,6 +210,12 @@ void recordFreedBlocks(uint32_t node, uint32_t n)
    Allocation
    -------------------------------------------------------------------------- */
 
+STATIC_INLINE bdescr *
+tail_of (bdescr *bd)
+{
+    return bd + bd->blocks - 1;
+}
+
 STATIC_INLINE void
 initGroup(bdescr *head)
 {
@@ -223,7 +229,7 @@ initGroup(bdescr *head)
   // mblocks don't have bdescrs; freeing these is handled in a
   // different way by free_mblock_group().
   if (head->blocks > 1 && head->blocks <= BLOCKS_PER_MBLOCK) {
-      bdescr *last = head + head->blocks-1;
+      bdescr *last = tail_of(head);
       last->blocks = 0;
       last->link = head;
   }
@@ -285,13 +291,6 @@ free_list_insert (uint32_t node, bdescr *bd)
     dbl_link_onto(bd, &free_list[node][ln]);
 }
 
-
-STATIC_INLINE bdescr *
-tail_of (bdescr *bd)
-{
-    return bd + bd->blocks - 1;
-}
-
 // After splitting a group, the last block of each group must have a
 // tail that points to the head block, to keep our invariants for
 // coalescing.
diff --git a/rts/sm/CNF.c b/rts/sm/CNF.c
index c12f53a120..6bc58cde75 100644
--- a/rts/sm/CNF.c
+++ b/rts/sm/CNF.c
@@ -722,14 +722,14 @@ verify_consistency_block (StgCompactNFData *str, StgCompactNFDataBlock *block)
             p += arr_words_sizeW((StgArrBytes*)p);
             break;
 
-        case MUT_ARR_PTRS_FROZEN:
-        case MUT_ARR_PTRS_FROZEN0:
+        case MUT_ARR_PTRS_FROZEN_CLEAN:
+        case MUT_ARR_PTRS_FROZEN_DIRTY:
             verify_mut_arr_ptrs(str, (StgMutArrPtrs*)p);
             p += mut_arr_ptrs_sizeW((StgMutArrPtrs*)p);
             break;
 
-        case SMALL_MUT_ARR_PTRS_FROZEN:
-        case SMALL_MUT_ARR_PTRS_FROZEN0:
+        case SMALL_MUT_ARR_PTRS_FROZEN_CLEAN:
+        case SMALL_MUT_ARR_PTRS_FROZEN_DIRTY:
         {
             uint32_t i;
             StgSmallMutArrPtrs *arr = (StgSmallMutArrPtrs*)p;
@@ -969,14 +969,14 @@ fixup_block(StgCompactNFDataBlock *block, StgWord *fixup_table, uint32_t count)
             p += arr_words_sizeW((StgArrBytes*)p);
             break;
 
-        case MUT_ARR_PTRS_FROZEN:
-        case MUT_ARR_PTRS_FROZEN0:
+        case MUT_ARR_PTRS_FROZEN_CLEAN:
+        case MUT_ARR_PTRS_FROZEN_DIRTY:
             fixup_mut_arr_ptrs(fixup_table, count, (StgMutArrPtrs*)p);
             p += mut_arr_ptrs_sizeW((StgMutArrPtrs*)p);
             break;
 
-        case SMALL_MUT_ARR_PTRS_FROZEN:
-        case SMALL_MUT_ARR_PTRS_FROZEN0:
+        case SMALL_MUT_ARR_PTRS_FROZEN_CLEAN:
+        case SMALL_MUT_ARR_PTRS_FROZEN_DIRTY:
         {
             uint32_t i;
             StgSmallMutArrPtrs *arr = (StgSmallMutArrPtrs*)p;
diff --git a/rts/sm/Compact.c b/rts/sm/Compact.c
index 0e2fea8990..004e042069 100644
--- a/rts/sm/Compact.c
+++ b/rts/sm/Compact.c
@@ -25,7 +25,8 @@
 #include "Trace.h"
 #include "Weak.h"
 #include "MarkWeak.h"
-#include "Stable.h"
+#include "StablePtr.h"
+#include "StableName.h"
 
 // Turn off inlining when debugging - it obfuscates things
 #if defined(DEBUG)
@@ -212,7 +213,7 @@ thread_static( StgClosure* p )
         p = *THUNK_STATIC_LINK(p);
         continue;
     case FUN_STATIC:
-        p = *FUN_STATIC_LINK(p);
+        p = *STATIC_LINK(info,p);
         continue;
     case CONSTR:
     case CONSTR_NOCAF:
@@ -482,8 +483,8 @@ update_fwd_large( bdescr *bd )
 
     case MUT_ARR_PTRS_CLEAN:
     case MUT_ARR_PTRS_DIRTY:
-    case MUT_ARR_PTRS_FROZEN:
-    case MUT_ARR_PTRS_FROZEN0:
+    case MUT_ARR_PTRS_FROZEN_CLEAN:
+    case MUT_ARR_PTRS_FROZEN_DIRTY:
       // follow everything
       {
           StgMutArrPtrs *a;
@@ -497,8 +498,8 @@ update_fwd_large( bdescr *bd )
 
     case SMALL_MUT_ARR_PTRS_CLEAN:
     case SMALL_MUT_ARR_PTRS_DIRTY:
-    case SMALL_MUT_ARR_PTRS_FROZEN:
-    case SMALL_MUT_ARR_PTRS_FROZEN0:
+    case SMALL_MUT_ARR_PTRS_FROZEN_CLEAN:
+    case SMALL_MUT_ARR_PTRS_FROZEN_DIRTY:
       // follow everything
       {
           StgSmallMutArrPtrs *a;
@@ -682,8 +683,8 @@ thread_obj (const StgInfoTable *info, StgPtr p)
 
     case MUT_ARR_PTRS_CLEAN:
     case MUT_ARR_PTRS_DIRTY:
-    case MUT_ARR_PTRS_FROZEN:
-    case MUT_ARR_PTRS_FROZEN0:
+    case MUT_ARR_PTRS_FROZEN_CLEAN:
+    case MUT_ARR_PTRS_FROZEN_DIRTY:
         // follow everything
     {
         StgMutArrPtrs *a;
@@ -698,8 +699,8 @@ thread_obj (const StgInfoTable *info, StgPtr p)
 
     case SMALL_MUT_ARR_PTRS_CLEAN:
     case SMALL_MUT_ARR_PTRS_DIRTY:
-    case SMALL_MUT_ARR_PTRS_FROZEN:
-    case SMALL_MUT_ARR_PTRS_FROZEN0:
+    case SMALL_MUT_ARR_PTRS_FROZEN_CLEAN:
+    case SMALL_MUT_ARR_PTRS_FROZEN_DIRTY:
         // follow everything
     {
         StgSmallMutArrPtrs *a;
@@ -1000,7 +1001,10 @@ compact(StgClosure *static_objects)
     thread_static(static_objects /* ToDo: ok? */);
 
     // the stable pointer table
-    threadStableTables((evac_fn)thread_root, NULL);
+    threadStablePtrTable((evac_fn)thread_root, NULL);
+
+    // the stable name table
+    threadStableNameTable((evac_fn)thread_root, NULL);
 
     // the CAF list (used by GHCi)
     markCAFs((evac_fn)thread_root, NULL);
diff --git a/rts/sm/Compact.h b/rts/sm/Compact.h
index 6dcb50b1aa..63abfc7180 100644
--- a/rts/sm/Compact.h
+++ b/rts/sm/Compact.h
@@ -20,8 +20,8 @@ mark(StgPtr p, bdescr *bd)
 {
     uint32_t offset_within_block = p - bd->start; // in words
     StgPtr bitmap_word = (StgPtr)bd->u.bitmap +
-        (offset_within_block / (sizeof(W_)*BITS_PER_BYTE));
-    StgWord bit_mask = (StgWord)1 << (offset_within_block & (sizeof(W_)*BITS_PER_BYTE - 1));
+        (offset_within_block / BITS_IN(W_));
+    StgWord bit_mask = (StgWord)1 << (offset_within_block & (BITS_IN(W_) - 1));
     *bitmap_word |= bit_mask;
 }
 
@@ -30,8 +30,8 @@ unmark(StgPtr p, bdescr *bd)
 {
     uint32_t offset_within_block = p - bd->start; // in words
     StgPtr bitmap_word = (StgPtr)bd->u.bitmap +
-        (offset_within_block / (sizeof(W_)*BITS_PER_BYTE));
-    StgWord bit_mask = (StgWord)1 << (offset_within_block & (sizeof(W_)*BITS_PER_BYTE - 1));
+        (offset_within_block / BITS_IN(W_));
+    StgWord bit_mask = (StgWord)1 << (offset_within_block & (BITS_IN(W_) - 1));
     *bitmap_word &= ~bit_mask;
 }
 
@@ -40,8 +40,8 @@ is_marked(StgPtr p, bdescr *bd)
 {
     uint32_t offset_within_block = p - bd->start; // in words
     StgPtr bitmap_word = (StgPtr)bd->u.bitmap +
-        (offset_within_block / (sizeof(W_)*BITS_PER_BYTE));
-    StgWord bit_mask = (StgWord)1 << (offset_within_block & (sizeof(W_)*BITS_PER_BYTE - 1));
+        (offset_within_block / BITS_IN(W_));
+    StgWord bit_mask = (StgWord)1 << (offset_within_block & (BITS_IN(W_)- 1));
     return (*bitmap_word & bit_mask);
 }
 
diff --git a/rts/sm/Evac.c b/rts/sm/Evac.c
index fb1af0f692..289031945d 100644
--- a/rts/sm/Evac.c
+++ b/rts/sm/Evac.c
@@ -28,10 +28,6 @@
 #include "CNF.h"
 #include "Scav.h"
 
-#if defined(PROF_SPIN) && defined(THREADED_RTS) && defined(PARALLEL_GC)
-StgWord64 whitehole_spin = 0;
-#endif
-
 #if defined(THREADED_RTS) && !defined(PARALLEL_GC)
 #define evacuate(p) evacuate1(p)
 #define evacuate_BLACKHOLE(p) evacuate_BLACKHOLE1(p)
@@ -47,7 +43,7 @@ StgWord64 whitehole_spin = 0;
  */
 #define MAX_THUNK_SELECTOR_DEPTH 16
 
-static void eval_thunk_selector (StgClosure **q, StgSelector * p, bool);
+static void eval_thunk_selector (StgClosure **q, StgSelector *p, bool);
 STATIC_INLINE void evacuate_large(StgPtr p);
 
 /* -----------------------------------------------------------------------------
@@ -197,8 +193,9 @@ spin:
         info = xchg((StgPtr)&src->header.info, (W_)&stg_WHITEHOLE_info);
         if (info == (W_)&stg_WHITEHOLE_info) {
 #if defined(PROF_SPIN)
-            whitehole_spin++;
+            whitehole_gc_spin++;
 #endif
+            busy_wait_nop();
             goto spin;
         }
     if (IS_FORWARDING_PTR(info)) {
@@ -281,14 +278,7 @@ evacuate_large(StgPtr p)
   }
 
   // remove from large_object list
-  if (bd->u.back) {
-    bd->u.back->link = bd->link;
-  } else { // first object in the list
-    gen->large_objects = bd->link;
-  }
-  if (bd->link) {
-    bd->link->u.back = bd->u.back;
-  }
+  dbl_link_remove(bd, &gen->large_objects);
 
   /* link it on to the evacuated large object list of the destination gen
    */
@@ -417,14 +407,7 @@ evacuate_compact (StgPtr p)
     }
 
     // remove from compact_objects list
-    if (bd->u.back) {
-        bd->u.back->link = bd->link;
-    } else { // first object in the list
-        gen->compact_objects = bd->link;
-    }
-    if (bd->link) {
-        bd->link->u.back = bd->u.back;
-    }
+    dbl_link_remove(bd, &gen->compact_objects);
 
     /* link it on to the evacuated compact object list of the destination gen
      */
@@ -539,14 +522,14 @@ loop:
       switch (info->type) {
 
       case THUNK_STATIC:
-          if (info->srt_bitmap != 0) {
+          if (info->srt != 0) {
               evacuate_static_object(THUNK_STATIC_LINK((StgClosure *)q), q);
           }
           return;
 
       case FUN_STATIC:
-          if (info->srt_bitmap != 0) {
-              evacuate_static_object(FUN_STATIC_LINK((StgClosure *)q), q);
+          if (info->srt != 0 || info->layout.payload.ptrs != 0) {
+              evacuate_static_object(STATIC_LINK(info,(StgClosure *)q), q);
           }
           return;
 
@@ -707,9 +690,6 @@ loop:
   case THUNK_1_1:
   case THUNK_2_0:
   case THUNK_0_2:
-#if defined(NO_PROMOTE_THUNKS)
-#error bitrotted
-#endif
     copy(p,info,q,sizeofW(StgThunk)+2,gen_no);
     return;
 
@@ -753,6 +733,19 @@ loop:
               copy(p,info,q,sizeofW(StgInd),gen_no);
               return;
           }
+          // Note [BLACKHOLE pointing to IND]
+          //
+          // BLOCKING_QUEUE can be overwritten by IND (see
+          // wakeBlockingQueue()). However, when this happens we must
+          // be updating the BLACKHOLE, so the BLACKHOLE's indirectee
+          // should now point to the value.
+          //
+          // The mutator might observe an inconsistent state, because
+          // the writes are happening in another thread, so it's
+          // possible for the mutator to follow an indirectee and find
+          // an IND. But this should never happen in the GC, because
+          // the mutators are all stopped and the writes have
+          // completed.
           ASSERT(i != &stg_IND_info);
       }
       q = r;
@@ -818,16 +811,16 @@ loop:
 
   case MUT_ARR_PTRS_CLEAN:
   case MUT_ARR_PTRS_DIRTY:
-  case MUT_ARR_PTRS_FROZEN:
-  case MUT_ARR_PTRS_FROZEN0:
+  case MUT_ARR_PTRS_FROZEN_CLEAN:
+  case MUT_ARR_PTRS_FROZEN_DIRTY:
       // just copy the block
       copy(p,info,q,mut_arr_ptrs_sizeW((StgMutArrPtrs *)q),gen_no);
       return;
 
   case SMALL_MUT_ARR_PTRS_CLEAN:
   case SMALL_MUT_ARR_PTRS_DIRTY:
-  case SMALL_MUT_ARR_PTRS_FROZEN:
-  case SMALL_MUT_ARR_PTRS_FROZEN0:
+  case SMALL_MUT_ARR_PTRS_FROZEN_CLEAN:
+  case SMALL_MUT_ARR_PTRS_FROZEN_DIRTY:
       // just copy the block
       copy(p,info,q,small_mut_arr_ptrs_sizeW((StgSmallMutArrPtrs *)q),gen_no);
       return;
@@ -898,9 +891,16 @@ evacuate_BLACKHOLE(StgClosure **p)
 
     bd = Bdescr((P_)q);
 
-    // blackholes can't be in a compact, or large
-    ASSERT((bd->flags & (BF_COMPACT | BF_LARGE)) == 0);
+    // blackholes can't be in a compact
+    ASSERT((bd->flags & BF_COMPACT) == 0);
 
+    // blackholes *can* be in a large object: when raiseAsync() creates an
+    // AP_STACK the payload might be large enough to create a large object.
+    // See #14497.
+    if (bd->flags & BF_LARGE) {
+        evacuate_large((P_)q);
+        return;
+    }
     if (bd->flags & BF_EVACUATED) {
         if (bd->gen_no < gct->evac_gen_no) {
             gct->failed_to_evac = true;
@@ -934,23 +934,34 @@ evacuate_BLACKHOLE(StgClosure **p)
     copy(p,info,q,sizeofW(StgInd),gen_no);
 }
 
-/* -----------------------------------------------------------------------------
-   Evaluate a THUNK_SELECTOR if possible.
+/* ----------------------------------------------------------------------------
+   Update a chain of thunk selectors with the given value. All selectors in the
+   chain become IND pointing to the value, except when there is a loop (i.e.
+   the value of a THUNK_SELECTOR is the THUNK_SELECTOR itself), in that case we
+   leave the selector as-is.
+
+   p is the current selector to update. In eval_thunk_selector we make a list
+   from selectors using ((StgThunk*)p)->payload[0] for the link field and use
+   that field to traverse the chain here.
+
+   val is the final value of the selector chain.
+
+   A chain is formed when we've got something like:
 
-   p points to a THUNK_SELECTOR that we want to evaluate.  The
-   result of "evaluating" it will be evacuated and a pointer to the
-   to-space closure will be returned.
+      let x = C1 { f1 = e1 }
+          y = C2 { f2 = f1 x }
+          z = f2 y
 
-   If the THUNK_SELECTOR could not be evaluated (its selectee is still
-   a THUNK, for example), then the THUNK_SELECTOR itself will be
-   evacuated.
+   Here the chain (p) we get when evacuating z is:
+
+      [ f2 y, f1 x ]
+
+   and val is e1.
    -------------------------------------------------------------------------- */
+
 static void
 unchain_thunk_selectors(StgSelector *p, StgClosure *val)
 {
-    StgSelector *prev;
-
-    prev = NULL;
     while (p)
     {
         ASSERT(p->header.info == &stg_WHITEHOLE_info);
@@ -960,7 +971,7 @@ unchain_thunk_selectors(StgSelector *p, StgClosure *val)
         // not evacuate it), so in this case val is in from-space.
         // ASSERT(!HEAP_ALLOCED_GC(val) || Bdescr((P_)val)->gen_no > N || (Bdescr((P_)val)->flags & BF_EVACUATED));
 
-        prev = (StgSelector*)((StgClosure *)p)->payload[0];
+        StgSelector *prev = (StgSelector*)((StgClosure *)p)->payload[0];
 
         // Update the THUNK_SELECTOR with an indirection to the
         // value.  The value is still in from-space at this stage.
@@ -997,8 +1008,18 @@ unchain_thunk_selectors(StgSelector *p, StgClosure *val)
     }
 }
 
+/* -----------------------------------------------------------------------------
+   Evaluate a THUNK_SELECTOR if possible.
+
+   p points to a THUNK_SELECTOR that we want to evaluate.
+
+   If the THUNK_SELECTOR could not be evaluated (its selectee is still a THUNK,
+   for example), then the THUNK_SELECTOR itself will be evacuated depending on
+   the evac parameter.
+   -------------------------------------------------------------------------- */
+
 static void
-eval_thunk_selector (StgClosure **q, StgSelector * p, bool evac)
+eval_thunk_selector (StgClosure **q, StgSelector *p, bool evac)
                  // NB. for legacy reasons, p & q are swapped around :(
 {
     uint32_t field;
@@ -1007,7 +1028,6 @@ eval_thunk_selector (StgClosure **q, StgSelector * p, bool evac)
     StgClosure *selectee;
     StgSelector *prev_thunk_selector;
     bdescr *bd;
-    StgClosure *val;
 
     prev_thunk_selector = NULL;
     // this is a chain of THUNK_SELECTORs that we are going to update
@@ -1057,9 +1077,14 @@ selector_chain:
     // In threaded mode, we'll use WHITEHOLE to lock the selector
     // thunk while we evaluate it.
     {
-        do {
+        while(true) {
             info_ptr = xchg((StgPtr)&p->header.info, (W_)&stg_WHITEHOLE_info);
-        } while (info_ptr == (W_)&stg_WHITEHOLE_info);
+            if (info_ptr != (W_)&stg_WHITEHOLE_info) { break; }
+#if defined(PROF_SPIN)
+            ++whitehole_gc_spin;
+#endif
+            busy_wait_nop();
+        }
 
         // make sure someone else didn't get here first...
         if (IS_FORWARDING_PTR(info_ptr) ||
@@ -1127,7 +1152,7 @@ selector_loop:
                                           info->layout.payload.nptrs));
 
               // Select the right field from the constructor
-              val = selectee->payload[field];
+              StgClosure *val = selectee->payload[field];
 
 #if defined(PROFILING)
               // For the purposes of LDV profiling, we have destroyed
@@ -1159,6 +1184,8 @@ selector_loop:
                       val = ((StgInd *)val)->indirectee;
                       goto val_loop;
                   case THUNK_SELECTOR:
+                      // Use payload to make a list of thunk selectors, to be
+                      // used in unchain_thunk_selectors
                       ((StgClosure*)p)->payload[0] = (StgClosure *)prev_thunk_selector;
                       prev_thunk_selector = p;
                       p = (StgSelector*)val;
@@ -1273,5 +1300,4 @@ bale_out:
         copy(q,(const StgInfoTable *)info_ptr,(StgClosure *)p,THUNK_SELECTOR_sizeW(),bd->dest_no);
     }
     unchain_thunk_selectors(prev_thunk_selector, *q);
-    return;
 }
diff --git a/rts/sm/GC.c b/rts/sm/GC.c
index aa804a8b76..70d6d8efe5 100644
--- a/rts/sm/GC.c
+++ b/rts/sm/GC.c
@@ -28,6 +28,7 @@
 #include "Sparks.h"
 #include "Sweep.h"
 
+#include "Arena.h"
 #include "Storage.h"
 #include "RtsUtils.h"
 #include "Apply.h"
@@ -45,9 +46,15 @@
 #include "RetainerProfile.h"
 #include "LdvProfile.h"
 #include "RaiseAsync.h"
-#include "Stable.h"
+#include "StableName.h"
+#include "StablePtr.h"
 #include "CheckUnload.h"
 #include "CNF.h"
+#include "RtsFlags.h"
+
+#if defined(PROFILING)
+#include "RetainerProfile.h"
+#endif
 
 #include <string.h> // for memset()
 #include <unistd.h>
@@ -89,6 +96,8 @@
  *
  * We build up a static object list while collecting generations 0..N,
  * which is then appended to the static object list of generation N+1.
+ *
+ * See also: Note [STATIC_LINK fields] in Storage.h.
  */
 
 /* N is the oldest generation being collected, where the generations
@@ -112,8 +121,6 @@ uint32_t mutlist_MUTVARS,
     mutlist_TVAR_WATCH_QUEUE,
     mutlist_TREC_CHUNK,
     mutlist_TREC_HEADER,
-    mutlist_ATOMIC_INVARIANT,
-    mutlist_INVARIANT_CHECK_QUEUE,
     mutlist_OTHERS;
 #endif
 
@@ -122,7 +129,9 @@ uint32_t mutlist_MUTVARS,
 gc_thread **gc_threads = NULL;
 
 #if !defined(THREADED_RTS)
-StgWord8 the_gc_thread[sizeof(gc_thread) + 64 * sizeof(gen_workspace)];
+// Must be aligned to 64-bytes to meet stated 64-byte alignment of gen_workspace
+StgWord8 the_gc_thread[sizeof(gc_thread) + 64 * sizeof(gen_workspace)]
+    ATTRIBUTE_ALIGNED(64);
 #endif
 
 // Number of threads running in *this* GC.  Affects how many
@@ -132,6 +141,13 @@ uint32_t n_gc_threads;
 // For stats:
 static long copied;        // *words* copied & scavenged during this GC
 
+#if defined(PROF_SPIN) && defined(THREADED_RTS)
+// spin and yield counts for the quasi-SpinLock in waitForGcThreads
+volatile StgWord64 waitForGcThreads_spin = 0;
+volatile StgWord64 waitForGcThreads_yield = 0;
+volatile StgWord64 whitehole_gc_spin = 0;
+#endif
+
 bool work_stealing;
 
 uint32_t static_flag = STATIC_FLAG_B;
@@ -188,7 +204,9 @@ GarbageCollect (uint32_t collect_gen,
 {
   bdescr *bd;
   generation *gen;
-  StgWord live_blocks, live_words, par_max_copied, par_balanced_copied;
+  StgWord live_blocks, live_words, par_max_copied, par_balanced_copied,
+      gc_spin_spin, gc_spin_yield, mut_spin_spin, mut_spin_yield,
+      any_work, no_work, scav_find_work;
 #if defined(THREADED_RTS)
   gc_thread *saved_gct;
 #endif
@@ -221,8 +239,9 @@ GarbageCollect (uint32_t collect_gen,
   // tell the stats department that we've started a GC
   stat_startGC(cap, gct);
 
-  // lock the StablePtr table
-  stableLock();
+  // Lock the StablePtr table. This prevents FFI calls manipulating
+  // the table from occurring during GC.
+  stablePtrLock();
 
 #if defined(DEBUG)
   mutlist_MUTVARS = 0;
@@ -232,8 +251,6 @@ GarbageCollect (uint32_t collect_gen,
   mutlist_TVAR_WATCH_QUEUE = 0;
   mutlist_TREC_CHUNK = 0;
   mutlist_TREC_HEADER = 0;
-  mutlist_ATOMIC_INVARIANT = 0;
-  mutlist_INVARIANT_CHECK_QUEUE = 0;
   mutlist_OTHERS = 0;
 #endif
 
@@ -385,17 +402,15 @@ GarbageCollect (uint32_t collect_gen,
 
   markScheduler(mark_root, gct);
 
-#if defined(RTS_USER_SIGNALS)
-  // mark the signal handlers (signals should be already blocked)
-  markSignalHandlers(mark_root, gct);
-#endif
-
   // Mark the weak pointer list, and prepare to detect dead weak pointers.
   markWeakPtrList();
   initWeakForGC();
 
   // Mark the stable pointer table.
-  markStableTables(mark_root, gct);
+  markStablePtrTable(mark_root, gct);
+
+  // Remember old stable name addresses.
+  rememberOldStableNameAddresses ();
 
   /* -------------------------------------------------------------------------
    * Repeatedly scavenge all the areas we know about until there's no
@@ -421,7 +436,7 @@ GarbageCollect (uint32_t collect_gen,
   shutdown_gc_threads(gct->thread_index, idle_cap);
 
   // Now see which stable names are still alive.
-  gcStableTables();
+  gcStableNameTable();
 
 #if defined(THREADED_RTS)
   if (n_gc_threads == 1) {
@@ -461,32 +476,53 @@ GarbageCollect (uint32_t collect_gen,
   copied = 0;
   par_max_copied = 0;
   par_balanced_copied = 0;
+  gc_spin_spin = 0;
+  gc_spin_yield = 0;
+  mut_spin_spin = 0;
+  mut_spin_yield = 0;
+  any_work = 0;
+  no_work = 0;
+  scav_find_work = 0;
   {
       uint32_t i;
       uint64_t par_balanced_copied_acc = 0;
+      const gc_thread* thread;
 
       for (i=0; i < n_gc_threads; i++) {
           copied += gc_threads[i]->copied;
       }
       for (i=0; i < n_gc_threads; i++) {
+          thread = gc_threads[i];
           if (n_gc_threads > 1) {
               debugTrace(DEBUG_gc,"thread %d:", i);
-              debugTrace(DEBUG_gc,"   copied           %ld", gc_threads[i]->copied * sizeof(W_));
-              debugTrace(DEBUG_gc,"   scanned          %ld", gc_threads[i]->scanned * sizeof(W_));
-              debugTrace(DEBUG_gc,"   any_work         %ld", gc_threads[i]->any_work);
-              debugTrace(DEBUG_gc,"   no_work          %ld", gc_threads[i]->no_work);
-              debugTrace(DEBUG_gc,"   scav_find_work %ld",   gc_threads[i]->scav_find_work);
+              debugTrace(DEBUG_gc,"   copied           %ld",
+                         thread->copied * sizeof(W_));
+              debugTrace(DEBUG_gc,"   scanned          %ld",
+                         thread->scanned * sizeof(W_));
+              debugTrace(DEBUG_gc,"   any_work         %ld",
+                         thread->any_work);
+              debugTrace(DEBUG_gc,"   no_work          %ld",
+                         thread->no_work);
+              debugTrace(DEBUG_gc,"   scav_find_work %ld",
+                         thread->scav_find_work);
+
+#if defined(THREADED_RTS) && defined(PROF_SPIN)
+              gc_spin_spin += thread->gc_spin.spin;
+              gc_spin_yield += thread->gc_spin.yield;
+              mut_spin_spin += thread->mut_spin.spin;
+              mut_spin_yield += thread->mut_spin.yield;
+#endif
+
+              any_work += thread->any_work;
+              no_work += thread->no_work;
+              scav_find_work += thread->scav_find_work;
+
+              par_max_copied = stg_max(gc_threads[i]->copied, par_max_copied);
+              par_balanced_copied_acc +=
+                  stg_min(n_gc_threads * gc_threads[i]->copied, copied);
           }
-          par_max_copied = stg_max(gc_threads[i]->copied, par_max_copied);
-          par_balanced_copied_acc +=
-            stg_min(n_gc_threads * gc_threads[i]->copied, copied);
       }
-      if (n_gc_threads == 1) {
-          par_max_copied = 0;
-          par_balanced_copied = 0;
-      }
-      else
-      {
+      if (n_gc_threads > 1) {
           // See Note [Work Balance] for an explanation of this computation
           par_balanced_copied =
               (par_balanced_copied_acc - copied + (n_gc_threads - 1) / 2) /
@@ -521,13 +557,11 @@ GarbageCollect (uint32_t collect_gen,
         copied +=  mut_list_size;
 
         debugTrace(DEBUG_gc,
-                   "mut_list_size: %lu (%d vars, %d arrays, %d MVARs, %d TVARs, %d TVAR_WATCH_QUEUEs, %d TREC_CHUNKs, %d TREC_HEADERs, %d ATOMIC_INVARIANTs, %d INVARIANT_CHECK_QUEUEs, %d others)",
+                   "mut_list_size: %lu (%d vars, %d arrays, %d MVARs, %d TVARs, %d TVAR_WATCH_QUEUEs, %d TREC_CHUNKs, %d TREC_HEADERs, %d others)",
                    (unsigned long)(mut_list_size * sizeof(W_)),
                    mutlist_MUTVARS, mutlist_MUTARRS, mutlist_MVARS,
                    mutlist_TVAR, mutlist_TVAR_WATCH_QUEUE,
                    mutlist_TREC_CHUNK, mutlist_TREC_HEADER,
-                   mutlist_ATOMIC_INVARIANT,
-                   mutlist_INVARIANT_CHECK_QUEUE,
                    mutlist_OTHERS);
     }
 
@@ -701,15 +735,15 @@ GarbageCollect (uint32_t collect_gen,
   if (major_gc) { gcCAFs(); }
 #endif
 
-  // Update the stable pointer hash table.
-  updateStableTables(major_gc);
+  // Update the stable name hash table
+  updateStableNameTable(major_gc);
 
   // unlock the StablePtr table.  Must be before scheduleFinalizers(),
   // because a finalizer may call hs_free_fun_ptr() or
   // hs_free_stable_ptr(), both of which access the StablePtr table.
-  stableUnlock();
+  stablePtrUnlock();
 
-  // Must be after stableUnlock(), because it might free stable ptrs.
+  // Must be after stablePtrUnlock(), because it might free stable ptrs.
   if (major_gc) {
       checkUnload (gct->scavenged_static_objects);
   }
@@ -751,24 +785,51 @@ GarbageCollect (uint32_t collect_gen,
   ACQUIRE_SM_LOCK;
 
   if (major_gc) {
-      W_ need, got;
-      need = BLOCKS_TO_MBLOCKS(n_alloc_blocks);
-      got = mblocks_allocated;
+      W_ need_prealloc, need_live, need, got;
+      uint32_t i;
+
+      need_live = 0;
+      for (i = 0; i < RtsFlags.GcFlags.generations; i++) {
+          need_live += genLiveBlocks(&generations[i]);
+      }
+      need_live = stg_max(RtsFlags.GcFlags.minOldGenSize, need_live);
+
+      need_prealloc = 0;
+      for (i = 0; i < n_nurseries; i++) {
+          need_prealloc += nurseries[i].n_blocks;
+      }
+      need_prealloc += RtsFlags.GcFlags.largeAllocLim;
+      need_prealloc += countAllocdBlocks(exec_block);
+      need_prealloc += arenaBlocks();
+#if defined(PROFILING)
+      if (RtsFlags.ProfFlags.doHeapProfile == HEAP_BY_RETAINER) {
+          need_prealloc = retainerStackBlocks();
+      }
+#endif
+
       /* If the amount of data remains constant, next major GC we'll
-         require (F+1)*need. We leave (F+2)*need in order to reduce
-         repeated deallocation and reallocation. */
-      need = (RtsFlags.GcFlags.oldGenFactor + 2) * need;
+       * require (F+1)*live + prealloc. We leave (F+2)*live + prealloc
+       * in order to reduce repeated deallocation and reallocation. #14702
+       */
+      need = need_prealloc + (RtsFlags.GcFlags.oldGenFactor + 2) * need_live;
+
+      /* Also, if user set heap size, do not drop below it.
+       */
+      need = stg_max(RtsFlags.GcFlags.heapSizeSuggestion, need);
+
       /* But with a large nursery, the above estimate might exceed
        * maxHeapSize.  A large resident set size might make the OS
        * kill this process, or swap unnecessarily.  Therefore we
        * ensure that our estimate does not exceed maxHeapSize.
        */
       if (RtsFlags.GcFlags.maxHeapSize != 0) {
-          W_ max = BLOCKS_TO_MBLOCKS(RtsFlags.GcFlags.maxHeapSize);
-          if (need > max) {
-              need = max;
-          }
+          need = stg_min(RtsFlags.GcFlags.maxHeapSize, need);
       }
+
+      need = BLOCKS_TO_MBLOCKS(need);
+
+      got = mblocks_allocated;
+
       if (got > need) {
           returnMemoryToOS(got - need);
       }
@@ -797,7 +858,9 @@ GarbageCollect (uint32_t collect_gen,
   // ok, GC over: tell the stats department what happened.
   stat_endGC(cap, gct, live_words, copied,
              live_blocks * BLOCK_SIZE_W - live_words /* slop */,
-             N, n_gc_threads, par_max_copied, par_balanced_copied);
+             N, n_gc_threads, par_max_copied, par_balanced_copied,
+             gc_spin_spin, gc_spin_yield, mut_spin_spin, mut_spin_yield,
+             any_work, no_work, scav_find_work);
 
 #if defined(RTS_USER_SIGNALS)
   if (RtsFlags.MiscFlags.install_signal_handlers) {
@@ -825,11 +888,6 @@ static void heapOverflow(void)
    Initialise the gc_thread structures.
    -------------------------------------------------------------------------- */
 
-#define GC_THREAD_INACTIVE             0
-#define GC_THREAD_STANDING_BY          1
-#define GC_THREAD_RUNNING              2
-#define GC_THREAD_WAITING_TO_CONTINUE  3
-
 static void
 new_gc_thread (uint32_t n, gc_thread *t)
 {
@@ -1132,6 +1190,9 @@ waitForGcThreads (Capability *cap USED_IF_THREADS, bool idle_cap[])
     const uint32_t me = cap->no;
     uint32_t i, j;
     bool retry = true;
+    Time t0, t1, t2;
+
+    t0 = t1 = t2 = getProcessElapsedTime();
 
     while(retry) {
         for (i=0; i < n_threads; i++) {
@@ -1151,8 +1212,32 @@ waitForGcThreads (Capability *cap USED_IF_THREADS, bool idle_cap[])
                 }
             }
             if (!retry) break;
+#if defined(PROF_SPIN)
+            waitForGcThreads_yield++;
+#endif
             yieldThread();
         }
+
+        t2 = getProcessElapsedTime();
+        if (RtsFlags.GcFlags.longGCSync != 0 &&
+            t2 - t1 > RtsFlags.GcFlags.longGCSync) {
+            /* call this every longGCSync of delay */
+            rtsConfig.longGCSync(cap->no, t2 - t0);
+            t1 = t2;
+        }
+        if (retry) {
+#if defined(PROF_SPIN)
+            // This is a bit strange, we'll get more yields than spins.
+            // I guess that means it's not a spin-lock at all, but these
+            // numbers are still useful (I think).
+            waitForGcThreads_spin++;
+#endif
+        }
+    }
+
+    if (RtsFlags.GcFlags.longGCSync != 0 &&
+        t2 - t0 > RtsFlags.GcFlags.longGCSync) {
+        rtsConfig.longGCSyncEnd(t2 - t0);
     }
 }
 
@@ -1324,7 +1409,7 @@ prepare_collected_gen (generation *gen)
         bdescr *bitmap_bdescr;
         StgWord *bitmap;
 
-        bitmap_size = gen->n_old_blocks * BLOCK_SIZE / (sizeof(W_)*BITS_PER_BYTE);
+        bitmap_size = gen->n_old_blocks * BLOCK_SIZE / BITS_IN(W_);
 
         if (bitmap_size > 0) {
             bitmap_bdescr = allocGroup((StgWord)BLOCK_ROUND_UP(bitmap_size)
@@ -1342,7 +1427,7 @@ prepare_collected_gen (generation *gen)
             // block descriptor.
             for (bd=gen->old_blocks; bd != NULL; bd = bd->link) {
                 bd->u.bitmap = bitmap;
-                bitmap += BLOCK_SIZE_W / (sizeof(W_)*BITS_PER_BYTE);
+                bitmap += BLOCK_SIZE_W / BITS_IN(W_);
 
                 // Also at this point we set the BF_MARKED flag
                 // for this block.  The invariant is that
@@ -1446,9 +1531,6 @@ collect_gct_blocks (void)
    take a global lock.  Here we collect those blocks from the
    cap->pinned_object_blocks lists and put them on the
    main g0->large_object list.
-
-   Returns: the number of words allocated this way, for stats
-   purposes.
    -------------------------------------------------------------------------- */
 
 static void
@@ -1744,8 +1826,8 @@ resize_nursery (void)
    Sanity code for CAF garbage collection.
 
    With DEBUG turned on, we manage a CAF list in addition to the SRT
-   mechanism.  After GC, we run down the CAF list and blackhole any
-   CAFs which have been garbage collected.  This means we get an error
+   mechanism.  After GC, we run down the CAF list and make any
+   CAFs which have been garbage collected GCD_CAF.  This means we get an error
    whenever the program tries to enter a garbage collected CAF.
 
    Any garbage collected CAFs are taken off the CAF list at the same
@@ -1771,7 +1853,10 @@ static void gcCAFs(void)
         info = get_itbl((StgClosure*)p);
         ASSERT(info->type == IND_STATIC);
 
-        if (p->static_link == NULL) {
+        // See Note [STATIC_LINK fields] in Storage.h
+        // This condition identifies CAFs that have just been GC'd and
+        // don't have static_link==3 which means they should be ignored.
+        if ((((StgWord)(p->static_link)&STATIC_BITS) | prev_static_flag) != 3) {
             debugTrace(DEBUG_gccafs, "CAF gc'd at 0x%p", p);
             SET_INFO((StgClosure*)p,&stg_GCD_CAF_info); // stub it
             if (prev == NULL) {
@@ -1788,3 +1873,28 @@ static void gcCAFs(void)
     debugTrace(DEBUG_gccafs, "%d CAFs live", i);
 }
 #endif
+
+
+/* -----------------------------------------------------------------------------
+   The GC can leave some work for the mutator to do before the next
+   GC, provided the work can be safely overlapped with mutation.  This
+   can help reduce the GC pause time.
+
+   The mutator can call doIdleGCWork() any time it likes, but
+   preferably when it is idle.  It's safe for multiple capabilities to
+   call doIdleGCWork().
+
+   When 'all' is
+     * false: doIdleGCWork() should only take a short, bounded, amount
+       of time.
+     * true: doIdleGCWork() will complete all the outstanding GC work.
+
+   The return value is
+     * true if there's more to do (only if 'all' is false).
+     * false otherwise.
+  -------------------------------------------------------------------------- */
+
+bool doIdleGCWork(Capability *cap STG_UNUSED, bool all)
+{
+    return runSomeFinalizers(all);
+}
diff --git a/rts/sm/GC.h b/rts/sm/GC.h
index c6b0c13a46..437a25f8d9 100644
--- a/rts/sm/GC.h
+++ b/rts/sm/GC.h
@@ -26,6 +26,8 @@ typedef void (*evac_fn)(void *user, StgClosure **root);
 StgClosure * isAlive      ( StgClosure *p );
 void         markCAFs     ( evac_fn evac, void *user );
 
+bool doIdleGCWork(Capability *cap, bool all);
+
 extern uint32_t N;
 extern bool major_gc;
 
@@ -40,13 +42,13 @@ extern uint32_t mutlist_MUTVARS, mutlist_MUTARRS, mutlist_MVARS, mutlist_OTHERS,
     mutlist_TVAR,
     mutlist_TVAR_WATCH_QUEUE,
     mutlist_TREC_CHUNK,
-    mutlist_TREC_HEADER,
-    mutlist_ATOMIC_INVARIANT,
-    mutlist_INVARIANT_CHECK_QUEUE;
+    mutlist_TREC_HEADER;
 #endif
 
 #if defined(PROF_SPIN) && defined(THREADED_RTS)
-extern StgWord64 whitehole_spin;
+extern volatile StgWord64 whitehole_gc_spin;
+extern volatile StgWord64 waitForGcThreads_spin;
+extern volatile StgWord64 waitForGcThreads_yield;
 #endif
 
 void gcWorkerThread (Capability *cap);
diff --git a/rts/sm/GCThread.h b/rts/sm/GCThread.h
index bb206db64c..e865dabe5d 100644
--- a/rts/sm/GCThread.h
+++ b/rts/sm/GCThread.h
@@ -116,6 +116,12 @@ typedef struct gen_workspace_ {
    of the GC threads
    ------------------------------------------------------------------------- */
 
+/* values for the wakeup field */
+#define GC_THREAD_INACTIVE             0
+#define GC_THREAD_STANDING_BY          1
+#define GC_THREAD_RUNNING              2
+#define GC_THREAD_WAITING_TO_CONTINUE  3
+
 typedef struct gc_thread_ {
     Capability *cap;
 
diff --git a/rts/sm/GCUtils.c b/rts/sm/GCUtils.c
index 0373c2b925..31b2913a37 100644
--- a/rts/sm/GCUtils.c
+++ b/rts/sm/GCUtils.c
@@ -81,6 +81,14 @@ freeChain_sync(bdescr *bd)
     RELEASE_SPIN_LOCK(&gc_alloc_block_sync);
 }
 
+void
+freeGroup_sync(bdescr *bd)
+{
+    ACQUIRE_SPIN_LOCK(&gc_alloc_block_sync);
+    freeGroup(bd);
+    RELEASE_SPIN_LOCK(&gc_alloc_block_sync);
+}
+
 /* -----------------------------------------------------------------------------
    Workspace utilities
    -------------------------------------------------------------------------- */
@@ -261,7 +269,7 @@ todo_block_full (uint32_t size, gen_workspace *ws)
                 // object.  However, if the object we're copying is
                 // larger than a block, then we might have an empty
                 // block here.
-                freeGroup(bd);
+                freeGroup_sync(bd);
             } else {
                 push_scanned_block(bd, ws);
             }
@@ -341,24 +349,3 @@ alloc_todo_block (gen_workspace *ws, uint32_t size)
 
     return ws->todo_free;
 }
-
-/* -----------------------------------------------------------------------------
- * Debugging
- * -------------------------------------------------------------------------- */
-
-#if defined(DEBUG)
-void
-printMutableList(bdescr *bd)
-{
-    StgPtr p;
-
-    debugBelch("mutable list %p: ", bd);
-
-    for (; bd != NULL; bd = bd->link) {
-        for (p = bd->start; p < bd->free; p++) {
-            debugBelch("%p (%s), ", (void *)*p, info_type((StgClosure *)*p));
-        }
-    }
-    debugBelch("\n");
-}
-#endif /* DEBUG */
diff --git a/rts/sm/GCUtils.h b/rts/sm/GCUtils.h
index 2e2d4b199d..8b6040769e 100644
--- a/rts/sm/GCUtils.h
+++ b/rts/sm/GCUtils.h
@@ -31,6 +31,7 @@ INLINE_HEADER bdescr *allocBlockOnNode_sync(uint32_t node)
 }
 
 void    freeChain_sync(bdescr *bd);
+void    freeGroup_sync(bdescr *bd);
 
 void    push_scanned_block   (bdescr *bd, gen_workspace *ws);
 StgPtr  todo_block_full      (uint32_t size, gen_workspace *ws);
@@ -50,11 +51,6 @@ isPartiallyFull(bdescr *bd)
     return (bd->free + WORK_UNIT_WORDS < bd->start + BLOCK_SIZE_W);
 }
 
-
-#if defined(DEBUG)
-void printMutableList (bdescr *bd);
-#endif
-
 // Version of recordMutableGen for use during GC.  This uses the
 // mutable lists attached to the current gc_thread structure, which
 // are the same as the mutable lists on the Capability.
diff --git a/rts/sm/MarkWeak.c b/rts/sm/MarkWeak.c
index 9a077b3d14..88037f6a34 100644
--- a/rts/sm/MarkWeak.c
+++ b/rts/sm/MarkWeak.c
@@ -32,11 +32,11 @@
 /* -----------------------------------------------------------------------------
    Weak Pointers
 
-   traverse_weak_ptr_list is called possibly many times during garbage
+   traverseWeakPtrList is called possibly many times during garbage
    collection.  It returns a flag indicating whether it did any work
    (i.e. called evacuate on any live pointers).
 
-   Invariant: traverse_weak_ptr_list is called when the heap is in an
+   Invariant: traverseWeakPtrList is called when the heap is in an
    idempotent state.  That means that there are no pending
    evacuate/scavenge operations.  This invariant helps the weak
    pointer code decide which weak pointers are dead - if there are no
@@ -60,7 +60,7 @@
 
      Now, we discover which *threads* are still alive.  Pointers to
      threads from the all_threads and main thread lists are the
-     weakest of all: a pointers from the finalizer of a dead weak
+     weakest of all: a pointer from the finalizer of a dead weak
      pointer can keep a thread alive.  Any threads found to be unreachable
      are evacuated and placed on the resurrected_threads list so we
      can send them a signal later.
@@ -72,7 +72,7 @@
    -------------------------------------------------------------------------- */
 
 /* Which stage of processing various kinds of weak pointer are we at?
- * (see traverse_weak_ptr_list() below for discussion).
+ * (see traverseWeakPtrList() below for discussion).
  */
 typedef enum { WeakPtrs, WeakThreads, WeakDone } WeakStage;
 static WeakStage weak_stage;
@@ -185,7 +185,7 @@ traverseWeakPtrList(void)
   }
 
   default:
-      barf("traverse_weak_ptr_list");
+      barf("traverseWeakPtrList");
       return true;
   }
 }
@@ -344,7 +344,7 @@ static void tidyThreadList (generation *gen)
 
         if (tmp == NULL) {
             // not alive (yet): leave this thread on the
-            // old_all_threads list.
+            // old_threads list.
             prev = &(t->global_link);
         }
         else {
@@ -378,14 +378,13 @@ static void checkWeakPtrSanity(StgWeak *hd, StgWeak *tl)
 void collectFreshWeakPtrs()
 {
     uint32_t i;
-    generation *gen = &generations[0];
     // move recently allocated weak_ptr_list to the old list as well
     for (i = 0; i < n_capabilities; i++) {
         Capability *cap = capabilities[i];
         if (cap->weak_ptr_list_tl != NULL) {
             IF_DEBUG(sanity, checkWeakPtrSanity(cap->weak_ptr_list_hd, cap->weak_ptr_list_tl));
-            cap->weak_ptr_list_tl->link = gen->weak_ptr_list;
-            gen->weak_ptr_list = cap->weak_ptr_list_hd;
+            cap->weak_ptr_list_tl->link = g0->weak_ptr_list;
+            g0->weak_ptr_list = cap->weak_ptr_list_hd;
             cap->weak_ptr_list_tl = NULL;
             cap->weak_ptr_list_hd = NULL;
         } else {
diff --git a/rts/sm/OSMem.h b/rts/sm/OSMem.h
index 3b0cee9630..7dd0efdc23 100644
--- a/rts/sm/OSMem.h
+++ b/rts/sm/OSMem.h
@@ -18,6 +18,7 @@ void osFreeAllMBlocks(void);
 size_t getPageSize (void);
 StgWord64 getPhysicalMemorySize (void);
 void setExecutable (void *p, W_ len, bool exec);
+bool osBuiltWithNumaSupport(void); // See #14956
 bool osNumaAvailable(void);
 uint32_t osNumaNodes(void);
 uint64_t osNumaMask(void);
diff --git a/rts/sm/Sanity.c b/rts/sm/Sanity.c
index 53b101024a..8d4171b1cd 100644
--- a/rts/sm/Sanity.c
+++ b/rts/sm/Sanity.c
@@ -380,8 +380,8 @@ checkClosure( const StgClosure* p )
 
     case MUT_ARR_PTRS_CLEAN:
     case MUT_ARR_PTRS_DIRTY:
-    case MUT_ARR_PTRS_FROZEN:
-    case MUT_ARR_PTRS_FROZEN0:
+    case MUT_ARR_PTRS_FROZEN_CLEAN:
+    case MUT_ARR_PTRS_FROZEN_DIRTY:
         {
             StgMutArrPtrs* a = (StgMutArrPtrs *)p;
             uint32_t i;
@@ -391,6 +391,18 @@ checkClosure( const StgClosure* p )
             return mut_arr_ptrs_sizeW(a);
         }
 
+    case SMALL_MUT_ARR_PTRS_CLEAN:
+    case SMALL_MUT_ARR_PTRS_DIRTY:
+    case SMALL_MUT_ARR_PTRS_FROZEN_CLEAN:
+    case SMALL_MUT_ARR_PTRS_FROZEN_DIRTY:
+        {
+            StgSmallMutArrPtrs *a = (StgSmallMutArrPtrs *)p;
+            for (uint32_t i = 0; i < a->ptrs; i++) {
+                ASSERT(LOOKS_LIKE_CLOSURE_PTR(a->payload[i]));
+            }
+            return small_mut_arr_ptrs_sizeW(a);
+        }
+
     case TSO:
         checkTSO((StgTSO *)p);
         return sizeofW(StgTSO);
@@ -535,7 +547,8 @@ checkTSO(StgTSO *tso)
     ASSERT(next == END_TSO_QUEUE ||
            info == &stg_MVAR_TSO_QUEUE_info ||
            info == &stg_TSO_info ||
-           info == &stg_WHITEHOLE_info); // happens due to STM doing lockTSO()
+           info == &stg_WHITEHOLE_info); // used to happen due to STM doing
+                                         // lockTSO(), might not happen now
 
     if (   tso->why_blocked == BlockedOnMVar
         || tso->why_blocked == BlockedOnMVarRead
@@ -677,7 +690,7 @@ checkStaticObjects ( StgClosure* static_objects )
       break;
 
     case FUN_STATIC:
-      p = *FUN_STATIC_LINK((StgClosure *)p);
+      p = *STATIC_LINK(info,(StgClosure *)p);
       break;
 
     case CONSTR:
@@ -859,7 +872,7 @@ void findSlop(bdescr *bd)
         slop = (bd->blocks * BLOCK_SIZE_W) - (bd->free - bd->start);
         if (slop > (1024/sizeof(W_))) {
             debugBelch("block at %p (bdescr %p) has %" FMT_Word "KB slop\n",
-                       bd->start, bd, slop / (1024/sizeof(W_)));
+                       bd->start, bd, slop / (1024/(W_)sizeof(W_)));
         }
     }
 }
diff --git a/rts/sm/Scav.c b/rts/sm/Scav.c
index 1ae8a4c19b..2f61914e55 100644
--- a/rts/sm/Scav.c
+++ b/rts/sm/Scav.c
@@ -11,6 +11,37 @@
  *
  * ---------------------------------------------------------------------------*/
 
+/* ----------------------------------------------------------------------------
+   We have two main scavenge functions:
+
+   - scavenge_block(bdescr *bd)
+   - scavenge_one(StgPtr p)
+
+   As the names and parameters suggest, first one scavenges a whole block while
+   the second one only scavenges one object. This however is not the only
+   difference. scavenge_block scavenges all SRTs while scavenge_one only
+   scavenges SRTs of stacks. The reason is because scavenge_one is called in two
+   cases:
+
+   - When scavenging a mut_list
+   - When scavenging a large object
+
+   We don't have to scavenge SRTs when scavenging a mut_list, because we only
+   scavenge mut_lists in minor GCs, and static objects are only collected in
+   major GCs.
+
+   However, because scavenge_one is also used to scavenge large objects (which
+   are scavenged even in major GCs), we need to deal with SRTs of large
+   objects. We never allocate large FUNs and THUNKs, but we allocate large
+   STACKs (e.g. in threadStackOverflow), and stack frames can have SRTs. So
+   scavenge_one skips FUN and THUNK SRTs but scavenges stack frame SRTs.
+
+   In summary, in a major GC:
+
+   - scavenge_block() scavenges all SRTs
+   - scavenge_one() scavenges only stack frame SRTs
+   ------------------------------------------------------------------------- */
+
 #include "PosixSource.h"
 #include "Rts.h"
 
@@ -329,105 +360,17 @@ scavenge_AP (StgAP *ap)
    Scavenge SRTs
    -------------------------------------------------------------------------- */
 
-/* Similar to scavenge_large_bitmap(), but we don't write back the
- * pointers we get back from evacuate().
- */
-static void
-scavenge_large_srt_bitmap( StgLargeSRT *large_srt )
-{
-    uint32_t i, j, size;
-    StgWord bitmap;
-    StgClosure **p;
-
-    size   = (uint32_t)large_srt->l.size;
-    p      = (StgClosure **)large_srt->srt;
-
-    for (i = 0; i < size / BITS_IN(W_); i++) {
-        bitmap = large_srt->l.bitmap[i];
-        // skip zero words: bitmaps can be very sparse, and this helps
-        // performance a lot in some cases.
-        if (bitmap != 0) {
-            for (j = 0; j < BITS_IN(W_); j++) {
-                if ((bitmap & 1) != 0) {
-                    evacuate(p);
-                }
-                p++;
-                bitmap = bitmap >> 1;
-            }
-        } else {
-            p += BITS_IN(W_);
-        }
-    }
-    if (size % BITS_IN(W_) != 0) {
-        bitmap = large_srt->l.bitmap[i];
-        for (j = 0; j < size % BITS_IN(W_); j++) {
-            if ((bitmap & 1) != 0) {
-                evacuate(p);
-            }
-            p++;
-            bitmap = bitmap >> 1;
-        }
-    }
-}
-
-/* evacuate the SRT.  If srt_bitmap is zero, then there isn't an
- * srt field in the info table.  That's ok, because we'll
- * never dereference it.
- */
-STATIC_INLINE GNUC_ATTR_HOT void
-scavenge_srt (StgClosure **srt, uint32_t srt_bitmap)
-{
-  uint32_t bitmap;
-  StgClosure **p;
-
-  bitmap = srt_bitmap;
-  p = srt;
-
-  if (bitmap == (StgHalfWord)(-1)) {
-      scavenge_large_srt_bitmap( (StgLargeSRT *)srt );
-      return;
-  }
-
-  while (bitmap != 0) {
-      if ((bitmap & 1) != 0) {
-#if defined(COMPILING_WINDOWS_DLL)
-          // Special-case to handle references to closures hiding out in DLLs, since
-          // double indirections required to get at those. The code generator knows
-          // which is which when generating the SRT, so it stores the (indirect)
-          // reference to the DLL closure in the table by first adding one to it.
-          // We check for this here, and undo the addition before evacuating it.
-          //
-          // If the SRT entry hasn't got bit 0 set, the SRT entry points to a
-          // closure that's fixed at link-time, and no extra magic is required.
-          if ( (W_)(*srt) & 0x1 ) {
-              evacuate( (StgClosure**) ((W_) (*srt) & ~0x1));
-          } else {
-              evacuate(p);
-          }
-#else
-          evacuate(p);
-#endif
-      }
-      p++;
-      bitmap = bitmap >> 1;
-  }
-}
-
-
 STATIC_INLINE GNUC_ATTR_HOT void
 scavenge_thunk_srt(const StgInfoTable *info)
 {
     StgThunkInfoTable *thunk_info;
-    uint32_t bitmap;
 
     if (!major_gc) return;
 
     thunk_info = itbl_to_thunk_itbl(info);
-    bitmap = thunk_info->i.srt_bitmap;
-    if (bitmap) {
-        // don't read srt_offset if bitmap==0, because it doesn't exist
-        // and so the memory might not be readable.
-        scavenge_srt((StgClosure **)GET_SRT(thunk_info), bitmap);
+    if (thunk_info->i.srt) {
+        StgClosure *srt = (StgClosure*)GET_SRT(thunk_info);
+        evacuate(&srt);
     }
 }
 
@@ -435,16 +378,13 @@ STATIC_INLINE GNUC_ATTR_HOT void
 scavenge_fun_srt(const StgInfoTable *info)
 {
     StgFunInfoTable *fun_info;
-    uint32_t bitmap;
 
     if (!major_gc) return;
 
     fun_info = itbl_to_fun_itbl(info);
-    bitmap = fun_info->i.srt_bitmap;
-    if (bitmap) {
-        // don't read srt_offset if bitmap==0, because it doesn't exist
-        // and so the memory might not be readable.
-        scavenge_srt((StgClosure **)GET_FUN_SRT(fun_info), bitmap);
+    if (fun_info->i.srt) {
+        StgClosure *srt = (StgClosure*)GET_FUN_SRT(fun_info);
+        evacuate(&srt);
     }
 }
 
@@ -737,18 +677,16 @@ scavenge_block (bdescr *bd)
         break;
     }
 
-    case MUT_ARR_PTRS_FROZEN:
-    case MUT_ARR_PTRS_FROZEN0:
+    case MUT_ARR_PTRS_FROZEN_CLEAN:
+    case MUT_ARR_PTRS_FROZEN_DIRTY:
         // follow everything
     {
         p = scavenge_mut_arr_ptrs((StgMutArrPtrs*)p);
 
-        // If we're going to put this object on the mutable list, then
-        // set its info ptr to MUT_ARR_PTRS_FROZEN0 to indicate that.
         if (gct->failed_to_evac) {
-            ((StgClosure *)q)->header.info = &stg_MUT_ARR_PTRS_FROZEN0_info;
+            ((StgClosure *)q)->header.info = &stg_MUT_ARR_PTRS_FROZEN_DIRTY_info;
         } else {
-            ((StgClosure *)q)->header.info = &stg_MUT_ARR_PTRS_FROZEN_info;
+            ((StgClosure *)q)->header.info = &stg_MUT_ARR_PTRS_FROZEN_CLEAN_info;
         }
         break;
     }
@@ -780,8 +718,8 @@ scavenge_block (bdescr *bd)
         break;
     }
 
-    case SMALL_MUT_ARR_PTRS_FROZEN:
-    case SMALL_MUT_ARR_PTRS_FROZEN0:
+    case SMALL_MUT_ARR_PTRS_FROZEN_CLEAN:
+    case SMALL_MUT_ARR_PTRS_FROZEN_DIRTY:
         // follow everything
     {
         StgPtr next;
@@ -791,12 +729,10 @@ scavenge_block (bdescr *bd)
             evacuate((StgClosure **)p);
         }
 
-        // If we're going to put this object on the mutable list, then
-        // set its info ptr to SMALL_MUT_ARR_PTRS_FROZEN0 to indicate that.
         if (gct->failed_to_evac) {
-            ((StgClosure *)q)->header.info = &stg_SMALL_MUT_ARR_PTRS_FROZEN0_info;
+            ((StgClosure *)q)->header.info = &stg_SMALL_MUT_ARR_PTRS_FROZEN_DIRTY_info;
         } else {
-            ((StgClosure *)q)->header.info = &stg_SMALL_MUT_ARR_PTRS_FROZEN_info;
+            ((StgClosure *)q)->header.info = &stg_SMALL_MUT_ARR_PTRS_FROZEN_CLEAN_info;
         }
         break;
     }
@@ -1133,20 +1069,18 @@ scavenge_mark_stack(void)
             break;
         }
 
-        case MUT_ARR_PTRS_FROZEN:
-        case MUT_ARR_PTRS_FROZEN0:
+        case MUT_ARR_PTRS_FROZEN_CLEAN:
+        case MUT_ARR_PTRS_FROZEN_DIRTY:
             // follow everything
         {
             StgPtr q = p;
 
             scavenge_mut_arr_ptrs((StgMutArrPtrs *)p);
 
-            // If we're going to put this object on the mutable list, then
-            // set its info ptr to MUT_ARR_PTRS_FROZEN0 to indicate that.
             if (gct->failed_to_evac) {
-                ((StgClosure *)q)->header.info = &stg_MUT_ARR_PTRS_FROZEN0_info;
+                ((StgClosure *)q)->header.info = &stg_MUT_ARR_PTRS_FROZEN_DIRTY_info;
             } else {
-                ((StgClosure *)q)->header.info = &stg_MUT_ARR_PTRS_FROZEN_info;
+                ((StgClosure *)q)->header.info = &stg_MUT_ARR_PTRS_FROZEN_CLEAN_info;
             }
             break;
         }
@@ -1180,8 +1114,8 @@ scavenge_mark_stack(void)
             break;
         }
 
-        case SMALL_MUT_ARR_PTRS_FROZEN:
-        case SMALL_MUT_ARR_PTRS_FROZEN0:
+        case SMALL_MUT_ARR_PTRS_FROZEN_CLEAN:
+        case SMALL_MUT_ARR_PTRS_FROZEN_DIRTY:
             // follow everything
         {
             StgPtr next, q = p;
@@ -1191,12 +1125,10 @@ scavenge_mark_stack(void)
                 evacuate((StgClosure **)p);
             }
 
-            // If we're going to put this object on the mutable list, then
-            // set its info ptr to SMALL_MUT_ARR_PTRS_FROZEN0 to indicate that.
             if (gct->failed_to_evac) {
-                ((StgClosure *)q)->header.info = &stg_SMALL_MUT_ARR_PTRS_FROZEN0_info;
+                ((StgClosure *)q)->header.info = &stg_SMALL_MUT_ARR_PTRS_FROZEN_DIRTY_info;
             } else {
-                ((StgClosure *)q)->header.info = &stg_SMALL_MUT_ARR_PTRS_FROZEN_info;
+                ((StgClosure *)q)->header.info = &stg_SMALL_MUT_ARR_PTRS_FROZEN_CLEAN_info;
             }
             break;
         }
@@ -1365,7 +1297,7 @@ scavenge_one(StgPtr p)
 
     case WEAK:
         // This WEAK object will not be considered by tidyWeakList during this
-        // collection because it is in a generation >= N, but it is on the
+        // collection because it is in a generation > N, but it is on the
         // mutable list so we must evacuate all of its pointers because some
         // of them may point into a younger generation.
         scavengeLiveWeak((StgWeak *)p);
@@ -1457,18 +1389,16 @@ scavenge_one(StgPtr p)
         break;
     }
 
-    case MUT_ARR_PTRS_FROZEN:
-    case MUT_ARR_PTRS_FROZEN0:
+    case MUT_ARR_PTRS_FROZEN_CLEAN:
+    case MUT_ARR_PTRS_FROZEN_DIRTY:
     {
         // follow everything
         scavenge_mut_arr_ptrs((StgMutArrPtrs *)p);
 
-        // If we're going to put this object on the mutable list, then
-        // set its info ptr to MUT_ARR_PTRS_FROZEN0 to indicate that.
         if (gct->failed_to_evac) {
-            ((StgClosure *)p)->header.info = &stg_MUT_ARR_PTRS_FROZEN0_info;
+            ((StgClosure *)p)->header.info = &stg_MUT_ARR_PTRS_FROZEN_DIRTY_info;
         } else {
-            ((StgClosure *)p)->header.info = &stg_MUT_ARR_PTRS_FROZEN_info;
+            ((StgClosure *)p)->header.info = &stg_MUT_ARR_PTRS_FROZEN_CLEAN_info;
         }
         break;
     }
@@ -1502,8 +1432,8 @@ scavenge_one(StgPtr p)
         break;
     }
 
-    case SMALL_MUT_ARR_PTRS_FROZEN:
-    case SMALL_MUT_ARR_PTRS_FROZEN0:
+    case SMALL_MUT_ARR_PTRS_FROZEN_CLEAN:
+    case SMALL_MUT_ARR_PTRS_FROZEN_DIRTY:
     {
         // follow everything
         StgPtr next, q=p;
@@ -1513,12 +1443,10 @@ scavenge_one(StgPtr p)
             evacuate((StgClosure **)p);
         }
 
-        // If we're going to put this object on the mutable list, then
-        // set its info ptr to SMALL_MUT_ARR_PTRS_FROZEN0 to indicate that.
         if (gct->failed_to_evac) {
-            ((StgClosure *)q)->header.info = &stg_SMALL_MUT_ARR_PTRS_FROZEN0_info;
+            ((StgClosure *)q)->header.info = &stg_SMALL_MUT_ARR_PTRS_FROZEN_DIRTY_info;
         } else {
-            ((StgClosure *)q)->header.info = &stg_SMALL_MUT_ARR_PTRS_FROZEN_info;
+            ((StgClosure *)q)->header.info = &stg_SMALL_MUT_ARR_PTRS_FROZEN_CLEAN_info;
         }
         break;
     }
@@ -1653,8 +1581,8 @@ scavenge_mutable_list(bdescr *bd, generation *gen)
                 mutlist_MUTVARS++; break;
             case MUT_ARR_PTRS_CLEAN:
             case MUT_ARR_PTRS_DIRTY:
-            case MUT_ARR_PTRS_FROZEN:
-            case MUT_ARR_PTRS_FROZEN0:
+            case MUT_ARR_PTRS_FROZEN_CLEAN:
+            case MUT_ARR_PTRS_FROZEN_DIRTY:
                 mutlist_MUTARRS++; break;
             case MVAR_CLEAN:
                 barf("MVAR_CLEAN on mutable list");
@@ -1669,10 +1597,6 @@ scavenge_mutable_list(bdescr *bd, generation *gen)
                     mutlist_TVAR_WATCH_QUEUE++;
                 else if (((StgClosure*)p)->header.info == &stg_TREC_HEADER_info)
                     mutlist_TREC_HEADER++;
-                else if (((StgClosure*)p)->header.info == &stg_ATOMIC_INVARIANT_info)
-                    mutlist_ATOMIC_INVARIANT++;
-                else if (((StgClosure*)p)->header.info == &stg_INVARIANT_CHECK_QUEUE_info)
-                    mutlist_INVARIANT_CHECK_QUEUE++;
                 else
                     mutlist_OTHERS++;
                 break;
@@ -1690,6 +1614,7 @@ scavenge_mutable_list(bdescr *bd, generation *gen)
             //
             switch (get_itbl((StgClosure *)p)->type) {
             case MUT_ARR_PTRS_CLEAN:
+            case SMALL_MUT_ARR_PTRS_CLEAN:
                 recordMutableGen_GC((StgClosure *)p,gen_no);
                 continue;
             case MUT_ARR_PTRS_DIRTY:
@@ -1813,7 +1738,11 @@ scavenge_static(void)
 
     case FUN_STATIC:
       scavenge_fun_srt(info);
-      break;
+      /* fallthrough */
+
+      // a FUN_STATIC can also be an SRT, so it may have pointer
+      // fields.  See Note [SRTs] in CmmBuildInfoTables, specifically
+      // the [FUN] optimisation.
 
     case CONSTR:
     case CONSTR_NOCAF:
@@ -1979,8 +1908,10 @@ scavenge_stack(StgPtr p, StgPtr stack_end)
         p = scavenge_small_bitmap(p, size, bitmap);
 
     follow_srt:
-        if (major_gc)
-            scavenge_srt((StgClosure **)GET_SRT(info), info->i.srt_bitmap);
+        if (major_gc && info->i.srt) {
+            StgClosure *srt = (StgClosure*)GET_SRT(info);
+            evacuate(&srt);
+        }
         continue;
 
     case RET_BCO: {
diff --git a/rts/sm/Storage.c b/rts/sm/Storage.c
index ffaed5f17c..dcc5b3a3c7 100644
--- a/rts/sm/Storage.c
+++ b/rts/sm/Storage.c
@@ -197,11 +197,7 @@ initStorage (void)
 
 #if defined(THREADED_RTS)
   initSpinLock(&gc_alloc_block_sync);
-#if defined(PROF_SPIN)
-  whitehole_spin = 0;
 #endif
-#endif
-
   N = 0;
 
   for (n = 0; n < n_numa_nodes; n++) {
@@ -224,6 +220,7 @@ initStorage (void)
 void storageAddCapabilities (uint32_t from, uint32_t to)
 {
     uint32_t n, g, i, new_n_nurseries;
+    nursery *old_nurseries;
 
     if (RtsFlags.GcFlags.nurseryChunkSize == 0) {
         new_n_nurseries = to;
@@ -233,6 +230,7 @@ void storageAddCapabilities (uint32_t from, uint32_t to)
             stg_max(to, total_alloc / RtsFlags.GcFlags.nurseryChunkSize);
     }
 
+    old_nurseries = nurseries;
     if (from > 0) {
         nurseries = stgReallocBytes(nurseries,
                                     new_n_nurseries * sizeof(struct nursery_),
@@ -244,8 +242,9 @@ void storageAddCapabilities (uint32_t from, uint32_t to)
 
     // we've moved the nurseries, so we have to update the rNursery
     // pointers from the Capabilities.
-    for (i = 0; i < to; i++) {
-        capabilities[i]->r.rNursery = &nurseries[i];
+    for (i = 0; i < from; i++) {
+        uint32_t index = capabilities[i]->r.rNursery - old_nurseries;
+        capabilities[i]->r.rNursery = &nurseries[index];
     }
 
     /* The allocation area.  Policy: keep the allocation area
@@ -307,21 +306,21 @@ freeStorage (bool free_heap)
 
    The entry code for every CAF does the following:
 
-      - calls newCaf, which builds a CAF_BLACKHOLE on the heap and atomically
+      - calls newCAF, which builds a CAF_BLACKHOLE on the heap and atomically
         updates the CAF with IND_STATIC pointing to the CAF_BLACKHOLE
 
-      - if newCaf returns zero, it re-enters the CAF (see Note [atomic
+      - if newCAF returns zero, it re-enters the CAF (see Note [atomic
         CAF entry])
 
       - pushes an update frame pointing to the CAF_BLACKHOLE
 
-   Why do we build an BLACKHOLE in the heap rather than just updating
+   Why do we build a BLACKHOLE in the heap rather than just updating
    the thunk directly?  It's so that we only need one kind of update
    frame - otherwise we'd need a static version of the update frame
    too, and various other parts of the RTS that deal with update
    frames would also need special cases for static update frames.
 
-   newCaf() does the following:
+   newCAF() does the following:
 
       - atomically locks the CAF (see [atomic CAF entry])
 
@@ -339,7 +338,7 @@ freeStorage (bool free_heap)
    ------------------
    Note [atomic CAF entry]
 
-   With THREADED_RTS, newCaf() is required to be atomic (see
+   With THREADED_RTS, newCAF() is required to be atomic (see
    #5558). This is because if two threads happened to enter the same
    CAF simultaneously, they would create two distinct CAF_BLACKHOLEs,
    and so the normal threadPaused() machinery for detecting duplicate
@@ -359,7 +358,7 @@ freeStorage (bool free_heap)
       - we must be able to *revert* CAFs that have been evaluated, to
         their pre-evaluated form.
 
-      To do this, we use an additional CAF list.  When newCaf() is
+      To do this, we use an additional CAF list.  When newCAF() is
       called on a dynamically-loaded CAF, we add it to the CAF list
       instead of the old-generation mutable list, and save away its
       old info pointer (in caf->saved_info) for later reversion.
@@ -796,6 +795,20 @@ move_STACK (StgStack *src, StgStack *dest)
     dest->sp = (StgPtr)dest->sp + diff;
 }
 
+STATIC_INLINE void
+accountAllocation(Capability *cap, W_ n)
+{
+    TICK_ALLOC_HEAP_NOCTR(WDS(n));
+    CCS_ALLOC(cap->r.rCCCS,n);
+    if (cap->r.rCurrentTSO != NULL) {
+        // cap->r.rCurrentTSO->alloc_limit -= n*sizeof(W_)
+        ASSIGN_Int64((W_*)&(cap->r.rCurrentTSO->alloc_limit),
+                     (PK_Int64((W_*)&(cap->r.rCurrentTSO->alloc_limit))
+                      - n*sizeof(W_)));
+    }
+
+}
+
 /* -----------------------------------------------------------------------------
    StgPtr allocate (Capability *cap, W_ n)
 
@@ -812,21 +825,37 @@ move_STACK (StgStack *src, StgStack *dest)
    that operation fails, then the whole process will be killed.
    -------------------------------------------------------------------------- */
 
+/*
+ * Allocate some n words of heap memory; terminating
+ * on heap overflow
+ */
 StgPtr
 allocate (Capability *cap, W_ n)
 {
+    StgPtr p = allocateMightFail(cap, n);
+    if (p == NULL) {
+        reportHeapOverflow();
+        // heapOverflow() doesn't exit (see #2592), but we aren't
+        // in a position to do a clean shutdown here: we
+        // either have to allocate the memory or exit now.
+        // Allocating the memory would be bad, because the user
+        // has requested that we not exceed maxHeapSize, so we
+        // just exit.
+        stg_exit(EXIT_HEAPOVERFLOW);
+    }
+    return p;
+}
+
+/*
+ * Allocate some n words of heap memory; returning NULL
+ * on heap overflow
+ */
+StgPtr
+allocateMightFail (Capability *cap, W_ n)
+{
     bdescr *bd;
     StgPtr p;
 
-    TICK_ALLOC_HEAP_NOCTR(WDS(n));
-    CCS_ALLOC(cap->r.rCCCS,n);
-    if (cap->r.rCurrentTSO != NULL) {
-        // cap->r.rCurrentTSO->alloc_limit -= n*sizeof(W_)
-        ASSIGN_Int64((W_*)&(cap->r.rCurrentTSO->alloc_limit),
-                     (PK_Int64((W_*)&(cap->r.rCurrentTSO->alloc_limit))
-                      - n*sizeof(W_)));
-    }
-
     if (n >= LARGE_OBJECT_THRESHOLD/sizeof(W_)) {
         // The largest number of words such that
         // the computation of req_blocks will not overflow.
@@ -845,16 +874,12 @@ allocate (Capability *cap, W_ n)
             req_blocks >= HS_INT32_MAX)   // avoid overflow when
                                           // calling allocGroup() below
         {
-            reportHeapOverflow();
-            // heapOverflow() doesn't exit (see #2592), but we aren't
-            // in a position to do a clean shutdown here: we
-            // either have to allocate the memory or exit now.
-            // Allocating the memory would be bad, because the user
-            // has requested that we not exceed maxHeapSize, so we
-            // just exit.
-            stg_exit(EXIT_HEAPOVERFLOW);
+            return NULL;
         }
 
+        // Only credit allocation after we've passed the size check above
+        accountAllocation(cap, n);
+
         ACQUIRE_SM_LOCK
         bd = allocGroupOnNode(cap->node,req_blocks);
         dbl_link_onto(bd, &g0->large_objects);
@@ -870,6 +895,7 @@ allocate (Capability *cap, W_ n)
 
     /* small allocation (<LARGE_OBJECT_THRESHOLD) */
 
+    accountAllocation(cap, n);
     bd = cap->r.rCurrentAlloc;
     if (bd == NULL || bd->free + n > bd->start + BLOCK_SIZE_W) {
 
@@ -955,7 +981,8 @@ allocate (Capability *cap, W_ n)
    to pinned ByteArrays, not scavenging is ok.
 
    This function is called by newPinnedByteArray# which immediately
-   fills the allocated memory with a MutableByteArray#.
+   fills the allocated memory with a MutableByteArray#. Note that
+   this returns NULL on heap overflow.
    ------------------------------------------------------------------------- */
 
 StgPtr
@@ -967,20 +994,16 @@ allocatePinned (Capability *cap, W_ n)
     // If the request is for a large object, then allocate()
     // will give us a pinned object anyway.
     if (n >= LARGE_OBJECT_THRESHOLD/sizeof(W_)) {
-        p = allocate(cap, n);
-        Bdescr(p)->flags |= BF_PINNED;
-        return p;
-    }
-
-    TICK_ALLOC_HEAP_NOCTR(WDS(n));
-    CCS_ALLOC(cap->r.rCCCS,n);
-    if (cap->r.rCurrentTSO != NULL) {
-        // cap->r.rCurrentTSO->alloc_limit -= n*sizeof(W_);
-        ASSIGN_Int64((W_*)&(cap->r.rCurrentTSO->alloc_limit),
-                     (PK_Int64((W_*)&(cap->r.rCurrentTSO->alloc_limit))
-                      - n*sizeof(W_)));
+        p = allocateMightFail(cap, n);
+        if (p == NULL) {
+            return NULL;
+        } else {
+            Bdescr(p)->flags |= BF_PINNED;
+            return p;
+        }
     }
 
+    accountAllocation(cap, n);
     bd = cap->pinned_object_block;
 
     // If we don't have a block of pinned objects yet, or the current
@@ -1135,7 +1158,7 @@ dirty_MVAR(StgRegTable *reg, StgClosure *p)
  * -------------------------------------------------------------------------- */
 
 /* -----------------------------------------------------------------------------
- * [Note allocation accounting]
+ * Note [allocation accounting]
  *
  *   - When cap->r.rCurrentNusery moves to a new block in the nursery,
  *     we add the size of the used portion of the previous block to
@@ -1241,16 +1264,15 @@ W_ gcThreadLiveBlocks (uint32_t i, uint32_t g)
  * to store bitmaps and the mark stack.  Note: blocks_needed does not
  * include the blocks in the nursery.
  *
- * Assume: all data currently live will remain live.  Generationss
+ * Assume: all data currently live will remain live.  Generations
  * that will be collected next time will therefore need twice as many
  * blocks since all the data will be copied.
  */
 extern W_
 calcNeeded (bool force_major, memcount *blocks_needed)
 {
-    W_ needed = 0, blocks;
-    uint32_t g, N;
-    generation *gen;
+    W_ needed = 0;
+    uint32_t N;
 
     if (force_major) {
         N = RtsFlags.GcFlags.generations - 1;
@@ -1258,12 +1280,12 @@ calcNeeded (bool force_major, memcount *blocks_needed)
         N = 0;
     }
 
-    for (g = 0; g < RtsFlags.GcFlags.generations; g++) {
-        gen = &generations[g];
+    for (uint32_t g = 0; g < RtsFlags.GcFlags.generations; g++) {
+        generation *gen = &generations[g];
 
-        blocks = gen->n_blocks // or: gen->n_words / BLOCK_SIZE_W (?)
-               + gen->n_large_blocks
-               + gen->n_compact_blocks;
+        W_ blocks = gen->n_blocks // or: gen->n_words / BLOCK_SIZE_W (?)
+                  + gen->n_large_blocks
+                  + gen->n_compact_blocks;
 
         // we need at least this much space
         needed += blocks;