1 files changed, 531 insertions, 32 deletions
diff --git a/rts/sm/NonMovingMark.c b/rts/sm/NonMovingMark.c
index cf1950471e..b273b09b05 100644
--- a/rts/sm/NonMovingMark.c
+++ b/rts/sm/NonMovingMark.c
@@ -67,6 +67,14 @@ bdescr *nonmoving_large_objects = NULL;
 bdescr *nonmoving_marked_large_objects = NULL;
 memcount n_nonmoving_large_blocks = 0;
 memcount n_nonmoving_marked_large_blocks = 0;
+#if defined(THREADED_RTS)
+/* Protects everything above. Furthermore, we only set the BF_MARKED bit of
+ * large object blocks when this is held. This ensures that the write barrier
+ * (e.g. finish_upd_rem_set_mark) and the collector (mark_closure) don't try to
+ * move the same large object to nonmoving_marked_large_objects more than once.
+ */
+static Mutex nonmoving_large_objects_mutex;
+#endif
 
 /*
  * Where we keep our threads during collection since we must have a snapshot of
@@ -87,11 +95,257 @@ StgWeak *nonmoving_weak_ptr_list = NULL;
 StgIndStatic *debug_caf_list_snapshot = (StgIndStatic*)END_OF_CAF_LIST;
 #endif
 
+/* Note [Update remembered set]
+ * ~~~~~~~~~~~~~~~~~~~~~~~~~~~~
+ * The concurrent non-moving collector uses a remembered set to ensure
+ * that its marking is consistent with the snapshot invariant defined in
+ * the design. This remembered set, known as the update remembered set,
+ * records all pointers that have been overwritten since the beginning
+ * of the concurrent mark. This ensures that concurrent mutation cannot hide
+ * pointers to live objects from the nonmoving garbage collector.
+ *
+ * The update remembered set is maintained via a write barrier that
+ * is enabled whenever a concurrent mark is active. This write barrier
+ * can be found in a number of places:
+ *
+ *  - In rts/Primops.cmm in primops responsible for modifying mutable closures
+ *    (e.g. MVARs, MUT_VARs, etc.)
+ *
+ *  - In rts/STM.c, where
+ *
+ *  - In the dirty_* functions found in rts/Storage.c where we dirty MVARs,
+ *    MUT_VARs, TSOs and STACKs. STACK is a somewhat special case, as described
+ *    in Note [StgStack dirtiness flags and concurrent marking] in TSO.h.
+ *
+ *  - In the code generated by the STG code generator for pointer array writes
+ *
+ * There is also a read barrier to handle weak references, as described in
+ * Note [Concurrent read barrier on deRefWeak#].
+ *
+ * The representation of the update remembered set is the same as that of
+ * the mark queue. For efficiency, each capability maintains its own local
+ * accumulator of remembered set entries. When a capability fills its
+ * accumulator it is linked in to the global remembered set
+ * (upd_rem_set_block_list), where it is consumed by the mark phase.
+ *
+ * The mark phase is responsible for freeing update remembered set block
+ * allocations.
+ *
+ *
+ * Note [Concurrent read barrier on deRefWeak#]
+ * ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
+ *
+ * In general the non-moving GC assumes that all pointers reachable from a
+ * marked object are themselves marked (or in the mark queue). However,
+ * weak pointers are an obvious exception to this rule. In particular,
+ * deRefWeakPtr# allows the mutator to turn a weak reference into a strong
+ * reference. This interacts badly with concurrent collection. For
+ * instance, consider this program:
+ *
+ *     f :: a -> b -> IO b
+ *     f k v = do
+ *         -- assume that k and v are the only references to the
+ *         -- closures to which they refer.
+ *         weak <- mkWeakPtr k v Nothing
+ *
+ *         -- N.B. k is now technically dead since the only reference to it is
+ *         -- weak, but we've not yet had a chance to tombstone the WeakPtr
+ *         -- (which will happen in the course of major GC).
+ *         performMajorGC
+ *         -- Now we are running concurrently with the mark...
+
+ *         Just x <- deRefWeak weak
+ *         -- We have now introduced a reference to `v`, which will
+ *         -- not be marked as the only reference to `v` when the snapshot was
+ *         -- taken is via a WeakPtr.
+ *         return x
+ *
+ */
+static Mutex upd_rem_set_lock;
+bdescr *upd_rem_set_block_list = NULL;
+
+#if defined(THREADED_RTS)
+/* Used during the mark/sweep phase transition to track how many capabilities
+ * have pushed their update remembered sets. Protected by upd_rem_set_lock.
+ */
+static volatile StgWord upd_rem_set_flush_count = 0;
+#endif
+
+
+/* Signaled by each capability when it has flushed its update remembered set */
+static Condition upd_rem_set_flushed_cond;
+
+/* Indicates to mutators that the write barrier must be respected. Set while
+ * concurrent mark is running.
+ */
+StgWord nonmoving_write_barrier_enabled = false;
+
 /* Used to provide the current mark queue to the young generation
  * collector for scavenging.
  */
 MarkQueue *current_mark_queue = NULL;
 
+/* Initialise update remembered set data structures */
+void nonmovingMarkInitUpdRemSet() {
+    initMutex(&upd_rem_set_lock);
+    initCondition(&upd_rem_set_flushed_cond);
+#if defined(THREADED_RTS)
+    initMutex(&nonmoving_large_objects_mutex);
+#endif
+}
+
+#if defined(THREADED_RTS) && defined(DEBUG)
+static uint32_t markQueueLength(MarkQueue *q);
+#endif
+static void init_mark_queue_(MarkQueue *queue);
+
+/* Transfers the given capability's update-remembered set to the global
+ * remembered set.
+ *
+ * Really the argument type should be UpdRemSet* but this would be rather
+ * inconvenient without polymorphism.
+ */
+static void nonmovingAddUpdRemSetBlocks(MarkQueue *rset)
+{
+    if (markQueueIsEmpty(rset)) return;
+
+    // find the tail of the queue
+    bdescr *start = rset->blocks;
+    bdescr *end = start;
+    while (end->link != NULL)
+        end = end->link;
+
+    // add the blocks to the global remembered set
+    ACQUIRE_LOCK(&upd_rem_set_lock);
+    end->link = upd_rem_set_block_list;
+    upd_rem_set_block_list = start;
+    RELEASE_LOCK(&upd_rem_set_lock);
+
+    // Reset remembered set
+    ACQUIRE_SM_LOCK;
+    init_mark_queue_(rset);
+    rset->is_upd_rem_set = true;
+    RELEASE_SM_LOCK;
+}
+
+#if defined(THREADED_RTS)
+/* Called by capabilities to flush their update remembered sets when
+ * synchronising with the non-moving collector as it transitions from mark to
+ * sweep phase.
+ */
+void nonmovingFlushCapUpdRemSetBlocks(Capability *cap)
+{
+    debugTrace(DEBUG_nonmoving_gc,
+               "Capability %d flushing update remembered set: %d",
+               cap->no, markQueueLength(&cap->upd_rem_set.queue));
+    nonmovingAddUpdRemSetBlocks(&cap->upd_rem_set.queue);
+    atomic_inc(&upd_rem_set_flush_count, 1);
+    signalCondition(&upd_rem_set_flushed_cond);
+    // After this mutation will remain suspended until nonmovingFinishFlush
+    // releases its capabilities.
+}
+
+/* Request that all capabilities flush their update remembered sets and suspend
+ * execution until the further notice.
+ */
+void nonmovingBeginFlush(Task *task)
+{
+    debugTrace(DEBUG_nonmoving_gc, "Starting update remembered set flush...");
+    upd_rem_set_flush_count = 0;
+    stopAllCapabilitiesWith(NULL, task, SYNC_FLUSH_UPD_REM_SET);
+
+    // XXX: We may have been given a capability via releaseCapability (i.e. a
+    // task suspended due to a foreign call) in which case our requestSync
+    // logic won't have been hit. Make sure that everyone so far has flushed.
+    // Ideally we want to mark asynchronously with syncing.
+    for (uint32_t i = 0; i < n_capabilities; i++) {
+        nonmovingFlushCapUpdRemSetBlocks(capabilities[i]);
+    }
+}
+
+/* Wait until a capability has flushed its update remembered set. Returns true
+ * if all capabilities have flushed.
+ */
+bool nonmovingWaitForFlush()
+{
+    ACQUIRE_LOCK(&upd_rem_set_lock);
+    debugTrace(DEBUG_nonmoving_gc, "Flush count %d", upd_rem_set_flush_count);
+    bool finished = upd_rem_set_flush_count == n_capabilities;
+    if (!finished) {
+        waitCondition(&upd_rem_set_flushed_cond, &upd_rem_set_lock);
+    }
+    RELEASE_LOCK(&upd_rem_set_lock);
+    return finished;
+}
+
+/* Note [Unintentional marking in resurrectThreads]
+ * ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
+ * In both moving and non-moving collectors threads found to be unreachable are
+ * evacuated/marked and then resurrected with resurrectThreads. resurrectThreads
+ * raises an exception in the unreachable thread via raiseAsync, which does
+ * mutations on the heap. These mutations cause adding stuff to UpdRemSet of the
+ * thread's capability. Here's an example backtrace where this happens:
+ *
+ *     #0  updateRemembSetPushClosure
+ *     #1  0x000000000072b363 in dirty_TVAR
+ *     #2  0x00000000007162e5 in remove_watch_queue_entries_for_trec
+ *     #3  0x0000000000717098 in stmAbortTransaction
+ *     #4  0x000000000070c6eb in raiseAsync
+ *     #5  0x000000000070b473 in throwToSingleThreaded__
+ *     #6  0x000000000070b4ab in throwToSingleThreaded
+ *     #7  0x00000000006fce82 in resurrectThreads
+ *     #8  0x00000000007215db in nonmovingMark_
+ *     #9  0x0000000000721438 in nonmovingConcurrentMark
+ *     #10 0x00007f1ee81cd6db in start_thread
+ *     #11 0x00007f1ee850688f in clone
+ *
+ * However we don't really want to run write barriers when calling
+ * resurrectThreads here, because we're in a GC pause, and overwritten values
+ * are definitely gone forever (as opposed to being inserted in a marked object
+ * or kept in registers and used later).
+ *
+ * When this happens, if we don't reset the UpdRemSets, what happens is in the
+ * next mark we see these objects that were added in previous mark's
+ * resurrectThreads in UpdRemSets, and mark those. This causes keeping
+ * unreachable objects alive, and effects weak finalization and thread resurrect
+ * (which rely on things become unreachable). As an example, stm048 fails when
+ * we get this wrong, because when we do raiseAsync on a thread that was blocked
+ * on an STM transaction we mutate a TVAR_WATCH_QUEUE, which has a reference to
+ * the TSO that was running the STM transaction. If the TSO becomes unreachable
+ * again in the next GC we don't realize this, because it was added to an
+ * UpdRemSet in the previous GC's mark phase, because of raiseAsync.
+ *
+ * To fix this we clear all UpdRemSets in nonmovingFinishFlush, right before
+ * releasing capabilities. This is somewhat inefficient (we allow adding objects
+ * to UpdRemSets, only to later reset them), but the only case where we add to
+ * UpdRemSets during mark is resurrectThreads, and I don't think we do so many
+ * resurrection in a thread that we fill UpdRemSets and allocate new blocks. So
+ * pushing an UpdRemSet in this case is really fast, and resetting is even
+ * faster (we just update a pointer).
+ *
+ * TODO (osa): What if we actually marked UpdRemSets in this case, in the mark
+ * loop? Would that work? Or what would break?
+ */
+
+/* Notify capabilities that the synchronisation is finished; they may resume
+ * execution.
+ */
+void nonmovingFinishFlush(Task *task)
+{
+    // See Note [Unintentional marking in resurrectThreads]
+    for (uint32_t i = 0; i < n_capabilities; i++) {
+        reset_upd_rem_set(&capabilities[i]->upd_rem_set);
+    }
+    // Also reset upd_rem_set_block_list in case some of the UpdRemSets were
+    // filled and we flushed them.
+    freeChain_lock(upd_rem_set_block_list);
+    upd_rem_set_block_list = NULL;
+
+    debugTrace(DEBUG_nonmoving_gc, "Finished update remembered set flush...");
+    releaseAllCapabilities(n_capabilities, NULL, task);
+}
+#endif
+
 /*********************************************************
  * Pushing to either the mark queue or remembered set
  *********************************************************/
@@ -102,14 +356,18 @@ push (MarkQueue *q, const MarkQueueEnt *ent)
     // Are we at the end of the block?
     if (q->top->head == MARK_QUEUE_BLOCK_ENTRIES) {
         // Yes, this block is full.
-        // allocate a fresh block.
-        ACQUIRE_SM_LOCK;
-        bdescr *bd = allocGroup(1);
-        bd->link = q->blocks;
-        q->blocks = bd;
-        q->top = (MarkQueueBlock *) bd->start;
-        q->top->head = 0;
-        RELEASE_SM_LOCK;
+        if (q->is_upd_rem_set) {
+            nonmovingAddUpdRemSetBlocks(q);
+        } else {
+            // allocate a fresh block.
+            ACQUIRE_SM_LOCK;
+            bdescr *bd = allocGroup(1);
+            bd->link = q->blocks;
+            q->blocks = bd;
+            q->top = (MarkQueueBlock *) bd->start;
+            q->top->head = 0;
+            RELEASE_SM_LOCK;
+        }
     }
 
     q->top->entries[q->top->head] = *ent;
@@ -183,6 +441,183 @@ void push_fun_srt (MarkQueue *q, const StgInfoTable *info)
 }
 
 /*********************************************************
+ * Pushing to the update remembered set
+ *
+ * upd_rem_set_push_* functions are directly called by
+ * mutators and need to check whether the value is in
+ * non-moving heap.
+ *********************************************************/
+
+// Check if the object is traced by the non-moving collector. This holds in two
+// conditions:
+//
+// - Object is in non-moving heap
+// - Object is a large (BF_LARGE) and marked as BF_NONMOVING
+// - Object is static (HEAP_ALLOCED_GC(obj) == false)
+//
+static
+bool check_in_nonmoving_heap(StgClosure *p) {
+    if (HEAP_ALLOCED_GC(p)) {
+        // This works for both large and small objects:
+        return Bdescr((P_)p)->flags & BF_NONMOVING;
+    } else {
+        return true; // a static object
+    }
+}
+
+/* Push the free variables of a (now-evaluated) thunk to the
+ * update remembered set.
+ */
+inline void updateRemembSetPushThunk(Capability *cap, StgThunk *thunk)
+{
+    const StgInfoTable *info;
+    do {
+        info = get_volatile_itbl((StgClosure *) thunk);
+    } while (info->type == WHITEHOLE);
+    updateRemembSetPushThunkEager(cap, (StgThunkInfoTable *) info, thunk);
+}
+
+void updateRemembSetPushThunkEager(Capability *cap,
+                                   const StgThunkInfoTable *info,
+                                   StgThunk *thunk)
+{
+    /* N.B. info->i.type mustn't be WHITEHOLE */
+    switch (info->i.type) {
+    case THUNK:
+    case THUNK_1_0:
+    case THUNK_0_1:
+    case THUNK_2_0:
+    case THUNK_1_1:
+    case THUNK_0_2:
+    {
+        MarkQueue *queue = &cap->upd_rem_set.queue;
+        push_thunk_srt(queue, &info->i);
+
+        // Don't record the origin of objects living outside of the nonmoving
+        // heap; we can't perform the selector optimisation on them anyways.
+        bool record_origin = check_in_nonmoving_heap((StgClosure*)thunk);
+
+        for (StgWord i = 0; i < info->i.layout.payload.ptrs; i++) {
+            if (check_in_nonmoving_heap(thunk->payload[i])) {
+                push_closure(queue,
+                             thunk->payload[i],
+                             record_origin ? &thunk->payload[i] : NULL);
+            }
+        }
+        break;
+    }
+    case AP:
+    {
+        MarkQueue *queue = &cap->upd_rem_set.queue;
+        StgAP *ap = (StgAP *) thunk;
+        push_closure(queue, ap->fun, &ap->fun);
+        mark_PAP_payload(queue, ap->fun, ap->payload, ap->n_args);
+        break;
+    }
+    case THUNK_SELECTOR:
+    case BLACKHOLE:
+        // TODO: This is right, right?
+        break;
+    default:
+        barf("updateRemembSetPushThunk: invalid thunk pushed: p=%p, type=%d",
+             thunk, info->i.type);
+    }
+}
+
+void updateRemembSetPushThunk_(StgRegTable *reg, StgThunk *p)
+{
+    updateRemembSetPushThunk(regTableToCapability(reg), p);
+}
+
+inline void updateRemembSetPushClosure(Capability *cap, StgClosure *p)
+{
+    if (!check_in_nonmoving_heap(p)) return;
+    MarkQueue *queue = &cap->upd_rem_set.queue;
+    push_closure(queue, p, NULL);
+}
+
+void updateRemembSetPushClosure_(StgRegTable *reg, StgClosure *p)
+{
+    updateRemembSetPushClosure(regTableToCapability(reg), p);
+}
+
+STATIC_INLINE bool needs_upd_rem_set_mark(StgClosure *p)
+{
+    // TODO: Deduplicate with mark_closure
+    bdescr *bd = Bdescr((StgPtr) p);
+    if (bd->gen != oldest_gen) {
+        return false;
+    } else if (bd->flags & BF_LARGE) {
+        if (! (bd->flags & BF_NONMOVING_SWEEPING)) {
+            return false;
+        } else {
+            return ! (bd->flags & BF_MARKED);
+        }
+    } else {
+        struct NonmovingSegment *seg = nonmovingGetSegment((StgPtr) p);
+        nonmoving_block_idx block_idx = nonmovingGetBlockIdx((StgPtr) p);
+        return nonmovingGetMark(seg, block_idx) != nonmovingMarkEpoch;
+    }
+}
+
+/* Set the mark bit; only to be called *after* we have fully marked the closure */
+STATIC_INLINE void finish_upd_rem_set_mark(StgClosure *p)
+{
+    bdescr *bd = Bdescr((StgPtr) p);
+    if (bd->flags & BF_LARGE) {
+        // Someone else may have already marked it.
+        ACQUIRE_LOCK(&nonmoving_large_objects_mutex);
+        if (! (bd->flags & BF_MARKED)) {
+            bd->flags |= BF_MARKED;
+            dbl_link_remove(bd, &nonmoving_large_objects);
+            dbl_link_onto(bd, &nonmoving_marked_large_objects);
+            n_nonmoving_large_blocks -= bd->blocks;
+            n_nonmoving_marked_large_blocks += bd->blocks;
+        }
+        RELEASE_LOCK(&nonmoving_large_objects_mutex);
+    } else {
+        struct NonmovingSegment *seg = nonmovingGetSegment((StgPtr) p);
+        nonmoving_block_idx block_idx = nonmovingGetBlockIdx((StgPtr) p);
+        nonmovingSetMark(seg, block_idx);
+    }
+}
+
+void updateRemembSetPushTSO(Capability *cap, StgTSO *tso)
+{
+    if (needs_upd_rem_set_mark((StgClosure *) tso)) {
+        debugTrace(DEBUG_nonmoving_gc, "upd_rem_set: TSO %p", tso);
+        mark_tso(&cap->upd_rem_set.queue, tso);
+        finish_upd_rem_set_mark((StgClosure *) tso);
+    }
+}
+
+void updateRemembSetPushStack(Capability *cap, StgStack *stack)
+{
+    // N.B. caller responsible for checking nonmoving_write_barrier_enabled
+    if (needs_upd_rem_set_mark((StgClosure *) stack)) {
+        StgWord marking = stack->marking;
+        // See Note [StgStack dirtiness flags and concurrent marking]
+        if (cas(&stack->marking, marking, nonmovingMarkEpoch)
+              != nonmovingMarkEpoch) {
+            // We have claimed the right to mark the stack.
+            debugTrace(DEBUG_nonmoving_gc, "upd_rem_set: STACK %p", stack->sp);
+            mark_stack(&cap->upd_rem_set.queue, stack);
+            finish_upd_rem_set_mark((StgClosure *) stack);
+            return;
+        } else {
+            // The concurrent GC has claimed the right to mark the stack.
+            // Wait until it finishes marking before proceeding with
+            // mutation.
+            while (needs_upd_rem_set_mark((StgClosure *) stack));
+#if defined(PARALLEL_GC)
+                busy_wait_nop(); // TODO: Spinning here is unfortunate
+#endif
+            return;
+        }
+    }
+}
+
+/*********************************************************
  * Pushing to the mark queue
  *********************************************************/
 
@@ -192,8 +627,8 @@ void markQueuePush (MarkQueue *q, const MarkQueueEnt *ent)
 }
 
 void markQueuePushClosure (MarkQueue *q,
-                              StgClosure *p,
-                              StgClosure **origin)
+                           StgClosure *p,
+                           StgClosure **origin)
 {
     push_closure(q, p, origin);
 }
@@ -264,7 +699,7 @@ again:
 }
 
 /*********************************************************
- * Creating and destroying MarkQueues
+ * Creating and destroying MarkQueues and UpdRemSets
  *********************************************************/
 
 /* Must hold sm_mutex. */
@@ -281,22 +716,45 @@ void initMarkQueue (MarkQueue *queue)
 {
     init_mark_queue_(queue);
     queue->marked_objects = allocHashTable();
+    queue->is_upd_rem_set = false;
+}
+
+/* Must hold sm_mutex. */
+void init_upd_rem_set (UpdRemSet *rset)
+{
+    init_mark_queue_(&rset->queue);
+    // Update remembered sets don't have to worry about static objects
+    rset->queue.marked_objects = NULL;
+    rset->queue.is_upd_rem_set = true;
+}
+
+void reset_upd_rem_set (UpdRemSet *rset)
+{
+    // UpdRemSets always have one block for the mark queue. This assertion is to
+    // update this code if we change that.
+    ASSERT(rset->queue.blocks->link == NULL);
+    rset->queue.top->head = 0;
 }
 
 void freeMarkQueue (MarkQueue *queue)
 {
-    bdescr* b = queue->blocks;
-    ACQUIRE_SM_LOCK;
-    while (b)
-    {
-        bdescr* b_ = b->link;
-        freeGroup(b);
-        b = b_;
-    }
-    RELEASE_SM_LOCK;
+    freeChain_lock(queue->blocks);
     freeHashTable(queue->marked_objects, NULL);
 }
 
+#if defined(THREADED_RTS) && defined(DEBUG)
+static uint32_t
+markQueueLength (MarkQueue *q)
+{
+    uint32_t n = 0;
+    for (bdescr *block = q->blocks; block; block = block->link) {
+        MarkQueueBlock *queue = (MarkQueueBlock*)block->start;
+        n += queue->head;
+    }
+    return n;
+}
+#endif
+
 
 /*********************************************************
  * Marking
@@ -307,7 +765,8 @@ void freeMarkQueue (MarkQueue *queue)
  * barrier. Consequently it's quite important that we deeply mark
  * any outstanding transactions.
  */
-static void mark_trec_header (MarkQueue *queue, StgTRecHeader *trec)
+static void
+mark_trec_header (MarkQueue *queue, StgTRecHeader *trec)
 {
     while (trec != NO_TREC) {
         StgTRecChunk *chunk = trec->current_chunk;
@@ -326,7 +785,8 @@ static void mark_trec_header (MarkQueue *queue, StgTRecHeader *trec)
     }
 }
 
-static void mark_tso (MarkQueue *queue, StgTSO *tso)
+static void
+mark_tso (MarkQueue *queue, StgTSO *tso)
 {
     // TODO: Clear dirty if contains only old gen objects
 
@@ -535,7 +995,7 @@ mark_closure (MarkQueue *queue, StgClosure *p, StgClosure **origin)
     p = UNTAG_CLOSURE(p);
 
 #   define PUSH_FIELD(obj, field)                                \
-        markQueuePushClosure(queue,                           \
+        markQueuePushClosure(queue,                              \
                                 (StgClosure *) (obj)->field,     \
                                 (StgClosure **) &(obj)->field)
 
@@ -592,7 +1052,7 @@ mark_closure (MarkQueue *queue, StgClosure *p, StgClosure **origin)
             return;
 
         case WHITEHOLE:
-            while (get_itbl(p)->type == WHITEHOLE);
+            while (get_volatile_itbl(p)->type == WHITEHOLE);
                 // busy_wait_nop(); // FIXME
             goto try_again;
 
@@ -608,9 +1068,12 @@ mark_closure (MarkQueue *queue, StgClosure *p, StgClosure **origin)
         // we moved everything to the non-moving heap before starting the major
         // collection, we know that we don't need to trace it: it was allocated
         // after we took our snapshot.
-
+#if !defined(THREADED_RTS)
         // This should never happen in the non-concurrent case
         barf("Closure outside of non-moving heap: %p", p);
+#else
+        return;
+#endif
     }
 
     ASSERTM(LOOKS_LIKE_CLOSURE_PTR(p), "invalid closure, info=%p", p->header.info);
@@ -878,7 +1341,22 @@ mark_closure (MarkQueue *queue, StgClosure *p, StgClosure **origin)
     case STACK: {
         // See Note [StgStack dirtiness flags and concurrent marking]
         StgStack *stack = (StgStack *) p;
-        mark_stack(queue, stack);
+        StgWord marking = stack->marking;
+
+        // N.B. stack->marking must be != nonmovingMarkEpoch unless
+        // someone has already marked it.
+        if (cas(&stack->marking, marking, nonmovingMarkEpoch)
+              != nonmovingMarkEpoch) {
+            // We have claimed the right to mark the stack.
+            mark_stack(queue, stack);
+        } else {
+            // A mutator has already started marking the stack; we just let it
+            // do its thing and move on. There's no reason to wait; we know that
+            // the stack will be fully marked before we sweep due to the final
+            // post-mark synchronization. Most importantly, we do not set its
+            // mark bit, the mutator is responsible for this.
+            return;
+        }
         break;
     }
 
@@ -905,8 +1383,7 @@ mark_closure (MarkQueue *queue, StgClosure *p, StgClosure **origin)
     }
 
     case WHITEHOLE:
-        while (get_itbl(p)->type == WHITEHOLE);
-            // busy_wait_nop(); // FIXME
+        while (get_volatile_itbl(p)->type == WHITEHOLE);
         goto try_again;
 
     default:
@@ -921,6 +1398,12 @@ mark_closure (MarkQueue *queue, StgClosure *p, StgClosure **origin)
      * mutator waiting for us to finish so it can start execution.
      */
     if (bd->flags & BF_LARGE) {
+        /* Marking a large object isn't idempotent since we move it to
+         * nonmoving_marked_large_objects; to ensure that we don't repeatedly
+         * mark a large object, we only set BF_MARKED on large objects in the
+         * nonmoving heap while holding nonmoving_large_objects_mutex
+         */
+        ACQUIRE_LOCK(&nonmoving_large_objects_mutex);
         if (! (bd->flags & BF_MARKED)) {
             // Remove the object from nonmoving_large_objects and link it to
             // nonmoving_marked_large_objects
@@ -930,6 +1413,7 @@ mark_closure (MarkQueue *queue, StgClosure *p, StgClosure **origin)
             n_nonmoving_marked_large_blocks += bd->blocks;
             bd->flags |= BF_MARKED;
         }
+        RELEASE_LOCK(&nonmoving_large_objects_mutex);
     } else {
         // TODO: Kill repetition
         struct NonmovingSegment *seg = nonmovingGetSegment((StgPtr) p);
@@ -947,7 +1431,8 @@ mark_closure (MarkQueue *queue, StgClosure *p, StgClosure **origin)
  *  c. the mark queue has been seeded with a set of roots.
  *
  */
-GNUC_ATTR_HOT void nonmovingMark (MarkQueue *queue)
+GNUC_ATTR_HOT void
+nonmovingMark (MarkQueue *queue)
 {
     debugTrace(DEBUG_nonmoving_gc, "Starting mark pass");
     unsigned int count = 0;
@@ -974,9 +1459,23 @@ GNUC_ATTR_HOT void nonmovingMark (MarkQueue *queue)
             break;
         }
         case NULL_ENTRY:
-            // Nothing more to do
-            debugTrace(DEBUG_nonmoving_gc, "Finished mark pass: %d", count);
-            return;
+            // Perhaps the update remembered set has more to mark...
+            if (upd_rem_set_block_list) {
+                ACQUIRE_LOCK(&upd_rem_set_lock);
+                bdescr *old = queue->blocks;
+                queue->blocks = upd_rem_set_block_list;
+                queue->top = (MarkQueueBlock *) queue->blocks->start;
+                upd_rem_set_block_list = NULL;
+                RELEASE_LOCK(&upd_rem_set_lock);
+
+                ACQUIRE_SM_LOCK;
+                freeGroup(old);
+                RELEASE_SM_LOCK;
+            } else {
+                // Nothing more to do
+                debugTrace(DEBUG_nonmoving_gc, "Finished mark pass: %d", count);
+                return;
+            }
         }
     }
 }