Allocate pinned object blocks from the nursery, not the global

allocator. Prompted by a benchmark posted to parallel-haskell@haskell.org by Andreas Voellmy <andreas.voellmy@gmail.com>. This program exhibits contention for the block allocator when run with -N2 and greater without the fix: {-# LANGUAGE MagicHash, UnboxedTuples, BangPatterns #-} module Main where import Control.Monad import Control.Concurrent import System.Environment import GHC.IO import GHC.Exts import GHC.Conc main = do [m] <- fmap (fmap read) getArgs n <- getNumCapabilities ms <- replicateM n newEmptyMVar sequence [ forkIO $ busyWorkerB (m `quot` n) >> putMVar mv () | mv <- ms ] mapM takeMVar ms busyWorkerB :: Int -> IO () busyWorkerB n_loops = go 0 where go !n | n >= n_loops = return () | otherwise = do p <- (IO $ \s -> case newPinnedByteArray# 1024# s of { (# s', mbarr# #) -> (# s', () #) } ) go (n+1)
author: Simon Marlow <marlowsd@gmail.com> 2012-02-13 11:17:50 +0000
committer: Simon Marlow <marlowsd@gmail.com> 2012-02-13 12:26:10 +0000
commit: 67f4ab7e6b7705a9d617c6109a8c5434ede13cae (patch)
tree: 8f1ed63f526c3a88a4f234c9a3d5b5ac2a9eb0c6
parent: 86ebfef9a5acc60b7a2ce3c8f025e6e707f17f87 (diff)
download: haskell-67f4ab7e6b7705a9d617c6109a8c5434ede13cae.tar.gz
5 files changed, 94 insertions, 13 deletions
diff --git a/rts/Capability.c b/rts/Capability.c
index 54f9196b99..2cb3042088 100644
--- a/rts/Capability.c
+++ b/rts/Capability.c
@@ -273,6 +273,7 @@ initCapability( Capability *cap, nat i )
     cap->transaction_tokens = 0;
     cap->context_switch = 0;
     cap->pinned_object_block = NULL;
+    cap->pinned_object_blocks = NULL;
 
 #ifdef PROFILING
     cap->r.rCCCS = CCS_SYSTEM;
diff --git a/rts/Capability.h b/rts/Capability.h
index 2ae2fcf6d7..2f616b5c6f 100644
--- a/rts/Capability.h
+++ b/rts/Capability.h
@@ -76,6 +76,8 @@ struct Capability_ {
 
     // block for allocating pinned objects into
     bdescr *pinned_object_block;
+    // full pinned object blocks allocated since the last GC
+    bdescr *pinned_object_blocks;
 
     // Context switch flag.  When non-zero, this means: stop running
     // Haskell code, and switch threads.
diff --git a/rts/sm/GC.c b/rts/sm/GC.c
index aeadf6f42f..86231948c1 100644
--- a/rts/sm/GC.c
+++ b/rts/sm/GC.c
@@ -150,6 +150,7 @@ static StgWord dec_running          (void);
 static void wakeup_gc_threads       (nat me);
 static void shutdown_gc_threads     (nat me);
 static void collect_gct_blocks      (void);
+static lnat collect_pinned_object_blocks (void);
 
 #if 0 && defined(DEBUG)
 static void gcCAFs                  (void);
@@ -285,6 +286,10 @@ GarbageCollect (rtsBool force_major_gc,
   // check sanity *before* GC
   IF_DEBUG(sanity, checkSanity(rtsFalse /* before GC */, major_gc));
 
+  // gather blocks allocated using allocatePinned() from each capability
+  // and put them on the g0->large_object list.
+  collect_pinned_object_blocks();
+
   // Initialise all the generations/steps that we're collecting.
   for (g = 0; g <= N; g++) {
       prepare_collected_gen(&generations[g]);
@@ -1422,6 +1427,43 @@ collect_gct_blocks (void)
 }
 
 /* -----------------------------------------------------------------------------
+   During mutation, any blocks that are filled by allocatePinned() are
+   stashed on the local pinned_object_blocks list, to avoid needing to
+   take a global lock.  Here we collect those blocks from the
+   cap->pinned_object_blocks lists and put them on the
+   main g0->large_object list.
+
+   Returns: the number of words allocated this way, for stats
+   purposes.
+   -------------------------------------------------------------------------- */
+
+static lnat
+collect_pinned_object_blocks (void)
+{
+    nat n;
+    bdescr *bd, *prev;
+    lnat allocated = 0;
+
+    for (n = 0; n < n_capabilities; n++) {
+        prev = NULL;
+        for (bd = capabilities[n].pinned_object_blocks; bd != NULL; bd = bd->link) {
+            allocated += bd->free - bd->start;
+            prev = bd;
+        }
+        if (prev != NULL) {
+            prev->link = g0->large_objects;
+            if (g0->large_objects != NULL) {
+                g0->large_objects->u.back = prev;
+            }
+            g0->large_objects = capabilities[n].pinned_object_blocks;
+            capabilities[n].pinned_object_blocks = 0;
+        }
+    }
+
+    return allocated;
+}
+
+/* -----------------------------------------------------------------------------
    Initialise a gc_thread before GC
    -------------------------------------------------------------------------- */
 
diff --git a/rts/sm/Sanity.c b/rts/sm/Sanity.c
index b6c5926ab8..78ecc96e0a 100644
--- a/rts/sm/Sanity.c
+++ b/rts/sm/Sanity.c
@@ -869,7 +869,7 @@ memInventory (rtsBool show)
           gen_blocks[g] += countBlocks(gc_threads[i]->gens[g].part_list);
           gen_blocks[g] += countBlocks(gc_threads[i]->gens[g].scavd_list);
           gen_blocks[g] += countBlocks(gc_threads[i]->gens[g].todo_bd);
-      }	  
+      }
       gen_blocks[g] += genBlocks(&generations[g]);
   }
 
@@ -880,6 +880,7 @@ memInventory (rtsBool show)
       if (capabilities[i].pinned_object_block != NULL) {
           nursery_blocks += capabilities[i].pinned_object_block->blocks;
       }
+      nursery_blocks += countBlocks(capabilities[i].pinned_object_blocks);
   }
 
   retainer_blocks = 0;
diff --git a/rts/sm/Storage.c b/rts/sm/Storage.c
index 012ba514db..0ff37d2582 100644
--- a/rts/sm/Storage.c
+++ b/rts/sm/Storage.c
@@ -744,8 +744,54 @@ allocatePinned (Capability *cap, lnat n)
     bd = cap->pinned_object_block;
     
     // If we don't have a block of pinned objects yet, or the current
-    // one isn't large enough to hold the new object, allocate a new one.
+    // one isn't large enough to hold the new object, get a new one.
     if (bd == NULL || (bd->free + n) > (bd->start + BLOCK_SIZE_W)) {
+
+        // stash the old block on cap->pinned_object_blocks.  On the
+        // next GC cycle these objects will be moved to
+        // g0->large_objects.
+        if (bd != NULL) {
+            dbl_link_onto(bd, &cap->pinned_object_blocks);
+        }
+
+        // We need to find another block.  We could just allocate one,
+        // but that means taking a global lock and we really want to
+        // avoid that (benchmarks that allocate a lot of pinned
+        // objects scale really badly if we do this).
+        //
+        // So first, we try taking the next block from the nursery, in
+        // the same way as allocate(), but note that we can only take
+        // an *empty* block, because we're about to mark it as
+        // BF_PINNED | BF_LARGE.
+        bd = cap->r.rCurrentNursery->link;
+        if (bd == NULL || bd->free != bd->start) { // must be empty!
+            // The nursery is empty, or the next block is non-empty:
+            // allocate a fresh block (we can't fail here).
+
+            // XXX in the case when the next nursery block is
+            // non-empty we aren't exerting any pressure to GC soon,
+            // so if this case ever happens then we could in theory
+            // keep allocating for ever without calling the GC. We
+            // can't bump g0->n_new_large_words because that will be
+            // counted towards allocation, and we're already counting
+            // our pinned obects as allocation in
+            // collect_pinned_object_blocks in the GC.
+            ACQUIRE_SM_LOCK;
+            bd = allocBlock();
+            RELEASE_SM_LOCK;
+            initBdescr(bd, g0, g0);
+        } else {
+            // we have a block in the nursery: steal it
+            cap->r.rCurrentNursery->link = bd->link;
+            if (bd->link != NULL) {
+                bd->link->u.back = cap->r.rCurrentNursery;
+            }
+            cap->r.rNursery->n_blocks--;
+        }
+
+        cap->pinned_object_block = bd;
+        bd->flags  = BF_PINNED | BF_LARGE | BF_EVACUATED;
+
         // The pinned_object_block remains attached to the capability
         // until it is full, even if a GC occurs.  We want this
         // behaviour because otherwise the unallocated portion of the
@@ -759,17 +805,6 @@ allocatePinned (Capability *cap, lnat n)
         // the next GC the BF_EVACUATED flag will be cleared, and the
         // block will be promoted as usual (if anything in it is
         // live).
-        ACQUIRE_SM_LOCK;
-        if (bd != NULL) {
-            dbl_link_onto(bd, &g0->large_objects);
-            g0->n_large_blocks++;
-            g0->n_new_large_words += bd->free - bd->start;
-        }
-        cap->pinned_object_block = bd = allocBlock();
-        RELEASE_SM_LOCK;
-        initBdescr(bd, g0, g0);
-        bd->flags  = BF_PINNED | BF_LARGE | BF_EVACUATED;
-	bd->free   = bd->start;
     }
 
     p = bd->free;
author	Simon Marlow <marlowsd@gmail.com>	2012-02-13 11:17:50 +0000
committer	Simon Marlow <marlowsd@gmail.com>	2012-02-13 12:26:10 +0000
commit	67f4ab7e6b7705a9d617c6109a8c5434ede13cae (patch)
tree	8f1ed63f526c3a88a4f234c9a3d5b5ac2a9eb0c6
parent	86ebfef9a5acc60b7a2ce3c8f025e6e707f17f87 (diff)
download	haskell-67f4ab7e6b7705a9d617c6109a8c5434ede13cae.tar.gz