NUMA cleanups

- Move the numaMap and nNumaNodes out of RtsFlags to Capability.c - Add a test to tests/rts
author: Simon Marlow <marlowsd@gmail.com> 2016-06-11 11:07:14 +0100
committer: Simon Marlow <marlowsd@gmail.com> 2016-06-17 14:52:45 +0100
commit: 498ed2664219f7e8f1077f46ad2061aba2f57de4 (patch)
tree: 123f66f55096876114b89876e4adf287ad944818
parent: a7f65b8787b0521397ee09061394425aa69bc6e0 (diff)
download: haskell-498ed2664219f7e8f1077f46ad2061aba2f57de4.tar.gz
12 files changed, 94 insertions, 63 deletions
diff --git a/includes/rts/Flags.h b/includes/rts/Flags.h
index e229aa12b1..c66aed90a3 100644
--- a/includes/rts/Flags.h
+++ b/includes/rts/Flags.h
@@ -73,9 +73,7 @@ typedef struct _GC_FLAGS {
                                  */
 
     rtsBool numa;               /* Use NUMA */
-    uint32_t nNumaNodes;        /* Number of nodes */
-    uint32_t numaMap[MAX_NUMA_NODES]; /* Map our internal node numbers to OS
-                                       * node numbers */
+    StgWord numaMask;
 } GC_FLAGS;
 
 /* See Note [Synchronization of flags and base APIs] */
diff --git a/libraries/base/GHC/RTS/Flags.hsc b/libraries/base/GHC/RTS/Flags.hsc
index e067019a8c..5eba4860ff 100644
--- a/libraries/base/GHC/RTS/Flags.hsc
+++ b/libraries/base/GHC/RTS/Flags.hsc
@@ -114,7 +114,7 @@ data GCFlags = GCFlags
     , heapBase              :: Word -- ^ address to ask the OS for memory
     , allocLimitGrace       :: Word
     , numa                  :: Bool
-    , nNumaNodes            :: Word32
+    , numaMask              :: Word
     } deriving (Show)
 
 -- | Parameters concerning context switching
@@ -376,7 +376,7 @@ getGCFlags = do
           <*> #{peek GC_FLAGS, heapBase} ptr
           <*> #{peek GC_FLAGS, allocLimitGrace} ptr
           <*> #{peek GC_FLAGS, numa} ptr
-          <*> #{peek GC_FLAGS, nNumaNodes} ptr
+          <*> #{peek GC_FLAGS, numaMask} ptr
 
 getParFlags :: IO ParFlags
 getParFlags = do
diff --git a/rts/Capability.c b/rts/Capability.c
index 411e64dc7a..7ca220fbd9 100644
--- a/rts/Capability.c
+++ b/rts/Capability.c
@@ -26,6 +26,7 @@
 #include "sm/GC.h" // for gcWorkerThread()
 #include "STM.h"
 #include "RtsUtils.h"
+#include "sm/OSMem.h"
 
 #if !defined(mingw32_HOST_OS)
 #include "rts/IOManager.h" // for setIOManagerControlFd()
@@ -59,6 +60,12 @@ static Capability *last_free_capability[MAX_NUMA_NODES];
  */
 PendingSync * volatile pending_sync = 0;
 
+// Number of logical NUMA nodes
+uint32_t n_numa_nodes;
+
+// Map logical NUMA node to OS node numbers
+uint32_t numa_map[MAX_NUMA_NODES];
+
 /* Let foreign code get the current Capability -- assuming there is one!
  * This is useful for unsafe foreign calls because they are called with
  * the current Capability held, but they are not passed it. For example,
@@ -326,6 +333,31 @@ void initCapabilities (void)
     traceCapsetCreate(CAPSET_OSPROCESS_DEFAULT, CapsetTypeOsProcess);
     traceCapsetCreate(CAPSET_CLOCKDOMAIN_DEFAULT, CapsetTypeClockdomain);
 
+    // Initialise NUMA
+    if (!RtsFlags.GcFlags.numa) {
+        n_numa_nodes = 1;
+        for (i = 0; i < MAX_NUMA_NODES; i++) {
+            numa_map[i] = 0;
+        }
+    } else {
+        uint32_t nNodes = osNumaNodes();
+        if (nNodes > MAX_NUMA_NODES) {
+            barf("Too many NUMA nodes (max %d)", MAX_NUMA_NODES);
+        }
+        StgWord mask = RtsFlags.GcFlags.numaMask & osNumaMask();
+        uint32_t logical = 0, physical = 0;
+        for (; physical < MAX_NUMA_NODES; physical++) {
+            if (mask & 1) {
+                numa_map[logical++] = physical;
+            }
+            mask = mask >> 1;
+        }
+        n_numa_nodes = logical;
+        if (logical == 0) {
+            barf("%s: available NUMA node set is empty");
+        }
+    }
+
 #if defined(THREADED_RTS)
 
 #ifndef REG_Base
@@ -355,7 +387,7 @@ void initCapabilities (void)
     // There are no free capabilities to begin with.  We will start
     // a worker Task to each Capability, which will quickly put the
     // Capability on the free list when it finds nothing to do.
-    for (i = 0; i < RtsFlags.GcFlags.nNumaNodes; i++) {
+    for (i = 0; i < n_numa_nodes; i++) {
         last_free_capability[i] = capabilities[0];
     }
 }
@@ -730,9 +762,9 @@ void waitForCapability (Capability **pCap, Task *task)
                 // Otherwise, search for a free capability on this node.
                 cap = NULL;
                 for (i = task->node; i < enabled_capabilities;
-                     i += RtsFlags.GcFlags.nNumaNodes) {
+                     i += n_numa_nodes) {
                     // visits all the capabilities on this node, because
-                    // cap[i]->node == i % RtsFlags.GcFlags.nNumaNodes
+                    // cap[i]->node == i % n_numa_nodes
                     if (!capabilities[i]->running_task) {
                         cap = capabilities[i];
                         break;
diff --git a/rts/Capability.h b/rts/Capability.h
index 6874379c5f..67b43280eb 100644
--- a/rts/Capability.h
+++ b/rts/Capability.h
@@ -39,7 +39,7 @@ struct Capability_ {
     // The NUMA node on which this capability resides.  This is used to allocate
     // node-local memory in allocate().
     //
-    // Note: this is always equal to cap->no % RtsFlags.ParFlags.nNumaNodes.
+    // Note: this is always equal to cap->no % n_numa_nodes.
     // The reason we slice it this way is that if we add or remove capabilities
     // via setNumCapabilities(), then we keep the number of capabilities on each
     // NUMA node balanced.
@@ -159,9 +159,6 @@ struct Capability_ {
 #endif
   ;
 
-
-#define capNoToNumaNode(n) ((n) % RtsFlags.GcFlags.nNumaNodes)
-
 #if defined(THREADED_RTS)
 #define ASSERT_TASK_ID(task) ASSERT(task->id == osThreadId())
 #else
@@ -350,6 +347,18 @@ void markCapabilities (evac_fn evac, void *user);
 void traverseSparkQueues (evac_fn evac, void *user);
 
 /* -----------------------------------------------------------------------------
+   NUMA
+   -------------------------------------------------------------------------- */
+
+/* Number of logical NUMA nodes */
+extern uint32_t n_numa_nodes;
+
+/* Map logical NUMA node to OS node numbers */
+extern uint32_t numa_map[MAX_NUMA_NODES];
+
+#define capNoToNumaNode(n) ((n) % n_numa_nodes)
+
+/* -----------------------------------------------------------------------------
    Messages
    -------------------------------------------------------------------------- */
 
diff --git a/rts/RtsFlags.c b/rts/RtsFlags.c
index 25345bf57b..e23f760f43 100644
--- a/rts/RtsFlags.c
+++ b/rts/RtsFlags.c
@@ -123,7 +123,6 @@ static void errorRtsOptsDisabled (const char *s);
 
 void initRtsFlagsDefaults(void)
 {
-    uint32_t i;
     StgWord64 maxStkSize = 8 * getPhysicalMemorySize() / 10;
     // if getPhysicalMemorySize fails just move along with an 8MB limit
     if (maxStkSize == 0)
@@ -160,10 +159,7 @@ void initRtsFlagsDefaults(void)
     RtsFlags.GcFlags.heapBase           = 0;   /* means don't care */
     RtsFlags.GcFlags.allocLimitGrace    = (100*1024) / BLOCK_SIZE;
     RtsFlags.GcFlags.numa               = rtsFalse;
-    RtsFlags.GcFlags.nNumaNodes         = 1;
-    for (i = 0; i < MAX_NUMA_NODES; i++) {
-        RtsFlags.GcFlags.numaMap[i] = 0;
-    }
+    RtsFlags.GcFlags.numaMask           = 1;
 
     RtsFlags.DebugFlags.scheduler       = rtsFalse;
     RtsFlags.DebugFlags.interpreter     = rtsFalse;
@@ -776,28 +772,8 @@ error = rtsTrue;
                           break;
                       }
 
-                      uint32_t nNodes = osNumaNodes();
-                      if (nNodes > MAX_NUMA_NODES) {
-                          errorBelch("%s: Too many NUMA nodes (max %d)",
-                                     rts_argv[arg], MAX_NUMA_NODES);
-                          error = rtsTrue;
-                      } else {
-                          RtsFlags.GcFlags.numa = rtsTrue;
-                          mask = mask & osNumaMask();
-                          uint32_t logical = 0, physical = 0;
-                          for (; physical < MAX_NUMA_NODES; physical++) {
-                              if (mask & 1) {
-                                  RtsFlags.GcFlags.numaMap[logical++] = physical;
-                              }
-                              mask = mask >> 1;
-                          }
-                          RtsFlags.GcFlags.nNumaNodes = logical;
-                          if (logical == 0) {
-                              errorBelch("%s: available node set is empty",
-                                         rts_argv[arg]);
-                              error = rtsTrue;
-                          }
-                      }
+                      RtsFlags.GcFlags.numa = rtsTrue;
+                      RtsFlags.GcFlags.numaMask = mask;
                   }
 #endif
 #if defined(DEBUG) && defined(THREADED_RTS)
@@ -821,11 +797,7 @@ error = rtsTrue;
                       } else {
                           RtsFlags.GcFlags.numa = rtsTrue;
                           RtsFlags.DebugFlags.numa = rtsTrue;
-                          RtsFlags.GcFlags.nNumaNodes = nNodes;
-                          uint32_t physical = 0;
-                          for (; physical < MAX_NUMA_NODES; physical++) {
-                              RtsFlags.GcFlags.numaMap[physical] = physical;
-                          }
+                          RtsFlags.GcFlags.numaMask = (1<<nNodes) - 1;
                       }
                   }
 #endif
diff --git a/rts/Task.c b/rts/Task.c
index 9a827745ba..9a658e019c 100644
--- a/rts/Task.c
+++ b/rts/Task.c
@@ -429,7 +429,7 @@ workerStart(Task *task)
         setThreadAffinity(cap->no, n_capabilities);
     }
     if (RtsFlags.GcFlags.numa && !RtsFlags.DebugFlags.numa) {
-        setThreadNode(RtsFlags.GcFlags.numaMap[task->node]);
+        setThreadNode(numa_map[task->node]);
     }
 
     // set the thread-local pointer to the Task:
@@ -510,7 +510,7 @@ void rts_setInCallCapability (
         if (RtsFlags.GcFlags.numa) {
             task->node = capNoToNumaNode(preferred_capability);
             if (!DEBUG_IS_ON || !RtsFlags.DebugFlags.numa) { // faking NUMA
-                setThreadNode(RtsFlags.GcFlags.numaMap[task->node]);
+                setThreadNode(numa_map[task->node]);
             }
         }
     }
diff --git a/rts/posix/OSThreads.c b/rts/posix/OSThreads.c
index 35ea2bde21..112a311f79 100644
--- a/rts/posix/OSThreads.c
+++ b/rts/posix/OSThreads.c
@@ -321,7 +321,6 @@ setThreadAffinity (uint32_t n STG_UNUSED,
 #if HAVE_LIBNUMA
 void setThreadNode (uint32_t node)
 {
-    ASSERT(node < RtsFlags.GcFlags.nNumaNodes);
     if (numa_run_on_node(node) == -1) {
         sysErrorBelch("numa_run_on_node");
         stg_exit(1);
diff --git a/rts/sm/BlockAlloc.c b/rts/sm/BlockAlloc.c
index c2859b0c15..6c2e96414e 100644
--- a/rts/sm/BlockAlloc.c
+++ b/rts/sm/BlockAlloc.c
@@ -467,7 +467,7 @@ uint32_t nodeWithLeastBlocks (void)
 {
     uint32_t node = 0, i;
     uint32_t min_blocks = n_alloc_blocks_by_node[0];
-    for (i = 1; i < RtsFlags.GcFlags.nNumaNodes; i++) {
+    for (i = 1; i < n_numa_nodes; i++) {
         if (n_alloc_blocks_by_node[i] < min_blocks) {
             min_blocks = n_alloc_blocks_by_node[i];
             node = i;
@@ -504,7 +504,7 @@ bdescr* allocLargeChunkOnNode (uint32_t node, W_ min, W_ max)
     StgWord ln, lnmax;
 
     if (min >= BLOCKS_PER_MBLOCK) {
-        return allocGroup(max);
+        return allocGroupOnNode(node,max);
     }
 
     ln = log_2_ceil(min);
@@ -811,7 +811,7 @@ void returnMemoryToOS(uint32_t n /* megablocks */)
     StgWord size;
 
     // ToDo: not fair, we free all the memory starting with node 0.
-    for (node = 0; n > 0 && node < RtsFlags.GcFlags.nNumaNodes; node++) {
+    for (node = 0; n > 0 && node < n_numa_nodes; node++) {
         bd = free_mblock_list[node];
         while ((n > 0) && (bd != NULL)) {
             size = BLOCKS_TO_MBLOCKS(bd->blocks);
@@ -875,7 +875,7 @@ checkFreeListSanity(void)
     StgWord ln, min;
     uint32_t node;
 
-    for (node = 0; node < RtsFlags.GcFlags.nNumaNodes; node++) {
+    for (node = 0; node < n_numa_nodes; node++) {
         min = 1;
         for (ln = 0; ln < NUM_FREE_LISTS; ln++) {
             IF_DEBUG(block_alloc,
@@ -950,7 +950,7 @@ countFreeList(void)
   StgWord ln;
   uint32_t node;
 
-  for (node = 0; node < RtsFlags.GcFlags.nNumaNodes; node++) {
+  for (node = 0; node < n_numa_nodes; node++) {
       for (ln=0; ln < NUM_FREE_LISTS; ln++) {
           for (bd = free_list[node][ln]; bd != NULL; bd = bd->link) {
               total_blocks += bd->blocks;
diff --git a/rts/sm/MBlock.c b/rts/sm/MBlock.c
index 53999d2c4b..4be7fd4356 100644
--- a/rts/sm/MBlock.c
+++ b/rts/sm/MBlock.c
@@ -594,7 +594,7 @@ getMBlocksOnNode(uint32_t node, uint32_t n)
 #ifdef DEBUG
     if (RtsFlags.DebugFlags.numa) return addr; // faking NUMA
 #endif
-    osBindMBlocksToNode(addr, n * MBLOCK_SIZE, RtsFlags.GcFlags.numaMap[node]);
+    osBindMBlocksToNode(addr, n * MBLOCK_SIZE, numa_map[node]);
     return addr;
 }
 
diff --git a/rts/sm/Storage.c b/rts/sm/Storage.c
index a9a7857d43..7c41f8c64b 100644
--- a/rts/sm/Storage.c
+++ b/rts/sm/Storage.c
@@ -57,7 +57,7 @@ generation *oldest_gen  = NULL; /* oldest generation, for convenience */
 /*
  * Array of nurseries, size == n_capabilities
  *
- * nursery[i] belongs to NUMA node (i % RtsFlags.GcFlags.nNumaNodes)
+ * nursery[i] belongs to NUMA node (i % n_numa_nodes)
  * This is chosen to be the same convention as capabilities[i], so
  * that when not using nursery chunks (+RTS -n), we just map
  * capabilities to nurseries 1:1.
@@ -209,7 +209,7 @@ initStorage (void)
 
   N = 0;
 
-  for (n = 0; n < RtsFlags.GcFlags.nNumaNodes; n++) {
+  for (n = 0; n < n_numa_nodes; n++) {
       next_nursery[n] = n;
   }
   storageAddCapabilities(0, n_capabilities);
@@ -615,7 +615,7 @@ assignNurseriesToCapabilities (uint32_t from, uint32_t to)
     for (i = from; i < to; i++) {
         node = capabilities[i]->node;
         assignNurseryToCapability(capabilities[i], next_nursery[node]);
-        next_nursery[node] += RtsFlags.GcFlags.nNumaNodes;
+        next_nursery[node] += n_numa_nodes;
     }
 }
 
@@ -642,7 +642,7 @@ resetNurseries (void)
 {
     uint32_t n;
 
-    for (n = 0; n < RtsFlags.GcFlags.nNumaNodes; n++) {
+    for (n = 0; n < n_numa_nodes; n++) {
         next_nursery[n] = n;
     }
     assignNurseriesToCapabilities(0, n_capabilities);
@@ -758,22 +758,20 @@ getNewNursery (Capability *cap)
     for(;;) {
         i = next_nursery[node];
         if (i < n_nurseries) {
-            if (cas(&next_nursery[node], i,
-                    i+RtsFlags.GcFlags.nNumaNodes) == i) {
+            if (cas(&next_nursery[node], i, i+n_numa_nodes) == i) {
                 assignNurseryToCapability(cap, i);
                 return rtsTrue;
             }
-        } else if (RtsFlags.GcFlags.nNumaNodes > 1) {
+        } else if (n_numa_nodes > 1) {
             // Try to find an unused nursery chunk on other nodes.  We'll get
             // remote memory, but the rationale is that avoiding GC is better
             // than avoiding remote memory access.
             rtsBool lost = rtsFalse;
-            for (n = 0; n < RtsFlags.GcFlags.nNumaNodes; n++) {
+            for (n = 0; n < n_numa_nodes; n++) {
                 if (n == node) continue;
                 i = next_nursery[n];
                 if (i < n_nurseries) {
-                    if (cas(&next_nursery[n], i,
-                            i+RtsFlags.GcFlags.nNumaNodes) == i) {
+                    if (cas(&next_nursery[n], i, i+n_numa_nodes) == i) {
                         assignNurseryToCapability(cap, i);
                         return rtsTrue;
                     } else {
diff --git a/testsuite/tests/rts/all.T b/testsuite/tests/rts/all.T
index de11b3f3ec..f03309e6d8 100644
--- a/testsuite/tests/rts/all.T
+++ b/testsuite/tests/rts/all.T
@@ -350,3 +350,6 @@ test('T10296a', [extra_clean(['T10296a.o','T10296a_c.o','T10296a'])],
                 ['$MAKE -s --no-print-directory T10296a'])
 
 test('T10296b', [only_ways('threaded2')], compile_and_run, [''])
+
+test('numa001', [ extra_run_opts('8'), extra_ways(['debug_numa']) ]
+                , compile_and_run, [''])
diff --git a/testsuite/tests/rts/numa001.hs b/testsuite/tests/rts/numa001.hs
new file mode 100644
index 0000000000..860a794101
--- /dev/null
+++ b/testsuite/tests/rts/numa001.hs
@@ -0,0 +1,20 @@
+import System.Environment
+import Control.Monad
+import Control.Concurrent
+
+main = do
+  [n] <- map read <$> getArgs
+  mvars <- replicateM n newEmptyMVar
+  sequence_ [ forkIO $ putMVar m $! nsoln n
+            | (m,n) <- zip mvars (repeat 9) ]
+  mapM_ takeMVar mvars
+
+nsoln nq = length (gen nq)
+ where
+    safe :: Int -> Int -> [Int] -> Bool
+    safe x d []    = True
+    safe x d (q:l) = x /= q && x /= q+d && x /= q-d && safe x (d+1) l
+
+    gen :: Int -> [[Int]]
+    gen 0 = [[]]
+    gen n = [ (q:b) | b <- gen (n-1), q <- [1..nq], safe q 1 b]
author	Simon Marlow <marlowsd@gmail.com>	2016-06-11 11:07:14 +0100
committer	Simon Marlow <marlowsd@gmail.com>	2016-06-17 14:52:45 +0100
commit	498ed2664219f7e8f1077f46ad2061aba2f57de4 (patch)
tree	123f66f55096876114b89876e4adf287ad944818
parent	a7f65b8787b0521397ee09061394425aa69bc6e0 (diff)
download	haskell-498ed2664219f7e8f1077f46ad2061aba2f57de4.tar.gz