diff options
author | Simon Marlow <marlowsd@gmail.com> | 2016-06-11 11:07:14 +0100 |
---|---|---|
committer | Simon Marlow <marlowsd@gmail.com> | 2016-06-17 14:52:45 +0100 |
commit | 498ed2664219f7e8f1077f46ad2061aba2f57de4 (patch) | |
tree | 123f66f55096876114b89876e4adf287ad944818 | |
parent | a7f65b8787b0521397ee09061394425aa69bc6e0 (diff) | |
download | haskell-498ed2664219f7e8f1077f46ad2061aba2f57de4.tar.gz |
NUMA cleanups
- Move the numaMap and nNumaNodes out of RtsFlags to Capability.c
- Add a test to tests/rts
-rw-r--r-- | includes/rts/Flags.h | 4 | ||||
-rw-r--r-- | libraries/base/GHC/RTS/Flags.hsc | 4 | ||||
-rw-r--r-- | rts/Capability.c | 38 | ||||
-rw-r--r-- | rts/Capability.h | 17 | ||||
-rw-r--r-- | rts/RtsFlags.c | 36 | ||||
-rw-r--r-- | rts/Task.c | 4 | ||||
-rw-r--r-- | rts/posix/OSThreads.c | 1 | ||||
-rw-r--r-- | rts/sm/BlockAlloc.c | 10 | ||||
-rw-r--r-- | rts/sm/MBlock.c | 2 | ||||
-rw-r--r-- | rts/sm/Storage.c | 18 | ||||
-rw-r--r-- | testsuite/tests/rts/all.T | 3 | ||||
-rw-r--r-- | testsuite/tests/rts/numa001.hs | 20 |
12 files changed, 94 insertions, 63 deletions
diff --git a/includes/rts/Flags.h b/includes/rts/Flags.h index e229aa12b1..c66aed90a3 100644 --- a/includes/rts/Flags.h +++ b/includes/rts/Flags.h @@ -73,9 +73,7 @@ typedef struct _GC_FLAGS { */ rtsBool numa; /* Use NUMA */ - uint32_t nNumaNodes; /* Number of nodes */ - uint32_t numaMap[MAX_NUMA_NODES]; /* Map our internal node numbers to OS - * node numbers */ + StgWord numaMask; } GC_FLAGS; /* See Note [Synchronization of flags and base APIs] */ diff --git a/libraries/base/GHC/RTS/Flags.hsc b/libraries/base/GHC/RTS/Flags.hsc index e067019a8c..5eba4860ff 100644 --- a/libraries/base/GHC/RTS/Flags.hsc +++ b/libraries/base/GHC/RTS/Flags.hsc @@ -114,7 +114,7 @@ data GCFlags = GCFlags , heapBase :: Word -- ^ address to ask the OS for memory , allocLimitGrace :: Word , numa :: Bool - , nNumaNodes :: Word32 + , numaMask :: Word } deriving (Show) -- | Parameters concerning context switching @@ -376,7 +376,7 @@ getGCFlags = do <*> #{peek GC_FLAGS, heapBase} ptr <*> #{peek GC_FLAGS, allocLimitGrace} ptr <*> #{peek GC_FLAGS, numa} ptr - <*> #{peek GC_FLAGS, nNumaNodes} ptr + <*> #{peek GC_FLAGS, numaMask} ptr getParFlags :: IO ParFlags getParFlags = do diff --git a/rts/Capability.c b/rts/Capability.c index 411e64dc7a..7ca220fbd9 100644 --- a/rts/Capability.c +++ b/rts/Capability.c @@ -26,6 +26,7 @@ #include "sm/GC.h" // for gcWorkerThread() #include "STM.h" #include "RtsUtils.h" +#include "sm/OSMem.h" #if !defined(mingw32_HOST_OS) #include "rts/IOManager.h" // for setIOManagerControlFd() @@ -59,6 +60,12 @@ static Capability *last_free_capability[MAX_NUMA_NODES]; */ PendingSync * volatile pending_sync = 0; +// Number of logical NUMA nodes +uint32_t n_numa_nodes; + +// Map logical NUMA node to OS node numbers +uint32_t numa_map[MAX_NUMA_NODES]; + /* Let foreign code get the current Capability -- assuming there is one! * This is useful for unsafe foreign calls because they are called with * the current Capability held, but they are not passed it. For example, @@ -326,6 +333,31 @@ void initCapabilities (void) traceCapsetCreate(CAPSET_OSPROCESS_DEFAULT, CapsetTypeOsProcess); traceCapsetCreate(CAPSET_CLOCKDOMAIN_DEFAULT, CapsetTypeClockdomain); + // Initialise NUMA + if (!RtsFlags.GcFlags.numa) { + n_numa_nodes = 1; + for (i = 0; i < MAX_NUMA_NODES; i++) { + numa_map[i] = 0; + } + } else { + uint32_t nNodes = osNumaNodes(); + if (nNodes > MAX_NUMA_NODES) { + barf("Too many NUMA nodes (max %d)", MAX_NUMA_NODES); + } + StgWord mask = RtsFlags.GcFlags.numaMask & osNumaMask(); + uint32_t logical = 0, physical = 0; + for (; physical < MAX_NUMA_NODES; physical++) { + if (mask & 1) { + numa_map[logical++] = physical; + } + mask = mask >> 1; + } + n_numa_nodes = logical; + if (logical == 0) { + barf("%s: available NUMA node set is empty"); + } + } + #if defined(THREADED_RTS) #ifndef REG_Base @@ -355,7 +387,7 @@ void initCapabilities (void) // There are no free capabilities to begin with. We will start // a worker Task to each Capability, which will quickly put the // Capability on the free list when it finds nothing to do. - for (i = 0; i < RtsFlags.GcFlags.nNumaNodes; i++) { + for (i = 0; i < n_numa_nodes; i++) { last_free_capability[i] = capabilities[0]; } } @@ -730,9 +762,9 @@ void waitForCapability (Capability **pCap, Task *task) // Otherwise, search for a free capability on this node. cap = NULL; for (i = task->node; i < enabled_capabilities; - i += RtsFlags.GcFlags.nNumaNodes) { + i += n_numa_nodes) { // visits all the capabilities on this node, because - // cap[i]->node == i % RtsFlags.GcFlags.nNumaNodes + // cap[i]->node == i % n_numa_nodes if (!capabilities[i]->running_task) { cap = capabilities[i]; break; diff --git a/rts/Capability.h b/rts/Capability.h index 6874379c5f..67b43280eb 100644 --- a/rts/Capability.h +++ b/rts/Capability.h @@ -39,7 +39,7 @@ struct Capability_ { // The NUMA node on which this capability resides. This is used to allocate // node-local memory in allocate(). // - // Note: this is always equal to cap->no % RtsFlags.ParFlags.nNumaNodes. + // Note: this is always equal to cap->no % n_numa_nodes. // The reason we slice it this way is that if we add or remove capabilities // via setNumCapabilities(), then we keep the number of capabilities on each // NUMA node balanced. @@ -159,9 +159,6 @@ struct Capability_ { #endif ; - -#define capNoToNumaNode(n) ((n) % RtsFlags.GcFlags.nNumaNodes) - #if defined(THREADED_RTS) #define ASSERT_TASK_ID(task) ASSERT(task->id == osThreadId()) #else @@ -350,6 +347,18 @@ void markCapabilities (evac_fn evac, void *user); void traverseSparkQueues (evac_fn evac, void *user); /* ----------------------------------------------------------------------------- + NUMA + -------------------------------------------------------------------------- */ + +/* Number of logical NUMA nodes */ +extern uint32_t n_numa_nodes; + +/* Map logical NUMA node to OS node numbers */ +extern uint32_t numa_map[MAX_NUMA_NODES]; + +#define capNoToNumaNode(n) ((n) % n_numa_nodes) + +/* ----------------------------------------------------------------------------- Messages -------------------------------------------------------------------------- */ diff --git a/rts/RtsFlags.c b/rts/RtsFlags.c index 25345bf57b..e23f760f43 100644 --- a/rts/RtsFlags.c +++ b/rts/RtsFlags.c @@ -123,7 +123,6 @@ static void errorRtsOptsDisabled (const char *s); void initRtsFlagsDefaults(void) { - uint32_t i; StgWord64 maxStkSize = 8 * getPhysicalMemorySize() / 10; // if getPhysicalMemorySize fails just move along with an 8MB limit if (maxStkSize == 0) @@ -160,10 +159,7 @@ void initRtsFlagsDefaults(void) RtsFlags.GcFlags.heapBase = 0; /* means don't care */ RtsFlags.GcFlags.allocLimitGrace = (100*1024) / BLOCK_SIZE; RtsFlags.GcFlags.numa = rtsFalse; - RtsFlags.GcFlags.nNumaNodes = 1; - for (i = 0; i < MAX_NUMA_NODES; i++) { - RtsFlags.GcFlags.numaMap[i] = 0; - } + RtsFlags.GcFlags.numaMask = 1; RtsFlags.DebugFlags.scheduler = rtsFalse; RtsFlags.DebugFlags.interpreter = rtsFalse; @@ -776,28 +772,8 @@ error = rtsTrue; break; } - uint32_t nNodes = osNumaNodes(); - if (nNodes > MAX_NUMA_NODES) { - errorBelch("%s: Too many NUMA nodes (max %d)", - rts_argv[arg], MAX_NUMA_NODES); - error = rtsTrue; - } else { - RtsFlags.GcFlags.numa = rtsTrue; - mask = mask & osNumaMask(); - uint32_t logical = 0, physical = 0; - for (; physical < MAX_NUMA_NODES; physical++) { - if (mask & 1) { - RtsFlags.GcFlags.numaMap[logical++] = physical; - } - mask = mask >> 1; - } - RtsFlags.GcFlags.nNumaNodes = logical; - if (logical == 0) { - errorBelch("%s: available node set is empty", - rts_argv[arg]); - error = rtsTrue; - } - } + RtsFlags.GcFlags.numa = rtsTrue; + RtsFlags.GcFlags.numaMask = mask; } #endif #if defined(DEBUG) && defined(THREADED_RTS) @@ -821,11 +797,7 @@ error = rtsTrue; } else { RtsFlags.GcFlags.numa = rtsTrue; RtsFlags.DebugFlags.numa = rtsTrue; - RtsFlags.GcFlags.nNumaNodes = nNodes; - uint32_t physical = 0; - for (; physical < MAX_NUMA_NODES; physical++) { - RtsFlags.GcFlags.numaMap[physical] = physical; - } + RtsFlags.GcFlags.numaMask = (1<<nNodes) - 1; } } #endif diff --git a/rts/Task.c b/rts/Task.c index 9a827745ba..9a658e019c 100644 --- a/rts/Task.c +++ b/rts/Task.c @@ -429,7 +429,7 @@ workerStart(Task *task) setThreadAffinity(cap->no, n_capabilities); } if (RtsFlags.GcFlags.numa && !RtsFlags.DebugFlags.numa) { - setThreadNode(RtsFlags.GcFlags.numaMap[task->node]); + setThreadNode(numa_map[task->node]); } // set the thread-local pointer to the Task: @@ -510,7 +510,7 @@ void rts_setInCallCapability ( if (RtsFlags.GcFlags.numa) { task->node = capNoToNumaNode(preferred_capability); if (!DEBUG_IS_ON || !RtsFlags.DebugFlags.numa) { // faking NUMA - setThreadNode(RtsFlags.GcFlags.numaMap[task->node]); + setThreadNode(numa_map[task->node]); } } } diff --git a/rts/posix/OSThreads.c b/rts/posix/OSThreads.c index 35ea2bde21..112a311f79 100644 --- a/rts/posix/OSThreads.c +++ b/rts/posix/OSThreads.c @@ -321,7 +321,6 @@ setThreadAffinity (uint32_t n STG_UNUSED, #if HAVE_LIBNUMA void setThreadNode (uint32_t node) { - ASSERT(node < RtsFlags.GcFlags.nNumaNodes); if (numa_run_on_node(node) == -1) { sysErrorBelch("numa_run_on_node"); stg_exit(1); diff --git a/rts/sm/BlockAlloc.c b/rts/sm/BlockAlloc.c index c2859b0c15..6c2e96414e 100644 --- a/rts/sm/BlockAlloc.c +++ b/rts/sm/BlockAlloc.c @@ -467,7 +467,7 @@ uint32_t nodeWithLeastBlocks (void) { uint32_t node = 0, i; uint32_t min_blocks = n_alloc_blocks_by_node[0]; - for (i = 1; i < RtsFlags.GcFlags.nNumaNodes; i++) { + for (i = 1; i < n_numa_nodes; i++) { if (n_alloc_blocks_by_node[i] < min_blocks) { min_blocks = n_alloc_blocks_by_node[i]; node = i; @@ -504,7 +504,7 @@ bdescr* allocLargeChunkOnNode (uint32_t node, W_ min, W_ max) StgWord ln, lnmax; if (min >= BLOCKS_PER_MBLOCK) { - return allocGroup(max); + return allocGroupOnNode(node,max); } ln = log_2_ceil(min); @@ -811,7 +811,7 @@ void returnMemoryToOS(uint32_t n /* megablocks */) StgWord size; // ToDo: not fair, we free all the memory starting with node 0. - for (node = 0; n > 0 && node < RtsFlags.GcFlags.nNumaNodes; node++) { + for (node = 0; n > 0 && node < n_numa_nodes; node++) { bd = free_mblock_list[node]; while ((n > 0) && (bd != NULL)) { size = BLOCKS_TO_MBLOCKS(bd->blocks); @@ -875,7 +875,7 @@ checkFreeListSanity(void) StgWord ln, min; uint32_t node; - for (node = 0; node < RtsFlags.GcFlags.nNumaNodes; node++) { + for (node = 0; node < n_numa_nodes; node++) { min = 1; for (ln = 0; ln < NUM_FREE_LISTS; ln++) { IF_DEBUG(block_alloc, @@ -950,7 +950,7 @@ countFreeList(void) StgWord ln; uint32_t node; - for (node = 0; node < RtsFlags.GcFlags.nNumaNodes; node++) { + for (node = 0; node < n_numa_nodes; node++) { for (ln=0; ln < NUM_FREE_LISTS; ln++) { for (bd = free_list[node][ln]; bd != NULL; bd = bd->link) { total_blocks += bd->blocks; diff --git a/rts/sm/MBlock.c b/rts/sm/MBlock.c index 53999d2c4b..4be7fd4356 100644 --- a/rts/sm/MBlock.c +++ b/rts/sm/MBlock.c @@ -594,7 +594,7 @@ getMBlocksOnNode(uint32_t node, uint32_t n) #ifdef DEBUG if (RtsFlags.DebugFlags.numa) return addr; // faking NUMA #endif - osBindMBlocksToNode(addr, n * MBLOCK_SIZE, RtsFlags.GcFlags.numaMap[node]); + osBindMBlocksToNode(addr, n * MBLOCK_SIZE, numa_map[node]); return addr; } diff --git a/rts/sm/Storage.c b/rts/sm/Storage.c index a9a7857d43..7c41f8c64b 100644 --- a/rts/sm/Storage.c +++ b/rts/sm/Storage.c @@ -57,7 +57,7 @@ generation *oldest_gen = NULL; /* oldest generation, for convenience */ /* * Array of nurseries, size == n_capabilities * - * nursery[i] belongs to NUMA node (i % RtsFlags.GcFlags.nNumaNodes) + * nursery[i] belongs to NUMA node (i % n_numa_nodes) * This is chosen to be the same convention as capabilities[i], so * that when not using nursery chunks (+RTS -n), we just map * capabilities to nurseries 1:1. @@ -209,7 +209,7 @@ initStorage (void) N = 0; - for (n = 0; n < RtsFlags.GcFlags.nNumaNodes; n++) { + for (n = 0; n < n_numa_nodes; n++) { next_nursery[n] = n; } storageAddCapabilities(0, n_capabilities); @@ -615,7 +615,7 @@ assignNurseriesToCapabilities (uint32_t from, uint32_t to) for (i = from; i < to; i++) { node = capabilities[i]->node; assignNurseryToCapability(capabilities[i], next_nursery[node]); - next_nursery[node] += RtsFlags.GcFlags.nNumaNodes; + next_nursery[node] += n_numa_nodes; } } @@ -642,7 +642,7 @@ resetNurseries (void) { uint32_t n; - for (n = 0; n < RtsFlags.GcFlags.nNumaNodes; n++) { + for (n = 0; n < n_numa_nodes; n++) { next_nursery[n] = n; } assignNurseriesToCapabilities(0, n_capabilities); @@ -758,22 +758,20 @@ getNewNursery (Capability *cap) for(;;) { i = next_nursery[node]; if (i < n_nurseries) { - if (cas(&next_nursery[node], i, - i+RtsFlags.GcFlags.nNumaNodes) == i) { + if (cas(&next_nursery[node], i, i+n_numa_nodes) == i) { assignNurseryToCapability(cap, i); return rtsTrue; } - } else if (RtsFlags.GcFlags.nNumaNodes > 1) { + } else if (n_numa_nodes > 1) { // Try to find an unused nursery chunk on other nodes. We'll get // remote memory, but the rationale is that avoiding GC is better // than avoiding remote memory access. rtsBool lost = rtsFalse; - for (n = 0; n < RtsFlags.GcFlags.nNumaNodes; n++) { + for (n = 0; n < n_numa_nodes; n++) { if (n == node) continue; i = next_nursery[n]; if (i < n_nurseries) { - if (cas(&next_nursery[n], i, - i+RtsFlags.GcFlags.nNumaNodes) == i) { + if (cas(&next_nursery[n], i, i+n_numa_nodes) == i) { assignNurseryToCapability(cap, i); return rtsTrue; } else { diff --git a/testsuite/tests/rts/all.T b/testsuite/tests/rts/all.T index de11b3f3ec..f03309e6d8 100644 --- a/testsuite/tests/rts/all.T +++ b/testsuite/tests/rts/all.T @@ -350,3 +350,6 @@ test('T10296a', [extra_clean(['T10296a.o','T10296a_c.o','T10296a'])], ['$MAKE -s --no-print-directory T10296a']) test('T10296b', [only_ways('threaded2')], compile_and_run, ['']) + +test('numa001', [ extra_run_opts('8'), extra_ways(['debug_numa']) ] + , compile_and_run, ['']) diff --git a/testsuite/tests/rts/numa001.hs b/testsuite/tests/rts/numa001.hs new file mode 100644 index 0000000000..860a794101 --- /dev/null +++ b/testsuite/tests/rts/numa001.hs @@ -0,0 +1,20 @@ +import System.Environment +import Control.Monad +import Control.Concurrent + +main = do + [n] <- map read <$> getArgs + mvars <- replicateM n newEmptyMVar + sequence_ [ forkIO $ putMVar m $! nsoln n + | (m,n) <- zip mvars (repeat 9) ] + mapM_ takeMVar mvars + +nsoln nq = length (gen nq) + where + safe :: Int -> Int -> [Int] -> Bool + safe x d [] = True + safe x d (q:l) = x /= q && x /= q+d && x /= q-d && safe x (d+1) l + + gen :: Int -> [[Int]] + gen 0 = [[]] + gen n = [ (q:b) | b <- gen (n-1), q <- [1..nq], safe q 1 b] |