From 9e5ea67e268be2659cd30ebaed7044d298198ab0 Mon Sep 17 00:00:00 2001 From: Simon Marlow Date: Sat, 23 Apr 2016 21:14:49 +0100 Subject: NUMA support Summary: The aim here is to reduce the number of remote memory accesses on systems with a NUMA memory architecture, typically multi-socket servers. Linux provides a NUMA API for doing two things: * Allocating memory local to a particular node * Binding a thread to a particular node When given the +RTS --numa flag, the runtime will * Determine the number of NUMA nodes (N) by querying the OS * Assign capabilities to nodes, so cap C is on node C%N * Bind worker threads on a capability to the correct node * Keep a separate free lists in the block layer for each node * Allocate the nursery for a capability from node-local memory * Allocate blocks in the GC from node-local memory For example, using nofib/parallel/queens on a 24-core 2-socket machine: ``` $ ./Main 15 +RTS -N24 -s -A64m Total time 173.960s ( 7.467s elapsed) $ ./Main 15 +RTS -N24 -s -A64m --numa Total time 150.836s ( 6.423s elapsed) ``` The biggest win here is expected to be allocating from node-local memory, so that means programs using a large -A value (as here). According to perf, on this program the number of remote memory accesses were reduced by more than 50% by using `--numa`. Test Plan: * validate * There's a new flag --debug-numa= that pretends to do NUMA without actually making the OS calls, which is useful for testing the code on non-NUMA systems. * TODO: I need to add some unit tests Reviewers: erikd, austin, rwbarton, ezyang, bgamari, hvr, niteria Subscribers: thomie Differential Revision: https://phabricator.haskell.org/D2199 --- configure.ac | 7 + docs/users_guide/runtime_control.rst | 50 ++++ includes/Cmm.h | 1 - includes/Rts.h | 1 - includes/RtsAPI.h | 6 +- includes/rts/Constants.h | 6 + includes/rts/Flags.h | 8 +- includes/rts/OSThreads.h | 4 +- includes/rts/Threads.h | 4 +- includes/rts/storage/Block.h | 20 +- includes/rts/storage/MBlock.h | 2 + includes/rts/storage/SMPClosureOps.h | 125 ---------- rts/Capability.c | 38 +-- rts/Capability.h | 14 +- rts/HeapStackCheck.cmm | 1 + rts/Inlines.c | 1 + rts/Messages.h | 1 + rts/PrimOps.cmm | 1 + rts/ProfHeap.c | 1 + rts/RtsFlags.c | 118 ++++++++-- rts/SMPClosureOps.h | 129 ++++++++++ rts/STM.c | 1 + rts/Schedule.c | 5 +- rts/Task.c | 26 ++- rts/Task.h | 6 + rts/eventlog/EventLog.c | 2 +- rts/package.conf.in | 4 +- rts/posix/OSMem.c | 64 +++++ rts/posix/OSThreads.c | 30 ++- rts/sm/BlockAlloc.c | 350 +++++++++++++++++----------- rts/sm/BlockAlloc.h | 1 + rts/sm/GC.c | 8 +- rts/sm/GCUtils.c | 15 +- rts/sm/GCUtils.h | 14 +- rts/sm/MBlock.c | 19 +- rts/sm/MarkStack.h | 1 + rts/sm/OSMem.h | 4 + rts/sm/Storage.c | 232 ++++++++++-------- rts/win32/OSMem.c | 22 ++ rts/win32/OSThreads.c | 3 + testsuite/config/ghc | 6 +- testsuite/tests/codeGen/should_run/all.T | 5 +- testsuite/tests/concurrent/prog001/all.T | 2 +- testsuite/tests/concurrent/should_run/all.T | 9 +- 44 files changed, 937 insertions(+), 430 deletions(-) delete mode 100644 includes/rts/storage/SMPClosureOps.h create mode 100644 rts/SMPClosureOps.h diff --git a/configure.ac b/configure.ac index d7eb7380cd..070bae54d9 100644 --- a/configure.ac +++ b/configure.ac @@ -1103,6 +1103,13 @@ if test $UseLibdw = "YES" ; then fi AC_DEFINE_UNQUOTED([USE_LIBDW], [$USE_LIBDW], [Set to 1 to use libdw]) +dnl ** Have libnuma? +dnl -------------------------------------------------------------- +AC_CHECK_HEADERS([numa.h numaif.h]) +AC_CHECK_LIB(numa, numa_available, + [AC_DEFINE([HAVE_LIBNUMA], [1], [Define to 1 if you have libnuma.])] + []) + dnl ** Documentation dnl -------------------------------------------------------------- if test -n "$SPHINXBUILD"; then diff --git a/docs/users_guide/runtime_control.rst b/docs/users_guide/runtime_control.rst index 19135c61ce..1ae51ddc49 100644 --- a/docs/users_guide/runtime_control.rst +++ b/docs/users_guide/runtime_control.rst @@ -643,6 +643,56 @@ performance. ``-F`` parameter will be reduced in order to avoid exceeding the maximum heap size. +.. rts-flag:: --numa + --numa= + + .. index:: + single: NUMA, enabling in the runtime + + Enable NUMA-aware memory allocation in the runtime (only available + with ``-threaded``, and only on Linux currently). + + Background: some systems have a Non-Uniform Memory Architecture, + whereby main memory is split into banks which are "local" to + specific CPU cores. Accessing local memory is faster than + accessing remote memory. The OS provides APIs for allocating + local memory and binding threads to particular CPU cores, so that + we can ensure certain memory accesses are using local memory. + + The ``--numa`` option tells the RTS to tune its memory usage to + maximize local memory accesses. In particular, the RTS will: + + - Determine the number of NUMA nodes (N) by querying the OS. + - Manage separate memory pools for each node. + - Map capabilities to NUMA nodes. Capability C is mapped to + NUMA node C mod N. + - Bind worker threads on a capability to the appropriate node. + - Allocate the nursery from node-local memory. + - Perform other memory allocation, including in the GC, from + node-local memory. + - When load-balancing, we prefer to migrate threads to another + Capability on the same node. + + The ``--numa`` flag is typically beneficial when a program is + using all cores of a large multi-core NUMA system, with a large + allocation area (``-A``). All memory accesses to the allocation + area will go to local memory, which can save a significant amount + of remote memory access. A runtime speedup on the order of 10% + is typical, but can vary a lot depending on the hardware and the + memory behaviour of the program. + + Note that the RTS will not set CPU affinity for bound threads and + threads entering Haskell from C/C++, so if your program uses bound + threads you should ensure that each bound thread calls the RTS API + `rts_setInCallCapability(c,1)` from C/C++ before calling into + Haskell. Otherwise there could be a mismatch between the CPU that + the thread is running on and the memory it is using while running + Haskell code, which will negate any benefits of ``--numa``. + + If given an explicit , the is interpreted as a bitmap + that indicates the NUMA nodes on which to run the program. For + example, ``--numa=3`` would run the program on NUMA nodes 0 and 1. + .. _rts-options-statistics: RTS options to produce runtime statistics diff --git a/includes/Cmm.h b/includes/Cmm.h index cbd7e36ac1..3b9a5a6794 100644 --- a/includes/Cmm.h +++ b/includes/Cmm.h @@ -325,7 +325,6 @@ #include "DerivedConstants.h" #include "rts/storage/ClosureTypes.h" #include "rts/storage/FunTypes.h" -#include "rts/storage/SMPClosureOps.h" #include "rts/OSThreads.h" /* diff --git a/includes/Rts.h b/includes/Rts.h index 1ad1bba5f8..3d4538f41d 100644 --- a/includes/Rts.h +++ b/includes/Rts.h @@ -203,7 +203,6 @@ INLINE_HEADER Time fsecondsToTime (double t) #include "rts/storage/ClosureTypes.h" #include "rts/storage/TSO.h" #include "stg/MiscClosures.h" /* InfoTables, closures etc. defined in the RTS */ -#include "rts/storage/SMPClosureOps.h" #include "rts/storage/Block.h" #include "rts/storage/ClosureMacros.h" #include "rts/storage/MBlock.h" diff --git a/includes/RtsAPI.h b/includes/RtsAPI.h index a4a094fd14..230c982c12 100644 --- a/includes/RtsAPI.h +++ b/includes/RtsAPI.h @@ -179,7 +179,11 @@ Capability *rts_unsafeGetMyCapability (void); // Note that the thread may still be migrated by the RTS scheduler, but that // will only happen if there are multiple threads running on one Capability and // another Capability is free. -void setInCallCapability (int preferred_capability); +// +// If affinity is non-zero, the current thread will be bound to +// specific CPUs according to the prevailing affinity policy for the +// specified capability, set by either +RTS -qa or +RTS --numa. +void rts_setInCallCapability (int preferred_capability, int affinity); /* ---------------------------------------------------------------------------- Building Haskell objects from C datatypes. diff --git a/includes/rts/Constants.h b/includes/rts/Constants.h index b65b8d3a9d..114f30ce2f 100644 --- a/includes/rts/Constants.h +++ b/includes/rts/Constants.h @@ -295,4 +295,10 @@ #define MAX_SPARE_WORKERS 6 +/* + * The maximum number of NUMA nodes we support. This is a fixed limit so that + * we can have static arrays of this size in the RTS for speed. + */ +#define MAX_NUMA_NODES 16 + #endif /* RTS_CONSTANTS_H */ diff --git a/includes/rts/Flags.h b/includes/rts/Flags.h index 8020a177b0..ff303dc5e6 100644 --- a/includes/rts/Flags.h +++ b/includes/rts/Flags.h @@ -73,6 +73,11 @@ typedef struct _GC_FLAGS { * to handle the exception before we * raise it again. */ + + rtsBool numa; /* Use NUMA */ + uint32_t nNumaNodes; /* Number of nodes */ + uint32_t numaMap[MAX_NUMA_NODES]; /* Map our internal node numbers to OS + * node numbers */ } GC_FLAGS; /* See Note [Synchronization of flags and base APIs] */ @@ -93,6 +98,7 @@ typedef struct _DEBUG_FLAGS { rtsBool squeeze; /* 'z' stack squeezing & lazy blackholing */ rtsBool hpc; /* 'c' coverage */ rtsBool sparks; /* 'r' */ + rtsBool numa; /* '--debug-numa' */ } DEBUG_FLAGS; /* See Note [Synchronization of flags and base APIs] */ @@ -184,7 +190,7 @@ typedef struct _MISC_FLAGS { #ifdef THREADED_RTS /* See Note [Synchronization of flags and base APIs] */ typedef struct _PAR_FLAGS { - uint32_t nNodes; /* number of threads to run simultaneously */ + uint32_t nCapabilities; /* number of threads to run simultaneously */ rtsBool migrate; /* migrate threads between capabilities */ uint32_t maxLocalSparks; rtsBool parGcEnabled; /* enable parallel GC */ diff --git a/includes/rts/OSThreads.h b/includes/rts/OSThreads.h index ee1855b4da..bc84b714db 100644 --- a/includes/rts/OSThreads.h +++ b/includes/rts/OSThreads.h @@ -200,7 +200,9 @@ void setThreadLocalVar (ThreadLocalKey *key, void *value); void freeThreadLocalKey (ThreadLocalKey *key); // Processors and affinity -void setThreadAffinity (uint32_t n, uint32_t m); +void setThreadAffinity (uint32_t n, uint32_t m); +void setThreadNode (uint32_t node); +void releaseThreadNode (void); #endif // !CMINUSMINUS #else diff --git a/includes/rts/Threads.h b/includes/rts/Threads.h index 67d01db7d4..866c4692bd 100644 --- a/includes/rts/Threads.h +++ b/includes/rts/Threads.h @@ -58,7 +58,9 @@ pid_t forkProcess (HsStablePtr *entry) HsBool rtsSupportsBoundThreads (void); -// The number of Capabilities +// The number of Capabilities. +// ToDo: I would like this to be private to the RTS and instead expose a +// function getNumCapabilities(), but it is used in compiler/cbits/genSym.c extern unsigned int n_capabilities; // The number of Capabilities that are not disabled diff --git a/includes/rts/storage/Block.h b/includes/rts/storage/Block.h index 1a31de5512..e04cfdfec6 100644 --- a/includes/rts/storage/Block.h +++ b/includes/rts/storage/Block.h @@ -111,7 +111,7 @@ typedef struct bdescr_ { StgWord16 gen_no; // gen->no, cached StgWord16 dest_no; // number of destination generation - StgWord16 _pad1; + StgWord16 node; // which memory node does this block live on? StgWord16 flags; // block flags, see below @@ -280,12 +280,28 @@ extern void initBlockAllocator(void); /* Allocation -------------------------------------------------------------- */ bdescr *allocGroup(W_ n); -bdescr *allocBlock(void); + +EXTERN_INLINE bdescr* allocBlock(void); +EXTERN_INLINE bdescr* allocBlock(void) +{ + return allocGroup(1); +} + +bdescr *allocGroupOnNode(uint32_t node, W_ n); + +EXTERN_INLINE bdescr* allocBlockOnNode(uint32_t node); +EXTERN_INLINE bdescr* allocBlockOnNode(uint32_t node) +{ + return allocGroupOnNode(node,1); +} // versions that take the storage manager lock for you: bdescr *allocGroup_lock(W_ n); bdescr *allocBlock_lock(void); +bdescr *allocGroupOnNode_lock(uint32_t node, W_ n); +bdescr *allocBlockOnNode_lock(uint32_t node); + /* De-Allocation ----------------------------------------------------------- */ void freeGroup(bdescr *p); diff --git a/includes/rts/storage/MBlock.h b/includes/rts/storage/MBlock.h index 419a96e225..a8251c8c0b 100644 --- a/includes/rts/storage/MBlock.h +++ b/includes/rts/storage/MBlock.h @@ -18,6 +18,8 @@ extern W_ mblocks_allocated; extern void initMBlocks(void); extern void * getMBlock(void); extern void * getMBlocks(uint32_t n); +extern void * getMBlockOnNode(uint32_t node); +extern void * getMBlocksOnNode(uint32_t node, uint32_t n); extern void freeMBlocks(void *addr, uint32_t n); extern void releaseFreeMemory(void); extern void freeAllMBlocks(void); diff --git a/includes/rts/storage/SMPClosureOps.h b/includes/rts/storage/SMPClosureOps.h deleted file mode 100644 index ee92186f5f..0000000000 --- a/includes/rts/storage/SMPClosureOps.h +++ /dev/null @@ -1,125 +0,0 @@ -/* ---------------------------------------------------------------------------- - * - * (c) The GHC Team, 2005-2013 - * - * Macros for THREADED_RTS support - * - * -------------------------------------------------------------------------- */ - -#ifndef RTS_STORAGE_SMPCLOSUREOPS_H -#define RTS_STORAGE_SMPCLOSUREOPS_H - -#ifdef CMINUSMINUS - -/* Lock closure, equivalent to ccall lockClosure but the condition is inlined. - * Arguments are swapped for uniformity with unlockClosure. */ -#if defined(THREADED_RTS) -#define LOCK_CLOSURE(closure, info) \ - if (CInt[n_capabilities] == 1 :: CInt) { \ - info = GET_INFO(closure); \ - } else { \ - ("ptr" info) = ccall reallyLockClosure(closure "ptr"); \ - } -#else -#define LOCK_CLOSURE(closure, info) info = GET_INFO(closure) -#endif - -#define unlockClosure(ptr,info) \ - prim_write_barrier; \ - StgHeader_info(ptr) = info; - -#else - -INLINE_HEADER StgInfoTable *lockClosure(StgClosure *p); -EXTERN_INLINE StgInfoTable *reallyLockClosure(StgClosure *p); -EXTERN_INLINE StgInfoTable *tryLockClosure(StgClosure *p); -EXTERN_INLINE void unlockClosure(StgClosure *p, const StgInfoTable *info); - -#if defined(THREADED_RTS) - -/* ----------------------------------------------------------------------------- - * Locking/unlocking closures - * - * This is used primarily in the implementation of MVars. - * -------------------------------------------------------------------------- */ - -// We want a callable copy of reallyLockClosure() so that we can refer to it -// from .cmm files compiled using the native codegen, so these are given -// EXTERN_INLINE. C-- should use LOCK_CLOSURE not lockClosure, so we've -// kept it INLINE_HEADER. -EXTERN_INLINE StgInfoTable *reallyLockClosure(StgClosure *p) -{ - StgWord info; - do { - uint32_t i = 0; - do { - info = xchg((P_)(void *)&p->header.info, (W_)&stg_WHITEHOLE_info); - if (info != (W_)&stg_WHITEHOLE_info) return (StgInfoTable *)info; - } while (++i < SPIN_COUNT); - yieldThread(); - } while (1); -} - -INLINE_HEADER StgInfoTable *lockClosure(StgClosure *p) -{ - if (n_capabilities == 1) { - return (StgInfoTable *)p->header.info; - } - else { - return reallyLockClosure(p); - } -} - -// ToDo: consider splitting tryLockClosure into reallyTryLockClosure, -// same as lockClosure -EXTERN_INLINE StgInfoTable *tryLockClosure(StgClosure *p) -{ - StgWord info; - if (n_capabilities == 1) { - return (StgInfoTable *)p->header.info; - } - else { - info = xchg((P_)(void *)&p->header.info, (W_)&stg_WHITEHOLE_info); - if (info != (W_)&stg_WHITEHOLE_info) { - return (StgInfoTable *)info; - } else { - return NULL; - } - } -} - -#else /* !THREADED_RTS */ - -EXTERN_INLINE StgInfoTable * -reallyLockClosure(StgClosure *p) -{ return (StgInfoTable *)p->header.info; } - -INLINE_HEADER StgInfoTable * -lockClosure(StgClosure *p) -{ return (StgInfoTable *)p->header.info; } - -EXTERN_INLINE StgInfoTable * -tryLockClosure(StgClosure *p) -{ return (StgInfoTable *)p->header.info; } - -#endif /* THREADED_RTS */ - -EXTERN_INLINE void unlockClosure(StgClosure *p, const StgInfoTable *info) -{ - // This is a strictly ordered write, so we need a write_barrier(): - write_barrier(); - p->header.info = info; -} - -// Handy specialised versions of lockClosure()/unlockClosure() -INLINE_HEADER void lockTSO(StgTSO *tso); -INLINE_HEADER void lockTSO(StgTSO *tso) -{ lockClosure((StgClosure *)tso); } - -INLINE_HEADER void unlockTSO(StgTSO *tso); -INLINE_HEADER void unlockTSO(StgTSO *tso) -{ unlockClosure((StgClosure*)tso, (const StgInfoTable *)&stg_TSO_info); } - -#endif /* CMINUSMINUS */ - -#endif /* RTS_STORAGE_SMPCLOSUREOPS_H */ diff --git a/rts/Capability.c b/rts/Capability.c index 1b5f51a92b..411e64dc7a 100644 --- a/rts/Capability.c +++ b/rts/Capability.c @@ -51,7 +51,7 @@ Capability **capabilities = NULL; // an in-call has a chance of quickly finding a free Capability. // Maintaining a global free list of Capabilities would require global // locking, so we don't do that. -static Capability *last_free_capability = NULL; +static Capability *last_free_capability[MAX_NUMA_NODES]; /* * Indicates that the RTS wants to synchronise all the Capabilities @@ -230,11 +230,12 @@ popReturningTask (Capability *cap) * ------------------------------------------------------------------------- */ static void -initCapability( Capability *cap, uint32_t i ) +initCapability (Capability *cap, uint32_t i) { uint32_t g; cap->no = i; + cap->node = capNoToNumaNode(i); cap->in_haskell = rtsFalse; cap->idle = 0; cap->disabled = rtsFalse; @@ -316,9 +317,10 @@ initCapability( Capability *cap, uint32_t i ) * controlled by the user via the RTS flag -N. * * ------------------------------------------------------------------------- */ -void -initCapabilities( void ) +void initCapabilities (void) { + uint32_t i; + /* Declare a couple capability sets representing the process and clock domain. Each capability will get added to these capsets. */ traceCapsetCreate(CAPSET_OSPROCESS_DEFAULT, CapsetTypeOsProcess); @@ -328,21 +330,22 @@ initCapabilities( void ) #ifndef REG_Base // We can't support multiple CPUs if BaseReg is not a register - if (RtsFlags.ParFlags.nNodes > 1) { + if (RtsFlags.ParFlags.nCapabilities > 1) { errorBelch("warning: multiple CPUs not supported in this build, reverting to 1"); - RtsFlags.ParFlags.nNodes = 1; + RtsFlags.ParFlags.nCapabilities = 1; } #endif n_capabilities = 0; - moreCapabilities(0, RtsFlags.ParFlags.nNodes); - n_capabilities = RtsFlags.ParFlags.nNodes; + moreCapabilities(0, RtsFlags.ParFlags.nCapabilities); + n_capabilities = RtsFlags.ParFlags.nCapabilities; #else /* !THREADED_RTS */ n_capabilities = 1; capabilities = stgMallocBytes(sizeof(Capability*), "initCapabilities"); capabilities[0] = &MainCapability; + initCapability(&MainCapability, 0); #endif @@ -352,7 +355,9 @@ initCapabilities( void ) // There are no free capabilities to begin with. We will start // a worker Task to each Capability, which will quickly put the // Capability on the free list when it finds nothing to do. - last_free_capability = capabilities[0]; + for (i = 0; i < RtsFlags.GcFlags.nNumaNodes; i++) { + last_free_capability[i] = capabilities[0]; + } } void @@ -532,7 +537,7 @@ releaseCapability_ (Capability* cap, #ifdef PROFILING cap->r.rCCCS = CCS_IDLE; #endif - last_free_capability = cap; + last_free_capability[cap->node] = cap; debugTrace(DEBUG_sched, "freeing capability %d", cap->no); } @@ -711,6 +716,7 @@ void waitForCapability (Capability **pCap, Task *task) *pCap = &MainCapability; #else + uint32_t i; Capability *cap = *pCap; if (cap == NULL) { @@ -719,12 +725,14 @@ void waitForCapability (Capability **pCap, Task *task) enabled_capabilities]; } else { // Try last_free_capability first - cap = last_free_capability; + cap = last_free_capability[task->node]; if (cap->running_task) { - uint32_t i; - // otherwise, search for a free capability + // Otherwise, search for a free capability on this node. cap = NULL; - for (i = 0; i < n_capabilities; i++) { + for (i = task->node; i < enabled_capabilities; + i += RtsFlags.GcFlags.nNumaNodes) { + // visits all the capabilities on this node, because + // cap[i]->node == i % RtsFlags.GcFlags.nNumaNodes if (!capabilities[i]->running_task) { cap = capabilities[i]; break; @@ -732,7 +740,7 @@ void waitForCapability (Capability **pCap, Task *task) } if (cap == NULL) { // Can't find a free one, use last_free_capability. - cap = last_free_capability; + cap = last_free_capability[task->node]; } } } diff --git a/rts/Capability.h b/rts/Capability.h index 22c1d2a5c7..6874379c5f 100644 --- a/rts/Capability.h +++ b/rts/Capability.h @@ -36,6 +36,15 @@ struct Capability_ { uint32_t no; // capability number. + // The NUMA node on which this capability resides. This is used to allocate + // node-local memory in allocate(). + // + // Note: this is always equal to cap->no % RtsFlags.ParFlags.nNumaNodes. + // The reason we slice it this way is that if we add or remove capabilities + // via setNumCapabilities(), then we keep the number of capabilities on each + // NUMA node balanced. + uint32_t node; + // The Task currently holding this Capability. This task has // exclusive access to the contents of this Capability (apart from // returning_tasks_hd/returning_tasks_tl). @@ -151,6 +160,8 @@ struct Capability_ { ; +#define capNoToNumaNode(n) ((n) % RtsFlags.GcFlags.nNumaNodes) + #if defined(THREADED_RTS) #define ASSERT_TASK_ID(task) ASSERT(task->id == osThreadId()) #else @@ -221,7 +232,6 @@ INLINE_HEADER void releaseCapability_ (Capability* cap STG_UNUSED, // extern uint32_t enabled_capabilities; // Array of all the capabilities -// extern Capability **capabilities; // @@ -364,7 +374,7 @@ recordMutableCap (const StgClosure *p, Capability *cap, uint32_t gen) bd = cap->mut_lists[gen]; if (bd->free >= bd->start + BLOCK_SIZE_W) { bdescr *new_bd; - new_bd = allocBlock_lock(); + new_bd = allocBlockOnNode_lock(cap->node); new_bd->link = bd; bd = new_bd; cap->mut_lists[gen] = bd; diff --git a/rts/HeapStackCheck.cmm b/rts/HeapStackCheck.cmm index 825eaef27f..69bff74f41 100644 --- a/rts/HeapStackCheck.cmm +++ b/rts/HeapStackCheck.cmm @@ -12,6 +12,7 @@ #include "Cmm.h" #include "Updates.h" +#include "SMPClosureOps.h" #ifdef __PIC__ import pthread_mutex_unlock; diff --git a/rts/Inlines.c b/rts/Inlines.c index e6f29b6e1b..a3eb956d0c 100644 --- a/rts/Inlines.c +++ b/rts/Inlines.c @@ -7,3 +7,4 @@ #include "Schedule.h" #include "Capability.h" #include "WSDeque.h" +#include "SMPClosureOps.h" diff --git a/rts/Messages.h b/rts/Messages.h index 302cb946b3..1459b581d4 100644 --- a/rts/Messages.h +++ b/rts/Messages.h @@ -18,6 +18,7 @@ void sendMessage (Capability *from_cap, Capability *to_cap, Message *msg); #include "Capability.h" #include "Updates.h" // for DEBUG_FILL_SLOP +#include "SMPClosureOps.h" INLINE_HEADER void doneWithMsgThrowTo (MessageThrowTo *m) diff --git a/rts/PrimOps.cmm b/rts/PrimOps.cmm index 160bccd2f9..b82eebe07f 100644 --- a/rts/PrimOps.cmm +++ b/rts/PrimOps.cmm @@ -23,6 +23,7 @@ #include "Cmm.h" #include "MachDeps.h" +#include "SMPClosureOps.h" #ifdef __PIC__ import pthread_mutex_lock; diff --git a/rts/ProfHeap.c b/rts/ProfHeap.c index 9557648fcc..18c3e41a32 100644 --- a/rts/ProfHeap.c +++ b/rts/ProfHeap.c @@ -9,6 +9,7 @@ #include "PosixSource.h" #include "Rts.h" +#include "Capability.h" #include "RtsFlags.h" #include "RtsUtils.h" #include "Profiling.h" diff --git a/rts/RtsFlags.c b/rts/RtsFlags.c index 1ec5db0bb0..25345bf57b 100644 --- a/rts/RtsFlags.c +++ b/rts/RtsFlags.c @@ -15,6 +15,7 @@ #include "RtsFlags.h" #include "sm/OSMem.h" #include "hooks/Hooks.h" +#include "Capability.h" #ifdef HAVE_CTYPE_H #include @@ -122,6 +123,7 @@ static void errorRtsOptsDisabled (const char *s); void initRtsFlagsDefaults(void) { + uint32_t i; StgWord64 maxStkSize = 8 * getPhysicalMemorySize() / 10; // if getPhysicalMemorySize fails just move along with an 8MB limit if (maxStkSize == 0) @@ -157,8 +159,12 @@ void initRtsFlagsDefaults(void) #endif RtsFlags.GcFlags.heapBase = 0; /* means don't care */ RtsFlags.GcFlags.allocLimitGrace = (100*1024) / BLOCK_SIZE; + RtsFlags.GcFlags.numa = rtsFalse; + RtsFlags.GcFlags.nNumaNodes = 1; + for (i = 0; i < MAX_NUMA_NODES; i++) { + RtsFlags.GcFlags.numaMap[i] = 0; + } -#ifdef DEBUG RtsFlags.DebugFlags.scheduler = rtsFalse; RtsFlags.DebugFlags.interpreter = rtsFalse; RtsFlags.DebugFlags.weak = rtsFalse; @@ -174,7 +180,7 @@ void initRtsFlagsDefaults(void) RtsFlags.DebugFlags.squeeze = rtsFalse; RtsFlags.DebugFlags.hpc = rtsFalse; RtsFlags.DebugFlags.sparks = rtsFalse; -#endif + RtsFlags.DebugFlags.numa = rtsFalse; #if defined(PROFILING) RtsFlags.CcFlags.doCostCentres = 0; @@ -220,7 +226,7 @@ void initRtsFlagsDefaults(void) RtsFlags.MiscFlags.linkerMemBase = 0; #ifdef THREADED_RTS - RtsFlags.ParFlags.nNodes = 1; + RtsFlags.ParFlags.nCapabilities = 1; RtsFlags.ParFlags.migrate = rtsTrue; RtsFlags.ParFlags.parGcEnabled = 1; RtsFlags.ParFlags.parGcGen = 0; @@ -398,6 +404,14 @@ usage_text[] = { " -qi If a processor has been idle for the last GCs, do not", " wake it up for a non-load-balancing parallel GC.", " (0 disables, default: 0)", +" --numa[=]", +" Use NUMA, nodes given by (default: off)", +#if defined(DEBUG) +" --debug-numa[=]", +" Pretend NUMA: like --numa, but without the system calls.", +" Can be used on non-NUMA systems for debugging.", +"", +#endif #endif " --install-signal-handlers=", " Install signal handlers (default: yes)", @@ -745,6 +759,76 @@ error = rtsTrue; printRtsInfo(); stg_exit(0); } +#if defined(THREADED_RTS) + else if (!strncmp("numa", &rts_argv[arg][2], 4)) { + OPTION_SAFE; + StgWord mask; + if (rts_argv[arg][6] == '=') { + mask = (StgWord)strtol(rts_argv[arg]+7, + (char **) NULL, 10); + } else { + mask = (StgWord)~0; + } + if (!osNumaAvailable()) { + errorBelch("%s: OS reports NUMA is not available", + rts_argv[arg]); + error = rtsTrue; + break; + } + + uint32_t nNodes = osNumaNodes(); + if (nNodes > MAX_NUMA_NODES) { + errorBelch("%s: Too many NUMA nodes (max %d)", + rts_argv[arg], MAX_NUMA_NODES); + error = rtsTrue; + } else { + RtsFlags.GcFlags.numa = rtsTrue; + mask = mask & osNumaMask(); + uint32_t logical = 0, physical = 0; + for (; physical < MAX_NUMA_NODES; physical++) { + if (mask & 1) { + RtsFlags.GcFlags.numaMap[logical++] = physical; + } + mask = mask >> 1; + } + RtsFlags.GcFlags.nNumaNodes = logical; + if (logical == 0) { + errorBelch("%s: available node set is empty", + rts_argv[arg]); + error = rtsTrue; + } + } + } +#endif +#if defined(DEBUG) && defined(THREADED_RTS) + else if (!strncmp("debug-numa", &rts_argv[arg][2], 10)) { + OPTION_SAFE; + size_t nNodes; + if (rts_argv[arg][12] == '=' && + isdigit(rts_argv[arg][13])) { + nNodes = (StgWord)strtol(rts_argv[arg]+13, + (char **) NULL, 10); + } else { + errorBelch("%s: missing number of nodes", + rts_argv[arg]); + error = rtsTrue; + break; + } + if (nNodes > MAX_NUMA_NODES) { + errorBelch("%s: Too many NUMA nodes (max %d)", + rts_argv[arg], MAX_NUMA_NODES); + error = rtsTrue; + } else { + RtsFlags.GcFlags.numa = rtsTrue; + RtsFlags.DebugFlags.numa = rtsTrue; + RtsFlags.GcFlags.nNumaNodes = nNodes; + uint32_t physical = 0; + for (; physical < MAX_NUMA_NODES; physical++) { + RtsFlags.GcFlags.numaMap[physical] = physical; + } + } + } +#endif else { OPTION_SAFE; errorBelch("unknown RTS option: %s",rts_argv[arg]); @@ -856,20 +940,20 @@ error = rtsTrue; if (strncmp("maxN", &rts_argv[arg][1], 4) == 0) { OPTION_SAFE; THREADED_BUILD_ONLY( - int nNodes; + int nCapabilities; int proc = (int)getNumberOfProcessors(); - nNodes = strtol(rts_argv[arg]+5, (char **) NULL, 10); - if (nNodes > proc) { nNodes = proc; } + nCapabilities = strtol(rts_argv[arg]+5, (char **) NULL, 10); + if (nCapabilities > proc) { nCapabilities = proc; } - if (nNodes <= 0) { + if (nCapabilities <= 0) { errorBelch("bad value for -maxN"); error = rtsTrue; } #if defined(PROFILING) - RtsFlags.ParFlags.nNodes = 1; + RtsFlags.ParFlags.nCapabilities = 1; #else - RtsFlags.ParFlags.nNodes = (uint32_t)nNodes; + RtsFlags.ParFlags.nCapabilities = (uint32_t)nCapabilities; #endif ) break; } else { @@ -1071,26 +1155,26 @@ error = rtsTrue; THREADED_BUILD_ONLY( if (rts_argv[arg][2] == '\0') { #if defined(PROFILING) - RtsFlags.ParFlags.nNodes = 1; + RtsFlags.ParFlags.nCapabilities = 1; #else - RtsFlags.ParFlags.nNodes = getNumberOfProcessors(); + RtsFlags.ParFlags.nCapabilities = getNumberOfProcessors(); #endif } else { - int nNodes; + int nCapabilities; OPTION_SAFE; /* but see extra checks below... */ - nNodes = strtol(rts_argv[arg]+2, (char **) NULL, 10); + nCapabilities = strtol(rts_argv[arg]+2, (char **) NULL, 10); - if (nNodes <= 0) { + if (nCapabilities <= 0) { errorBelch("bad value for -N"); error = rtsTrue; } if (rtsOptsEnabled == RtsOptsSafeOnly && - nNodes > (int)getNumberOfProcessors()) { + nCapabilities > (int)getNumberOfProcessors()) { errorRtsOptsDisabled("Using large values for -N is not allowed by default. %s"); stg_exit(EXIT_FAILURE); } - RtsFlags.ParFlags.nNodes = (uint32_t)nNodes; + RtsFlags.ParFlags.nCapabilities = (uint32_t)nCapabilities; } ) break; @@ -1395,7 +1479,7 @@ static void normaliseRtsOpts (void) } #ifdef THREADED_RTS - if (RtsFlags.ParFlags.parGcThreads > RtsFlags.ParFlags.nNodes) { + if (RtsFlags.ParFlags.parGcThreads > RtsFlags.ParFlags.nCapabilities) { errorBelch("GC threads (-qn) must be between 1 and the value of -N"); errorUsage(); } diff --git a/rts/SMPClosureOps.h b/rts/SMPClosureOps.h new file mode 100644 index 0000000000..39cde45544 --- /dev/null +++ b/rts/SMPClosureOps.h @@ -0,0 +1,129 @@ +/* ---------------------------------------------------------------------------- + * + * (c) The GHC Team, 2005-2013 + * + * Macros for THREADED_RTS support + * + * -------------------------------------------------------------------------- */ + +#ifndef RTS_STORAGE_SMPCLOSUREOPS_H +#define RTS_STORAGE_SMPCLOSUREOPS_H + +#include "BeginPrivate.h" + +#ifdef CMINUSMINUS + +/* Lock closure, equivalent to ccall lockClosure but the condition is inlined. + * Arguments are swapped for uniformity with unlockClosure. */ +#if defined(THREADED_RTS) +#define LOCK_CLOSURE(closure, info) \ + if (CInt[n_capabilities] == 1 :: CInt) { \ + info = GET_INFO(closure); \ + } else { \ + ("ptr" info) = ccall reallyLockClosure(closure "ptr"); \ + } +#else +#define LOCK_CLOSURE(closure, info) info = GET_INFO(closure) +#endif + +#define unlockClosure(ptr,info) \ + prim_write_barrier; \ + StgHeader_info(ptr) = info; + +#else + +INLINE_HEADER StgInfoTable *lockClosure(StgClosure *p); +EXTERN_INLINE StgInfoTable *reallyLockClosure(StgClosure *p); +EXTERN_INLINE StgInfoTable *tryLockClosure(StgClosure *p); +EXTERN_INLINE void unlockClosure(StgClosure *p, const StgInfoTable *info); + +#if defined(THREADED_RTS) + +/* ----------------------------------------------------------------------------- + * Locking/unlocking closures + * + * This is used primarily in the implementation of MVars. + * -------------------------------------------------------------------------- */ + +// We want a callable copy of reallyLockClosure() so that we can refer to it +// from .cmm files compiled using the native codegen, so these are given +// EXTERN_INLINE. C-- should use LOCK_CLOSURE not lockClosure, so we've +// kept it INLINE_HEADER. +EXTERN_INLINE StgInfoTable *reallyLockClosure(StgClosure *p) +{ + StgWord info; + do { + uint32_t i = 0; + do { + info = xchg((P_)(void *)&p->header.info, (W_)&stg_WHITEHOLE_info); + if (info != (W_)&stg_WHITEHOLE_info) return (StgInfoTable *)info; + } while (++i < SPIN_COUNT); + yieldThread(); + } while (1); +} + +INLINE_HEADER StgInfoTable *lockClosure(StgClosure *p) +{ + if (n_capabilities == 1) { + return (StgInfoTable *)p->header.info; + } + else { + return reallyLockClosure(p); + } +} + +// ToDo: consider splitting tryLockClosure into reallyTryLockClosure, +// same as lockClosure +EXTERN_INLINE StgInfoTable *tryLockClosure(StgClosure *p) +{ + StgWord info; + if (n_capabilities == 1) { + return (StgInfoTable *)p->header.info; + } + else { + info = xchg((P_)(void *)&p->header.info, (W_)&stg_WHITEHOLE_info); + if (info != (W_)&stg_WHITEHOLE_info) { + return (StgInfoTable *)info; + } else { + return NULL; + } + } +} + +#else /* !THREADED_RTS */ + +EXTERN_INLINE StgInfoTable * +reallyLockClosure(StgClosure *p) +{ return (StgInfoTable *)p->header.info; } + +INLINE_HEADER StgInfoTable * +lockClosure(StgClosure *p) +{ return (StgInfoTable *)p->header.info; } + +EXTERN_INLINE StgInfoTable * +tryLockClosure(StgClosure *p) +{ return (StgInfoTable *)p->header.info; } + +#endif /* THREADED_RTS */ + +EXTERN_INLINE void unlockClosure(StgClosure *p, const StgInfoTable *info) +{ + // This is a strictly ordered write, so we need a write_barrier(): + write_barrier(); + p->header.info = info; +} + +// Handy specialised versions of lockClosure()/unlockClosure() +INLINE_HEADER void lockTSO(StgTSO *tso); +INLINE_HEADER void lockTSO(StgTSO *tso) +{ lockClosure((StgClosure *)tso); } + +INLINE_HEADER void unlockTSO(StgTSO *tso); +INLINE_HEADER void unlockTSO(StgTSO *tso) +{ unlockClosure((StgClosure*)tso, (const StgInfoTable *)&stg_TSO_info); } + +#endif /* CMINUSMINUS */ + +#include "EndPrivate.h" + +#endif /* RTS_STORAGE_SMPCLOSUREOPS_H */ diff --git a/rts/STM.c b/rts/STM.c index 9cd0833550..d5c2713a6f 100644 --- a/rts/STM.c +++ b/rts/STM.c @@ -92,6 +92,7 @@ #include "Trace.h" #include "Threads.h" #include "sm/Storage.h" +#include "SMPClosureOps.h" #include diff --git a/rts/Schedule.c b/rts/Schedule.c index 8a08e35cc3..fca276dc08 100644 --- a/rts/Schedule.c +++ b/rts/Schedule.c @@ -726,7 +726,8 @@ schedulePushWork(Capability *cap USED_IF_THREADS, } while (n_wanted_caps < n_capabilities-1); } - // Grab free capabilities, starting from cap->no+1. + // First grab as many free Capabilities as we can. ToDo: we should use + // capabilities on the same NUMA node preferably, but not exclusively. for (i = (cap->no + 1) % n_capabilities, n_free_caps=0; n_free_caps < n_wanted_caps && i != cap->no; i = (i + 1) % n_capabilities) { @@ -1134,7 +1135,7 @@ scheduleHandleHeapOverflow( Capability *cap, StgTSO *t ) // nursery has only one // block. - bd = allocGroup_lock(blocks); + bd = allocGroupOnNode_lock(cap->node,blocks); cap->r.rNursery->n_blocks += blocks; // link the new group after CurrentNursery diff --git a/rts/Task.c b/rts/Task.c index 45ef77ba0b..9a827745ba 100644 --- a/rts/Task.c +++ b/rts/Task.c @@ -220,6 +220,7 @@ newTask (rtsBool worker) initCondition(&task->cond); initMutex(&task->lock); task->wakeup = rtsFalse; + task->node = 0; #endif task->next = NULL; @@ -427,6 +428,9 @@ workerStart(Task *task) if (RtsFlags.ParFlags.setAffinity) { setThreadAffinity(cap->no, n_capabilities); } + if (RtsFlags.GcFlags.numa && !RtsFlags.DebugFlags.numa) { + setThreadNode(RtsFlags.GcFlags.numaMap[task->node]); + } // set the thread-local pointer to the Task: setMyTask(task); @@ -457,6 +461,7 @@ startWorkerTask (Capability *cap) // We don't emit a task creation event here, but in workerStart, // where the kernel thread id is known. task->cap = cap; + task->node = cap->node; // Give the capability directly to the worker; we can't let anyone // else get in, because the new worker Task has nowhere to go to @@ -490,13 +495,27 @@ interruptWorkerTask (Task *task) #endif /* THREADED_RTS */ -void -setInCallCapability (int preferred_capability) +void rts_setInCallCapability ( + int preferred_capability, + int affinity USED_IF_THREADS) { Task *task = allocTask(); task->preferred_capability = preferred_capability; -} +#ifdef THREADED_RTS + if (affinity) { + if (RtsFlags.ParFlags.setAffinity) { + setThreadAffinity(preferred_capability, n_capabilities); + } + if (RtsFlags.GcFlags.numa) { + task->node = capNoToNumaNode(preferred_capability); + if (!DEBUG_IS_ON || !RtsFlags.DebugFlags.numa) { // faking NUMA + setThreadNode(RtsFlags.GcFlags.numaMap[task->node]); + } + } + } +#endif +} #ifdef DEBUG @@ -525,4 +544,3 @@ printAllTasks(void) } #endif - diff --git a/rts/Task.h b/rts/Task.h index 5fea2badda..558f543fac 100644 --- a/rts/Task.h +++ b/rts/Task.h @@ -115,6 +115,12 @@ typedef struct Task_ { #if defined(THREADED_RTS) OSThreadId id; // The OS Thread ID of this task + // The NUMA node this Task belongs to. If this is a worker thread, then the + // OS thread will be bound to this node (see workerStart()). If this is an + // external thread calling into Haskell, it can be bound to a node using + // rts_setInCallCapability(). + uint32_t node; + Condition cond; // used for sleeping & waking up this task Mutex lock; // lock for the condition variable diff --git a/rts/eventlog/EventLog.c b/rts/eventlog/EventLog.c index 9e2ed33d22..7cb69e8d7d 100644 --- a/rts/eventlog/EventLog.c +++ b/rts/eventlog/EventLog.c @@ -273,7 +273,7 @@ initEventLogging(void) */ #ifdef THREADED_RTS // XXX n_capabilities hasn't been initislised yet - n_caps = RtsFlags.ParFlags.nNodes; + n_caps = RtsFlags.ParFlags.nCapabilities; #else n_caps = 1; #endif diff --git a/rts/package.conf.in b/rts/package.conf.in index b52a8672d5..d2b728ea88 100644 --- a/rts/package.conf.in +++ b/rts/package.conf.in @@ -37,6 +37,9 @@ extra-libraries: #ifdef HAVE_LIBFFI , "ffi" #endif +#ifdef HAVE_LIBNUMA + , "numa" +#endif #ifdef mingw32_HOST_OS ,"wsock32" /* for the linker */ ,"gdi32" /* for the linker */ @@ -179,4 +182,3 @@ framework-dirs: haddock-interfaces: haddock-html: - diff --git a/rts/posix/OSMem.c b/rts/posix/OSMem.c index 5ff4bc86e4..a534219902 100644 --- a/rts/posix/OSMem.c +++ b/rts/posix/OSMem.c @@ -30,6 +30,12 @@ #ifdef HAVE_FCNTL_H #include #endif +#ifdef HAVE_NUMA_H +#include +#endif +#ifdef HAVE_NUMAIF_H +#include +#endif #include @@ -287,6 +293,7 @@ osGetMBlocks(uint32_t n) ret = gen_map_mblocks(size); } } + // Next time, we'll try to allocate right after the block we just got. // ToDo: check that we haven't already grabbed the memory at next_request next_request = (char *)ret + size; @@ -294,6 +301,31 @@ osGetMBlocks(uint32_t n) return ret; } +void osBindMBlocksToNode( + void *addr STG_UNUSED, + StgWord size STG_UNUSED, + uint32_t node STG_UNUSED) +{ +#ifdef HAVE_NUMAIF_H + int ret; + StgWord mask = 0; + mask |= 1 << node; + if (RtsFlags.GcFlags.numa) { + ret = mbind(addr, (unsigned long)size, + MPOL_BIND, &mask, sizeof(StgWord)*8, MPOL_MF_STRICT); + // paranoia: MPOL_BIND guarantees memory on the correct node; + // MPOL_MF_STRICT will tell us if it didn't work. We might want to + // relax these in due course, but I want to be sure it's doing what we + // want first. + if (ret != 0) { + sysErrorBelch("mbind"); + stg_exit(EXIT_FAILURE); + } + } +#endif +} + + void osFreeMBlocks(void *addr, uint32_t n) { munmap(addr, n * MBLOCK_SIZE); @@ -512,4 +544,36 @@ void osReleaseHeapMemory(void) sysErrorBelch("unable to release address space"); } +rtsBool osNumaAvailable(void) +{ +#ifdef HAVE_NUMA_H + return (numa_available() != -1); +#else + return rtsFalse; +#endif +} + +uint32_t osNumaNodes(void) +{ +#ifdef HAVE_NUMA_H + return numa_num_configured_nodes(); +#else + return 1; +#endif +} + +StgWord osNumaMask(void) +{ +#ifdef HAVE_NUMA_H + struct bitmask *mask; + mask = numa_get_mems_allowed(); + if (mask->size > sizeof(StgWord)*8) { + barf("Too many NUMA nodes"); + } + return mask->maskp[0]; +#else + return 1; +#endif +} + #endif diff --git a/rts/posix/OSThreads.c b/rts/posix/OSThreads.c index ad138d314d..72538c1bf3 100644 --- a/rts/posix/OSThreads.c +++ b/rts/posix/OSThreads.c @@ -70,6 +70,10 @@ # include #endif +#ifdef HAVE_NUMA_H +#include +#endif + /* * This (allegedly) OS threads independent layer was initially * abstracted away from code that used Pthreads, so the functions @@ -308,10 +312,32 @@ setThreadAffinity(uint32_t n, uint32_t m) #else void -setThreadAffinity (uint32_t n GNUC3_ATTRIBUTE(__unused__), - uint32_t m GNUC3_ATTRIBUTE(__unused__)) +setThreadAffinity (uint32_t n STG_UNUSED, + uint32_t m STG_UNUSED) +{ +} +#endif + +#ifdef HAVE_NUMA_H +void setThreadNode (uint32_t node) { + ASSERT(node < RtsFlags.GcFlags.nNumaNodes); + if (numa_run_on_node(node) == -1) { + sysErrorBelch("numa_run_on_node"); + stg_exit(1); + } +} + +void releaseThreadNode (void) +{ + if (numa_run_on_node(-1) == -1) { + sysErrorBelch("numa_run_on_node"); + stg_exit(1); + } } +#else +void setThreadNode (uint32_t node STG_UNUSED) { /* nothing */ } +void releaseThreadNode (void) { /* nothing */ } #endif void diff --git a/rts/sm/BlockAlloc.c b/rts/sm/BlockAlloc.c index ff1a6460a4..c2859b0c15 100644 --- a/rts/sm/BlockAlloc.c +++ b/rts/sm/BlockAlloc.c @@ -7,11 +7,11 @@ * This is the architecture independent part of the block allocator. * It requires only the following support from the operating system: * - * void *getMBlock(uint32_t n); + * void *getMBlocks(uint32_t n); * * returns the address of an n*MBLOCK_SIZE region of memory, aligned on * an MBLOCK_SIZE boundary. There are no other restrictions on the - * addresses of memory returned by getMBlock(). + * addresses of memory returned by getMBlocks(). * * ---------------------------------------------------------------------------*/ @@ -25,7 +25,7 @@ #include -static void initMBlock(void *mblock); +static void initMBlock(void *mblock, uint32_t node); /* ----------------------------------------------------------------------------- @@ -158,27 +158,54 @@ static void initMBlock(void *mblock); // In THREADED_RTS mode, the free list is protected by sm_mutex. -static bdescr *free_list[NUM_FREE_LISTS]; -static bdescr *free_mblock_list; +static bdescr *free_list[MAX_NUMA_NODES][NUM_FREE_LISTS]; +static bdescr *free_mblock_list[MAX_NUMA_NODES]; W_ n_alloc_blocks; // currently allocated blocks W_ hw_alloc_blocks; // high-water allocated blocks +W_ n_alloc_blocks_by_node[MAX_NUMA_NODES]; + /* ----------------------------------------------------------------------------- Initialisation -------------------------------------------------------------------------- */ void initBlockAllocator(void) { - uint32_t i; - for (i=0; i < NUM_FREE_LISTS; i++) { - free_list[i] = NULL; + uint32_t i, node; + for (node = 0; node < MAX_NUMA_NODES; node++) { + for (i=0; i < NUM_FREE_LISTS; i++) { + free_list[node][i] = NULL; + } + free_mblock_list[node] = NULL; + n_alloc_blocks_by_node[node] = 0; } - free_mblock_list = NULL; n_alloc_blocks = 0; hw_alloc_blocks = 0; } +/* ----------------------------------------------------------------------------- + Accounting + -------------------------------------------------------------------------- */ + +STATIC_INLINE +void recordAllocatedBlocks(uint32_t node, uint32_t n) +{ + n_alloc_blocks += n; + n_alloc_blocks_by_node[node] += n; + if (n > 0 && n_alloc_blocks > hw_alloc_blocks) { + hw_alloc_blocks = n_alloc_blocks; + } +} + +STATIC_INLINE +void recordFreedBlocks(uint32_t node, uint32_t n) +{ + ASSERT(n_alloc_blocks >= n); + n_alloc_blocks -= n; + n_alloc_blocks_by_node[node] -= n; +} + /* ----------------------------------------------------------------------------- Allocation -------------------------------------------------------------------------- */ @@ -248,14 +275,14 @@ log_2_ceil(W_ n) } STATIC_INLINE void -free_list_insert (bdescr *bd) +free_list_insert (uint32_t node, bdescr *bd) { uint32_t ln; ASSERT(bd->blocks < BLOCKS_PER_MBLOCK); ln = log_2(bd->blocks); - dbl_link_onto(bd, &free_list[ln]); + dbl_link_onto(bd, &free_list[node][ln]); } @@ -284,18 +311,18 @@ setup_tail (bdescr *bd) // Take a free block group bd, and split off a group of size n from // it. Adjust the free list as necessary, and return the new group. static bdescr * -split_free_block (bdescr *bd, W_ n, uint32_t ln) +split_free_block (bdescr *bd, uint32_t node, W_ n, uint32_t ln) { bdescr *fg; // free group ASSERT(bd->blocks > n); - dbl_link_remove(bd, &free_list[ln]); + dbl_link_remove(bd, &free_list[node][ln]); fg = bd + bd->blocks - n; // take n blocks off the end fg->blocks = n; bd->blocks -= n; setup_tail(bd); ln = log_2(bd->blocks); - dbl_link_onto(bd, &free_list[ln]); + dbl_link_onto(bd, &free_list[node][ln]); return fg; } @@ -304,7 +331,7 @@ split_free_block (bdescr *bd, W_ n, uint32_t ln) * initGroup afterwards. */ static bdescr * -alloc_mega_group (StgWord mblocks) +alloc_mega_group (uint32_t node, StgWord mblocks) { bdescr *best, *bd, *prev; StgWord n; @@ -313,14 +340,14 @@ alloc_mega_group (StgWord mblocks) best = NULL; prev = NULL; - for (bd = free_mblock_list; bd != NULL; prev = bd, bd = bd->link) + for (bd = free_mblock_list[node]; bd != NULL; prev = bd, bd = bd->link) { if (bd->blocks == n) { if (prev) { prev->link = bd->link; } else { - free_mblock_list = bd->link; + free_mblock_list[node] = bd->link; } return bd; } @@ -341,12 +368,17 @@ alloc_mega_group (StgWord mblocks) (best_mblocks-mblocks)*MBLOCK_SIZE); best->blocks = MBLOCK_GROUP_BLOCKS(best_mblocks - mblocks); - initMBlock(MBLOCK_ROUND_DOWN(bd)); + initMBlock(MBLOCK_ROUND_DOWN(bd), node); } else { - void *mblock = getMBlocks(mblocks); - initMBlock(mblock); // only need to init the 1st one + void *mblock; + if (RtsFlags.GcFlags.numa) { + mblock = getMBlocksOnNode(node, mblocks); + } else { + mblock = getMBlocks(mblocks); + } + initMBlock(mblock, node); // only need to init the 1st one bd = FIRST_BDESCR(mblock); } bd->blocks = MBLOCK_GROUP_BLOCKS(mblocks); @@ -354,7 +386,7 @@ alloc_mega_group (StgWord mblocks) } bdescr * -allocGroup (W_ n) +allocGroupOnNode (uint32_t node, W_ n) { bdescr *bd, *rem; StgWord ln; @@ -369,21 +401,19 @@ allocGroup (W_ n) // n_alloc_blocks doesn't count the extra blocks we get in a // megablock group. - n_alloc_blocks += mblocks * BLOCKS_PER_MBLOCK; - if (n_alloc_blocks > hw_alloc_blocks) hw_alloc_blocks = n_alloc_blocks; + recordAllocatedBlocks(node, mblocks * BLOCKS_PER_MBLOCK); - bd = alloc_mega_group(mblocks); + bd = alloc_mega_group(node, mblocks); // only the bdescrs of the first MB are required to be initialised initGroup(bd); goto finish; } - n_alloc_blocks += n; - if (n_alloc_blocks > hw_alloc_blocks) hw_alloc_blocks = n_alloc_blocks; + recordAllocatedBlocks(node, n); ln = log_2_ceil(n); - while (ln < NUM_FREE_LISTS && free_list[ln] == NULL) { + while (ln < NUM_FREE_LISTS && free_list[node][ln] == NULL) { ln++; } @@ -397,27 +427,27 @@ allocGroup (W_ n) } #endif - bd = alloc_mega_group(1); + bd = alloc_mega_group(node,1); bd->blocks = n; initGroup(bd); // we know the group will fit rem = bd + n; rem->blocks = BLOCKS_PER_MBLOCK-n; - initGroup(rem); // init the slop - n_alloc_blocks += rem->blocks; + initGroup(rem); // init the slop + recordAllocatedBlocks(node,rem->blocks); freeGroup(rem); // add the slop on to the free list goto finish; } - bd = free_list[ln]; + bd = free_list[node][ln]; if (bd->blocks == n) // exactly the right size! { - dbl_link_remove(bd, &free_list[ln]); + dbl_link_remove(bd, &free_list[node][ln]); initGroup(bd); } else if (bd->blocks > n) // block too big... { - bd = split_free_block(bd, n, ln); + bd = split_free_block(bd, node, n, ln); ASSERT(bd->blocks == n); initGroup(bd); } @@ -432,6 +462,26 @@ finish: return bd; } +STATIC_INLINE +uint32_t nodeWithLeastBlocks (void) +{ + uint32_t node = 0, i; + uint32_t min_blocks = n_alloc_blocks_by_node[0]; + for (i = 1; i < RtsFlags.GcFlags.nNumaNodes; i++) { + if (n_alloc_blocks_by_node[i] < min_blocks) { + min_blocks = n_alloc_blocks_by_node[i]; + node = i; + } + } + return node; +} + +bdescr* allocGroup (W_ n) +{ + return allocGroupOnNode(nodeWithLeastBlocks(),n); +} + + // // Allocate a chunk of blocks that is at least min and at most max // blocks in size. This API is used by the nursery allocator that @@ -448,8 +498,7 @@ finish: // fragmentation, but we make sure that we allocate large blocks // preferably if there are any. // -bdescr * -allocLargeChunk (W_ min, W_ max) +bdescr* allocLargeChunkOnNode (uint32_t node, W_ min, W_ max) { bdescr *bd; StgWord ln, lnmax; @@ -461,34 +510,38 @@ allocLargeChunk (W_ min, W_ max) ln = log_2_ceil(min); lnmax = log_2_ceil(max); - while (ln < NUM_FREE_LISTS && ln < lnmax && free_list[ln] == NULL) { + while (ln < NUM_FREE_LISTS && ln < lnmax && free_list[node][ln] == NULL) { ln++; } if (ln == NUM_FREE_LISTS || ln == lnmax) { - return allocGroup(max); + return allocGroupOnNode(node,max); } - bd = free_list[ln]; + bd = free_list[node][ln]; if (bd->blocks <= max) // exactly the right size! { - dbl_link_remove(bd, &free_list[ln]); + dbl_link_remove(bd, &free_list[node][ln]); initGroup(bd); } else // block too big... { - bd = split_free_block(bd, max, ln); + bd = split_free_block(bd, node, max, ln); ASSERT(bd->blocks == max); initGroup(bd); } - n_alloc_blocks += bd->blocks; - if (n_alloc_blocks > hw_alloc_blocks) hw_alloc_blocks = n_alloc_blocks; + recordAllocatedBlocks(node, bd->blocks); IF_DEBUG(sanity, memset(bd->start, 0xaa, bd->blocks * BLOCK_SIZE)); IF_DEBUG(sanity, checkFreeListSanity()); return bd; } +bdescr* allocLargeChunk (W_ min, W_ max) +{ + return allocLargeChunkOnNode(nodeWithLeastBlocks(), min, max); +} + bdescr * allocGroup_lock(W_ n) { @@ -500,17 +553,31 @@ allocGroup_lock(W_ n) } bdescr * -allocBlock(void) +allocBlock_lock(void) { - return allocGroup(1); + bdescr *bd; + ACQUIRE_SM_LOCK; + bd = allocBlock(); + RELEASE_SM_LOCK; + return bd; } bdescr * -allocBlock_lock(void) +allocGroupOnNode_lock(uint32_t node, W_ n) { bdescr *bd; ACQUIRE_SM_LOCK; - bd = allocBlock(); + bd = allocGroupOnNode(node,n); + RELEASE_SM_LOCK; + return bd; +} + +bdescr * +allocBlockOnNode_lock(uint32_t node) +{ + bdescr *bd; + ACQUIRE_SM_LOCK; + bd = allocBlockOnNode(node); RELEASE_SM_LOCK; return bd; } @@ -542,11 +609,13 @@ static void free_mega_group (bdescr *mg) { bdescr *bd, *prev; + uint32_t node; // Find the right place in the free list. free_mblock_list is // sorted by *address*, not by size as the free_list is. prev = NULL; - bd = free_mblock_list; + node = mg->node; + bd = free_mblock_list[node]; while (bd && bd->start < mg->start) { prev = bd; bd = bd->link; @@ -561,8 +630,8 @@ free_mega_group (bdescr *mg) } else { - mg->link = free_mblock_list; - free_mblock_list = mg; + mg->link = free_mblock_list[node]; + free_mblock_list[node] = mg; } // coalesce forwards coalesce_mblocks(mg); @@ -575,12 +644,15 @@ void freeGroup(bdescr *p) { StgWord ln; + uint32_t node; - // Todo: not true in multithreaded GC + // not true in multithreaded GC: // ASSERT_SM_LOCK(); ASSERT(p->free != (P_)-1); + node = p->node; + p->free = (void *)-1; /* indicates that this block is free */ p->gen = NULL; p->gen_no = 0; @@ -597,14 +669,13 @@ freeGroup(bdescr *p) // If this is an mgroup, make sure it has the right number of blocks ASSERT(p->blocks == MBLOCK_GROUP_BLOCKS(mblocks)); - n_alloc_blocks -= mblocks * BLOCKS_PER_MBLOCK; + recordFreedBlocks(node, mblocks * BLOCKS_PER_MBLOCK); free_mega_group(p); return; } - ASSERT(n_alloc_blocks >= p->blocks); - n_alloc_blocks -= p->blocks; + recordFreedBlocks(node, p->blocks); // coalesce forwards { @@ -614,7 +685,7 @@ freeGroup(bdescr *p) { p->blocks += next->blocks; ln = log_2(next->blocks); - dbl_link_remove(next, &free_list[ln]); + dbl_link_remove(next, &free_list[node][ln]); if (p->blocks == BLOCKS_PER_MBLOCK) { free_mega_group(p); @@ -634,7 +705,7 @@ freeGroup(bdescr *p) if (prev->free == (P_)-1) { ln = log_2(prev->blocks); - dbl_link_remove(prev, &free_list[ln]); + dbl_link_remove(prev, &free_list[node][ln]); prev->blocks += p->blocks; if (prev->blocks >= BLOCKS_PER_MBLOCK) { @@ -646,7 +717,7 @@ freeGroup(bdescr *p) } setup_tail(p); - free_list_insert(p); + free_list_insert(node,p); IF_DEBUG(sanity, checkFreeListSanity()); } @@ -679,7 +750,7 @@ freeChain_lock(bdescr *bd) } static void -initMBlock(void *mblock) +initMBlock(void *mblock, uint32_t node) { bdescr *bd; StgWord8 *block; @@ -695,6 +766,7 @@ initMBlock(void *mblock) for (; block <= (StgWord8*)LAST_BLOCK(mblock); bd += 1, block += BLOCK_SIZE) { bd->start = (void*)block; + bd->node = node; } } @@ -734,28 +806,32 @@ countAllocdBlocks(bdescr *bd) void returnMemoryToOS(uint32_t n /* megablocks */) { - static bdescr *bd; + bdescr *bd; + uint32_t node; StgWord size; - bd = free_mblock_list; - while ((n > 0) && (bd != NULL)) { - size = BLOCKS_TO_MBLOCKS(bd->blocks); - if (size > n) { - StgWord newSize = size - n; - char *freeAddr = MBLOCK_ROUND_DOWN(bd->start); - freeAddr += newSize * MBLOCK_SIZE; - bd->blocks = MBLOCK_GROUP_BLOCKS(newSize); - freeMBlocks(freeAddr, n); - n = 0; - } - else { - char *freeAddr = MBLOCK_ROUND_DOWN(bd->start); - n -= size; - bd = bd->link; - freeMBlocks(freeAddr, size); + // ToDo: not fair, we free all the memory starting with node 0. + for (node = 0; n > 0 && node < RtsFlags.GcFlags.nNumaNodes; node++) { + bd = free_mblock_list[node]; + while ((n > 0) && (bd != NULL)) { + size = BLOCKS_TO_MBLOCKS(bd->blocks); + if (size > n) { + StgWord newSize = size - n; + char *freeAddr = MBLOCK_ROUND_DOWN(bd->start); + freeAddr += newSize * MBLOCK_SIZE; + bd->blocks = MBLOCK_GROUP_BLOCKS(newSize); + freeMBlocks(freeAddr, n); + n = 0; + } + else { + char *freeAddr = MBLOCK_ROUND_DOWN(bd->start); + n -= size; + bd = bd->link; + freeMBlocks(freeAddr, size); + } } + free_mblock_list[node] = bd; } - free_mblock_list = bd; // Ask the OS to release any address space portion // that was associated with the just released MBlocks @@ -797,68 +873,71 @@ checkFreeListSanity(void) { bdescr *bd, *prev; StgWord ln, min; + uint32_t node; - - min = 1; - for (ln = 0; ln < NUM_FREE_LISTS; ln++) { - IF_DEBUG(block_alloc, - debugBelch("free block list [%" FMT_Word "]:\n", ln)); - - prev = NULL; - for (bd = free_list[ln]; bd != NULL; prev = bd, bd = bd->link) - { + for (node = 0; node < RtsFlags.GcFlags.nNumaNodes; node++) { + min = 1; + for (ln = 0; ln < NUM_FREE_LISTS; ln++) { IF_DEBUG(block_alloc, - debugBelch("group at %p, length %ld blocks\n", - bd->start, (long)bd->blocks)); - ASSERT(bd->free == (P_)-1); - ASSERT(bd->blocks > 0 && bd->blocks < BLOCKS_PER_MBLOCK); - ASSERT(bd->blocks >= min && bd->blocks <= (min*2 - 1)); - ASSERT(bd->link != bd); // catch easy loops - - check_tail(bd); - - if (prev) - ASSERT(bd->u.back == prev); - else - ASSERT(bd->u.back == NULL); + debugBelch("free block list [%" FMT_Word "]:\n", ln)); + prev = NULL; + for (bd = free_list[node][ln]; bd != NULL; prev = bd, bd = bd->link) { - bdescr *next; - next = bd + bd->blocks; - if (next <= LAST_BDESCR(MBLOCK_ROUND_DOWN(bd))) + IF_DEBUG(block_alloc, + debugBelch("group at %p, length %ld blocks\n", + bd->start, (long)bd->blocks)); + ASSERT(bd->free == (P_)-1); + ASSERT(bd->blocks > 0 && bd->blocks < BLOCKS_PER_MBLOCK); + ASSERT(bd->blocks >= min && bd->blocks <= (min*2 - 1)); + ASSERT(bd->link != bd); // catch easy loops + ASSERT(bd->node == node); + + check_tail(bd); + + if (prev) + ASSERT(bd->u.back == prev); + else + ASSERT(bd->u.back == NULL); + { - ASSERT(next->free != (P_)-1); + bdescr *next; + next = bd + bd->blocks; + if (next <= LAST_BDESCR(MBLOCK_ROUND_DOWN(bd))) + { + ASSERT(next->free != (P_)-1); + } } } + min = min << 1; } - min = min << 1; - } - prev = NULL; - for (bd = free_mblock_list; bd != NULL; prev = bd, bd = bd->link) - { - IF_DEBUG(block_alloc, - debugBelch("mega group at %p, length %ld blocks\n", - bd->start, (long)bd->blocks)); + prev = NULL; + for (bd = free_mblock_list[node]; bd != NULL; prev = bd, bd = bd->link) + { + IF_DEBUG(block_alloc, + debugBelch("mega group at %p, length %ld blocks\n", + bd->start, (long)bd->blocks)); - ASSERT(bd->link != bd); // catch easy loops + ASSERT(bd->link != bd); // catch easy loops - if (bd->link != NULL) - { - // make sure the list is sorted - ASSERT(bd->start < bd->link->start); - } + if (bd->link != NULL) + { + // make sure the list is sorted + ASSERT(bd->start < bd->link->start); + } - ASSERT(bd->blocks >= BLOCKS_PER_MBLOCK); - ASSERT(MBLOCK_GROUP_BLOCKS(BLOCKS_TO_MBLOCKS(bd->blocks)) - == bd->blocks); + ASSERT(bd->blocks >= BLOCKS_PER_MBLOCK); + ASSERT(MBLOCK_GROUP_BLOCKS(BLOCKS_TO_MBLOCKS(bd->blocks)) + == bd->blocks); - // make sure we're fully coalesced - if (bd->link != NULL) - { - ASSERT(MBLOCK_ROUND_DOWN(bd->link) != - (StgWord8*)MBLOCK_ROUND_DOWN(bd) + - BLOCKS_TO_MBLOCKS(bd->blocks) * MBLOCK_SIZE); + // make sure we're fully coalesced + if (bd->link != NULL) + { + ASSERT(MBLOCK_ROUND_DOWN(bd->link) != + (StgWord8*)MBLOCK_ROUND_DOWN(bd) + + BLOCKS_TO_MBLOCKS(bd->blocks) * MBLOCK_SIZE); + } } } } @@ -869,18 +948,21 @@ countFreeList(void) bdescr *bd; W_ total_blocks = 0; StgWord ln; + uint32_t node; - for (ln=0; ln < NUM_FREE_LISTS; ln++) { - for (bd = free_list[ln]; bd != NULL; bd = bd->link) { - total_blocks += bd->blocks; + for (node = 0; node < RtsFlags.GcFlags.nNumaNodes; node++) { + for (ln=0; ln < NUM_FREE_LISTS; ln++) { + for (bd = free_list[node][ln]; bd != NULL; bd = bd->link) { + total_blocks += bd->blocks; + } + } + for (bd = free_mblock_list[node]; bd != NULL; bd = bd->link) { + total_blocks += BLOCKS_PER_MBLOCK * BLOCKS_TO_MBLOCKS(bd->blocks); + // The caller of this function, memInventory(), expects to match + // the total number of blocks in the system against mblocks * + // BLOCKS_PER_MBLOCK, so we must subtract the space for the + // block descriptors from *every* mblock. } - } - for (bd = free_mblock_list; bd != NULL; bd = bd->link) { - total_blocks += BLOCKS_PER_MBLOCK * BLOCKS_TO_MBLOCKS(bd->blocks); - // The caller of this function, memInventory(), expects to match - // the total number of blocks in the system against mblocks * - // BLOCKS_PER_MBLOCK, so we must subtract the space for the - // block descriptors from *every* mblock. } return total_blocks; } diff --git a/rts/sm/BlockAlloc.h b/rts/sm/BlockAlloc.h index 2ba7c02c08..c26ae104e1 100644 --- a/rts/sm/BlockAlloc.h +++ b/rts/sm/BlockAlloc.h @@ -12,6 +12,7 @@ #include "BeginPrivate.h" bdescr *allocLargeChunk (W_ min, W_ max); +bdescr *allocLargeChunkOnNode (uint32_t node, W_ min, W_ max); /* Debugging -------------------------------------------------------------- */ diff --git a/rts/sm/GC.c b/rts/sm/GC.c index 996ce8cbce..3bfdaa25ff 100644 --- a/rts/sm/GC.c +++ b/rts/sm/GC.c @@ -802,7 +802,8 @@ new_gc_thread (uint32_t n, gc_thread *t) // but can't, because it uses gct which isn't set up at this point. // Hence, allocate a block for todo_bd manually: { - bdescr *bd = allocBlock(); // no lock, locks aren't initialised yet + bdescr *bd = allocBlockOnNode(capNoToNumaNode(n)); + // no lock, locks aren't initialised yet initBdescr(bd, ws->gen, ws->gen->to); bd->flags = BF_EVACUATED; bd->u.scan = bd->free = bd->start; @@ -1182,7 +1183,8 @@ prepare_collected_gen (generation *gen) if (g != 0) { for (i = 0; i < n_capabilities; i++) { freeChain(capabilities[i]->mut_lists[g]); - capabilities[i]->mut_lists[g] = allocBlock(); + capabilities[i]->mut_lists[g] = + allocBlockOnNode(capNoToNumaNode(i)); } } @@ -1296,7 +1298,7 @@ static void stash_mut_list (Capability *cap, uint32_t gen_no) { cap->saved_mut_lists[gen_no] = cap->mut_lists[gen_no]; - cap->mut_lists[gen_no] = allocBlock_sync(); + cap->mut_lists[gen_no] = allocBlockOnNode_sync(cap->node); } /* ---------------------------------------------------------------------------- diff --git a/rts/sm/GCUtils.c b/rts/sm/GCUtils.c index 5edf9dedbc..a515665d07 100644 --- a/rts/sm/GCUtils.c +++ b/rts/sm/GCUtils.c @@ -30,34 +30,33 @@ SpinLock gc_alloc_block_sync; #endif -bdescr * -allocBlock_sync(void) +bdescr* allocGroup_sync(uint32_t n) { bdescr *bd; + uint32_t node = capNoToNumaNode(gct->thread_index); ACQUIRE_SPIN_LOCK(&gc_alloc_block_sync); - bd = allocBlock(); + bd = allocGroupOnNode(node,n); RELEASE_SPIN_LOCK(&gc_alloc_block_sync); return bd; } -static bdescr * -allocGroup_sync(uint32_t n) +bdescr* allocGroupOnNode_sync(uint32_t node, uint32_t n) { bdescr *bd; ACQUIRE_SPIN_LOCK(&gc_alloc_block_sync); - bd = allocGroup(n); + bd = allocGroupOnNode(node,n); RELEASE_SPIN_LOCK(&gc_alloc_block_sync); return bd; } - static uint32_t allocBlocks_sync(uint32_t n, bdescr **hd) { bdescr *bd; uint32_t i; + uint32_t node = capNoToNumaNode(gct->thread_index); ACQUIRE_SPIN_LOCK(&gc_alloc_block_sync); - bd = allocLargeChunk(1,n); + bd = allocLargeChunkOnNode(node,1,n); // NB. allocLargeChunk, rather than allocGroup(n), to allocate in a // fragmentation-friendly way. n = bd->blocks; diff --git a/rts/sm/GCUtils.h b/rts/sm/GCUtils.h index 0f87eee3f1..7e5a827ce0 100644 --- a/rts/sm/GCUtils.h +++ b/rts/sm/GCUtils.h @@ -18,7 +18,19 @@ #include "GCTDecl.h" -bdescr *allocBlock_sync(void); +bdescr* allocGroup_sync(uint32_t n); +bdescr* allocGroupOnNode_sync(uint32_t node, uint32_t n); + +INLINE_HEADER bdescr *allocBlock_sync(void) +{ + return allocGroup_sync(1); +} + +INLINE_HEADER bdescr *allocBlockOnNode_sync(uint32_t node) +{ + return allocGroupOnNode_sync(node,1); +} + void freeChain_sync(bdescr *bd); void push_scanned_block (bdescr *bd, gen_workspace *ws); diff --git a/rts/sm/MBlock.c b/rts/sm/MBlock.c index 440b03efa7..53999d2c4b 100644 --- a/rts/sm/MBlock.c +++ b/rts/sm/MBlock.c @@ -566,7 +566,7 @@ void releaseFreeMemory(void) void * getMBlock(void) { - return getMBlocks(1); + return getMBlocks(1); } // The external interface: allocate 'n' mblocks, and return the @@ -587,6 +587,23 @@ getMBlocks(uint32_t n) return ret; } +void * +getMBlocksOnNode(uint32_t node, uint32_t n) +{ + void *addr = getMBlocks(n); +#ifdef DEBUG + if (RtsFlags.DebugFlags.numa) return addr; // faking NUMA +#endif + osBindMBlocksToNode(addr, n * MBLOCK_SIZE, RtsFlags.GcFlags.numaMap[node]); + return addr; +} + +void * +getMBlockOnNode(uint32_t node) +{ + return getMBlocksOnNode(node, 1); +} + void freeMBlocks(void *addr, uint32_t n) { diff --git a/rts/sm/MarkStack.h b/rts/sm/MarkStack.h index f978a32563..d90b5e47b4 100644 --- a/rts/sm/MarkStack.h +++ b/rts/sm/MarkStack.h @@ -15,6 +15,7 @@ #define SM_MARKSTACK_H #include "BeginPrivate.h" +#include "GCUtils.h" INLINE_HEADER void push_mark_stack(StgPtr p) diff --git a/rts/sm/OSMem.h b/rts/sm/OSMem.h index 8518f05d1b..660942827d 100644 --- a/rts/sm/OSMem.h +++ b/rts/sm/OSMem.h @@ -19,6 +19,10 @@ void osFreeAllMBlocks(void); size_t getPageSize (void); StgWord64 getPhysicalMemorySize (void); void setExecutable (void *p, W_ len, rtsBool exec); +rtsBool osNumaAvailable(void); +uint32_t osNumaNodes(void); +StgWord osNumaMask(void); +void osBindMBlocksToNode(void *addr, StgWord size, uint32_t node); INLINE_HEADER size_t roundDownToPage (size_t x) diff --git a/rts/sm/Storage.c b/rts/sm/Storage.c index 717c96ae81..a9a7857d43 100644 --- a/rts/sm/Storage.c +++ b/rts/sm/Storage.c @@ -6,7 +6,7 @@ * * Documentation on the architecture of the Storage Manager can be * found in the online commentary: - * + * * http://ghc.haskell.org/trac/ghc/wiki/Commentary/Rts/Storage * * ---------------------------------------------------------------------------*/ @@ -37,7 +37,7 @@ #include "ffi.h" -/* +/* * All these globals require sm_mutex to access in THREADED_RTS mode. */ StgIndStatic *dyn_caf_list = NULL; @@ -54,9 +54,22 @@ generation *generations = NULL; /* all the generations */ generation *g0 = NULL; /* generation 0, for convenience */ generation *oldest_gen = NULL; /* oldest generation, for convenience */ -nursery *nurseries = NULL; /* array of nurseries, size == n_capabilities */ +/* + * Array of nurseries, size == n_capabilities + * + * nursery[i] belongs to NUMA node (i % RtsFlags.GcFlags.nNumaNodes) + * This is chosen to be the same convention as capabilities[i], so + * that when not using nursery chunks (+RTS -n), we just map + * capabilities to nurseries 1:1. + */ +nursery *nurseries = NULL; uint32_t n_nurseries; -volatile StgWord next_nursery = 0; + +/* + * When we are using nursery chunks, we need a separate next_nursery + * pointer for each NUMA node. + */ +volatile StgWord next_nursery[MAX_NUMA_NODES]; #ifdef THREADED_RTS /* @@ -104,7 +117,7 @@ initGeneration (generation *gen, int g) void initStorage (void) { - uint32_t g; + uint32_t g, n; if (generations != NULL) { // multi-init protection @@ -120,22 +133,22 @@ initStorage (void) ASSERT(LOOKS_LIKE_INFO_PTR_NOT_NULL((StgWord)&stg_BLOCKING_QUEUE_CLEAN_info)); ASSERT(LOOKS_LIKE_CLOSURE_PTR(&stg_dummy_ret_closure)); ASSERT(!HEAP_ALLOCED(&stg_dummy_ret_closure)); - + if (RtsFlags.GcFlags.maxHeapSize != 0 && - RtsFlags.GcFlags.heapSizeSuggestion > + RtsFlags.GcFlags.heapSizeSuggestion > RtsFlags.GcFlags.maxHeapSize) { RtsFlags.GcFlags.maxHeapSize = RtsFlags.GcFlags.heapSizeSuggestion; } if (RtsFlags.GcFlags.maxHeapSize != 0 && - RtsFlags.GcFlags.minAllocAreaSize > + RtsFlags.GcFlags.minAllocAreaSize > RtsFlags.GcFlags.maxHeapSize) { errorBelch("maximum heap size (-M) is smaller than minimum alloc area size (-A)"); RtsFlags.GcFlags.minAllocAreaSize = RtsFlags.GcFlags.maxHeapSize; } initBlockAllocator(); - + #if defined(THREADED_RTS) initMutex(&sm_mutex); #endif @@ -143,7 +156,7 @@ initStorage (void) ACQUIRE_SM_LOCK; /* allocate generation info array */ - generations = (generation *)stgMallocBytes(RtsFlags.GcFlags.generations + generations = (generation *)stgMallocBytes(RtsFlags.GcFlags.generations * sizeof(struct generation_), "initStorage: gens"); @@ -161,7 +174,7 @@ initStorage (void) generations[g].to = &generations[g+1]; } oldest_gen->to = oldest_gen; - + /* The oldest generation has one step. */ if (RtsFlags.GcFlags.compact || RtsFlags.GcFlags.sweep) { if (RtsFlags.GcFlags.generations == 1) { @@ -178,7 +191,7 @@ initStorage (void) dyn_caf_list = (StgIndStatic*)END_OF_CAF_LIST; debug_caf_list = (StgIndStatic*)END_OF_CAF_LIST; revertible_caf_list = (StgIndStatic*)END_OF_CAF_LIST; - + if (RtsFlags.GcFlags.largeAllocLim > 0) { large_alloc_lim = RtsFlags.GcFlags.largeAllocLim * BLOCK_SIZE_W; } else { @@ -196,7 +209,9 @@ initStorage (void) N = 0; - next_nursery = 0; + for (n = 0; n < RtsFlags.GcFlags.nNumaNodes; n++) { + next_nursery[n] = n; + } storageAddCapabilities(0, n_capabilities); IF_DEBUG(gc, statDescribeGens()); @@ -257,7 +272,8 @@ void storageAddCapabilities (uint32_t from, uint32_t to) // allocate a block for each mut list for (n = from; n < to; n++) { for (g = 1; g < RtsFlags.GcFlags.generations; g++) { - capabilities[n]->mut_lists[g] = allocBlock(); + capabilities[n]->mut_lists[g] = + allocBlockOnNode(capNoToNumaNode(n)); } } @@ -526,7 +542,7 @@ StgInd* newGCdCAF (StgRegTable *reg, StgIndStatic *caf) -------------------------------------------------------------------------- */ static bdescr * -allocNursery (bdescr *tail, W_ blocks) +allocNursery (uint32_t node, bdescr *tail, W_ blocks) { bdescr *bd = NULL; W_ i, n; @@ -542,7 +558,7 @@ allocNursery (bdescr *tail, W_ blocks) // allocLargeChunk will prefer large chunks, but will pick up // small chunks if there are any available. We must allow // single blocks here to avoid fragmentation (#7257) - bd = allocLargeChunk(1, n); + bd = allocLargeChunkOnNode(node, 1, n); n = bd->blocks; blocks -= n; @@ -584,6 +600,7 @@ assignNurseryToCapability (Capability *cap, uint32_t n) cap->r.rCurrentNursery = nurseries[n].blocks; newNurseryBlock(nurseries[n].blocks); cap->r.rCurrentAlloc = NULL; + ASSERT(cap->r.rCurrentNursery->node == cap->node); } /* @@ -593,16 +610,18 @@ assignNurseryToCapability (Capability *cap, uint32_t n) static void assignNurseriesToCapabilities (uint32_t from, uint32_t to) { - uint32_t i; + uint32_t i, node; for (i = from; i < to; i++) { - assignNurseryToCapability(capabilities[i], next_nursery++); + node = capabilities[i]->node; + assignNurseryToCapability(capabilities[i], next_nursery[node]); + next_nursery[node] += RtsFlags.GcFlags.nNumaNodes; } } static void allocNurseries (uint32_t from, uint32_t to) -{ +{ uint32_t i; memcount n_blocks; @@ -613,24 +632,28 @@ allocNurseries (uint32_t from, uint32_t to) } for (i = from; i < to; i++) { - nurseries[i].blocks = allocNursery(NULL, n_blocks); + nurseries[i].blocks = allocNursery(capNoToNumaNode(i), NULL, n_blocks); nurseries[i].n_blocks = n_blocks; } } - + void resetNurseries (void) { - next_nursery = 0; + uint32_t n; + + for (n = 0; n < RtsFlags.GcFlags.nNumaNodes; n++) { + next_nursery[n] = n; + } assignNurseriesToCapabilities(0, n_capabilities); #ifdef DEBUG bdescr *bd; - uint32_t n; for (n = 0; n < n_nurseries; n++) { for (bd = nurseries[n].blocks; bd; bd = bd->link) { ASSERT(bd->gen_no == 0); ASSERT(bd->gen == g0); + ASSERT(bd->node == capNoToNumaNode(n)); IF_DEBUG(sanity, memset(bd->start, 0xaa, BLOCK_SIZE)); } } @@ -649,56 +672,54 @@ countNurseryBlocks (void) return blocks; } -static void -resizeNursery (nursery *nursery, W_ blocks) -{ - bdescr *bd; - W_ nursery_blocks; - - nursery_blocks = nursery->n_blocks; - if (nursery_blocks == blocks) return; - - if (nursery_blocks < blocks) { - debugTrace(DEBUG_gc, "increasing size of nursery to %d blocks", - blocks); - nursery->blocks = allocNursery(nursery->blocks, blocks-nursery_blocks); - } - else { - bdescr *next_bd; - - debugTrace(DEBUG_gc, "decreasing size of nursery to %d blocks", - blocks); - - bd = nursery->blocks; - while (nursery_blocks > blocks) { - next_bd = bd->link; - next_bd->u.back = NULL; - nursery_blocks -= bd->blocks; // might be a large block - freeGroup(bd); - bd = next_bd; - } - nursery->blocks = bd; - // might have gone just under, by freeing a large block, so make - // up the difference. - if (nursery_blocks < blocks) { - nursery->blocks = allocNursery(nursery->blocks, blocks-nursery_blocks); - } - } - - nursery->n_blocks = blocks; - ASSERT(countBlocks(nursery->blocks) == nursery->n_blocks); -} - -// +// // Resize each of the nurseries to the specified size. // static void resizeNurseriesEach (W_ blocks) { - uint32_t i; + uint32_t i, node; + bdescr *bd; + W_ nursery_blocks; + nursery *nursery; for (i = 0; i < n_nurseries; i++) { - resizeNursery(&nurseries[i], blocks); + nursery = &nurseries[i]; + nursery_blocks = nursery->n_blocks; + if (nursery_blocks == blocks) continue; + + node = capNoToNumaNode(i); + if (nursery_blocks < blocks) { + debugTrace(DEBUG_gc, "increasing size of nursery to %d blocks", + blocks); + nursery->blocks = allocNursery(node, nursery->blocks, + blocks-nursery_blocks); + } + else + { + bdescr *next_bd; + + debugTrace(DEBUG_gc, "decreasing size of nursery to %d blocks", + blocks); + + bd = nursery->blocks; + while (nursery_blocks > blocks) { + next_bd = bd->link; + next_bd->u.back = NULL; + nursery_blocks -= bd->blocks; // might be a large block + freeGroup(bd); + bd = next_bd; + } + nursery->blocks = bd; + // might have gone just under, by freeing a large block, so make + // up the difference. + if (nursery_blocks < blocks) { + nursery->blocks = allocNursery(node, nursery->blocks, + blocks-nursery_blocks); + } + } + nursery->n_blocks = blocks; + ASSERT(countBlocks(nursery->blocks) == nursery->n_blocks); } } @@ -716,7 +737,7 @@ resizeNurseriesFixed (void) resizeNurseriesEach(blocks); } -// +// // Resize the nurseries to the total specified size. // void @@ -731,18 +752,42 @@ rtsBool getNewNursery (Capability *cap) { StgWord i; + uint32_t node = cap->node; + uint32_t n; for(;;) { - i = next_nursery; - if (i >= n_nurseries) { + i = next_nursery[node]; + if (i < n_nurseries) { + if (cas(&next_nursery[node], i, + i+RtsFlags.GcFlags.nNumaNodes) == i) { + assignNurseryToCapability(cap, i); + return rtsTrue; + } + } else if (RtsFlags.GcFlags.nNumaNodes > 1) { + // Try to find an unused nursery chunk on other nodes. We'll get + // remote memory, but the rationale is that avoiding GC is better + // than avoiding remote memory access. + rtsBool lost = rtsFalse; + for (n = 0; n < RtsFlags.GcFlags.nNumaNodes; n++) { + if (n == node) continue; + i = next_nursery[n]; + if (i < n_nurseries) { + if (cas(&next_nursery[n], i, + i+RtsFlags.GcFlags.nNumaNodes) == i) { + assignNurseryToCapability(cap, i); + return rtsTrue; + } else { + lost = rtsTrue; /* lost a race */ + } + } + } + if (!lost) return rtsFalse; + } else { return rtsFalse; } - if (cas(&next_nursery, i, i+1) == i) { - assignNurseryToCapability(cap, i); - return rtsTrue; - } } } + /* ----------------------------------------------------------------------------- move_STACK is called to update the TSO structure after it has been moved from one place to another. @@ -753,8 +798,8 @@ move_STACK (StgStack *src, StgStack *dest) { ptrdiff_t diff; - // relocate the stack pointer... - diff = (StgPtr)dest - (StgPtr)src; // In *words* + // relocate the stack pointer... + diff = (StgPtr)dest - (StgPtr)src; // In *words* dest->sp = (StgPtr)dest->sp + diff; } @@ -818,7 +863,7 @@ allocate (Capability *cap, W_ n) } ACQUIRE_SM_LOCK - bd = allocGroup(req_blocks); + bd = allocGroupOnNode(cap->node,req_blocks); dbl_link_onto(bd, &g0->large_objects); g0->n_large_blocks += bd->blocks; // might be larger than req_blocks g0->n_new_large_words += n; @@ -834,19 +879,19 @@ allocate (Capability *cap, W_ n) bd = cap->r.rCurrentAlloc; if (bd == NULL || bd->free + n > bd->start + BLOCK_SIZE_W) { - + if (bd) finishedNurseryBlock(cap,bd); // The CurrentAlloc block is full, we need to find another // one. First, we try taking the next block from the // nursery: bd = cap->r.rCurrentNursery->link; - + if (bd == NULL) { // The nursery is empty: allocate a fresh block (we can't // fail here). ACQUIRE_SM_LOCK; - bd = allocBlock(); + bd = allocBlockOnNode(cap->node); cap->r.rNursery->n_blocks++; RELEASE_SM_LOCK; initBdescr(bd, g0, g0); @@ -944,7 +989,7 @@ allocatePinned (Capability *cap, W_ n) } bd = cap->pinned_object_block; - + // If we don't have a block of pinned objects yet, or the current // one isn't large enough to hold the new object, get a new one. if (bd == NULL || (bd->free + n) > (bd->start + BLOCK_SIZE_W)) { @@ -964,24 +1009,13 @@ allocatePinned (Capability *cap, W_ n) // objects scale really badly if we do this). // // So first, we try taking the next block from the nursery, in - // the same way as allocate(), but note that we can only take - // an *empty* block, because we're about to mark it as - // BF_PINNED | BF_LARGE. + // the same way as allocate(). bd = cap->r.rCurrentNursery->link; - if (bd == NULL) { // must be empty! - // The nursery is empty, or the next block is non-empty: - // allocate a fresh block (we can't fail here). - - // XXX in the case when the next nursery block is - // non-empty we aren't exerting any pressure to GC soon, - // so if this case ever happens then we could in theory - // keep allocating for ever without calling the GC. We - // can't bump g0->n_new_large_words because that will be - // counted towards allocation, and we're already counting - // our pinned obects as allocation in - // collect_pinned_object_blocks in the GC. + if (bd == NULL) { + // The nursery is empty: allocate a fresh block (we can't fail + // here). ACQUIRE_SM_LOCK; - bd = allocBlock(); + bd = allocBlockOnNode(cap->node); RELEASE_SM_LOCK; initBdescr(bd, g0, g0); } else { @@ -1217,13 +1251,13 @@ W_ gcThreadLiveBlocks (uint32_t i, uint32_t g) * that will be collected next time will therefore need twice as many * blocks since all the data will be copied. */ -extern W_ +extern W_ calcNeeded (rtsBool force_major, memcount *blocks_needed) { W_ needed = 0, blocks; uint32_t g, N; generation *gen; - + if (force_major) { N = RtsFlags.GcFlags.generations - 1; } else { @@ -1238,7 +1272,7 @@ calcNeeded (rtsBool force_major, memcount *blocks_needed) // we need at least this much space needed += blocks; - + // are we collecting this gen? if (g == 0 || // always collect gen 0 blocks > gen->max_blocks) diff --git a/rts/win32/OSMem.c b/rts/win32/OSMem.c index f0680e953a..3450267d9c 100644 --- a/rts/win32/OSMem.c +++ b/rts/win32/OSMem.c @@ -482,3 +482,25 @@ void osReleaseHeapMemory (void) } #endif + +rtsBool osNumaAvailable(void) +{ + return rtsFalse; +} + +uint32_t osNumaNodes(void) +{ + return 1; +} + +StgWord osNumaMask(void) +{ + return 1; +} + +void osBindMBlocksToNode( + void *addr STG_UNUSED, + StgWord size STG_UNUSED, + uint32_t node STG_UNUSED) +{ +} diff --git a/rts/win32/OSThreads.c b/rts/win32/OSThreads.c index 7a51ec5191..78fe29784b 100644 --- a/rts/win32/OSThreads.c +++ b/rts/win32/OSThreads.c @@ -300,6 +300,9 @@ interruptOSThread (OSThreadId id) CloseHandle(hdl); } +void setThreadNode (uint32_t node STG_UNUSED) { /* nothing */ } +void releaseThreadNode (void) { /* nothing */ } + #else /* !defined(THREADED_RTS) */ int diff --git a/testsuite/config/ghc b/testsuite/config/ghc index 355535153c..678cc56bc5 100644 --- a/testsuite/config/ghc +++ b/testsuite/config/ghc @@ -26,7 +26,7 @@ config.run_ways = ['normal', 'hpc'] config.other_ways = ['prof', 'normal_h', 'prof_hc_hb','prof_hb', 'prof_hd','prof_hy','prof_hr', - 'threaded1_ls', 'threaded2_hT', + 'threaded1_ls', 'threaded2_hT', 'debug_numa', 'llvm', 'debugllvm', 'profllvm', 'profoptllvm', 'profthreadedllvm', 'debug', @@ -87,6 +87,8 @@ config.way_flags = lambda name : { 'normal' : [], 'normal_h' : [], 'g1' : [], + 'nursery_chunks' : [], + 'debug_numa' : ['-threaded', '-debug'], 'optasm' : ['-O', '-fasm'], 'llvm' : ['-fllvm'], 'optllvm' : ['-O', '-fllvm'], @@ -119,6 +121,8 @@ config.way_rts_flags = { 'normal' : [], 'normal_h' : ['-h'], # works without -prof 'g1' : ['-G1'], + 'nursery_chunks' : ['-n32k'], + 'debug_numa' : ['-N2', '--debug-numa=2'], 'optasm' : [], 'llvm' : [], 'optllvm' : [], diff --git a/testsuite/tests/codeGen/should_run/all.T b/testsuite/tests/codeGen/should_run/all.T index b8bc4f60c2..42ec7d3e7f 100644 --- a/testsuite/tests/codeGen/should_run/all.T +++ b/testsuite/tests/codeGen/should_run/all.T @@ -21,7 +21,10 @@ test('cgrun017', normal, compile_and_run, ['']) test('cgrun018', normal, compile_and_run, ['']) test('cgrun019', normal, compile_and_run, ['']) test('cgrun020', normal, compile_and_run, ['']) -test('cgrun021', normal, compile_and_run, ['']) + +# cgrun021 does some GC, so let's use it to test GC parameters +test('cgrun021', extra_ways(['nursery_chunks']), compile_and_run, ['']) + test('cgrun022', normal, compile_and_run, ['']) test('cgrun024', normal, compile_and_run, ['']) test('cgrun025', diff --git a/testsuite/tests/concurrent/prog001/all.T b/testsuite/tests/concurrent/prog001/all.T index a3ba7b61f1..b5d4b1c4eb 100644 --- a/testsuite/tests/concurrent/prog001/all.T +++ b/testsuite/tests/concurrent/prog001/all.T @@ -4,7 +4,7 @@ # Also tests for bug #1466. # NB. This is a VERY IMPORTANT test! It is the only good test we have -# for throwTo. It has shown up several bugs that were not caught by the +# for throwTo. It has shown up several bugs that were not caught by the # other concurrency tests. # The program appears to be sensitive to scheduling, and can diverge diff --git a/testsuite/tests/concurrent/should_run/all.T b/testsuite/tests/concurrent/should_run/all.T index a451dfeb80..a974f6a304 100644 --- a/testsuite/tests/concurrent/should_run/all.T +++ b/testsuite/tests/concurrent/should_run/all.T @@ -35,7 +35,9 @@ test('T2910a', normal, compile_and_run, ['']) test('T3279', normal, compile_and_run, ['']) # This test takes a long time with the default context switch interval -test('T3429', extra_run_opts('+RTS -C0.001 -RTS'), compile_and_run, ['']) +test('T3429', [ extra_run_opts('+RTS -C0.001 -RTS'), + extra_ways(['debug_numa']) ], + compile_and_run, ['']) # without -O, goes into an infinite loop # GHCi does not detect the infinite loop. We should really fix this. @@ -144,7 +146,7 @@ test('conc020', normal, compile_and_run, ['']) test('conc021', [ omit_ways(['ghci']), exit_code(1) ], compile_and_run, ['']) test('conc022', normal, compile_and_run, ['']) -# On Windows, the non-threaded RTS creates a real OS thread for each +# On Windows, the non-threaded RTS creates a real OS thread for each # threadDelay. conc023 creates 5000 concurrent threadDelays, and the # resulting creation of OS threads seems to cause the system to run # out of memory sometimes (I'm not sure exactly how/why this happens, @@ -207,7 +209,7 @@ test('conc037', only_ways(['threaded1','threaded2']), compile_and_run, ['']) test('conc038', only_ways(['threaded1','threaded2']), compile_and_run, ['']) # Omit for GHCi, uses foreign export -# Omit for the threaded ways, because in this case the main thread is allowed to +# Omit for the threaded ways, because in this case the main thread is allowed to # complete, which causes the child thread to be interrupted. test('conc039', omit_ways(['ghci'] + threaded_ways), compile_and_run, ['']) @@ -253,4 +255,3 @@ test('setnumcapabilities001', # omit ghci, which can't handle unboxed tuples: test('compareAndSwap', [omit_ways(['ghci','hpc']), reqlib('primitive')], compile_and_run, ['']) - -- cgit v1.2.1