summaryrefslogtreecommitdiff
path: root/rts/Capability.h
diff options
context:
space:
mode:
authorSimon Marlow <marlowsd@gmail.com>2016-04-23 21:14:49 +0100
committerSimon Marlow <marlowsd@gmail.com>2016-06-10 21:25:54 +0100
commit9e5ea67e268be2659cd30ebaed7044d298198ab0 (patch)
treec395e74ee772ae0d59c852b3cbde743784b08d09 /rts/Capability.h
parentb9fa72a24ba2cc3120912e6afedc9280d28d2077 (diff)
downloadhaskell-9e5ea67e268be2659cd30ebaed7044d298198ab0.tar.gz
NUMA support
Summary: The aim here is to reduce the number of remote memory accesses on systems with a NUMA memory architecture, typically multi-socket servers. Linux provides a NUMA API for doing two things: * Allocating memory local to a particular node * Binding a thread to a particular node When given the +RTS --numa flag, the runtime will * Determine the number of NUMA nodes (N) by querying the OS * Assign capabilities to nodes, so cap C is on node C%N * Bind worker threads on a capability to the correct node * Keep a separate free lists in the block layer for each node * Allocate the nursery for a capability from node-local memory * Allocate blocks in the GC from node-local memory For example, using nofib/parallel/queens on a 24-core 2-socket machine: ``` $ ./Main 15 +RTS -N24 -s -A64m Total time 173.960s ( 7.467s elapsed) $ ./Main 15 +RTS -N24 -s -A64m --numa Total time 150.836s ( 6.423s elapsed) ``` The biggest win here is expected to be allocating from node-local memory, so that means programs using a large -A value (as here). According to perf, on this program the number of remote memory accesses were reduced by more than 50% by using `--numa`. Test Plan: * validate * There's a new flag --debug-numa=<n> that pretends to do NUMA without actually making the OS calls, which is useful for testing the code on non-NUMA systems. * TODO: I need to add some unit tests Reviewers: erikd, austin, rwbarton, ezyang, bgamari, hvr, niteria Subscribers: thomie Differential Revision: https://phabricator.haskell.org/D2199
Diffstat (limited to 'rts/Capability.h')
-rw-r--r--rts/Capability.h14
1 files changed, 12 insertions, 2 deletions
diff --git a/rts/Capability.h b/rts/Capability.h
index 22c1d2a5c7..6874379c5f 100644
--- a/rts/Capability.h
+++ b/rts/Capability.h
@@ -36,6 +36,15 @@ struct Capability_ {
uint32_t no; // capability number.
+ // The NUMA node on which this capability resides. This is used to allocate
+ // node-local memory in allocate().
+ //
+ // Note: this is always equal to cap->no % RtsFlags.ParFlags.nNumaNodes.
+ // The reason we slice it this way is that if we add or remove capabilities
+ // via setNumCapabilities(), then we keep the number of capabilities on each
+ // NUMA node balanced.
+ uint32_t node;
+
// The Task currently holding this Capability. This task has
// exclusive access to the contents of this Capability (apart from
// returning_tasks_hd/returning_tasks_tl).
@@ -151,6 +160,8 @@ struct Capability_ {
;
+#define capNoToNumaNode(n) ((n) % RtsFlags.GcFlags.nNumaNodes)
+
#if defined(THREADED_RTS)
#define ASSERT_TASK_ID(task) ASSERT(task->id == osThreadId())
#else
@@ -221,7 +232,6 @@ INLINE_HEADER void releaseCapability_ (Capability* cap STG_UNUSED,
// extern uint32_t enabled_capabilities;
// Array of all the capabilities
-//
extern Capability **capabilities;
//
@@ -364,7 +374,7 @@ recordMutableCap (const StgClosure *p, Capability *cap, uint32_t gen)
bd = cap->mut_lists[gen];
if (bd->free >= bd->start + BLOCK_SIZE_W) {
bdescr *new_bd;
- new_bd = allocBlock_lock();
+ new_bd = allocBlockOnNode_lock(cap->node);
new_bd->link = bd;
bd = new_bd;
cap->mut_lists[gen] = bd;