summaryrefslogtreecommitdiff
path: root/rts/posix
diff options
context:
space:
mode:
authorSimon Marlow <marlowsd@gmail.com>2016-04-23 21:14:49 +0100
committerSimon Marlow <marlowsd@gmail.com>2016-06-10 21:25:54 +0100
commit9e5ea67e268be2659cd30ebaed7044d298198ab0 (patch)
treec395e74ee772ae0d59c852b3cbde743784b08d09 /rts/posix
parentb9fa72a24ba2cc3120912e6afedc9280d28d2077 (diff)
downloadhaskell-9e5ea67e268be2659cd30ebaed7044d298198ab0.tar.gz
NUMA support
Summary: The aim here is to reduce the number of remote memory accesses on systems with a NUMA memory architecture, typically multi-socket servers. Linux provides a NUMA API for doing two things: * Allocating memory local to a particular node * Binding a thread to a particular node When given the +RTS --numa flag, the runtime will * Determine the number of NUMA nodes (N) by querying the OS * Assign capabilities to nodes, so cap C is on node C%N * Bind worker threads on a capability to the correct node * Keep a separate free lists in the block layer for each node * Allocate the nursery for a capability from node-local memory * Allocate blocks in the GC from node-local memory For example, using nofib/parallel/queens on a 24-core 2-socket machine: ``` $ ./Main 15 +RTS -N24 -s -A64m Total time 173.960s ( 7.467s elapsed) $ ./Main 15 +RTS -N24 -s -A64m --numa Total time 150.836s ( 6.423s elapsed) ``` The biggest win here is expected to be allocating from node-local memory, so that means programs using a large -A value (as here). According to perf, on this program the number of remote memory accesses were reduced by more than 50% by using `--numa`. Test Plan: * validate * There's a new flag --debug-numa=<n> that pretends to do NUMA without actually making the OS calls, which is useful for testing the code on non-NUMA systems. * TODO: I need to add some unit tests Reviewers: erikd, austin, rwbarton, ezyang, bgamari, hvr, niteria Subscribers: thomie Differential Revision: https://phabricator.haskell.org/D2199
Diffstat (limited to 'rts/posix')
-rw-r--r--rts/posix/OSMem.c64
-rw-r--r--rts/posix/OSThreads.c30
2 files changed, 92 insertions, 2 deletions
diff --git a/rts/posix/OSMem.c b/rts/posix/OSMem.c
index 5ff4bc86e4..a534219902 100644
--- a/rts/posix/OSMem.c
+++ b/rts/posix/OSMem.c
@@ -30,6 +30,12 @@
#ifdef HAVE_FCNTL_H
#include <fcntl.h>
#endif
+#ifdef HAVE_NUMA_H
+#include <numa.h>
+#endif
+#ifdef HAVE_NUMAIF_H
+#include <numaif.h>
+#endif
#include <errno.h>
@@ -287,6 +293,7 @@ osGetMBlocks(uint32_t n)
ret = gen_map_mblocks(size);
}
}
+
// Next time, we'll try to allocate right after the block we just got.
// ToDo: check that we haven't already grabbed the memory at next_request
next_request = (char *)ret + size;
@@ -294,6 +301,31 @@ osGetMBlocks(uint32_t n)
return ret;
}
+void osBindMBlocksToNode(
+ void *addr STG_UNUSED,
+ StgWord size STG_UNUSED,
+ uint32_t node STG_UNUSED)
+{
+#ifdef HAVE_NUMAIF_H
+ int ret;
+ StgWord mask = 0;
+ mask |= 1 << node;
+ if (RtsFlags.GcFlags.numa) {
+ ret = mbind(addr, (unsigned long)size,
+ MPOL_BIND, &mask, sizeof(StgWord)*8, MPOL_MF_STRICT);
+ // paranoia: MPOL_BIND guarantees memory on the correct node;
+ // MPOL_MF_STRICT will tell us if it didn't work. We might want to
+ // relax these in due course, but I want to be sure it's doing what we
+ // want first.
+ if (ret != 0) {
+ sysErrorBelch("mbind");
+ stg_exit(EXIT_FAILURE);
+ }
+ }
+#endif
+}
+
+
void osFreeMBlocks(void *addr, uint32_t n)
{
munmap(addr, n * MBLOCK_SIZE);
@@ -512,4 +544,36 @@ void osReleaseHeapMemory(void)
sysErrorBelch("unable to release address space");
}
+rtsBool osNumaAvailable(void)
+{
+#ifdef HAVE_NUMA_H
+ return (numa_available() != -1);
+#else
+ return rtsFalse;
+#endif
+}
+
+uint32_t osNumaNodes(void)
+{
+#ifdef HAVE_NUMA_H
+ return numa_num_configured_nodes();
+#else
+ return 1;
+#endif
+}
+
+StgWord osNumaMask(void)
+{
+#ifdef HAVE_NUMA_H
+ struct bitmask *mask;
+ mask = numa_get_mems_allowed();
+ if (mask->size > sizeof(StgWord)*8) {
+ barf("Too many NUMA nodes");
+ }
+ return mask->maskp[0];
+#else
+ return 1;
+#endif
+}
+
#endif
diff --git a/rts/posix/OSThreads.c b/rts/posix/OSThreads.c
index ad138d314d..72538c1bf3 100644
--- a/rts/posix/OSThreads.c
+++ b/rts/posix/OSThreads.c
@@ -70,6 +70,10 @@
# include <signal.h>
#endif
+#ifdef HAVE_NUMA_H
+#include <numa.h>
+#endif
+
/*
* This (allegedly) OS threads independent layer was initially
* abstracted away from code that used Pthreads, so the functions
@@ -308,10 +312,32 @@ setThreadAffinity(uint32_t n, uint32_t m)
#else
void
-setThreadAffinity (uint32_t n GNUC3_ATTRIBUTE(__unused__),
- uint32_t m GNUC3_ATTRIBUTE(__unused__))
+setThreadAffinity (uint32_t n STG_UNUSED,
+ uint32_t m STG_UNUSED)
+{
+}
+#endif
+
+#ifdef HAVE_NUMA_H
+void setThreadNode (uint32_t node)
{
+ ASSERT(node < RtsFlags.GcFlags.nNumaNodes);
+ if (numa_run_on_node(node) == -1) {
+ sysErrorBelch("numa_run_on_node");
+ stg_exit(1);
+ }
+}
+
+void releaseThreadNode (void)
+{
+ if (numa_run_on_node(-1) == -1) {
+ sysErrorBelch("numa_run_on_node");
+ stg_exit(1);
+ }
}
+#else
+void setThreadNode (uint32_t node STG_UNUSED) { /* nothing */ }
+void releaseThreadNode (void) { /* nothing */ }
#endif
void