diff options
author | Simon Marlow <marlowsd@gmail.com> | 2016-04-23 21:14:49 +0100 |
---|---|---|
committer | Simon Marlow <marlowsd@gmail.com> | 2016-06-10 21:25:54 +0100 |
commit | 9e5ea67e268be2659cd30ebaed7044d298198ab0 (patch) | |
tree | c395e74ee772ae0d59c852b3cbde743784b08d09 /rts/posix | |
parent | b9fa72a24ba2cc3120912e6afedc9280d28d2077 (diff) | |
download | haskell-9e5ea67e268be2659cd30ebaed7044d298198ab0.tar.gz |
NUMA support
Summary:
The aim here is to reduce the number of remote memory accesses on
systems with a NUMA memory architecture, typically multi-socket servers.
Linux provides a NUMA API for doing two things:
* Allocating memory local to a particular node
* Binding a thread to a particular node
When given the +RTS --numa flag, the runtime will
* Determine the number of NUMA nodes (N) by querying the OS
* Assign capabilities to nodes, so cap C is on node C%N
* Bind worker threads on a capability to the correct node
* Keep a separate free lists in the block layer for each node
* Allocate the nursery for a capability from node-local memory
* Allocate blocks in the GC from node-local memory
For example, using nofib/parallel/queens on a 24-core 2-socket machine:
```
$ ./Main 15 +RTS -N24 -s -A64m
Total time 173.960s ( 7.467s elapsed)
$ ./Main 15 +RTS -N24 -s -A64m --numa
Total time 150.836s ( 6.423s elapsed)
```
The biggest win here is expected to be allocating from node-local
memory, so that means programs using a large -A value (as here).
According to perf, on this program the number of remote memory accesses
were reduced by more than 50% by using `--numa`.
Test Plan:
* validate
* There's a new flag --debug-numa=<n> that pretends to do NUMA without
actually making the OS calls, which is useful for testing the code
on non-NUMA systems.
* TODO: I need to add some unit tests
Reviewers: erikd, austin, rwbarton, ezyang, bgamari, hvr, niteria
Subscribers: thomie
Differential Revision: https://phabricator.haskell.org/D2199
Diffstat (limited to 'rts/posix')
-rw-r--r-- | rts/posix/OSMem.c | 64 | ||||
-rw-r--r-- | rts/posix/OSThreads.c | 30 |
2 files changed, 92 insertions, 2 deletions
diff --git a/rts/posix/OSMem.c b/rts/posix/OSMem.c index 5ff4bc86e4..a534219902 100644 --- a/rts/posix/OSMem.c +++ b/rts/posix/OSMem.c @@ -30,6 +30,12 @@ #ifdef HAVE_FCNTL_H #include <fcntl.h> #endif +#ifdef HAVE_NUMA_H +#include <numa.h> +#endif +#ifdef HAVE_NUMAIF_H +#include <numaif.h> +#endif #include <errno.h> @@ -287,6 +293,7 @@ osGetMBlocks(uint32_t n) ret = gen_map_mblocks(size); } } + // Next time, we'll try to allocate right after the block we just got. // ToDo: check that we haven't already grabbed the memory at next_request next_request = (char *)ret + size; @@ -294,6 +301,31 @@ osGetMBlocks(uint32_t n) return ret; } +void osBindMBlocksToNode( + void *addr STG_UNUSED, + StgWord size STG_UNUSED, + uint32_t node STG_UNUSED) +{ +#ifdef HAVE_NUMAIF_H + int ret; + StgWord mask = 0; + mask |= 1 << node; + if (RtsFlags.GcFlags.numa) { + ret = mbind(addr, (unsigned long)size, + MPOL_BIND, &mask, sizeof(StgWord)*8, MPOL_MF_STRICT); + // paranoia: MPOL_BIND guarantees memory on the correct node; + // MPOL_MF_STRICT will tell us if it didn't work. We might want to + // relax these in due course, but I want to be sure it's doing what we + // want first. + if (ret != 0) { + sysErrorBelch("mbind"); + stg_exit(EXIT_FAILURE); + } + } +#endif +} + + void osFreeMBlocks(void *addr, uint32_t n) { munmap(addr, n * MBLOCK_SIZE); @@ -512,4 +544,36 @@ void osReleaseHeapMemory(void) sysErrorBelch("unable to release address space"); } +rtsBool osNumaAvailable(void) +{ +#ifdef HAVE_NUMA_H + return (numa_available() != -1); +#else + return rtsFalse; +#endif +} + +uint32_t osNumaNodes(void) +{ +#ifdef HAVE_NUMA_H + return numa_num_configured_nodes(); +#else + return 1; +#endif +} + +StgWord osNumaMask(void) +{ +#ifdef HAVE_NUMA_H + struct bitmask *mask; + mask = numa_get_mems_allowed(); + if (mask->size > sizeof(StgWord)*8) { + barf("Too many NUMA nodes"); + } + return mask->maskp[0]; +#else + return 1; +#endif +} + #endif diff --git a/rts/posix/OSThreads.c b/rts/posix/OSThreads.c index ad138d314d..72538c1bf3 100644 --- a/rts/posix/OSThreads.c +++ b/rts/posix/OSThreads.c @@ -70,6 +70,10 @@ # include <signal.h> #endif +#ifdef HAVE_NUMA_H +#include <numa.h> +#endif + /* * This (allegedly) OS threads independent layer was initially * abstracted away from code that used Pthreads, so the functions @@ -308,10 +312,32 @@ setThreadAffinity(uint32_t n, uint32_t m) #else void -setThreadAffinity (uint32_t n GNUC3_ATTRIBUTE(__unused__), - uint32_t m GNUC3_ATTRIBUTE(__unused__)) +setThreadAffinity (uint32_t n STG_UNUSED, + uint32_t m STG_UNUSED) +{ +} +#endif + +#ifdef HAVE_NUMA_H +void setThreadNode (uint32_t node) { + ASSERT(node < RtsFlags.GcFlags.nNumaNodes); + if (numa_run_on_node(node) == -1) { + sysErrorBelch("numa_run_on_node"); + stg_exit(1); + } +} + +void releaseThreadNode (void) +{ + if (numa_run_on_node(-1) == -1) { + sysErrorBelch("numa_run_on_node"); + stg_exit(1); + } } +#else +void setThreadNode (uint32_t node STG_UNUSED) { /* nothing */ } +void releaseThreadNode (void) { /* nothing */ } #endif void |