From 9e5ea67e268be2659cd30ebaed7044d298198ab0 Mon Sep 17 00:00:00 2001 From: Simon Marlow Date: Sat, 23 Apr 2016 21:14:49 +0100 Subject: NUMA support Summary: The aim here is to reduce the number of remote memory accesses on systems with a NUMA memory architecture, typically multi-socket servers. Linux provides a NUMA API for doing two things: * Allocating memory local to a particular node * Binding a thread to a particular node When given the +RTS --numa flag, the runtime will * Determine the number of NUMA nodes (N) by querying the OS * Assign capabilities to nodes, so cap C is on node C%N * Bind worker threads on a capability to the correct node * Keep a separate free lists in the block layer for each node * Allocate the nursery for a capability from node-local memory * Allocate blocks in the GC from node-local memory For example, using nofib/parallel/queens on a 24-core 2-socket machine: ``` $ ./Main 15 +RTS -N24 -s -A64m Total time 173.960s ( 7.467s elapsed) $ ./Main 15 +RTS -N24 -s -A64m --numa Total time 150.836s ( 6.423s elapsed) ``` The biggest win here is expected to be allocating from node-local memory, so that means programs using a large -A value (as here). According to perf, on this program the number of remote memory accesses were reduced by more than 50% by using `--numa`. Test Plan: * validate * There's a new flag --debug-numa= that pretends to do NUMA without actually making the OS calls, which is useful for testing the code on non-NUMA systems. * TODO: I need to add some unit tests Reviewers: erikd, austin, rwbarton, ezyang, bgamari, hvr, niteria Subscribers: thomie Differential Revision: https://phabricator.haskell.org/D2199 --- rts/posix/OSThreads.c | 30 ++++++++++++++++++++++++++++-- 1 file changed, 28 insertions(+), 2 deletions(-) (limited to 'rts/posix/OSThreads.c') diff --git a/rts/posix/OSThreads.c b/rts/posix/OSThreads.c index ad138d314d..72538c1bf3 100644 --- a/rts/posix/OSThreads.c +++ b/rts/posix/OSThreads.c @@ -70,6 +70,10 @@ # include #endif +#ifdef HAVE_NUMA_H +#include +#endif + /* * This (allegedly) OS threads independent layer was initially * abstracted away from code that used Pthreads, so the functions @@ -308,10 +312,32 @@ setThreadAffinity(uint32_t n, uint32_t m) #else void -setThreadAffinity (uint32_t n GNUC3_ATTRIBUTE(__unused__), - uint32_t m GNUC3_ATTRIBUTE(__unused__)) +setThreadAffinity (uint32_t n STG_UNUSED, + uint32_t m STG_UNUSED) +{ +} +#endif + +#ifdef HAVE_NUMA_H +void setThreadNode (uint32_t node) { + ASSERT(node < RtsFlags.GcFlags.nNumaNodes); + if (numa_run_on_node(node) == -1) { + sysErrorBelch("numa_run_on_node"); + stg_exit(1); + } +} + +void releaseThreadNode (void) +{ + if (numa_run_on_node(-1) == -1) { + sysErrorBelch("numa_run_on_node"); + stg_exit(1); + } } +#else +void setThreadNode (uint32_t node STG_UNUSED) { /* nothing */ } +void releaseThreadNode (void) { /* nothing */ } #endif void -- cgit v1.2.1