From 9e5ea67e268be2659cd30ebaed7044d298198ab0 Mon Sep 17 00:00:00 2001
From: Simon Marlow <marlowsd@gmail.com>
Date: Sat, 23 Apr 2016 21:14:49 +0100
Subject: NUMA support

Summary:
The aim here is to reduce the number of remote memory accesses on
systems with a NUMA memory architecture, typically multi-socket servers.

Linux provides a NUMA API for doing two things:
* Allocating memory local to a particular node
* Binding a thread to a particular node

When given the +RTS --numa flag, the runtime will
* Determine the number of NUMA nodes (N) by querying the OS
* Assign capabilities to nodes, so cap C is on node C%N
* Bind worker threads on a capability to the correct node
* Keep a separate free lists in the block layer for each node
* Allocate the nursery for a capability from node-local memory
* Allocate blocks in the GC from node-local memory

For example, using nofib/parallel/queens on a 24-core 2-socket machine:

```
$ ./Main 15 +RTS -N24 -s -A64m
  Total   time  173.960s  (  7.467s elapsed)

$ ./Main 15 +RTS -N24 -s -A64m --numa
  Total   time  150.836s  (  6.423s elapsed)
```

The biggest win here is expected to be allocating from node-local
memory, so that means programs using a large -A value (as here).

According to perf, on this program the number of remote memory accesses
were reduced by more than 50% by using `--numa`.

Test Plan:
* validate
* There's a new flag --debug-numa=<n> that pretends to do NUMA without
  actually making the OS calls, which is useful for testing the code
  on non-NUMA systems.
* TODO: I need to add some unit tests

Reviewers: erikd, austin, rwbarton, ezyang, bgamari, hvr, niteria

Subscribers: thomie

Differential Revision: https://phabricator.haskell.org/D2199
---
 configure.ac                                |   7 +
 docs/users_guide/runtime_control.rst        |  50 ++++
 includes/Cmm.h                              |   1 -
 includes/Rts.h                              |   1 -
 includes/RtsAPI.h                           |   6 +-
 includes/rts/Constants.h                    |   6 +
 includes/rts/Flags.h                        |   8 +-
 includes/rts/OSThreads.h                    |   4 +-
 includes/rts/Threads.h                      |   4 +-
 includes/rts/storage/Block.h                |  20 +-
 includes/rts/storage/MBlock.h               |   2 +
 includes/rts/storage/SMPClosureOps.h        | 125 ----------
 rts/Capability.c                            |  38 +--
 rts/Capability.h                            |  14 +-
 rts/HeapStackCheck.cmm                      |   1 +
 rts/Inlines.c                               |   1 +
 rts/Messages.h                              |   1 +
 rts/PrimOps.cmm                             |   1 +
 rts/ProfHeap.c                              |   1 +
 rts/RtsFlags.c                              | 118 ++++++++--
 rts/SMPClosureOps.h                         | 129 ++++++++++
 rts/STM.c                                   |   1 +
 rts/Schedule.c                              |   5 +-
 rts/Task.c                                  |  26 ++-
 rts/Task.h                                  |   6 +
 rts/eventlog/EventLog.c                     |   2 +-
 rts/package.conf.in                         |   4 +-
 rts/posix/OSMem.c                           |  64 +++++
 rts/posix/OSThreads.c                       |  30 ++-
 rts/sm/BlockAlloc.c                         | 350 +++++++++++++++++-----------
 rts/sm/BlockAlloc.h                         |   1 +
 rts/sm/GC.c                                 |   8 +-
 rts/sm/GCUtils.c                            |  15 +-
 rts/sm/GCUtils.h                            |  14 +-
 rts/sm/MBlock.c                             |  19 +-
 rts/sm/MarkStack.h                          |   1 +
 rts/sm/OSMem.h                              |   4 +
 rts/sm/Storage.c                            | 232 ++++++++++--------
 rts/win32/OSMem.c                           |  22 ++
 rts/win32/OSThreads.c                       |   3 +
 testsuite/config/ghc                        |   6 +-
 testsuite/tests/codeGen/should_run/all.T    |   5 +-
 testsuite/tests/concurrent/prog001/all.T    |   2 +-
 testsuite/tests/concurrent/should_run/all.T |   9 +-
 44 files changed, 937 insertions(+), 430 deletions(-)
 delete mode 100644 includes/rts/storage/SMPClosureOps.h
 create mode 100644 rts/SMPClosureOps.h
diff --git a/configure.ac b/configure.ac
index d7eb7380cd..070bae54d9 100644
--- a/configure.ac
+++ b/configure.ac
@@ -1103,6 +1103,13 @@ if test $UseLibdw = "YES" ; then
 fi
 AC_DEFINE_UNQUOTED([USE_LIBDW], [$USE_LIBDW], [Set to 1 to use libdw])
 
+dnl ** Have libnuma?
+dnl --------------------------------------------------------------
+AC_CHECK_HEADERS([numa.h numaif.h])
+AC_CHECK_LIB(numa, numa_available,
+        [AC_DEFINE([HAVE_LIBNUMA], [1], [Define to 1 if you have libnuma.])]
+        [])
+
 dnl ** Documentation
 dnl --------------------------------------------------------------
 if test -n "$SPHINXBUILD"; then
diff --git a/docs/users_guide/runtime_control.rst b/docs/users_guide/runtime_control.rst
index 19135c61ce..1ae51ddc49 100644
--- a/docs/users_guide/runtime_control.rst
+++ b/docs/users_guide/runtime_control.rst
@@ -643,6 +643,56 @@ performance.
     ``-F`` parameter will be reduced in order to avoid exceeding the
     maximum heap size.
 
+.. rts-flag:: --numa
+              --numa=<mask>
+
+    .. index::
+       single: NUMA, enabling in the runtime
+
+    Enable NUMA-aware memory allocation in the runtime (only available
+    with ``-threaded``, and only on Linux currently).
+
+    Background: some systems have a Non-Uniform Memory Architecture,
+    whereby main memory is split into banks which are "local" to
+    specific CPU cores.  Accessing local memory is faster than
+    accessing remote memory.  The OS provides APIs for allocating
+    local memory and binding threads to particular CPU cores, so that
+    we can ensure certain memory accesses are using local memory.
+
+    The ``--numa`` option tells the RTS to tune its memory usage to
+    maximize local memory accesses.  In particular, the RTS will:
+
+       - Determine the number of NUMA nodes (N) by querying the OS.
+       - Manage separate memory pools for each node.
+       - Map capabilities to NUMA nodes.  Capability C is mapped to
+         NUMA node C mod N.
+       - Bind worker threads on a capability to the appropriate node.
+       - Allocate the nursery from node-local memory.
+       - Perform other memory allocation, including in the GC, from
+         node-local memory.
+       - When load-balancing, we prefer to migrate threads to another
+         Capability on the same node.
+
+    The ``--numa`` flag is typically beneficial when a program is
+    using all cores of a large multi-core NUMA system, with a large
+    allocation area (``-A``).  All memory accesses to the allocation
+    area will go to local memory, which can save a significant amount
+    of remote memory access.  A runtime speedup on the order of 10%
+    is typical, but can vary a lot depending on the hardware and the
+    memory behaviour of the program.
+
+    Note that the RTS will not set CPU affinity for bound threads and
+    threads entering Haskell from C/C++, so if your program uses bound
+    threads you should ensure that each bound thread calls the RTS API
+    `rts_setInCallCapability(c,1)` from C/C++ before calling into
+    Haskell.  Otherwise there could be a mismatch between the CPU that
+    the thread is running on and the memory it is using while running
+    Haskell code, which will negate any benefits of ``--numa``.
+
+    If given an explicit <mask>, the <mask> is interpreted as a bitmap
+    that indicates the NUMA nodes on which to run the program.  For
+    example, ``--numa=3`` would run the program on NUMA nodes 0 and 1.
+
 .. _rts-options-statistics:
 
 RTS options to produce runtime statistics
diff --git a/includes/Cmm.h b/includes/Cmm.h
index cbd7e36ac1..3b9a5a6794 100644
--- a/includes/Cmm.h
+++ b/includes/Cmm.h
@@ -325,7 +325,6 @@
 #include "DerivedConstants.h"
 #include "rts/storage/ClosureTypes.h"
 #include "rts/storage/FunTypes.h"
-#include "rts/storage/SMPClosureOps.h"
 #include "rts/OSThreads.h"
 
 /*
diff --git a/includes/Rts.h b/includes/Rts.h
index 1ad1bba5f8..3d4538f41d 100644
--- a/includes/Rts.h
+++ b/includes/Rts.h
@@ -203,7 +203,6 @@ INLINE_HEADER Time fsecondsToTime (double t)
 #include "rts/storage/ClosureTypes.h"
 #include "rts/storage/TSO.h"
 #include "stg/MiscClosures.h" /* InfoTables, closures etc. defined in the RTS */
-#include "rts/storage/SMPClosureOps.h"
 #include "rts/storage/Block.h"
 #include "rts/storage/ClosureMacros.h"
 #include "rts/storage/MBlock.h"
diff --git a/includes/RtsAPI.h b/includes/RtsAPI.h
index a4a094fd14..230c982c12 100644
--- a/includes/RtsAPI.h
+++ b/includes/RtsAPI.h
@@ -179,7 +179,11 @@ Capability *rts_unsafeGetMyCapability (void);
 // Note that the thread may still be migrated by the RTS scheduler, but that
 // will only happen if there are multiple threads running on one Capability and
 // another Capability is free.
-void setInCallCapability (int preferred_capability);
+//
+// If affinity is non-zero, the current thread will be bound to
+// specific CPUs according to the prevailing affinity policy for the
+// specified capability, set by either +RTS -qa or +RTS --numa.
+void rts_setInCallCapability (int preferred_capability, int affinity);
 
 /* ----------------------------------------------------------------------------
    Building Haskell objects from C datatypes.
diff --git a/includes/rts/Constants.h b/includes/rts/Constants.h
index b65b8d3a9d..114f30ce2f 100644
--- a/includes/rts/Constants.h
+++ b/includes/rts/Constants.h
@@ -295,4 +295,10 @@
 
 #define MAX_SPARE_WORKERS 6
 
+/*
+ * The maximum number of NUMA nodes we support.  This is a fixed limit so that
+ * we can have static arrays of this size in the RTS for speed.
+ */
+#define MAX_NUMA_NODES 16
+
 #endif /* RTS_CONSTANTS_H */
diff --git a/includes/rts/Flags.h b/includes/rts/Flags.h
index 8020a177b0..ff303dc5e6 100644
--- a/includes/rts/Flags.h
+++ b/includes/rts/Flags.h
@@ -73,6 +73,11 @@ typedef struct _GC_FLAGS {
                                  * to handle the exception before we
                                  * raise it again.
                                  */
+
+    rtsBool numa;               /* Use NUMA */
+    uint32_t nNumaNodes;        /* Number of nodes */
+    uint32_t numaMap[MAX_NUMA_NODES]; /* Map our internal node numbers to OS
+                                       * node numbers */
 } GC_FLAGS;
 
 /* See Note [Synchronization of flags and base APIs] */
@@ -93,6 +98,7 @@ typedef struct _DEBUG_FLAGS {
     rtsBool squeeze;        /* 'z'  stack squeezing & lazy blackholing */
     rtsBool hpc; 	    /* 'c' coverage */
     rtsBool sparks; 	    /* 'r' */
+    rtsBool numa; 	    /* '--debug-numa' */
 } DEBUG_FLAGS;
 
 /* See Note [Synchronization of flags and base APIs] */
@@ -184,7 +190,7 @@ typedef struct _MISC_FLAGS {
 #ifdef THREADED_RTS
 /* See Note [Synchronization of flags and base APIs] */
 typedef struct _PAR_FLAGS {
-  uint32_t       nNodes;         /* number of threads to run simultaneously */
+  uint32_t       nCapabilities;  /* number of threads to run simultaneously */
   rtsBool        migrate;        /* migrate threads between capabilities */
   uint32_t       maxLocalSparks;
   rtsBool        parGcEnabled;   /* enable parallel GC */
diff --git a/includes/rts/OSThreads.h b/includes/rts/OSThreads.h
index ee1855b4da..bc84b714db 100644
--- a/includes/rts/OSThreads.h
+++ b/includes/rts/OSThreads.h
@@ -200,7 +200,9 @@ void  setThreadLocalVar (ThreadLocalKey *key, void *value);
 void  freeThreadLocalKey (ThreadLocalKey *key);
 
 // Processors and affinity
-void setThreadAffinity     (uint32_t n, uint32_t m);
+void setThreadAffinity (uint32_t n, uint32_t m);
+void setThreadNode (uint32_t node);
+void releaseThreadNode (void);
 #endif // !CMINUSMINUS
 
 #else
diff --git a/includes/rts/Threads.h b/includes/rts/Threads.h
index 67d01db7d4..866c4692bd 100644
--- a/includes/rts/Threads.h
+++ b/includes/rts/Threads.h
@@ -58,7 +58,9 @@ pid_t  forkProcess     (HsStablePtr *entry)
 
 HsBool rtsSupportsBoundThreads (void);
 
-// The number of Capabilities
+// The number of Capabilities.
+// ToDo: I would like this to be private to the RTS and instead expose a
+// function getNumCapabilities(), but it is used in compiler/cbits/genSym.c
 extern unsigned int n_capabilities;
 
 // The number of Capabilities that are not disabled
diff --git a/includes/rts/storage/Block.h b/includes/rts/storage/Block.h
index 1a31de5512..e04cfdfec6 100644
--- a/includes/rts/storage/Block.h
+++ b/includes/rts/storage/Block.h
@@ -111,7 +111,7 @@ typedef struct bdescr_ {
 
     StgWord16 gen_no;          // gen->no, cached
     StgWord16 dest_no;         // number of destination generation
-    StgWord16 _pad1;
+    StgWord16 node;            // which memory node does this block live on?
 
     StgWord16 flags;           // block flags, see below
 
@@ -280,12 +280,28 @@ extern void initBlockAllocator(void);
 /* Allocation -------------------------------------------------------------- */
 
 bdescr *allocGroup(W_ n);
-bdescr *allocBlock(void);
+
+EXTERN_INLINE bdescr* allocBlock(void);
+EXTERN_INLINE bdescr* allocBlock(void)
+{
+    return allocGroup(1);
+}
+
+bdescr *allocGroupOnNode(uint32_t node, W_ n);
+
+EXTERN_INLINE bdescr* allocBlockOnNode(uint32_t node);
+EXTERN_INLINE bdescr* allocBlockOnNode(uint32_t node)
+{
+    return allocGroupOnNode(node,1);
+}
 
 // versions that take the storage manager lock for you:
 bdescr *allocGroup_lock(W_ n);
 bdescr *allocBlock_lock(void);
 
+bdescr *allocGroupOnNode_lock(uint32_t node, W_ n);
+bdescr *allocBlockOnNode_lock(uint32_t node);
+
 /* De-Allocation ----------------------------------------------------------- */
 
 void freeGroup(bdescr *p);
diff --git a/includes/rts/storage/MBlock.h b/includes/rts/storage/MBlock.h
index 419a96e225..a8251c8c0b 100644
--- a/includes/rts/storage/MBlock.h
+++ b/includes/rts/storage/MBlock.h
@@ -18,6 +18,8 @@ extern W_ mblocks_allocated;
 extern void initMBlocks(void);
 extern void * getMBlock(void);
 extern void * getMBlocks(uint32_t n);
+extern void * getMBlockOnNode(uint32_t node);
+extern void * getMBlocksOnNode(uint32_t node, uint32_t n);
 extern void freeMBlocks(void *addr, uint32_t n);
 extern void releaseFreeMemory(void);
 extern void freeAllMBlocks(void);
diff --git a/includes/rts/storage/SMPClosureOps.h b/includes/rts/storage/SMPClosureOps.h
deleted file mode 100644
index ee92186f5f..0000000000
--- a/includes/rts/storage/SMPClosureOps.h
+++ /dev/null
@@ -1,125 +0,0 @@
-/* ----------------------------------------------------------------------------
- *
- * (c) The GHC Team, 2005-2013
- *
- * Macros for THREADED_RTS support
- *
- * -------------------------------------------------------------------------- */
-
-#ifndef RTS_STORAGE_SMPCLOSUREOPS_H
-#define RTS_STORAGE_SMPCLOSUREOPS_H
-
-#ifdef CMINUSMINUS
-
-/* Lock closure, equivalent to ccall lockClosure but the condition is inlined.
- * Arguments are swapped for uniformity with unlockClosure. */
-#if defined(THREADED_RTS)
-#define LOCK_CLOSURE(closure, info)                             \
-    if (CInt[n_capabilities] == 1 :: CInt) {                    \
-        info = GET_INFO(closure);                               \
-    } else {                                                    \
-        ("ptr" info) = ccall reallyLockClosure(closure "ptr");  \
-    }
-#else
-#define LOCK_CLOSURE(closure, info) info = GET_INFO(closure)
-#endif
-
-#define unlockClosure(ptr,info)                 \
-    prim_write_barrier;                         \
-    StgHeader_info(ptr) = info;
-
-#else
-
-INLINE_HEADER StgInfoTable *lockClosure(StgClosure *p);
-EXTERN_INLINE StgInfoTable *reallyLockClosure(StgClosure *p);
-EXTERN_INLINE StgInfoTable *tryLockClosure(StgClosure *p);
-EXTERN_INLINE void unlockClosure(StgClosure *p, const StgInfoTable *info);
-
-#if defined(THREADED_RTS)
-
-/* -----------------------------------------------------------------------------
- * Locking/unlocking closures
- *
- * This is used primarily in the implementation of MVars.
- * -------------------------------------------------------------------------- */
-
-// We want a callable copy of reallyLockClosure() so that we can refer to it
-// from .cmm files compiled using the native codegen, so these are given
-// EXTERN_INLINE.  C-- should use LOCK_CLOSURE not lockClosure, so we've
-// kept it INLINE_HEADER.
-EXTERN_INLINE StgInfoTable *reallyLockClosure(StgClosure *p)
-{
-    StgWord info;
-    do {
-        uint32_t i = 0;
-        do {
-            info = xchg((P_)(void *)&p->header.info, (W_)&stg_WHITEHOLE_info);
-            if (info != (W_)&stg_WHITEHOLE_info) return (StgInfoTable *)info;
-        } while (++i < SPIN_COUNT);
-        yieldThread();
-    } while (1);
-}
-
-INLINE_HEADER StgInfoTable *lockClosure(StgClosure *p)
-{
-    if (n_capabilities == 1) {
-        return (StgInfoTable *)p->header.info;
-    }
-    else {
-        return reallyLockClosure(p);
-    }
-}
-
-// ToDo: consider splitting tryLockClosure into reallyTryLockClosure,
-// same as lockClosure
-EXTERN_INLINE StgInfoTable *tryLockClosure(StgClosure *p)
-{
-    StgWord info;
-    if (n_capabilities == 1) {
-        return (StgInfoTable *)p->header.info;
-    }
-    else {
-        info = xchg((P_)(void *)&p->header.info, (W_)&stg_WHITEHOLE_info);
-        if (info != (W_)&stg_WHITEHOLE_info) {
-            return (StgInfoTable *)info;
-        } else {
-            return NULL;
-        }
-    }
-}
-
-#else /* !THREADED_RTS */
-
-EXTERN_INLINE StgInfoTable *
-reallyLockClosure(StgClosure *p)
-{ return (StgInfoTable *)p->header.info; }
-
-INLINE_HEADER StgInfoTable *
-lockClosure(StgClosure *p)
-{ return (StgInfoTable *)p->header.info; }
-
-EXTERN_INLINE StgInfoTable *
-tryLockClosure(StgClosure *p)
-{ return (StgInfoTable *)p->header.info; }
-
-#endif /* THREADED_RTS */
-
-EXTERN_INLINE void unlockClosure(StgClosure *p, const StgInfoTable *info)
-{
-    // This is a strictly ordered write, so we need a write_barrier():
-    write_barrier();
-    p->header.info = info;
-}
-
-// Handy specialised versions of lockClosure()/unlockClosure()
-INLINE_HEADER void lockTSO(StgTSO *tso);
-INLINE_HEADER void lockTSO(StgTSO *tso)
-{ lockClosure((StgClosure *)tso); }
-
-INLINE_HEADER void unlockTSO(StgTSO *tso);
-INLINE_HEADER void unlockTSO(StgTSO *tso)
-{ unlockClosure((StgClosure*)tso, (const StgInfoTable *)&stg_TSO_info); }
-
-#endif /* CMINUSMINUS */
-
-#endif /* RTS_STORAGE_SMPCLOSUREOPS_H */
diff --git a/rts/Capability.c b/rts/Capability.c
index 1b5f51a92b..411e64dc7a 100644
--- a/rts/Capability.c
+++ b/rts/Capability.c
@@ -51,7 +51,7 @@ Capability **capabilities = NULL;
 // an in-call has a chance of quickly finding a free Capability.
 // Maintaining a global free list of Capabilities would require global
 // locking, so we don't do that.
-static Capability *last_free_capability = NULL;
+static Capability *last_free_capability[MAX_NUMA_NODES];
 
 /*
  * Indicates that the RTS wants to synchronise all the Capabilities
@@ -230,11 +230,12 @@ popReturningTask (Capability *cap)
  * ------------------------------------------------------------------------- */
 
 static void
-initCapability( Capability *cap, uint32_t i )
+initCapability (Capability *cap, uint32_t i)
 {
     uint32_t g;
 
     cap->no = i;
+    cap->node = capNoToNumaNode(i);
     cap->in_haskell        = rtsFalse;
     cap->idle              = 0;
     cap->disabled          = rtsFalse;
@@ -316,9 +317,10 @@ initCapability( Capability *cap, uint32_t i )
  *            controlled by the user via the RTS flag -N.
  *
  * ------------------------------------------------------------------------- */
-void
-initCapabilities( void )
+void initCapabilities (void)
 {
+    uint32_t i;
+
     /* Declare a couple capability sets representing the process and
        clock domain. Each capability will get added to these capsets. */
     traceCapsetCreate(CAPSET_OSPROCESS_DEFAULT, CapsetTypeOsProcess);
@@ -328,21 +330,22 @@ initCapabilities( void )
 
 #ifndef REG_Base
     // We can't support multiple CPUs if BaseReg is not a register
-    if (RtsFlags.ParFlags.nNodes > 1) {
+    if (RtsFlags.ParFlags.nCapabilities > 1) {
         errorBelch("warning: multiple CPUs not supported in this build, reverting to 1");
-        RtsFlags.ParFlags.nNodes = 1;
+        RtsFlags.ParFlags.nCapabilities = 1;
     }
 #endif
 
     n_capabilities = 0;
-    moreCapabilities(0, RtsFlags.ParFlags.nNodes);
-    n_capabilities = RtsFlags.ParFlags.nNodes;
+    moreCapabilities(0, RtsFlags.ParFlags.nCapabilities);
+    n_capabilities = RtsFlags.ParFlags.nCapabilities;
 
 #else /* !THREADED_RTS */
 
     n_capabilities = 1;
     capabilities = stgMallocBytes(sizeof(Capability*), "initCapabilities");
     capabilities[0] = &MainCapability;
+
     initCapability(&MainCapability, 0);
 
 #endif
@@ -352,7 +355,9 @@ initCapabilities( void )
     // There are no free capabilities to begin with.  We will start
     // a worker Task to each Capability, which will quickly put the
     // Capability on the free list when it finds nothing to do.
-    last_free_capability = capabilities[0];
+    for (i = 0; i < RtsFlags.GcFlags.nNumaNodes; i++) {
+        last_free_capability[i] = capabilities[0];
+    }
 }
 
 void
@@ -532,7 +537,7 @@ releaseCapability_ (Capability* cap,
 #ifdef PROFILING
     cap->r.rCCCS = CCS_IDLE;
 #endif
-    last_free_capability = cap;
+    last_free_capability[cap->node] = cap;
     debugTrace(DEBUG_sched, "freeing capability %d", cap->no);
 }
 
@@ -711,6 +716,7 @@ void waitForCapability (Capability **pCap, Task *task)
     *pCap = &MainCapability;
 
 #else
+    uint32_t i;
     Capability *cap = *pCap;
 
     if (cap == NULL) {
@@ -719,12 +725,14 @@ void waitForCapability (Capability **pCap, Task *task)
                                enabled_capabilities];
         } else {
             // Try last_free_capability first
-            cap = last_free_capability;
+            cap = last_free_capability[task->node];
             if (cap->running_task) {
-                uint32_t i;
-                // otherwise, search for a free capability
+                // Otherwise, search for a free capability on this node.
                 cap = NULL;
-                for (i = 0; i < n_capabilities; i++) {
+                for (i = task->node; i < enabled_capabilities;
+                     i += RtsFlags.GcFlags.nNumaNodes) {
+                    // visits all the capabilities on this node, because
+                    // cap[i]->node == i % RtsFlags.GcFlags.nNumaNodes
                     if (!capabilities[i]->running_task) {
                         cap = capabilities[i];
                         break;
@@ -732,7 +740,7 @@ void waitForCapability (Capability **pCap, Task *task)
                 }
                 if (cap == NULL) {
                     // Can't find a free one, use last_free_capability.
-                    cap = last_free_capability;
+                    cap = last_free_capability[task->node];
                 }
             }
         }
diff --git a/rts/Capability.h b/rts/Capability.h
index 22c1d2a5c7..6874379c5f 100644
--- a/rts/Capability.h
+++ b/rts/Capability.h
@@ -36,6 +36,15 @@ struct Capability_ {
 
     uint32_t no;  // capability number.
 
+    // The NUMA node on which this capability resides.  This is used to allocate
+    // node-local memory in allocate().
+    //
+    // Note: this is always equal to cap->no % RtsFlags.ParFlags.nNumaNodes.
+    // The reason we slice it this way is that if we add or remove capabilities
+    // via setNumCapabilities(), then we keep the number of capabilities on each
+    // NUMA node balanced.
+    uint32_t node;
+
     // The Task currently holding this Capability.  This task has
     // exclusive access to the contents of this Capability (apart from
     // returning_tasks_hd/returning_tasks_tl).
@@ -151,6 +160,8 @@ struct Capability_ {
   ;
 
 
+#define capNoToNumaNode(n) ((n) % RtsFlags.GcFlags.nNumaNodes)
+
 #if defined(THREADED_RTS)
 #define ASSERT_TASK_ID(task) ASSERT(task->id == osThreadId())
 #else
@@ -221,7 +232,6 @@ INLINE_HEADER void releaseCapability_ (Capability* cap STG_UNUSED,
 // extern uint32_t enabled_capabilities;
 
 // Array of all the capabilities
-//
 extern Capability **capabilities;
 
 //
@@ -364,7 +374,7 @@ recordMutableCap (const StgClosure *p, Capability *cap, uint32_t gen)
     bd = cap->mut_lists[gen];
     if (bd->free >= bd->start + BLOCK_SIZE_W) {
         bdescr *new_bd;
-        new_bd = allocBlock_lock();
+        new_bd = allocBlockOnNode_lock(cap->node);
         new_bd->link = bd;
         bd = new_bd;
         cap->mut_lists[gen] = bd;
diff --git a/rts/HeapStackCheck.cmm b/rts/HeapStackCheck.cmm
index 825eaef27f..69bff74f41 100644
--- a/rts/HeapStackCheck.cmm
+++ b/rts/HeapStackCheck.cmm
@@ -12,6 +12,7 @@
 
 #include "Cmm.h"
 #include "Updates.h"
+#include "SMPClosureOps.h"
 
 #ifdef __PIC__
 import pthread_mutex_unlock;
diff --git a/rts/Inlines.c b/rts/Inlines.c
index e6f29b6e1b..a3eb956d0c 100644
--- a/rts/Inlines.c
+++ b/rts/Inlines.c
@@ -7,3 +7,4 @@
 #include "Schedule.h"
 #include "Capability.h"
 #include "WSDeque.h"
+#include "SMPClosureOps.h"
diff --git a/rts/Messages.h b/rts/Messages.h
index 302cb946b3..1459b581d4 100644
--- a/rts/Messages.h
+++ b/rts/Messages.h
@@ -18,6 +18,7 @@ void sendMessage    (Capability *from_cap, Capability *to_cap, Message *msg);
 
 #include "Capability.h"
 #include "Updates.h" // for DEBUG_FILL_SLOP
+#include "SMPClosureOps.h"
 
 INLINE_HEADER void
 doneWithMsgThrowTo (MessageThrowTo *m)
diff --git a/rts/PrimOps.cmm b/rts/PrimOps.cmm
index 160bccd2f9..b82eebe07f 100644
--- a/rts/PrimOps.cmm
+++ b/rts/PrimOps.cmm
@@ -23,6 +23,7 @@
 
 #include "Cmm.h"
 #include "MachDeps.h"
+#include "SMPClosureOps.h"
 
 #ifdef __PIC__
 import pthread_mutex_lock;
diff --git a/rts/ProfHeap.c b/rts/ProfHeap.c
index 9557648fcc..18c3e41a32 100644
--- a/rts/ProfHeap.c
+++ b/rts/ProfHeap.c
@@ -9,6 +9,7 @@
 #include "PosixSource.h"
 #include "Rts.h"
 
+#include "Capability.h"
 #include "RtsFlags.h"
 #include "RtsUtils.h"
 #include "Profiling.h"
diff --git a/rts/RtsFlags.c b/rts/RtsFlags.c
index 1ec5db0bb0..25345bf57b 100644
--- a/rts/RtsFlags.c
+++ b/rts/RtsFlags.c
@@ -15,6 +15,7 @@
 #include "RtsFlags.h"
 #include "sm/OSMem.h"
 #include "hooks/Hooks.h"
+#include "Capability.h"
 
 #ifdef HAVE_CTYPE_H
 #include <ctype.h>
@@ -122,6 +123,7 @@ static void errorRtsOptsDisabled (const char *s);
 
 void initRtsFlagsDefaults(void)
 {
+    uint32_t i;
     StgWord64 maxStkSize = 8 * getPhysicalMemorySize() / 10;
     // if getPhysicalMemorySize fails just move along with an 8MB limit
     if (maxStkSize == 0)
@@ -157,8 +159,12 @@ void initRtsFlagsDefaults(void)
 #endif
     RtsFlags.GcFlags.heapBase           = 0;   /* means don't care */
     RtsFlags.GcFlags.allocLimitGrace    = (100*1024) / BLOCK_SIZE;
+    RtsFlags.GcFlags.numa               = rtsFalse;
+    RtsFlags.GcFlags.nNumaNodes         = 1;
+    for (i = 0; i < MAX_NUMA_NODES; i++) {
+        RtsFlags.GcFlags.numaMap[i] = 0;
+    }
 
-#ifdef DEBUG
     RtsFlags.DebugFlags.scheduler       = rtsFalse;
     RtsFlags.DebugFlags.interpreter     = rtsFalse;
     RtsFlags.DebugFlags.weak            = rtsFalse;
@@ -174,7 +180,7 @@ void initRtsFlagsDefaults(void)
     RtsFlags.DebugFlags.squeeze         = rtsFalse;
     RtsFlags.DebugFlags.hpc             = rtsFalse;
     RtsFlags.DebugFlags.sparks          = rtsFalse;
-#endif
+    RtsFlags.DebugFlags.numa            = rtsFalse;
 
 #if defined(PROFILING)
     RtsFlags.CcFlags.doCostCentres      = 0;
@@ -220,7 +226,7 @@ void initRtsFlagsDefaults(void)
     RtsFlags.MiscFlags.linkerMemBase    = 0;
 
 #ifdef THREADED_RTS
-    RtsFlags.ParFlags.nNodes            = 1;
+    RtsFlags.ParFlags.nCapabilities     = 1;
     RtsFlags.ParFlags.migrate           = rtsTrue;
     RtsFlags.ParFlags.parGcEnabled      = 1;
     RtsFlags.ParFlags.parGcGen          = 0;
@@ -398,6 +404,14 @@ usage_text[] = {
 "  -qi<n>    If a processor has been idle for the last <n> GCs, do not",
 "            wake it up for a non-load-balancing parallel GC.",
 "            (0 disables,  default: 0)",
+"  --numa[=<node_mask>]",
+"            Use NUMA, nodes given by <node_mask> (default: off)",
+#if defined(DEBUG)
+"  --debug-numa[=<num_nodes>]",
+"            Pretend NUMA: like --numa, but without the system calls.",
+"            Can be used on non-NUMA systems for debugging.",
+"",
+#endif
 #endif
 "  --install-signal-handlers=<yes|no>",
 "            Install signal handlers (default: yes)",
@@ -745,6 +759,76 @@ error = rtsTrue;
                       printRtsInfo();
                       stg_exit(0);
                   }
+#if defined(THREADED_RTS)
+                  else if (!strncmp("numa", &rts_argv[arg][2], 4)) {
+                      OPTION_SAFE;
+                      StgWord mask;
+                      if (rts_argv[arg][6] == '=') {
+                          mask = (StgWord)strtol(rts_argv[arg]+7,
+                                                 (char **) NULL, 10);
+                      } else {
+                          mask = (StgWord)~0;
+                      }
+                      if (!osNumaAvailable()) {
+                          errorBelch("%s: OS reports NUMA is not available",
+                                     rts_argv[arg]);
+                          error = rtsTrue;
+                          break;
+                      }
+
+                      uint32_t nNodes = osNumaNodes();
+                      if (nNodes > MAX_NUMA_NODES) {
+                          errorBelch("%s: Too many NUMA nodes (max %d)",
+                                     rts_argv[arg], MAX_NUMA_NODES);
+                          error = rtsTrue;
+                      } else {
+                          RtsFlags.GcFlags.numa = rtsTrue;
+                          mask = mask & osNumaMask();
+                          uint32_t logical = 0, physical = 0;
+                          for (; physical < MAX_NUMA_NODES; physical++) {
+                              if (mask & 1) {
+                                  RtsFlags.GcFlags.numaMap[logical++] = physical;
+                              }
+                              mask = mask >> 1;
+                          }
+                          RtsFlags.GcFlags.nNumaNodes = logical;
+                          if (logical == 0) {
+                              errorBelch("%s: available node set is empty",
+                                         rts_argv[arg]);
+                              error = rtsTrue;
+                          }
+                      }
+                  }
+#endif
+#if defined(DEBUG) && defined(THREADED_RTS)
+                  else if (!strncmp("debug-numa", &rts_argv[arg][2], 10)) {
+                      OPTION_SAFE;
+                      size_t nNodes;
+                      if (rts_argv[arg][12] == '=' &&
+                          isdigit(rts_argv[arg][13])) {
+                          nNodes = (StgWord)strtol(rts_argv[arg]+13,
+                                                 (char **) NULL, 10);
+                      } else {
+                          errorBelch("%s: missing number of nodes",
+                                     rts_argv[arg]);
+                          error = rtsTrue;
+                          break;
+                      }
+                      if (nNodes > MAX_NUMA_NODES) {
+                          errorBelch("%s: Too many NUMA nodes (max %d)",
+                                     rts_argv[arg], MAX_NUMA_NODES);
+                          error = rtsTrue;
+                      } else {
+                          RtsFlags.GcFlags.numa = rtsTrue;
+                          RtsFlags.DebugFlags.numa = rtsTrue;
+                          RtsFlags.GcFlags.nNumaNodes = nNodes;
+                          uint32_t physical = 0;
+                          for (; physical < MAX_NUMA_NODES; physical++) {
+                              RtsFlags.GcFlags.numaMap[physical] = physical;
+                          }
+                      }
+                  }
+#endif
                   else {
                       OPTION_SAFE;
                       errorBelch("unknown RTS option: %s",rts_argv[arg]);
@@ -856,20 +940,20 @@ error = rtsTrue;
                 if (strncmp("maxN", &rts_argv[arg][1], 4) == 0) {
                   OPTION_SAFE;
                   THREADED_BUILD_ONLY(
-                    int nNodes;
+                    int nCapabilities;
                     int proc = (int)getNumberOfProcessors();
 
-                    nNodes = strtol(rts_argv[arg]+5, (char **) NULL, 10);
-                    if (nNodes > proc) { nNodes = proc; }
+                    nCapabilities = strtol(rts_argv[arg]+5, (char **) NULL, 10);
+                    if (nCapabilities > proc) { nCapabilities = proc; }
 
-                    if (nNodes <= 0) {
+                    if (nCapabilities <= 0) {
                       errorBelch("bad value for -maxN");
                       error = rtsTrue;
                     }
 #if defined(PROFILING)
-                    RtsFlags.ParFlags.nNodes = 1;
+                    RtsFlags.ParFlags.nCapabilities = 1;
 #else
-                    RtsFlags.ParFlags.nNodes = (uint32_t)nNodes;
+                    RtsFlags.ParFlags.nCapabilities = (uint32_t)nCapabilities;
 #endif
                   ) break;
                 } else {
@@ -1071,26 +1155,26 @@ error = rtsTrue;
                 THREADED_BUILD_ONLY(
                 if (rts_argv[arg][2] == '\0') {
 #if defined(PROFILING)
-                    RtsFlags.ParFlags.nNodes = 1;
+                    RtsFlags.ParFlags.nCapabilities = 1;
 #else
-                    RtsFlags.ParFlags.nNodes = getNumberOfProcessors();
+                    RtsFlags.ParFlags.nCapabilities = getNumberOfProcessors();
 #endif
                 } else {
-                    int nNodes;
+                    int nCapabilities;
                     OPTION_SAFE; /* but see extra checks below... */
 
-                    nNodes = strtol(rts_argv[arg]+2, (char **) NULL, 10);
+                    nCapabilities = strtol(rts_argv[arg]+2, (char **) NULL, 10);
 
-                    if (nNodes <= 0) {
+                    if (nCapabilities <= 0) {
                       errorBelch("bad value for -N");
                       error = rtsTrue;
                     }
                     if (rtsOptsEnabled == RtsOptsSafeOnly &&
-                      nNodes > (int)getNumberOfProcessors()) {
+                      nCapabilities > (int)getNumberOfProcessors()) {
                       errorRtsOptsDisabled("Using large values for -N is not allowed by default. %s");
                       stg_exit(EXIT_FAILURE);
                     }
-                    RtsFlags.ParFlags.nNodes = (uint32_t)nNodes;
+                    RtsFlags.ParFlags.nCapabilities = (uint32_t)nCapabilities;
                 }
                 ) break;
 
@@ -1395,7 +1479,7 @@ static void normaliseRtsOpts (void)
     }
 
 #ifdef THREADED_RTS
-    if (RtsFlags.ParFlags.parGcThreads > RtsFlags.ParFlags.nNodes) {
+    if (RtsFlags.ParFlags.parGcThreads > RtsFlags.ParFlags.nCapabilities) {
         errorBelch("GC threads (-qn) must be between 1 and the value of -N");
         errorUsage();
     }
diff --git a/rts/SMPClosureOps.h b/rts/SMPClosureOps.h
new file mode 100644
index 0000000000..39cde45544
--- /dev/null
+++ b/rts/SMPClosureOps.h
@@ -0,0 +1,129 @@
+/* ----------------------------------------------------------------------------
+ *
+ * (c) The GHC Team, 2005-2013
+ *
+ * Macros for THREADED_RTS support
+ *
+ * -------------------------------------------------------------------------- */
+
+#ifndef RTS_STORAGE_SMPCLOSUREOPS_H
+#define RTS_STORAGE_SMPCLOSUREOPS_H
+
+#include "BeginPrivate.h"
+
+#ifdef CMINUSMINUS
+
+/* Lock closure, equivalent to ccall lockClosure but the condition is inlined.
+ * Arguments are swapped for uniformity with unlockClosure. */
+#if defined(THREADED_RTS)
+#define LOCK_CLOSURE(closure, info)                             \
+    if (CInt[n_capabilities] == 1 :: CInt) {                    \
+        info = GET_INFO(closure);                               \
+    } else {                                                    \
+        ("ptr" info) = ccall reallyLockClosure(closure "ptr");  \
+    }
+#else
+#define LOCK_CLOSURE(closure, info) info = GET_INFO(closure)
+#endif
+
+#define unlockClosure(ptr,info)                 \
+    prim_write_barrier;                         \
+    StgHeader_info(ptr) = info;
+
+#else
+
+INLINE_HEADER StgInfoTable *lockClosure(StgClosure *p);
+EXTERN_INLINE StgInfoTable *reallyLockClosure(StgClosure *p);
+EXTERN_INLINE StgInfoTable *tryLockClosure(StgClosure *p);
+EXTERN_INLINE void unlockClosure(StgClosure *p, const StgInfoTable *info);
+
+#if defined(THREADED_RTS)
+
+/* -----------------------------------------------------------------------------
+ * Locking/unlocking closures
+ *
+ * This is used primarily in the implementation of MVars.
+ * -------------------------------------------------------------------------- */
+
+// We want a callable copy of reallyLockClosure() so that we can refer to it
+// from .cmm files compiled using the native codegen, so these are given
+// EXTERN_INLINE.  C-- should use LOCK_CLOSURE not lockClosure, so we've
+// kept it INLINE_HEADER.
+EXTERN_INLINE StgInfoTable *reallyLockClosure(StgClosure *p)
+{
+    StgWord info;
+    do {
+        uint32_t i = 0;
+        do {
+            info = xchg((P_)(void *)&p->header.info, (W_)&stg_WHITEHOLE_info);
+            if (info != (W_)&stg_WHITEHOLE_info) return (StgInfoTable *)info;
+        } while (++i < SPIN_COUNT);
+        yieldThread();
+    } while (1);
+}
+
+INLINE_HEADER StgInfoTable *lockClosure(StgClosure *p)
+{
+    if (n_capabilities == 1) {
+        return (StgInfoTable *)p->header.info;
+    }
+    else {
+        return reallyLockClosure(p);
+    }
+}
+
+// ToDo: consider splitting tryLockClosure into reallyTryLockClosure,
+// same as lockClosure
+EXTERN_INLINE StgInfoTable *tryLockClosure(StgClosure *p)
+{
+    StgWord info;
+    if (n_capabilities == 1) {
+        return (StgInfoTable *)p->header.info;
+    }
+    else {
+        info = xchg((P_)(void *)&p->header.info, (W_)&stg_WHITEHOLE_info);
+        if (info != (W_)&stg_WHITEHOLE_info) {
+            return (StgInfoTable *)info;
+        } else {
+            return NULL;
+        }
+    }
+}
+
+#else /* !THREADED_RTS */
+
+EXTERN_INLINE StgInfoTable *
+reallyLockClosure(StgClosure *p)
+{ return (StgInfoTable *)p->header.info; }
+
+INLINE_HEADER StgInfoTable *
+lockClosure(StgClosure *p)
+{ return (StgInfoTable *)p->header.info; }
+
+EXTERN_INLINE StgInfoTable *
+tryLockClosure(StgClosure *p)
+{ return (StgInfoTable *)p->header.info; }
+
+#endif /* THREADED_RTS */
+
+EXTERN_INLINE void unlockClosure(StgClosure *p, const StgInfoTable *info)
+{
+    // This is a strictly ordered write, so we need a write_barrier():
+    write_barrier();
+    p->header.info = info;
+}
+
+// Handy specialised versions of lockClosure()/unlockClosure()
+INLINE_HEADER void lockTSO(StgTSO *tso);
+INLINE_HEADER void lockTSO(StgTSO *tso)
+{ lockClosure((StgClosure *)tso); }
+
+INLINE_HEADER void unlockTSO(StgTSO *tso);
+INLINE_HEADER void unlockTSO(StgTSO *tso)
+{ unlockClosure((StgClosure*)tso, (const StgInfoTable *)&stg_TSO_info); }
+
+#endif /* CMINUSMINUS */
+
+#include "EndPrivate.h"
+
+#endif /* RTS_STORAGE_SMPCLOSUREOPS_H */
diff --git a/rts/STM.c b/rts/STM.c
index 9cd0833550..d5c2713a6f 100644
--- a/rts/STM.c
+++ b/rts/STM.c
@@ -92,6 +92,7 @@
 #include "Trace.h"
 #include "Threads.h"
 #include "sm/Storage.h"
+#include "SMPClosureOps.h"
 
 #include <stdio.h>
 
diff --git a/rts/Schedule.c b/rts/Schedule.c
index 8a08e35cc3..fca276dc08 100644
--- a/rts/Schedule.c
+++ b/rts/Schedule.c
@@ -726,7 +726,8 @@ schedulePushWork(Capability *cap USED_IF_THREADS,
         } while (n_wanted_caps < n_capabilities-1);
     }
 
-    // Grab free capabilities, starting from cap->no+1.
+    // First grab as many free Capabilities as we can.  ToDo: we should use
+    // capabilities on the same NUMA node preferably, but not exclusively.
     for (i = (cap->no + 1) % n_capabilities, n_free_caps=0;
          n_free_caps < n_wanted_caps && i != cap->no;
          i = (i + 1) % n_capabilities) {
@@ -1134,7 +1135,7 @@ scheduleHandleHeapOverflow( Capability *cap, StgTSO *t )
                                                // nursery has only one
                                                // block.
 
-            bd = allocGroup_lock(blocks);
+            bd = allocGroupOnNode_lock(cap->node,blocks);
             cap->r.rNursery->n_blocks += blocks;
 
             // link the new group after CurrentNursery
diff --git a/rts/Task.c b/rts/Task.c
index 45ef77ba0b..9a827745ba 100644
--- a/rts/Task.c
+++ b/rts/Task.c
@@ -220,6 +220,7 @@ newTask (rtsBool worker)
     initCondition(&task->cond);
     initMutex(&task->lock);
     task->wakeup = rtsFalse;
+    task->node = 0;
 #endif
 
     task->next = NULL;
@@ -427,6 +428,9 @@ workerStart(Task *task)
     if (RtsFlags.ParFlags.setAffinity) {
         setThreadAffinity(cap->no, n_capabilities);
     }
+    if (RtsFlags.GcFlags.numa && !RtsFlags.DebugFlags.numa) {
+        setThreadNode(RtsFlags.GcFlags.numaMap[task->node]);
+    }
 
     // set the thread-local pointer to the Task:
     setMyTask(task);
@@ -457,6 +461,7 @@ startWorkerTask (Capability *cap)
   // We don't emit a task creation event here, but in workerStart,
   // where the kernel thread id is known.
   task->cap = cap;
+  task->node = cap->node;
 
   // Give the capability directly to the worker; we can't let anyone
   // else get in, because the new worker Task has nowhere to go to
@@ -490,13 +495,27 @@ interruptWorkerTask (Task *task)
 
 #endif /* THREADED_RTS */
 
-void
-setInCallCapability (int preferred_capability)
+void rts_setInCallCapability (
+    int preferred_capability,
+    int affinity USED_IF_THREADS)
 {
     Task *task = allocTask();
     task->preferred_capability = preferred_capability;
-}
 
+#ifdef THREADED_RTS
+    if (affinity) {
+        if (RtsFlags.ParFlags.setAffinity) {
+            setThreadAffinity(preferred_capability, n_capabilities);
+        }
+        if (RtsFlags.GcFlags.numa) {
+            task->node = capNoToNumaNode(preferred_capability);
+            if (!DEBUG_IS_ON || !RtsFlags.DebugFlags.numa) { // faking NUMA
+                setThreadNode(RtsFlags.GcFlags.numaMap[task->node]);
+            }
+        }
+    }
+#endif
+}
 
 #ifdef DEBUG
 
@@ -525,4 +544,3 @@ printAllTasks(void)
 }
 
 #endif
-
diff --git a/rts/Task.h b/rts/Task.h
index 5fea2badda..558f543fac 100644
--- a/rts/Task.h
+++ b/rts/Task.h
@@ -115,6 +115,12 @@ typedef struct Task_ {
 #if defined(THREADED_RTS)
     OSThreadId id;              // The OS Thread ID of this task
 
+    // The NUMA node this Task belongs to.  If this is a worker thread, then the
+    // OS thread will be bound to this node (see workerStart()).  If this is an
+    // external thread calling into Haskell, it can be bound to a node using
+    // rts_setInCallCapability().
+    uint32_t node;
+
     Condition cond;             // used for sleeping & waking up this task
     Mutex lock;                 // lock for the condition variable
 
diff --git a/rts/eventlog/EventLog.c b/rts/eventlog/EventLog.c
index 9e2ed33d22..7cb69e8d7d 100644
--- a/rts/eventlog/EventLog.c
+++ b/rts/eventlog/EventLog.c
@@ -273,7 +273,7 @@ initEventLogging(void)
      */
 #ifdef THREADED_RTS
     // XXX n_capabilities hasn't been initislised yet
-    n_caps = RtsFlags.ParFlags.nNodes;
+    n_caps = RtsFlags.ParFlags.nCapabilities;
 #else
     n_caps = 1;
 #endif
diff --git a/rts/package.conf.in b/rts/package.conf.in
index b52a8672d5..d2b728ea88 100644
--- a/rts/package.conf.in
+++ b/rts/package.conf.in
@@ -37,6 +37,9 @@ extra-libraries:
 #ifdef HAVE_LIBFFI
                               , "ffi"
 #endif
+#ifdef HAVE_LIBNUMA
+                              , "numa"
+#endif
 #ifdef mingw32_HOST_OS
                               ,"wsock32"    /* for the linker */
                               ,"gdi32"      /* for the linker */
@@ -179,4 +182,3 @@ framework-dirs:
 
 haddock-interfaces:
 haddock-html:
-
diff --git a/rts/posix/OSMem.c b/rts/posix/OSMem.c
index 5ff4bc86e4..a534219902 100644
--- a/rts/posix/OSMem.c
+++ b/rts/posix/OSMem.c
@@ -30,6 +30,12 @@
 #ifdef HAVE_FCNTL_H
 #include <fcntl.h>
 #endif
+#ifdef HAVE_NUMA_H
+#include <numa.h>
+#endif
+#ifdef HAVE_NUMAIF_H
+#include <numaif.h>
+#endif
 
 #include <errno.h>
 
@@ -287,6 +293,7 @@ osGetMBlocks(uint32_t n)
           ret = gen_map_mblocks(size);
       }
   }
+
   // Next time, we'll try to allocate right after the block we just got.
   // ToDo: check that we haven't already grabbed the memory at next_request
   next_request = (char *)ret + size;
@@ -294,6 +301,31 @@ osGetMBlocks(uint32_t n)
   return ret;
 }
 
+void osBindMBlocksToNode(
+    void *addr STG_UNUSED,
+    StgWord size STG_UNUSED,
+    uint32_t node STG_UNUSED)
+{
+#ifdef HAVE_NUMAIF_H
+    int ret;
+    StgWord mask = 0;
+    mask |= 1 << node;
+    if (RtsFlags.GcFlags.numa) {
+        ret = mbind(addr, (unsigned long)size,
+                    MPOL_BIND, &mask, sizeof(StgWord)*8, MPOL_MF_STRICT);
+        // paranoia: MPOL_BIND guarantees memory on the correct node;
+        // MPOL_MF_STRICT will tell us if it didn't work.  We might want to
+        // relax these in due course, but I want to be sure it's doing what we
+        // want first.
+        if (ret != 0) {
+            sysErrorBelch("mbind");
+            stg_exit(EXIT_FAILURE);
+        }
+    }
+#endif
+}
+
+
 void osFreeMBlocks(void *addr, uint32_t n)
 {
     munmap(addr, n * MBLOCK_SIZE);
@@ -512,4 +544,36 @@ void osReleaseHeapMemory(void)
         sysErrorBelch("unable to release address space");
 }
 
+rtsBool osNumaAvailable(void)
+{
+#ifdef HAVE_NUMA_H
+    return (numa_available() != -1);
+#else
+    return rtsFalse;
+#endif
+}
+
+uint32_t osNumaNodes(void)
+{
+#ifdef HAVE_NUMA_H
+    return numa_num_configured_nodes();
+#else
+    return 1;
+#endif
+}
+
+StgWord osNumaMask(void)
+{
+#ifdef HAVE_NUMA_H
+    struct bitmask *mask;
+    mask = numa_get_mems_allowed();
+    if (mask->size > sizeof(StgWord)*8) {
+        barf("Too many NUMA nodes");
+    }
+    return mask->maskp[0];
+#else
+    return 1;
+#endif
+}
+
 #endif
diff --git a/rts/posix/OSThreads.c b/rts/posix/OSThreads.c
index ad138d314d..72538c1bf3 100644
--- a/rts/posix/OSThreads.c
+++ b/rts/posix/OSThreads.c
@@ -70,6 +70,10 @@
 # include <signal.h>
 #endif
 
+#ifdef HAVE_NUMA_H
+#include <numa.h>
+#endif
+
 /*
  * This (allegedly) OS threads independent layer was initially
  * abstracted away from code that used Pthreads, so the functions
@@ -308,10 +312,32 @@ setThreadAffinity(uint32_t n, uint32_t m)
 
 #else
 void
-setThreadAffinity (uint32_t n GNUC3_ATTRIBUTE(__unused__),
-                   uint32_t m GNUC3_ATTRIBUTE(__unused__))
+setThreadAffinity (uint32_t n STG_UNUSED,
+                   uint32_t m STG_UNUSED)
+{
+}
+#endif
+
+#ifdef HAVE_NUMA_H
+void setThreadNode (uint32_t node)
 {
+    ASSERT(node < RtsFlags.GcFlags.nNumaNodes);
+    if (numa_run_on_node(node) == -1) {
+        sysErrorBelch("numa_run_on_node");
+        stg_exit(1);
+    }
+}
+
+void releaseThreadNode (void)
+{
+    if (numa_run_on_node(-1) == -1) {
+        sysErrorBelch("numa_run_on_node");
+        stg_exit(1);
+    }
 }
+#else
+void setThreadNode (uint32_t node STG_UNUSED) { /* nothing */ }
+void releaseThreadNode (void) { /* nothing */ }
 #endif
 
 void
diff --git a/rts/sm/BlockAlloc.c b/rts/sm/BlockAlloc.c
index ff1a6460a4..c2859b0c15 100644
--- a/rts/sm/BlockAlloc.c
+++ b/rts/sm/BlockAlloc.c
@@ -7,11 +7,11 @@
  * This is the architecture independent part of the block allocator.
  * It requires only the following support from the operating system:
  *
- *    void *getMBlock(uint32_t n);
+ *    void *getMBlocks(uint32_t n);
  *
  * returns the address of an n*MBLOCK_SIZE region of memory, aligned on
  * an MBLOCK_SIZE boundary.  There are no other restrictions on the
- * addresses of memory returned by getMBlock().
+ * addresses of memory returned by getMBlocks().
  *
  * ---------------------------------------------------------------------------*/
 
@@ -25,7 +25,7 @@
 
 #include <string.h>
 
-static void  initMBlock(void *mblock);
+static void  initMBlock(void *mblock, uint32_t node);
 
 /* -----------------------------------------------------------------------------
 
@@ -158,27 +158,54 @@ static void  initMBlock(void *mblock);
 
 // In THREADED_RTS mode, the free list is protected by sm_mutex.
 
-static bdescr *free_list[NUM_FREE_LISTS];
-static bdescr *free_mblock_list;
+static bdescr *free_list[MAX_NUMA_NODES][NUM_FREE_LISTS];
+static bdescr *free_mblock_list[MAX_NUMA_NODES];
 
 W_ n_alloc_blocks;   // currently allocated blocks
 W_ hw_alloc_blocks;  // high-water allocated blocks
 
+W_ n_alloc_blocks_by_node[MAX_NUMA_NODES];
+
 /* -----------------------------------------------------------------------------
    Initialisation
    -------------------------------------------------------------------------- */
 
 void initBlockAllocator(void)
 {
-    uint32_t i;
-    for (i=0; i < NUM_FREE_LISTS; i++) {
-        free_list[i] = NULL;
+    uint32_t i, node;
+    for (node = 0; node < MAX_NUMA_NODES; node++) {
+        for (i=0; i < NUM_FREE_LISTS; i++) {
+            free_list[node][i] = NULL;
+        }
+        free_mblock_list[node] = NULL;
+        n_alloc_blocks_by_node[node] = 0;
     }
-    free_mblock_list = NULL;
     n_alloc_blocks = 0;
     hw_alloc_blocks = 0;
 }
 
+/* -----------------------------------------------------------------------------
+   Accounting
+   -------------------------------------------------------------------------- */
+
+STATIC_INLINE
+void recordAllocatedBlocks(uint32_t node, uint32_t n)
+{
+    n_alloc_blocks += n;
+    n_alloc_blocks_by_node[node] += n;
+    if (n > 0 && n_alloc_blocks > hw_alloc_blocks) {
+        hw_alloc_blocks = n_alloc_blocks;
+    }
+}
+
+STATIC_INLINE
+void recordFreedBlocks(uint32_t node, uint32_t n)
+{
+    ASSERT(n_alloc_blocks >= n);
+    n_alloc_blocks -= n;
+    n_alloc_blocks_by_node[node] -= n;
+}
+
 /* -----------------------------------------------------------------------------
    Allocation
    -------------------------------------------------------------------------- */
@@ -248,14 +275,14 @@ log_2_ceil(W_ n)
 }
 
 STATIC_INLINE void
-free_list_insert (bdescr *bd)
+free_list_insert (uint32_t node, bdescr *bd)
 {
     uint32_t ln;
 
     ASSERT(bd->blocks < BLOCKS_PER_MBLOCK);
     ln = log_2(bd->blocks);
 
-    dbl_link_onto(bd, &free_list[ln]);
+    dbl_link_onto(bd, &free_list[node][ln]);
 }
 
 
@@ -284,18 +311,18 @@ setup_tail (bdescr *bd)
 // Take a free block group bd, and split off a group of size n from
 // it.  Adjust the free list as necessary, and return the new group.
 static bdescr *
-split_free_block (bdescr *bd, W_ n, uint32_t ln)
+split_free_block (bdescr *bd, uint32_t node, W_ n, uint32_t ln)
 {
     bdescr *fg; // free group
 
     ASSERT(bd->blocks > n);
-    dbl_link_remove(bd, &free_list[ln]);
+    dbl_link_remove(bd, &free_list[node][ln]);
     fg = bd + bd->blocks - n; // take n blocks off the end
     fg->blocks = n;
     bd->blocks -= n;
     setup_tail(bd);
     ln = log_2(bd->blocks);
-    dbl_link_onto(bd, &free_list[ln]);
+    dbl_link_onto(bd, &free_list[node][ln]);
     return fg;
 }
 
@@ -304,7 +331,7 @@ split_free_block (bdescr *bd, W_ n, uint32_t ln)
  * initGroup afterwards.
  */
 static bdescr *
-alloc_mega_group (StgWord mblocks)
+alloc_mega_group (uint32_t node, StgWord mblocks)
 {
     bdescr *best, *bd, *prev;
     StgWord n;
@@ -313,14 +340,14 @@ alloc_mega_group (StgWord mblocks)
 
     best = NULL;
     prev = NULL;
-    for (bd = free_mblock_list; bd != NULL; prev = bd, bd = bd->link)
+    for (bd = free_mblock_list[node]; bd != NULL; prev = bd, bd = bd->link)
     {
         if (bd->blocks == n)
         {
             if (prev) {
                 prev->link = bd->link;
             } else {
-                free_mblock_list = bd->link;
+                free_mblock_list[node] = bd->link;
             }
             return bd;
         }
@@ -341,12 +368,17 @@ alloc_mega_group (StgWord mblocks)
                           (best_mblocks-mblocks)*MBLOCK_SIZE);
 
         best->blocks = MBLOCK_GROUP_BLOCKS(best_mblocks - mblocks);
-        initMBlock(MBLOCK_ROUND_DOWN(bd));
+        initMBlock(MBLOCK_ROUND_DOWN(bd), node);
     }
     else
     {
-        void *mblock = getMBlocks(mblocks);
-        initMBlock(mblock);             // only need to init the 1st one
+        void *mblock;
+        if (RtsFlags.GcFlags.numa) {
+            mblock = getMBlocksOnNode(node, mblocks);
+        } else {
+            mblock = getMBlocks(mblocks);
+        }
+        initMBlock(mblock, node); // only need to init the 1st one
         bd = FIRST_BDESCR(mblock);
     }
     bd->blocks = MBLOCK_GROUP_BLOCKS(mblocks);
@@ -354,7 +386,7 @@ alloc_mega_group (StgWord mblocks)
 }
 
 bdescr *
-allocGroup (W_ n)
+allocGroupOnNode (uint32_t node, W_ n)
 {
     bdescr *bd, *rem;
     StgWord ln;
@@ -369,21 +401,19 @@ allocGroup (W_ n)
 
         // n_alloc_blocks doesn't count the extra blocks we get in a
         // megablock group.
-        n_alloc_blocks += mblocks * BLOCKS_PER_MBLOCK;
-        if (n_alloc_blocks > hw_alloc_blocks) hw_alloc_blocks = n_alloc_blocks;
+        recordAllocatedBlocks(node, mblocks * BLOCKS_PER_MBLOCK);
 
-        bd = alloc_mega_group(mblocks);
+        bd = alloc_mega_group(node, mblocks);
         // only the bdescrs of the first MB are required to be initialised
         initGroup(bd);
         goto finish;
     }
 
-    n_alloc_blocks += n;
-    if (n_alloc_blocks > hw_alloc_blocks) hw_alloc_blocks = n_alloc_blocks;
+    recordAllocatedBlocks(node, n);
 
     ln = log_2_ceil(n);
 
-    while (ln < NUM_FREE_LISTS && free_list[ln] == NULL) {
+    while (ln < NUM_FREE_LISTS && free_list[node][ln] == NULL) {
         ln++;
     }
 
@@ -397,27 +427,27 @@ allocGroup (W_ n)
         }
 #endif
 
-        bd = alloc_mega_group(1);
+        bd = alloc_mega_group(node,1);
         bd->blocks = n;
         initGroup(bd);                   // we know the group will fit
         rem = bd + n;
         rem->blocks = BLOCKS_PER_MBLOCK-n;
-        initGroup(rem); // init the slop
-        n_alloc_blocks += rem->blocks;
+        initGroup(rem);                  // init the slop
+        recordAllocatedBlocks(node,rem->blocks);
         freeGroup(rem);                  // add the slop on to the free list
         goto finish;
     }
 
-    bd = free_list[ln];
+    bd = free_list[node][ln];
 
     if (bd->blocks == n)                // exactly the right size!
     {
-        dbl_link_remove(bd, &free_list[ln]);
+        dbl_link_remove(bd, &free_list[node][ln]);
         initGroup(bd);
     }
     else if (bd->blocks >  n)            // block too big...
     {
-        bd = split_free_block(bd, n, ln);
+        bd = split_free_block(bd, node, n, ln);
         ASSERT(bd->blocks == n);
         initGroup(bd);
     }
@@ -432,6 +462,26 @@ finish:
     return bd;
 }
 
+STATIC_INLINE
+uint32_t nodeWithLeastBlocks (void)
+{
+    uint32_t node = 0, i;
+    uint32_t min_blocks = n_alloc_blocks_by_node[0];
+    for (i = 1; i < RtsFlags.GcFlags.nNumaNodes; i++) {
+        if (n_alloc_blocks_by_node[i] < min_blocks) {
+            min_blocks = n_alloc_blocks_by_node[i];
+            node = i;
+        }
+    }
+    return node;
+}
+
+bdescr* allocGroup (W_ n)
+{
+    return allocGroupOnNode(nodeWithLeastBlocks(),n);
+}
+
+
 //
 // Allocate a chunk of blocks that is at least min and at most max
 // blocks in size. This API is used by the nursery allocator that
@@ -448,8 +498,7 @@ finish:
 // fragmentation, but we make sure that we allocate large blocks
 // preferably if there are any.
 //
-bdescr *
-allocLargeChunk (W_ min, W_ max)
+bdescr* allocLargeChunkOnNode (uint32_t node, W_ min, W_ max)
 {
     bdescr *bd;
     StgWord ln, lnmax;
@@ -461,34 +510,38 @@ allocLargeChunk (W_ min, W_ max)
     ln = log_2_ceil(min);
     lnmax = log_2_ceil(max);
 
-    while (ln < NUM_FREE_LISTS && ln < lnmax && free_list[ln] == NULL) {
+    while (ln < NUM_FREE_LISTS && ln < lnmax && free_list[node][ln] == NULL) {
         ln++;
     }
     if (ln == NUM_FREE_LISTS || ln == lnmax) {
-        return allocGroup(max);
+        return allocGroupOnNode(node,max);
     }
-    bd = free_list[ln];
+    bd = free_list[node][ln];
 
     if (bd->blocks <= max)              // exactly the right size!
     {
-        dbl_link_remove(bd, &free_list[ln]);
+        dbl_link_remove(bd, &free_list[node][ln]);
         initGroup(bd);
     }
     else   // block too big...
     {
-        bd = split_free_block(bd, max, ln);
+        bd = split_free_block(bd, node, max, ln);
         ASSERT(bd->blocks == max);
         initGroup(bd);
     }
 
-    n_alloc_blocks += bd->blocks;
-    if (n_alloc_blocks > hw_alloc_blocks) hw_alloc_blocks = n_alloc_blocks;
+    recordAllocatedBlocks(node, bd->blocks);
 
     IF_DEBUG(sanity, memset(bd->start, 0xaa, bd->blocks * BLOCK_SIZE));
     IF_DEBUG(sanity, checkFreeListSanity());
     return bd;
 }
 
+bdescr* allocLargeChunk (W_ min, W_ max)
+{
+    return allocLargeChunkOnNode(nodeWithLeastBlocks(), min, max);
+}
+
 bdescr *
 allocGroup_lock(W_ n)
 {
@@ -500,17 +553,31 @@ allocGroup_lock(W_ n)
 }
 
 bdescr *
-allocBlock(void)
+allocBlock_lock(void)
 {
-    return allocGroup(1);
+    bdescr *bd;
+    ACQUIRE_SM_LOCK;
+    bd = allocBlock();
+    RELEASE_SM_LOCK;
+    return bd;
 }
 
 bdescr *
-allocBlock_lock(void)
+allocGroupOnNode_lock(uint32_t node, W_ n)
 {
     bdescr *bd;
     ACQUIRE_SM_LOCK;
-    bd = allocBlock();
+    bd = allocGroupOnNode(node,n);
+    RELEASE_SM_LOCK;
+    return bd;
+}
+
+bdescr *
+allocBlockOnNode_lock(uint32_t node)
+{
+    bdescr *bd;
+    ACQUIRE_SM_LOCK;
+    bd = allocBlockOnNode(node);
     RELEASE_SM_LOCK;
     return bd;
 }
@@ -542,11 +609,13 @@ static void
 free_mega_group (bdescr *mg)
 {
     bdescr *bd, *prev;
+    uint32_t node;
 
     // Find the right place in the free list.  free_mblock_list is
     // sorted by *address*, not by size as the free_list is.
     prev = NULL;
-    bd = free_mblock_list;
+    node = mg->node;
+    bd = free_mblock_list[node];
     while (bd && bd->start < mg->start) {
         prev = bd;
         bd = bd->link;
@@ -561,8 +630,8 @@ free_mega_group (bdescr *mg)
     }
     else
     {
-        mg->link = free_mblock_list;
-        free_mblock_list = mg;
+        mg->link = free_mblock_list[node];
+        free_mblock_list[node] = mg;
     }
     // coalesce forwards
     coalesce_mblocks(mg);
@@ -575,12 +644,15 @@ void
 freeGroup(bdescr *p)
 {
   StgWord ln;
+  uint32_t node;
 
-  // Todo: not true in multithreaded GC
+  // not true in multithreaded GC:
   // ASSERT_SM_LOCK();
 
   ASSERT(p->free != (P_)-1);
 
+  node = p->node;
+
   p->free = (void *)-1;  /* indicates that this block is free */
   p->gen = NULL;
   p->gen_no = 0;
@@ -597,14 +669,13 @@ freeGroup(bdescr *p)
       // If this is an mgroup, make sure it has the right number of blocks
       ASSERT(p->blocks == MBLOCK_GROUP_BLOCKS(mblocks));
 
-      n_alloc_blocks -= mblocks * BLOCKS_PER_MBLOCK;
+      recordFreedBlocks(node, mblocks * BLOCKS_PER_MBLOCK);
 
       free_mega_group(p);
       return;
   }
 
-  ASSERT(n_alloc_blocks >= p->blocks);
-  n_alloc_blocks -= p->blocks;
+  recordFreedBlocks(node, p->blocks);
 
   // coalesce forwards
   {
@@ -614,7 +685,7 @@ freeGroup(bdescr *p)
       {
           p->blocks += next->blocks;
           ln = log_2(next->blocks);
-          dbl_link_remove(next, &free_list[ln]);
+          dbl_link_remove(next, &free_list[node][ln]);
           if (p->blocks == BLOCKS_PER_MBLOCK)
           {
               free_mega_group(p);
@@ -634,7 +705,7 @@ freeGroup(bdescr *p)
       if (prev->free == (P_)-1)
       {
           ln = log_2(prev->blocks);
-          dbl_link_remove(prev, &free_list[ln]);
+          dbl_link_remove(prev, &free_list[node][ln]);
           prev->blocks += p->blocks;
           if (prev->blocks >= BLOCKS_PER_MBLOCK)
           {
@@ -646,7 +717,7 @@ freeGroup(bdescr *p)
   }
 
   setup_tail(p);
-  free_list_insert(p);
+  free_list_insert(node,p);
 
   IF_DEBUG(sanity, checkFreeListSanity());
 }
@@ -679,7 +750,7 @@ freeChain_lock(bdescr *bd)
 }
 
 static void
-initMBlock(void *mblock)
+initMBlock(void *mblock, uint32_t node)
 {
     bdescr *bd;
     StgWord8 *block;
@@ -695,6 +766,7 @@ initMBlock(void *mblock)
     for (; block <= (StgWord8*)LAST_BLOCK(mblock); bd += 1,
              block += BLOCK_SIZE) {
         bd->start = (void*)block;
+        bd->node = node;
     }
 }
 
@@ -734,28 +806,32 @@ countAllocdBlocks(bdescr *bd)
 
 void returnMemoryToOS(uint32_t n /* megablocks */)
 {
-    static bdescr *bd;
+    bdescr *bd;
+    uint32_t node;
     StgWord size;
 
-    bd = free_mblock_list;
-    while ((n > 0) && (bd != NULL)) {
-        size = BLOCKS_TO_MBLOCKS(bd->blocks);
-        if (size > n) {
-            StgWord newSize = size - n;
-            char *freeAddr = MBLOCK_ROUND_DOWN(bd->start);
-            freeAddr += newSize * MBLOCK_SIZE;
-            bd->blocks = MBLOCK_GROUP_BLOCKS(newSize);
-            freeMBlocks(freeAddr, n);
-            n = 0;
-        }
-        else {
-            char *freeAddr = MBLOCK_ROUND_DOWN(bd->start);
-            n -= size;
-            bd = bd->link;
-            freeMBlocks(freeAddr, size);
+    // ToDo: not fair, we free all the memory starting with node 0.
+    for (node = 0; n > 0 && node < RtsFlags.GcFlags.nNumaNodes; node++) {
+        bd = free_mblock_list[node];
+        while ((n > 0) && (bd != NULL)) {
+            size = BLOCKS_TO_MBLOCKS(bd->blocks);
+            if (size > n) {
+                StgWord newSize = size - n;
+                char *freeAddr = MBLOCK_ROUND_DOWN(bd->start);
+                freeAddr += newSize * MBLOCK_SIZE;
+                bd->blocks = MBLOCK_GROUP_BLOCKS(newSize);
+                freeMBlocks(freeAddr, n);
+                n = 0;
+            }
+            else {
+                char *freeAddr = MBLOCK_ROUND_DOWN(bd->start);
+                n -= size;
+                bd = bd->link;
+                freeMBlocks(freeAddr, size);
+            }
         }
+        free_mblock_list[node] = bd;
     }
-    free_mblock_list = bd;
 
     // Ask the OS to release any address space portion
     // that was associated with the just released MBlocks
@@ -797,68 +873,71 @@ checkFreeListSanity(void)
 {
     bdescr *bd, *prev;
     StgWord ln, min;
+    uint32_t node;
 
-
-    min = 1;
-    for (ln = 0; ln < NUM_FREE_LISTS; ln++) {
-        IF_DEBUG(block_alloc,
-                 debugBelch("free block list [%" FMT_Word "]:\n", ln));
-
-        prev = NULL;
-        for (bd = free_list[ln]; bd != NULL; prev = bd, bd = bd->link)
-        {
+    for (node = 0; node < RtsFlags.GcFlags.nNumaNodes; node++) {
+        min = 1;
+        for (ln = 0; ln < NUM_FREE_LISTS; ln++) {
             IF_DEBUG(block_alloc,
-                     debugBelch("group at %p, length %ld blocks\n",
-                                bd->start, (long)bd->blocks));
-            ASSERT(bd->free == (P_)-1);
-            ASSERT(bd->blocks > 0 && bd->blocks < BLOCKS_PER_MBLOCK);
-            ASSERT(bd->blocks >= min && bd->blocks <= (min*2 - 1));
-            ASSERT(bd->link != bd); // catch easy loops
-
-            check_tail(bd);
-
-            if (prev)
-                ASSERT(bd->u.back == prev);
-            else
-                ASSERT(bd->u.back == NULL);
+                     debugBelch("free block list [%" FMT_Word "]:\n", ln));
 
+            prev = NULL;
+            for (bd = free_list[node][ln]; bd != NULL; prev = bd, bd = bd->link)
             {
-                bdescr *next;
-                next = bd + bd->blocks;
-                if (next <= LAST_BDESCR(MBLOCK_ROUND_DOWN(bd)))
+                IF_DEBUG(block_alloc,
+                         debugBelch("group at %p, length %ld blocks\n",
+                                    bd->start, (long)bd->blocks));
+                ASSERT(bd->free == (P_)-1);
+                ASSERT(bd->blocks > 0 && bd->blocks < BLOCKS_PER_MBLOCK);
+                ASSERT(bd->blocks >= min && bd->blocks <= (min*2 - 1));
+                ASSERT(bd->link != bd); // catch easy loops
+                ASSERT(bd->node == node);
+
+                check_tail(bd);
+
+                if (prev)
+                    ASSERT(bd->u.back == prev);
+                else
+                    ASSERT(bd->u.back == NULL);
+
                 {
-                    ASSERT(next->free != (P_)-1);
+                    bdescr *next;
+                    next = bd + bd->blocks;
+                    if (next <= LAST_BDESCR(MBLOCK_ROUND_DOWN(bd)))
+                    {
+                        ASSERT(next->free != (P_)-1);
+                    }
                 }
             }
+            min = min << 1;
         }
-        min = min << 1;
-    }
 
-    prev = NULL;
-    for (bd = free_mblock_list; bd != NULL; prev = bd, bd = bd->link)
-    {
-        IF_DEBUG(block_alloc,
-                 debugBelch("mega group at %p, length %ld blocks\n",
-                            bd->start, (long)bd->blocks));
+        prev = NULL;
+        for (bd = free_mblock_list[node]; bd != NULL; prev = bd, bd = bd->link)
+        {
+            IF_DEBUG(block_alloc,
+                     debugBelch("mega group at %p, length %ld blocks\n",
+                                bd->start, (long)bd->blocks));
 
-        ASSERT(bd->link != bd); // catch easy loops
+            ASSERT(bd->link != bd); // catch easy loops
 
-        if (bd->link != NULL)
-        {
-            // make sure the list is sorted
-            ASSERT(bd->start < bd->link->start);
-        }
+            if (bd->link != NULL)
+            {
+                // make sure the list is sorted
+                ASSERT(bd->start < bd->link->start);
+            }
 
-        ASSERT(bd->blocks >= BLOCKS_PER_MBLOCK);
-        ASSERT(MBLOCK_GROUP_BLOCKS(BLOCKS_TO_MBLOCKS(bd->blocks))
-               == bd->blocks);
+            ASSERT(bd->blocks >= BLOCKS_PER_MBLOCK);
+            ASSERT(MBLOCK_GROUP_BLOCKS(BLOCKS_TO_MBLOCKS(bd->blocks))
+                   == bd->blocks);
 
-        // make sure we're fully coalesced
-        if (bd->link != NULL)
-        {
-            ASSERT(MBLOCK_ROUND_DOWN(bd->link) !=
-                   (StgWord8*)MBLOCK_ROUND_DOWN(bd) +
-                   BLOCKS_TO_MBLOCKS(bd->blocks) * MBLOCK_SIZE);
+            // make sure we're fully coalesced
+            if (bd->link != NULL)
+            {
+                ASSERT(MBLOCK_ROUND_DOWN(bd->link) !=
+                       (StgWord8*)MBLOCK_ROUND_DOWN(bd) +
+                       BLOCKS_TO_MBLOCKS(bd->blocks) * MBLOCK_SIZE);
+            }
         }
     }
 }
@@ -869,18 +948,21 @@ countFreeList(void)
   bdescr *bd;
   W_ total_blocks = 0;
   StgWord ln;
+  uint32_t node;
 
-  for (ln=0; ln < NUM_FREE_LISTS; ln++) {
-      for (bd = free_list[ln]; bd != NULL; bd = bd->link) {
-          total_blocks += bd->blocks;
+  for (node = 0; node < RtsFlags.GcFlags.nNumaNodes; node++) {
+      for (ln=0; ln < NUM_FREE_LISTS; ln++) {
+          for (bd = free_list[node][ln]; bd != NULL; bd = bd->link) {
+              total_blocks += bd->blocks;
+          }
+      }
+      for (bd = free_mblock_list[node]; bd != NULL; bd = bd->link) {
+          total_blocks += BLOCKS_PER_MBLOCK * BLOCKS_TO_MBLOCKS(bd->blocks);
+          // The caller of this function, memInventory(), expects to match
+          // the total number of blocks in the system against mblocks *
+          // BLOCKS_PER_MBLOCK, so we must subtract the space for the
+          // block descriptors from *every* mblock.
       }
-  }
-  for (bd = free_mblock_list; bd != NULL; bd = bd->link) {
-      total_blocks += BLOCKS_PER_MBLOCK * BLOCKS_TO_MBLOCKS(bd->blocks);
-      // The caller of this function, memInventory(), expects to match
-      // the total number of blocks in the system against mblocks *
-      // BLOCKS_PER_MBLOCK, so we must subtract the space for the
-      // block descriptors from *every* mblock.
   }
   return total_blocks;
 }
diff --git a/rts/sm/BlockAlloc.h b/rts/sm/BlockAlloc.h
index 2ba7c02c08..c26ae104e1 100644
--- a/rts/sm/BlockAlloc.h
+++ b/rts/sm/BlockAlloc.h
@@ -12,6 +12,7 @@
 #include "BeginPrivate.h"
 
 bdescr *allocLargeChunk (W_ min, W_ max);
+bdescr *allocLargeChunkOnNode (uint32_t node, W_ min, W_ max);
 
 /* Debugging  -------------------------------------------------------------- */
 
diff --git a/rts/sm/GC.c b/rts/sm/GC.c
index 996ce8cbce..3bfdaa25ff 100644
--- a/rts/sm/GC.c
+++ b/rts/sm/GC.c
@@ -802,7 +802,8 @@ new_gc_thread (uint32_t n, gc_thread *t)
         // but can't, because it uses gct which isn't set up at this point.
         // Hence, allocate a block for todo_bd manually:
         {
-            bdescr *bd = allocBlock(); // no lock, locks aren't initialised yet
+            bdescr *bd = allocBlockOnNode(capNoToNumaNode(n));
+                // no lock, locks aren't initialised yet
             initBdescr(bd, ws->gen, ws->gen->to);
             bd->flags = BF_EVACUATED;
             bd->u.scan = bd->free = bd->start;
@@ -1182,7 +1183,8 @@ prepare_collected_gen (generation *gen)
     if (g != 0) {
         for (i = 0; i < n_capabilities; i++) {
             freeChain(capabilities[i]->mut_lists[g]);
-            capabilities[i]->mut_lists[g] = allocBlock();
+            capabilities[i]->mut_lists[g] =
+                allocBlockOnNode(capNoToNumaNode(i));
         }
     }
 
@@ -1296,7 +1298,7 @@ static void
 stash_mut_list (Capability *cap, uint32_t gen_no)
 {
     cap->saved_mut_lists[gen_no] = cap->mut_lists[gen_no];
-    cap->mut_lists[gen_no] = allocBlock_sync();
+    cap->mut_lists[gen_no] = allocBlockOnNode_sync(cap->node);
 }
 
 /* ----------------------------------------------------------------------------
diff --git a/rts/sm/GCUtils.c b/rts/sm/GCUtils.c
index 5edf9dedbc..a515665d07 100644
--- a/rts/sm/GCUtils.c
+++ b/rts/sm/GCUtils.c
@@ -30,34 +30,33 @@
 SpinLock gc_alloc_block_sync;
 #endif
 
-bdescr *
-allocBlock_sync(void)
+bdescr* allocGroup_sync(uint32_t n)
 {
     bdescr *bd;
+    uint32_t node = capNoToNumaNode(gct->thread_index);
     ACQUIRE_SPIN_LOCK(&gc_alloc_block_sync);
-    bd = allocBlock();
+    bd = allocGroupOnNode(node,n);
     RELEASE_SPIN_LOCK(&gc_alloc_block_sync);
     return bd;
 }
 
-static bdescr *
-allocGroup_sync(uint32_t n)
+bdescr* allocGroupOnNode_sync(uint32_t node, uint32_t n)
 {
     bdescr *bd;
     ACQUIRE_SPIN_LOCK(&gc_alloc_block_sync);
-    bd = allocGroup(n);
+    bd = allocGroupOnNode(node,n);
     RELEASE_SPIN_LOCK(&gc_alloc_block_sync);
     return bd;
 }
 
-
 static uint32_t
 allocBlocks_sync(uint32_t n, bdescr **hd)
 {
     bdescr *bd;
     uint32_t i;
+    uint32_t node = capNoToNumaNode(gct->thread_index);
     ACQUIRE_SPIN_LOCK(&gc_alloc_block_sync);
-    bd = allocLargeChunk(1,n);
+    bd = allocLargeChunkOnNode(node,1,n);
     // NB. allocLargeChunk, rather than allocGroup(n), to allocate in a
     // fragmentation-friendly way.
     n = bd->blocks;
diff --git a/rts/sm/GCUtils.h b/rts/sm/GCUtils.h
index 0f87eee3f1..7e5a827ce0 100644
--- a/rts/sm/GCUtils.h
+++ b/rts/sm/GCUtils.h
@@ -18,7 +18,19 @@
 
 #include "GCTDecl.h"
 
-bdescr *allocBlock_sync(void);
+bdescr* allocGroup_sync(uint32_t n);
+bdescr* allocGroupOnNode_sync(uint32_t node, uint32_t n);
+
+INLINE_HEADER bdescr *allocBlock_sync(void)
+{
+    return allocGroup_sync(1);
+}
+
+INLINE_HEADER bdescr *allocBlockOnNode_sync(uint32_t node)
+{
+    return allocGroupOnNode_sync(node,1);
+}
+
 void    freeChain_sync(bdescr *bd);
 
 void    push_scanned_block   (bdescr *bd, gen_workspace *ws);
diff --git a/rts/sm/MBlock.c b/rts/sm/MBlock.c
index 440b03efa7..53999d2c4b 100644
--- a/rts/sm/MBlock.c
+++ b/rts/sm/MBlock.c
@@ -566,7 +566,7 @@ void releaseFreeMemory(void)
 void *
 getMBlock(void)
 {
-  return getMBlocks(1);
+    return getMBlocks(1);
 }
 
 // The external interface: allocate 'n' mblocks, and return the
@@ -587,6 +587,23 @@ getMBlocks(uint32_t n)
     return ret;
 }
 
+void *
+getMBlocksOnNode(uint32_t node, uint32_t n)
+{
+    void *addr = getMBlocks(n);
+#ifdef DEBUG
+    if (RtsFlags.DebugFlags.numa) return addr; // faking NUMA
+#endif
+    osBindMBlocksToNode(addr, n * MBLOCK_SIZE, RtsFlags.GcFlags.numaMap[node]);
+    return addr;
+}
+
+void *
+getMBlockOnNode(uint32_t node)
+{
+    return getMBlocksOnNode(node, 1);
+}
+
 void
 freeMBlocks(void *addr, uint32_t n)
 {
diff --git a/rts/sm/MarkStack.h b/rts/sm/MarkStack.h
index f978a32563..d90b5e47b4 100644
--- a/rts/sm/MarkStack.h
+++ b/rts/sm/MarkStack.h
@@ -15,6 +15,7 @@
 #define SM_MARKSTACK_H
 
 #include "BeginPrivate.h"
+#include "GCUtils.h"
 
 INLINE_HEADER void
 push_mark_stack(StgPtr p)
diff --git a/rts/sm/OSMem.h b/rts/sm/OSMem.h
index 8518f05d1b..660942827d 100644
--- a/rts/sm/OSMem.h
+++ b/rts/sm/OSMem.h
@@ -19,6 +19,10 @@ void osFreeAllMBlocks(void);
 size_t getPageSize (void);
 StgWord64 getPhysicalMemorySize (void);
 void setExecutable (void *p, W_ len, rtsBool exec);
+rtsBool osNumaAvailable(void);
+uint32_t osNumaNodes(void);
+StgWord osNumaMask(void);
+void osBindMBlocksToNode(void *addr, StgWord size, uint32_t node);
 
 INLINE_HEADER size_t
 roundDownToPage (size_t x)
diff --git a/rts/sm/Storage.c b/rts/sm/Storage.c
index 717c96ae81..a9a7857d43 100644
--- a/rts/sm/Storage.c
+++ b/rts/sm/Storage.c
@@ -6,7 +6,7 @@
  *
  * Documentation on the architecture of the Storage Manager can be
  * found in the online commentary:
- * 
+ *
  *   http://ghc.haskell.org/trac/ghc/wiki/Commentary/Rts/Storage
  *
  * ---------------------------------------------------------------------------*/
@@ -37,7 +37,7 @@
 
 #include "ffi.h"
 
-/* 
+/*
  * All these globals require sm_mutex to access in THREADED_RTS mode.
  */
 StgIndStatic  *dyn_caf_list        = NULL;
@@ -54,9 +54,22 @@ generation *generations = NULL; /* all the generations */
 generation *g0          = NULL; /* generation 0, for convenience */
 generation *oldest_gen  = NULL; /* oldest generation, for convenience */
 
-nursery *nurseries = NULL;     /* array of nurseries, size == n_capabilities */
+/*
+ * Array of nurseries, size == n_capabilities
+ *
+ * nursery[i] belongs to NUMA node (i % RtsFlags.GcFlags.nNumaNodes)
+ * This is chosen to be the same convention as capabilities[i], so
+ * that when not using nursery chunks (+RTS -n), we just map
+ * capabilities to nurseries 1:1.
+ */
+nursery *nurseries = NULL;
 uint32_t n_nurseries;
-volatile StgWord next_nursery = 0;
+
+/*
+ * When we are using nursery chunks, we need a separate next_nursery
+ * pointer for each NUMA node.
+ */
+volatile StgWord next_nursery[MAX_NUMA_NODES];
 
 #ifdef THREADED_RTS
 /*
@@ -104,7 +117,7 @@ initGeneration (generation *gen, int g)
 void
 initStorage (void)
 {
-  uint32_t g;
+  uint32_t g, n;
 
   if (generations != NULL) {
       // multi-init protection
@@ -120,22 +133,22 @@ initStorage (void)
   ASSERT(LOOKS_LIKE_INFO_PTR_NOT_NULL((StgWord)&stg_BLOCKING_QUEUE_CLEAN_info));
   ASSERT(LOOKS_LIKE_CLOSURE_PTR(&stg_dummy_ret_closure));
   ASSERT(!HEAP_ALLOCED(&stg_dummy_ret_closure));
-  
+
   if (RtsFlags.GcFlags.maxHeapSize != 0 &&
-      RtsFlags.GcFlags.heapSizeSuggestion > 
+      RtsFlags.GcFlags.heapSizeSuggestion >
       RtsFlags.GcFlags.maxHeapSize) {
       RtsFlags.GcFlags.maxHeapSize = RtsFlags.GcFlags.heapSizeSuggestion;
   }
 
   if (RtsFlags.GcFlags.maxHeapSize != 0 &&
-      RtsFlags.GcFlags.minAllocAreaSize > 
+      RtsFlags.GcFlags.minAllocAreaSize >
       RtsFlags.GcFlags.maxHeapSize) {
       errorBelch("maximum heap size (-M) is smaller than minimum alloc area size (-A)");
       RtsFlags.GcFlags.minAllocAreaSize = RtsFlags.GcFlags.maxHeapSize;
   }
 
   initBlockAllocator();
-  
+
 #if defined(THREADED_RTS)
   initMutex(&sm_mutex);
 #endif
@@ -143,7 +156,7 @@ initStorage (void)
   ACQUIRE_SM_LOCK;
 
   /* allocate generation info array */
-  generations = (generation *)stgMallocBytes(RtsFlags.GcFlags.generations 
+  generations = (generation *)stgMallocBytes(RtsFlags.GcFlags.generations
                                              * sizeof(struct generation_),
                                              "initStorage: gens");
 
@@ -161,7 +174,7 @@ initStorage (void)
       generations[g].to = &generations[g+1];
   }
   oldest_gen->to = oldest_gen;
-  
+
   /* The oldest generation has one step. */
   if (RtsFlags.GcFlags.compact || RtsFlags.GcFlags.sweep) {
       if (RtsFlags.GcFlags.generations == 1) {
@@ -178,7 +191,7 @@ initStorage (void)
   dyn_caf_list = (StgIndStatic*)END_OF_CAF_LIST;
   debug_caf_list = (StgIndStatic*)END_OF_CAF_LIST;
   revertible_caf_list = (StgIndStatic*)END_OF_CAF_LIST;
-   
+
   if (RtsFlags.GcFlags.largeAllocLim > 0) {
       large_alloc_lim = RtsFlags.GcFlags.largeAllocLim * BLOCK_SIZE_W;
   } else {
@@ -196,7 +209,9 @@ initStorage (void)
 
   N = 0;
 
-  next_nursery = 0;
+  for (n = 0; n < RtsFlags.GcFlags.nNumaNodes; n++) {
+      next_nursery[n] = n;
+  }
   storageAddCapabilities(0, n_capabilities);
 
   IF_DEBUG(gc, statDescribeGens());
@@ -257,7 +272,8 @@ void storageAddCapabilities (uint32_t from, uint32_t to)
     // allocate a block for each mut list
     for (n = from; n < to; n++) {
         for (g = 1; g < RtsFlags.GcFlags.generations; g++) {
-            capabilities[n]->mut_lists[g] = allocBlock();
+            capabilities[n]->mut_lists[g] =
+                allocBlockOnNode(capNoToNumaNode(n));
         }
     }
 
@@ -526,7 +542,7 @@ StgInd* newGCdCAF (StgRegTable *reg, StgIndStatic *caf)
    -------------------------------------------------------------------------- */
 
 static bdescr *
-allocNursery (bdescr *tail, W_ blocks)
+allocNursery (uint32_t node, bdescr *tail, W_ blocks)
 {
     bdescr *bd = NULL;
     W_ i, n;
@@ -542,7 +558,7 @@ allocNursery (bdescr *tail, W_ blocks)
         // allocLargeChunk will prefer large chunks, but will pick up
         // small chunks if there are any available.  We must allow
         // single blocks here to avoid fragmentation (#7257)
-        bd = allocLargeChunk(1, n);
+        bd = allocLargeChunkOnNode(node, 1, n);
         n = bd->blocks;
         blocks -= n;
 
@@ -584,6 +600,7 @@ assignNurseryToCapability (Capability *cap, uint32_t n)
     cap->r.rCurrentNursery = nurseries[n].blocks;
     newNurseryBlock(nurseries[n].blocks);
     cap->r.rCurrentAlloc   = NULL;
+    ASSERT(cap->r.rCurrentNursery->node == cap->node);
 }
 
 /*
@@ -593,16 +610,18 @@ assignNurseryToCapability (Capability *cap, uint32_t n)
 static void
 assignNurseriesToCapabilities (uint32_t from, uint32_t to)
 {
-    uint32_t i;
+    uint32_t i, node;
 
     for (i = from; i < to; i++) {
-        assignNurseryToCapability(capabilities[i], next_nursery++);
+        node = capabilities[i]->node;
+        assignNurseryToCapability(capabilities[i], next_nursery[node]);
+        next_nursery[node] += RtsFlags.GcFlags.nNumaNodes;
     }
 }
 
 static void
 allocNurseries (uint32_t from, uint32_t to)
-{ 
+{
     uint32_t i;
     memcount n_blocks;
 
@@ -613,24 +632,28 @@ allocNurseries (uint32_t from, uint32_t to)
     }
 
     for (i = from; i < to; i++) {
-        nurseries[i].blocks = allocNursery(NULL, n_blocks);
+        nurseries[i].blocks = allocNursery(capNoToNumaNode(i), NULL, n_blocks);
         nurseries[i].n_blocks = n_blocks;
     }
 }
-      
+
 void
 resetNurseries (void)
 {
-    next_nursery = 0;
+    uint32_t n;
+
+    for (n = 0; n < RtsFlags.GcFlags.nNumaNodes; n++) {
+        next_nursery[n] = n;
+    }
     assignNurseriesToCapabilities(0, n_capabilities);
 
 #ifdef DEBUG
     bdescr *bd;
-    uint32_t n;
     for (n = 0; n < n_nurseries; n++) {
         for (bd = nurseries[n].blocks; bd; bd = bd->link) {
             ASSERT(bd->gen_no == 0);
             ASSERT(bd->gen == g0);
+            ASSERT(bd->node == capNoToNumaNode(n));
             IF_DEBUG(sanity, memset(bd->start, 0xaa, BLOCK_SIZE));
         }
     }
@@ -649,56 +672,54 @@ countNurseryBlocks (void)
     return blocks;
 }
 
-static void
-resizeNursery (nursery *nursery, W_ blocks)
-{
-  bdescr *bd;
-  W_ nursery_blocks;
-
-  nursery_blocks = nursery->n_blocks;
-  if (nursery_blocks == blocks) return;
-
-  if (nursery_blocks < blocks) {
-      debugTrace(DEBUG_gc, "increasing size of nursery to %d blocks", 
-                 blocks);
-    nursery->blocks = allocNursery(nursery->blocks, blocks-nursery_blocks);
-  } 
-  else {
-    bdescr *next_bd;
-    
-    debugTrace(DEBUG_gc, "decreasing size of nursery to %d blocks", 
-               blocks);
-
-    bd = nursery->blocks;
-    while (nursery_blocks > blocks) {
-        next_bd = bd->link;
-        next_bd->u.back = NULL;
-        nursery_blocks -= bd->blocks; // might be a large block
-        freeGroup(bd);
-        bd = next_bd;
-    }
-    nursery->blocks = bd;
-    // might have gone just under, by freeing a large block, so make
-    // up the difference.
-    if (nursery_blocks < blocks) {
-        nursery->blocks = allocNursery(nursery->blocks, blocks-nursery_blocks);
-    }
-  }
-  
-  nursery->n_blocks = blocks;
-  ASSERT(countBlocks(nursery->blocks) == nursery->n_blocks);
-}
-
-// 
+//
 // Resize each of the nurseries to the specified size.
 //
 static void
 resizeNurseriesEach (W_ blocks)
 {
-    uint32_t i;
+    uint32_t i, node;
+    bdescr *bd;
+    W_ nursery_blocks;
+    nursery *nursery;
 
     for (i = 0; i < n_nurseries; i++) {
-        resizeNursery(&nurseries[i], blocks);
+        nursery = &nurseries[i];
+        nursery_blocks = nursery->n_blocks;
+        if (nursery_blocks == blocks) continue;
+
+        node = capNoToNumaNode(i);
+        if (nursery_blocks < blocks) {
+            debugTrace(DEBUG_gc, "increasing size of nursery to %d blocks",
+                       blocks);
+            nursery->blocks = allocNursery(node, nursery->blocks,
+                                           blocks-nursery_blocks);
+        }
+        else
+        {
+            bdescr *next_bd;
+
+            debugTrace(DEBUG_gc, "decreasing size of nursery to %d blocks",
+                       blocks);
+
+            bd = nursery->blocks;
+            while (nursery_blocks > blocks) {
+                next_bd = bd->link;
+                next_bd->u.back = NULL;
+                nursery_blocks -= bd->blocks; // might be a large block
+                freeGroup(bd);
+                bd = next_bd;
+            }
+            nursery->blocks = bd;
+            // might have gone just under, by freeing a large block, so make
+            // up the difference.
+            if (nursery_blocks < blocks) {
+                nursery->blocks = allocNursery(node, nursery->blocks,
+                                               blocks-nursery_blocks);
+            }
+        }
+        nursery->n_blocks = blocks;
+        ASSERT(countBlocks(nursery->blocks) == nursery->n_blocks);
     }
 }
 
@@ -716,7 +737,7 @@ resizeNurseriesFixed (void)
     resizeNurseriesEach(blocks);
 }
 
-// 
+//
 // Resize the nurseries to the total specified size.
 //
 void
@@ -731,18 +752,42 @@ rtsBool
 getNewNursery (Capability *cap)
 {
     StgWord i;
+    uint32_t node = cap->node;
+    uint32_t n;
 
     for(;;) {
-        i = next_nursery;
-        if (i >= n_nurseries) {
+        i = next_nursery[node];
+        if (i < n_nurseries) {
+            if (cas(&next_nursery[node], i,
+                    i+RtsFlags.GcFlags.nNumaNodes) == i) {
+                assignNurseryToCapability(cap, i);
+                return rtsTrue;
+            }
+        } else if (RtsFlags.GcFlags.nNumaNodes > 1) {
+            // Try to find an unused nursery chunk on other nodes.  We'll get
+            // remote memory, but the rationale is that avoiding GC is better
+            // than avoiding remote memory access.
+            rtsBool lost = rtsFalse;
+            for (n = 0; n < RtsFlags.GcFlags.nNumaNodes; n++) {
+                if (n == node) continue;
+                i = next_nursery[n];
+                if (i < n_nurseries) {
+                    if (cas(&next_nursery[n], i,
+                            i+RtsFlags.GcFlags.nNumaNodes) == i) {
+                        assignNurseryToCapability(cap, i);
+                        return rtsTrue;
+                    } else {
+                        lost = rtsTrue; /* lost a race */
+                    }
+                }
+            }
+            if (!lost) return rtsFalse;
+        } else {
             return rtsFalse;
         }
-        if (cas(&next_nursery, i, i+1) == i) {
-            assignNurseryToCapability(cap, i);
-            return rtsTrue;
-        }
     }
 }
+
 /* -----------------------------------------------------------------------------
    move_STACK is called to update the TSO structure after it has been
    moved from one place to another.
@@ -753,8 +798,8 @@ move_STACK (StgStack *src, StgStack *dest)
 {
     ptrdiff_t diff;
 
-    // relocate the stack pointer... 
-    diff = (StgPtr)dest - (StgPtr)src; // In *words* 
+    // relocate the stack pointer...
+    diff = (StgPtr)dest - (StgPtr)src; // In *words*
     dest->sp = (StgPtr)dest->sp + diff;
 }
 
@@ -818,7 +863,7 @@ allocate (Capability *cap, W_ n)
         }
 
         ACQUIRE_SM_LOCK
-        bd = allocGroup(req_blocks);
+        bd = allocGroupOnNode(cap->node,req_blocks);
         dbl_link_onto(bd, &g0->large_objects);
         g0->n_large_blocks += bd->blocks; // might be larger than req_blocks
         g0->n_new_large_words += n;
@@ -834,19 +879,19 @@ allocate (Capability *cap, W_ n)
 
     bd = cap->r.rCurrentAlloc;
     if (bd == NULL || bd->free + n > bd->start + BLOCK_SIZE_W) {
-        
+
         if (bd) finishedNurseryBlock(cap,bd);
 
         // The CurrentAlloc block is full, we need to find another
         // one.  First, we try taking the next block from the
         // nursery:
         bd = cap->r.rCurrentNursery->link;
-        
+
         if (bd == NULL) {
             // The nursery is empty: allocate a fresh block (we can't
             // fail here).
             ACQUIRE_SM_LOCK;
-            bd = allocBlock();
+            bd = allocBlockOnNode(cap->node);
             cap->r.rNursery->n_blocks++;
             RELEASE_SM_LOCK;
             initBdescr(bd, g0, g0);
@@ -944,7 +989,7 @@ allocatePinned (Capability *cap, W_ n)
     }
 
     bd = cap->pinned_object_block;
-    
+
     // If we don't have a block of pinned objects yet, or the current
     // one isn't large enough to hold the new object, get a new one.
     if (bd == NULL || (bd->free + n) > (bd->start + BLOCK_SIZE_W)) {
@@ -964,24 +1009,13 @@ allocatePinned (Capability *cap, W_ n)
         // objects scale really badly if we do this).
         //
         // So first, we try taking the next block from the nursery, in
-        // the same way as allocate(), but note that we can only take
-        // an *empty* block, because we're about to mark it as
-        // BF_PINNED | BF_LARGE.
+        // the same way as allocate().
         bd = cap->r.rCurrentNursery->link;
-        if (bd == NULL) { // must be empty!
-            // The nursery is empty, or the next block is non-empty:
-            // allocate a fresh block (we can't fail here).
-
-            // XXX in the case when the next nursery block is
-            // non-empty we aren't exerting any pressure to GC soon,
-            // so if this case ever happens then we could in theory
-            // keep allocating for ever without calling the GC. We
-            // can't bump g0->n_new_large_words because that will be
-            // counted towards allocation, and we're already counting
-            // our pinned obects as allocation in
-            // collect_pinned_object_blocks in the GC.
+        if (bd == NULL) {
+            // The nursery is empty: allocate a fresh block (we can't fail
+            // here).
             ACQUIRE_SM_LOCK;
-            bd = allocBlock();
+            bd = allocBlockOnNode(cap->node);
             RELEASE_SM_LOCK;
             initBdescr(bd, g0, g0);
         } else {
@@ -1217,13 +1251,13 @@ W_ gcThreadLiveBlocks (uint32_t i, uint32_t g)
  * that will be collected next time will therefore need twice as many
  * blocks since all the data will be copied.
  */
-extern W_ 
+extern W_
 calcNeeded (rtsBool force_major, memcount *blocks_needed)
 {
     W_ needed = 0, blocks;
     uint32_t g, N;
     generation *gen;
-    
+
     if (force_major) {
         N = RtsFlags.GcFlags.generations - 1;
     } else {
@@ -1238,7 +1272,7 @@ calcNeeded (rtsBool force_major, memcount *blocks_needed)
 
         // we need at least this much space
         needed += blocks;
-        
+
         // are we collecting this gen?
         if (g == 0 || // always collect gen 0
             blocks > gen->max_blocks)
diff --git a/rts/win32/OSMem.c b/rts/win32/OSMem.c
index f0680e953a..3450267d9c 100644
--- a/rts/win32/OSMem.c
+++ b/rts/win32/OSMem.c
@@ -482,3 +482,25 @@ void osReleaseHeapMemory (void)
 }
 
 #endif
+
+rtsBool osNumaAvailable(void)
+{
+    return rtsFalse;
+}
+
+uint32_t osNumaNodes(void)
+{
+    return 1;
+}
+
+StgWord osNumaMask(void)
+{
+    return 1;
+}
+
+void osBindMBlocksToNode(
+    void *addr STG_UNUSED,
+    StgWord size STG_UNUSED,
+    uint32_t node STG_UNUSED)
+{
+}
diff --git a/rts/win32/OSThreads.c b/rts/win32/OSThreads.c
index 7a51ec5191..78fe29784b 100644
--- a/rts/win32/OSThreads.c
+++ b/rts/win32/OSThreads.c
@@ -300,6 +300,9 @@ interruptOSThread (OSThreadId id)
     CloseHandle(hdl);
 }
 
+void setThreadNode (uint32_t node STG_UNUSED) { /* nothing */ }
+void releaseThreadNode (void) { /* nothing */ }
+
 #else /* !defined(THREADED_RTS) */
 
 int
diff --git a/testsuite/config/ghc b/testsuite/config/ghc
index 355535153c..678cc56bc5 100644
--- a/testsuite/config/ghc
+++ b/testsuite/config/ghc
@@ -26,7 +26,7 @@ config.run_ways           = ['normal', 'hpc']
 config.other_ways         = ['prof', 'normal_h',
                              'prof_hc_hb','prof_hb',
                              'prof_hd','prof_hy','prof_hr',
-                             'threaded1_ls', 'threaded2_hT',
+                             'threaded1_ls', 'threaded2_hT', 'debug_numa',
                              'llvm', 'debugllvm',
                              'profllvm', 'profoptllvm', 'profthreadedllvm',
                              'debug',
@@ -87,6 +87,8 @@ config.way_flags = lambda name : {
     'normal'       : [],
     'normal_h'     : [],
     'g1'           : [],
+    'nursery_chunks' : [],
+    'debug_numa'   : ['-threaded', '-debug'],
     'optasm'       : ['-O', '-fasm'],
     'llvm'         : ['-fllvm'],
     'optllvm'      : ['-O', '-fllvm'],
@@ -119,6 +121,8 @@ config.way_rts_flags = {
     'normal'       : [],
     'normal_h'     : ['-h'], # works without -prof
     'g1'           : ['-G1'],
+    'nursery_chunks' : ['-n32k'],
+    'debug_numa'   : ['-N2', '--debug-numa=2'],
     'optasm'       : [],
     'llvm'         : [],
     'optllvm'      : [],
diff --git a/testsuite/tests/codeGen/should_run/all.T b/testsuite/tests/codeGen/should_run/all.T
index b8bc4f60c2..42ec7d3e7f 100644
--- a/testsuite/tests/codeGen/should_run/all.T
+++ b/testsuite/tests/codeGen/should_run/all.T
@@ -21,7 +21,10 @@ test('cgrun017', normal, compile_and_run, [''])
 test('cgrun018', normal, compile_and_run, [''])
 test('cgrun019', normal, compile_and_run, [''])
 test('cgrun020', normal, compile_and_run, [''])
-test('cgrun021', normal, compile_and_run, [''])
+
+# cgrun021 does some GC, so let's use it to test GC parameters
+test('cgrun021', extra_ways(['nursery_chunks']), compile_and_run, [''])
+
 test('cgrun022', normal, compile_and_run, [''])
 test('cgrun024', normal, compile_and_run, [''])
 test('cgrun025',
diff --git a/testsuite/tests/concurrent/prog001/all.T b/testsuite/tests/concurrent/prog001/all.T
index a3ba7b61f1..b5d4b1c4eb 100644
--- a/testsuite/tests/concurrent/prog001/all.T
+++ b/testsuite/tests/concurrent/prog001/all.T
@@ -4,7 +4,7 @@
 # Also tests for bug #1466.
 
 # NB. This is a VERY IMPORTANT test!  It is the only good test we have
-# for throwTo.  It has shown up several bugs that were not caught by the 
+# for throwTo.  It has shown up several bugs that were not caught by the
 # other concurrency tests.
 
 # The program appears to be sensitive to scheduling, and can diverge
diff --git a/testsuite/tests/concurrent/should_run/all.T b/testsuite/tests/concurrent/should_run/all.T
index a451dfeb80..a974f6a304 100644
--- a/testsuite/tests/concurrent/should_run/all.T
+++ b/testsuite/tests/concurrent/should_run/all.T
@@ -35,7 +35,9 @@ test('T2910a', normal, compile_and_run, [''])
 test('T3279', normal, compile_and_run, [''])
 
 # This test takes a long time with the default context switch interval
-test('T3429', extra_run_opts('+RTS -C0.001 -RTS'), compile_and_run, [''])
+test('T3429', [ extra_run_opts('+RTS -C0.001 -RTS'),
+                extra_ways(['debug_numa']) ],
+     compile_and_run, [''])
 
 # without -O, goes into an infinite loop
 # GHCi does not detect the infinite loop.  We should really fix this.
@@ -144,7 +146,7 @@ test('conc020', normal, compile_and_run, [''])
 test('conc021', [ omit_ways(['ghci']), exit_code(1) ], compile_and_run, [''])
 test('conc022', normal, compile_and_run, [''])
 
-# On Windows, the non-threaded RTS creates a real OS thread for each 
+# On Windows, the non-threaded RTS creates a real OS thread for each
 # threadDelay.  conc023 creates 5000 concurrent threadDelays, and the
 # resulting creation of OS threads seems to cause the system to run
 # out of memory sometimes (I'm not sure exactly how/why this happens,
@@ -207,7 +209,7 @@ test('conc037', only_ways(['threaded1','threaded2']), compile_and_run, [''])
 test('conc038', only_ways(['threaded1','threaded2']), compile_and_run, [''])
 
 # Omit for GHCi, uses foreign export
-# Omit for the threaded ways, because in this case the main thread is allowed to 
+# Omit for the threaded ways, because in this case the main thread is allowed to
 # complete, which causes the child thread to be interrupted.
 test('conc039', omit_ways(['ghci'] + threaded_ways), compile_and_run, [''])
 
@@ -253,4 +255,3 @@ test('setnumcapabilities001',
 
 # omit ghci, which can't handle unboxed tuples:
 test('compareAndSwap', [omit_ways(['ghci','hpc']), reqlib('primitive')], compile_and_run, [''])
-
-- 
cgit v1.2.1