23 files changed, 54 insertions, 1010 deletions
diff --git a/compiler/cmm/CmmMachOp.hs b/compiler/cmm/CmmMachOp.hs
index d8ce492de1..c4ec393ad6 100644
--- a/compiler/cmm/CmmMachOp.hs
+++ b/compiler/cmm/CmmMachOp.hs
@@ -19,9 +19,6 @@ module CmmMachOp
     -- CallishMachOp
     , CallishMachOp(..), callishMachOpHints
     , pprCallishMachOp
-
-    -- Atomic read-modify-write
-    , AtomicMachOp(..)
    )
 where
 
@@ -550,24 +547,8 @@ data CallishMachOp
 
   | MO_PopCnt Width
   | MO_BSwap Width
-
-  -- Atomic read-modify-write.
-  | MO_AtomicRMW Width AtomicMachOp
-  | MO_AtomicRead Width
-  | MO_AtomicWrite Width
-  | MO_Cmpxchg Width
   deriving (Eq, Show)
 
--- | The operation to perform atomically.
-data AtomicMachOp =
-      AMO_Add
-    | AMO_Sub
-    | AMO_And
-    | AMO_Nand
-    | AMO_Or
-    | AMO_Xor
-      deriving (Eq, Show)
-
 pprCallishMachOp :: CallishMachOp -> SDoc
 pprCallishMachOp mo = text (show mo)
 
diff --git a/compiler/cmm/CmmSink.hs b/compiler/cmm/CmmSink.hs
index 4dced9afd2..4c025425ab 100644
--- a/compiler/cmm/CmmSink.hs
+++ b/compiler/cmm/CmmSink.hs
@@ -650,10 +650,6 @@ data AbsMem
 -- perhaps we ought to have a special annotation for calls that can
 -- modify heap/stack memory.  For now we just use the conservative
 -- definition here.
---
--- Some CallishMachOp imply a memory barrier e.g. AtomicRMW and
--- therefore we should never float any memory operations across one of
--- these calls.
 
 
 bothMems :: AbsMem -> AbsMem -> AbsMem
diff --git a/compiler/cmm/PprC.hs b/compiler/cmm/PprC.hs
index 455c79ba02..47b247e278 100644
--- a/compiler/cmm/PprC.hs
+++ b/compiler/cmm/PprC.hs
@@ -753,10 +753,6 @@ pprCallishMachOp_for_C mop
         MO_Memmove      -> ptext (sLit "memmove")
         (MO_BSwap w)    -> ptext (sLit $ bSwapLabel w)
         (MO_PopCnt w)   -> ptext (sLit $ popCntLabel w)
-        (MO_AtomicRMW w amop) -> ptext (sLit $ atomicRMWLabel w amop)
-        (MO_Cmpxchg w)  -> ptext (sLit $ cmpxchgLabel w)
-        (MO_AtomicRead w)  -> ptext (sLit $ atomicReadLabel w)
-        (MO_AtomicWrite w) -> ptext (sLit $ atomicWriteLabel w)
         (MO_UF_Conv w)  -> ptext (sLit $ word2FloatLabel w)
 
         MO_S_QuotRem  {} -> unsupported
diff --git a/compiler/codeGen/StgCmmPrim.hs b/compiler/codeGen/StgCmmPrim.hs
index e4c682bf02..40a5e3649b 100644
--- a/compiler/codeGen/StgCmmPrim.hs
+++ b/compiler/codeGen/StgCmmPrim.hs
@@ -769,25 +769,6 @@ emitPrimOp _ res PrefetchByteArrayOp0        args = doPrefetchByteArrayOp 0 res
 emitPrimOp _ res PrefetchMutableByteArrayOp0 args = doPrefetchByteArrayOp 0 res args
 emitPrimOp _ res PrefetchAddrOp0             args = doPrefetchAddrOp 0 res args
 
--- Atomic read-modify-write
-emitPrimOp dflags [res] FetchAddByteArrayOp_Int [mba, ix, n] =
-    doAtomicRMW res AMO_Add mba ix (bWord dflags) n
-emitPrimOp dflags [res] FetchSubByteArrayOp_Int [mba, ix, n] =
-    doAtomicRMW res AMO_Sub mba ix (bWord dflags) n
-emitPrimOp dflags [res] FetchAndByteArrayOp_Int [mba, ix, n] =
-    doAtomicRMW res AMO_And mba ix (bWord dflags) n
-emitPrimOp dflags [res] FetchNandByteArrayOp_Int [mba, ix, n] =
-    doAtomicRMW res AMO_Nand mba ix (bWord dflags) n
-emitPrimOp dflags [res] FetchOrByteArrayOp_Int [mba, ix, n] =
-    doAtomicRMW res AMO_Or mba ix (bWord dflags) n
-emitPrimOp dflags [res] FetchXorByteArrayOp_Int [mba, ix, n] =
-    doAtomicRMW res AMO_Xor mba ix (bWord dflags) n
-emitPrimOp dflags [res] AtomicReadByteArrayOp_Int [mba, ix] =
-    doAtomicReadByteArray res mba ix (bWord dflags)
-emitPrimOp dflags [] AtomicWriteByteArrayOp_Int [mba, ix, val] =
-    doAtomicWriteByteArray mba ix (bWord dflags) val
-emitPrimOp dflags [res] CasByteArrayOp_Int [mba, ix, old, new] =
-    doCasByteArray res mba ix (bWord dflags) old new
 
 -- The rest just translate straightforwardly
 emitPrimOp dflags [res] op [arg]
@@ -1952,81 +1933,6 @@ doWriteSmallPtrArrayOp addr idx val = do
     emit (setInfo addr (CmmLit (CmmLabel mkSMAP_DIRTY_infoLabel)))
 
 ------------------------------------------------------------------------------
--- Atomic read-modify-write
-
--- | Emit an atomic modification to a byte array element. The result
--- reg contains that previous value of the element. Implies a full
--- memory barrier.
-doAtomicRMW :: LocalReg      -- ^ Result reg
-            -> AtomicMachOp  -- ^ Atomic op (e.g. add)
-            -> CmmExpr       -- ^ MutableByteArray#
-            -> CmmExpr       -- ^ Index
-            -> CmmType       -- ^ Type of element by which we are indexing
-            -> CmmExpr       -- ^ Op argument (e.g. amount to add)
-            -> FCode ()
-doAtomicRMW res amop mba idx idx_ty n = do
-    dflags <- getDynFlags
-    let width = typeWidth idx_ty
-        addr  = cmmIndexOffExpr dflags (arrWordsHdrSize dflags)
-                width mba idx
-    emitPrimCall
-        [ res ]
-        (MO_AtomicRMW width amop)
-        [ addr, n ]
-
--- | Emit an atomic read to a byte array that acts as a memory barrier.
-doAtomicReadByteArray
-    :: LocalReg  -- ^ Result reg
-    -> CmmExpr   -- ^ MutableByteArray#
-    -> CmmExpr   -- ^ Index
-    -> CmmType   -- ^ Type of element by which we are indexing
-    -> FCode ()
-doAtomicReadByteArray res mba idx idx_ty = do
-    dflags <- getDynFlags
-    let width = typeWidth idx_ty
-        addr  = cmmIndexOffExpr dflags (arrWordsHdrSize dflags)
-                width mba idx
-    emitPrimCall
-        [ res ]
-        (MO_AtomicRead width)
-        [ addr ]
-
--- | Emit an atomic write to a byte array that acts as a memory barrier.
-doAtomicWriteByteArray
-    :: CmmExpr   -- ^ MutableByteArray#
-    -> CmmExpr   -- ^ Index
-    -> CmmType   -- ^ Type of element by which we are indexing
-    -> CmmExpr   -- ^ Value to write
-    -> FCode ()
-doAtomicWriteByteArray mba idx idx_ty val = do
-    dflags <- getDynFlags
-    let width = typeWidth idx_ty
-        addr  = cmmIndexOffExpr dflags (arrWordsHdrSize dflags)
-                width mba idx
-    emitPrimCall
-        [ {- no results -} ]
-        (MO_AtomicWrite width)
-        [ addr, val ]
-
-doCasByteArray
-    :: LocalReg  -- ^ Result reg
-    -> CmmExpr   -- ^ MutableByteArray#
-    -> CmmExpr   -- ^ Index
-    -> CmmType   -- ^ Type of element by which we are indexing
-    -> CmmExpr   -- ^ Old value
-    -> CmmExpr   -- ^ New value
-    -> FCode ()
-doCasByteArray res mba idx idx_ty old new = do
-    dflags <- getDynFlags
-    let width = (typeWidth idx_ty)
-        addr = cmmIndexOffExpr dflags (arrWordsHdrSize dflags)
-               width mba idx
-    emitPrimCall
-        [ res ]
-        (MO_Cmpxchg width)
-        [ addr, old, new ]
-
-------------------------------------------------------------------------------
 -- Helpers for emitting function calls
 
 -- | Emit a call to @memcpy@.
diff --git a/compiler/llvmGen/Llvm/AbsSyn.hs b/compiler/llvmGen/Llvm/AbsSyn.hs
index 24d0856ea3..f92bd89c5c 100644
--- a/compiler/llvmGen/Llvm/AbsSyn.hs
+++ b/compiler/llvmGen/Llvm/AbsSyn.hs
@@ -65,8 +65,6 @@ data LlvmFunction = LlvmFunction {
 
 type LlvmFunctions = [LlvmFunction]
 
-type SingleThreaded = Bool
-
 -- | LLVM ordering types for synchronization purposes. (Introduced in LLVM
 -- 3.0). Please see the LLVM documentation for a better description.
 data LlvmSyncOrdering
@@ -226,11 +224,6 @@ data LlvmExpression
   | Load LlvmVar
 
   {- |
-    Atomic load of the value at location ptr
-  -}
-  | ALoad LlvmSyncOrdering SingleThreaded LlvmVar
-
-  {- |
     Navigate in an structure, selecting elements
       * inbound: Is the pointer inbounds? (computed pointer doesn't overflow)
       * ptr:     Location of the structure
diff --git a/compiler/llvmGen/Llvm/PpLlvm.hs b/compiler/llvmGen/Llvm/PpLlvm.hs
index 73077257f8..025078226d 100644
--- a/compiler/llvmGen/Llvm/PpLlvm.hs
+++ b/compiler/llvmGen/Llvm/PpLlvm.hs
@@ -239,7 +239,6 @@ ppLlvmExpression expr
         Insert     vec elt idx      -> ppInsert vec elt idx
         GetElemPtr inb ptr indexes  -> ppGetElementPtr inb ptr indexes
         Load       ptr              -> ppLoad ptr
-        ALoad      ord st ptr       -> ppALoad ord st ptr
         Malloc     tp amount        -> ppMalloc tp amount
         Phi        tp precessors    -> ppPhi tp precessors
         Asm        asm c ty v se sk -> ppAsm asm c ty v se sk
@@ -328,18 +327,13 @@ ppSyncOrdering SyncSeqCst    = text "seq_cst"
 -- of specifying alignment.
 
 ppLoad :: LlvmVar -> SDoc
-ppLoad var = text "load" <+> ppr var <> align
+ppLoad var
+    | isVecPtrVar var = text "load" <+> ppr var <>
+                        comma <+> text "align 1"
+    | otherwise       = text "load" <+> ppr var
   where
-    align | isVector . pLower . getVarType $ var = text ", align 1"
-          | otherwise = empty
-
-ppALoad :: LlvmSyncOrdering -> SingleThreaded -> LlvmVar -> SDoc
-ppALoad ord st var = sdocWithDynFlags $ \dflags ->
-  let alignment = (llvmWidthInBits dflags $ getVarType var) `quot` 8
-      align     = text ", align" <+> ppr alignment
-      sThreaded | st        = text " singlethread"
-                | otherwise = empty
-  in text "load atomic" <+> ppr var <> sThreaded <+> ppSyncOrdering ord <> align
+    isVecPtrVar :: LlvmVar -> Bool
+    isVecPtrVar = isVector . pLower . getVarType
 
 ppStore :: LlvmVar -> LlvmVar -> SDoc
 ppStore val dst
diff --git a/compiler/llvmGen/LlvmCodeGen/CodeGen.hs b/compiler/llvmGen/LlvmCodeGen/CodeGen.hs
index 4a56600937..517553516b 100644
--- a/compiler/llvmGen/LlvmCodeGen/CodeGen.hs
+++ b/compiler/llvmGen/LlvmCodeGen/CodeGen.hs
@@ -15,7 +15,6 @@ import BlockId
 import CodeGen.Platform ( activeStgRegs, callerSaves )
 import CLabel
 import Cmm
-import CPrim
 import PprCmm
 import CmmUtils
 import Hoopl
@@ -33,7 +32,6 @@ import Unique
 import Data.List ( nub )
 import Data.Maybe ( catMaybes )
 
-type Atomic = Bool
 type LlvmStatements = OrdList LlvmStatement
 
 -- -----------------------------------------------------------------------------
@@ -230,17 +228,6 @@ genCall t@(PrimTarget (MO_PopCnt w)) dsts args =
 genCall t@(PrimTarget (MO_BSwap w)) dsts args =
     genCallSimpleCast w t dsts args
 
-genCall (PrimTarget (MO_AtomicRead _)) [dst] [addr] = do
-  dstV <- getCmmReg (CmmLocal dst)
-  (v1, stmts, top) <- genLoad True addr (localRegType dst)
-  let stmt1 = Store v1 dstV
-  return (stmts `snocOL` stmt1, top)
-
--- TODO: implement these properly rather than calling to RTS functions.
--- genCall t@(PrimTarget (MO_AtomicWrite width)) [] [addr, val] = undefined
--- genCall t@(PrimTarget (MO_AtomicRMW width amop)) [dst] [addr, n] = undefined
--- genCall t@(PrimTarget (MO_Cmpxchg width)) [dst] [addr, old, new] = undefined
-
 -- Handle memcpy function specifically since llvm's intrinsic version takes
 -- some extra parameters.
 genCall t@(PrimTarget op) [] args'
@@ -561,6 +548,7 @@ cmmPrimOpFunctions mop = do
 
     (MO_Prefetch_Data _ )-> fsLit "llvm.prefetch"
 
+
     MO_S_QuotRem {}  -> unsupported
     MO_U_QuotRem {}  -> unsupported
     MO_U_QuotRem2 {} -> unsupported
@@ -570,12 +558,6 @@ cmmPrimOpFunctions mop = do
     MO_Touch         -> unsupported
     MO_UF_Conv _     -> unsupported
 
-    MO_AtomicRead _  -> unsupported
-
-    MO_AtomicRMW w amop -> fsLit $ atomicRMWLabel w amop
-    MO_Cmpxchg w        -> fsLit $ cmpxchgLabel w
-    MO_AtomicWrite w    -> fsLit $ atomicWriteLabel w
-
 -- | Tail function calls
 genJump :: CmmExpr -> [GlobalReg] -> LlvmM StmtData
 
@@ -867,7 +849,7 @@ exprToVarOpt opt e = case e of
         -> genLit opt lit
 
     CmmLoad e' ty
-        -> genLoad False e' ty
+        -> genLoad e' ty
 
     -- Cmmreg in expression is the value, so must load. If you want actual
     -- reg pointer, call getCmmReg directly.
@@ -1286,41 +1268,41 @@ genMachOp_slow _ _ _ = panic "genMachOp: More then 2 expressions in MachOp!"
 
 
 -- | Handle CmmLoad expression.
-genLoad :: Atomic -> CmmExpr -> CmmType -> LlvmM ExprData
+genLoad :: CmmExpr -> CmmType -> LlvmM ExprData
 
 -- First we try to detect a few common cases and produce better code for
 -- these then the default case. We are mostly trying to detect Cmm code
 -- like I32[Sp + n] and use 'getelementptr' operations instead of the
 -- generic case that uses casts and pointer arithmetic
-genLoad atomic e@(CmmReg (CmmGlobal r)) ty
-    = genLoad_fast atomic e r 0 ty
+genLoad e@(CmmReg (CmmGlobal r)) ty
+    = genLoad_fast e r 0 ty
 
-genLoad atomic e@(CmmRegOff (CmmGlobal r) n) ty
-    = genLoad_fast atomic e r n ty
+genLoad e@(CmmRegOff (CmmGlobal r) n) ty
+    = genLoad_fast e r n ty
 
-genLoad atomic e@(CmmMachOp (MO_Add _) [
+genLoad e@(CmmMachOp (MO_Add _) [
                             (CmmReg (CmmGlobal r)),
                             (CmmLit (CmmInt n _))])
                 ty
-    = genLoad_fast atomic e r (fromInteger n) ty
+    = genLoad_fast e r (fromInteger n) ty
 
-genLoad atomic e@(CmmMachOp (MO_Sub _) [
+genLoad e@(CmmMachOp (MO_Sub _) [
                             (CmmReg (CmmGlobal r)),
                             (CmmLit (CmmInt n _))])
                 ty
-    = genLoad_fast atomic e r (negate $ fromInteger n) ty
+    = genLoad_fast e r (negate $ fromInteger n) ty
 
 -- generic case
-genLoad atomic e ty
+genLoad e ty
     = do other <- getTBAAMeta otherN
-         genLoad_slow atomic e ty other
+         genLoad_slow e ty other
 
 -- | Handle CmmLoad expression.
 -- This is a special case for loading from a global register pointer
 -- offset such as I32[Sp+8].
-genLoad_fast :: Atomic -> CmmExpr -> GlobalReg -> Int -> CmmType
-             -> LlvmM ExprData
-genLoad_fast atomic e r n ty = do
+genLoad_fast :: CmmExpr -> GlobalReg -> Int -> CmmType
+                -> LlvmM ExprData
+genLoad_fast e r n ty = do
     dflags <- getDynFlags
     (gv, grt, s1) <- getCmmRegVal (CmmGlobal r)
     meta          <- getTBAARegMeta r
@@ -1333,7 +1315,7 @@ genLoad_fast atomic e r n ty = do
                 case grt == ty' of
                      -- were fine
                      True -> do
-                         (var, s3) <- doExpr ty' (MExpr meta $ loadInstr ptr)
+                         (var, s3) <- doExpr ty' (MExpr meta $ Load ptr)
                          return (var, s1 `snocOL` s2 `snocOL` s3,
                                      [])
 
@@ -1341,34 +1323,32 @@ genLoad_fast atomic e r n ty = do
                      False -> do
                          let pty = pLift ty'
                          (ptr', s3) <- doExpr pty $ Cast LM_Bitcast ptr pty
-                         (var, s4) <- doExpr ty' (MExpr meta $ loadInstr ptr')
+                         (var, s4) <- doExpr ty' (MExpr meta $ Load ptr')
                          return (var, s1 `snocOL` s2 `snocOL` s3
                                     `snocOL` s4, [])
 
             -- If its a bit type then we use the slow method since
             -- we can't avoid casting anyway.
-            False -> genLoad_slow atomic  e ty meta
-  where
-    loadInstr ptr | atomic    = ALoad SyncSeqCst False ptr
-                  | otherwise = Load ptr
+            False -> genLoad_slow e ty meta
+
 
 -- | Handle Cmm load expression.
 -- Generic case. Uses casts and pointer arithmetic if needed.
-genLoad_slow :: Atomic -> CmmExpr -> CmmType -> [MetaAnnot] -> LlvmM ExprData
-genLoad_slow atomic e ty meta = do
+genLoad_slow :: CmmExpr -> CmmType -> [MetaAnnot] -> LlvmM ExprData
+genLoad_slow e ty meta = do
     (iptr, stmts, tops) <- exprToVar e
     dflags <- getDynFlags
     case getVarType iptr of
          LMPointer _ -> do
                     (dvar, load) <- doExpr (cmmToLlvmType ty)
-                                           (MExpr meta $ loadInstr iptr)
+                                           (MExpr meta $ Load iptr)
                     return (dvar, stmts `snocOL` load, tops)
 
          i@(LMInt _) | i == llvmWord dflags -> do
                     let pty = LMPointer $ cmmToLlvmType ty
                     (ptr, cast)  <- doExpr pty $ Cast LM_Inttoptr iptr pty
                     (dvar, load) <- doExpr (cmmToLlvmType ty)
-                                           (MExpr meta $ loadInstr ptr)
+                                           (MExpr meta $ Load ptr)
                     return (dvar, stmts `snocOL` cast `snocOL` load, tops)
 
          other -> do dflags <- getDynFlags
@@ -1377,9 +1357,6 @@ genLoad_slow atomic e ty meta = do
                             "Size of Ptr: " ++ show (llvmPtrBits dflags) ++
                             ", Size of var: " ++ show (llvmWidthInBits dflags other) ++
                             ", Var: " ++ showSDoc dflags (ppr iptr)))
-  where
-    loadInstr ptr | atomic    = ALoad SyncSeqCst False ptr
-                  | otherwise = Load ptr
 
 
 -- | Handle CmmReg expression. This will return a pointer to the stack
diff --git a/compiler/nativeGen/CPrim.hs b/compiler/nativeGen/CPrim.hs
index 34782dfc1c..a6f4cab7bd 100644
--- a/compiler/nativeGen/CPrim.hs
+++ b/compiler/nativeGen/CPrim.hs
@@ -1,16 +1,11 @@
 -- | Generating C symbol names emitted by the compiler.
 module CPrim
-    ( atomicReadLabel
-    , atomicWriteLabel
-    , atomicRMWLabel
-    , cmpxchgLabel
-    , popCntLabel
+    ( popCntLabel
     , bSwapLabel
     , word2FloatLabel
     ) where
 
 import CmmType
-import CmmMachOp
 import Outputable
 
 popCntLabel :: Width -> String
@@ -36,46 +31,3 @@ word2FloatLabel w = "hs_word2float" ++ pprWidth w
     pprWidth W32 = "32"
     pprWidth W64 = "64"
     pprWidth w   = pprPanic "word2FloatLabel: Unsupported word width " (ppr w)
-
-atomicRMWLabel :: Width -> AtomicMachOp -> String
-atomicRMWLabel w amop = "hs_atomic_" ++ pprFunName amop ++ pprWidth w
-  where
-    pprWidth W8  = "8"
-    pprWidth W16 = "16"
-    pprWidth W32 = "32"
-    pprWidth W64 = "64"
-    pprWidth w   = pprPanic "atomicRMWLabel: Unsupported word width " (ppr w)
-
-    pprFunName AMO_Add  = "add"
-    pprFunName AMO_Sub  = "sub"
-    pprFunName AMO_And  = "and"
-    pprFunName AMO_Nand = "nand"
-    pprFunName AMO_Or   = "or"
-    pprFunName AMO_Xor  = "xor"
-
-cmpxchgLabel :: Width -> String
-cmpxchgLabel w = "hs_cmpxchg" ++ pprWidth w
-  where
-    pprWidth W8  = "8"
-    pprWidth W16 = "16"
-    pprWidth W32 = "32"
-    pprWidth W64 = "64"
-    pprWidth w   = pprPanic "cmpxchgLabel: Unsupported word width " (ppr w)
-
-atomicReadLabel :: Width -> String
-atomicReadLabel w = "hs_atomicread" ++ pprWidth w
-  where
-    pprWidth W8  = "8"
-    pprWidth W16 = "16"
-    pprWidth W32 = "32"
-    pprWidth W64 = "64"
-    pprWidth w   = pprPanic "atomicReadLabel: Unsupported word width " (ppr w)
-
-atomicWriteLabel :: Width -> String
-atomicWriteLabel w = "hs_atomicwrite" ++ pprWidth w
-  where
-    pprWidth W8  = "8"
-    pprWidth W16 = "16"
-    pprWidth W32 = "32"
-    pprWidth W64 = "64"
-    pprWidth w   = pprPanic "atomicWriteLabel: Unsupported word width " (ppr w)
diff --git a/compiler/nativeGen/PPC/CodeGen.hs b/compiler/nativeGen/PPC/CodeGen.hs
index 22a2c7cb6a..91651e6065 100644
--- a/compiler/nativeGen/PPC/CodeGen.hs
+++ b/compiler/nativeGen/PPC/CodeGen.hs
@@ -1160,10 +1160,6 @@ genCCall' dflags gcp target dest_regs args0
 
                     MO_BSwap w   -> (fsLit $ bSwapLabel w, False)
                     MO_PopCnt w  -> (fsLit $ popCntLabel w, False)
-                    MO_AtomicRMW w amop -> (fsLit $ atomicRMWLabel w amop, False)
-                    MO_Cmpxchg w -> (fsLit $ cmpxchgLabel w, False)
-                    MO_AtomicRead w  -> (fsLit $ atomicReadLabel w, False)
-                    MO_AtomicWrite w -> (fsLit $ atomicWriteLabel w, False)
 
                     MO_S_QuotRem {}  -> unsupported
                     MO_U_QuotRem {}  -> unsupported
diff --git a/compiler/nativeGen/SPARC/CodeGen.hs b/compiler/nativeGen/SPARC/CodeGen.hs
index 51f89d629f..f5e61d0a8f 100644
--- a/compiler/nativeGen/SPARC/CodeGen.hs
+++ b/compiler/nativeGen/SPARC/CodeGen.hs
@@ -654,10 +654,6 @@ outOfLineMachOp_table mop
 
         MO_BSwap w   -> fsLit $ bSwapLabel w
         MO_PopCnt w  -> fsLit $ popCntLabel w
-        MO_AtomicRMW w amop -> fsLit $ atomicRMWLabel w amop
-        MO_Cmpxchg w -> fsLit $ cmpxchgLabel w
-        MO_AtomicRead w -> fsLit $ atomicReadLabel w
-        MO_AtomicWrite w -> fsLit $ atomicWriteLabel w
 
         MO_S_QuotRem {}  -> unsupported
         MO_U_QuotRem {}  -> unsupported
diff --git a/compiler/nativeGen/X86/CodeGen.hs b/compiler/nativeGen/X86/CodeGen.hs
index 140e8b2992..fa93767fa3 100644
--- a/compiler/nativeGen/X86/CodeGen.hs
+++ b/compiler/nativeGen/X86/CodeGen.hs
@@ -1761,93 +1761,6 @@ genCCall dflags is32Bit (PrimTarget (MO_UF_Conv width)) dest_regs args = do
   where
     lbl = mkCmmCodeLabel primPackageId (fsLit (word2FloatLabel width))
 
-genCCall dflags _ (PrimTarget (MO_AtomicRMW width amop)) [dst] [addr, n] = do
-    Amode amode addr_code <- getAmode addr
-    arg <- getNewRegNat size
-    arg_code <- getAnyReg n
-    use_sse2 <- sse2Enabled
-    let platform = targetPlatform dflags
-        dst_r    = getRegisterReg platform use_sse2 (CmmLocal dst)
-    code <- op_code dst_r arg amode
-    return $ addr_code `appOL` arg_code arg `appOL` code
-  where
-    -- Code for the operation
-    op_code :: Reg       -- ^ The destination reg
-            -> Reg       -- ^ Register containing argument
-            -> AddrMode  -- ^ Address of location to mutate
-            -> NatM (OrdList Instr)
-    op_code dst_r arg amode = case amop of
-        -- In the common case where dst_r is a virtual register the
-        -- final move should go away, because it's the last use of arg
-        -- and the first use of dst_r.
-        AMO_Add  -> return $ toOL [ LOCK
-                                  , XADD size (OpReg arg) (OpAddr amode)
-                                  , MOV size (OpReg arg) (OpReg dst_r)
-                                  ]
-        AMO_Sub  -> return $ toOL [ NEGI size (OpReg arg)
-                                  , LOCK
-                                  , XADD size (OpReg arg) (OpAddr amode)
-                                  , MOV size (OpReg arg) (OpReg dst_r)
-                                  ]
-        AMO_And  -> cmpxchg_code (\ src dst -> unitOL $ AND size src dst)
-        AMO_Nand -> cmpxchg_code (\ src dst -> toOL [ AND size src dst
-                                                    , NOT size dst
-                                                    ])
-        AMO_Or   -> cmpxchg_code (\ src dst -> unitOL $ OR size src dst)
-        AMO_Xor  -> cmpxchg_code (\ src dst -> unitOL $ XOR size src dst)
-      where
-        -- Simulate operation that lacks a dedicated instruction using
-        -- cmpxchg.
-        cmpxchg_code :: (Operand -> Operand -> OrdList Instr)
-                     -> NatM (OrdList Instr)
-        cmpxchg_code instrs = do
-            lbl <- getBlockIdNat
-            tmp <- getNewRegNat size
-            return $ toOL
-                [ MOV size (OpAddr amode) (OpReg eax)
-                , JXX ALWAYS lbl
-                , NEWBLOCK lbl
-                  -- Keep old value so we can return it:
-                , MOV size (OpReg eax) (OpReg dst_r)
-                , MOV size (OpReg eax) (OpReg tmp)
-                ]
-                `appOL` instrs (OpReg arg) (OpReg tmp) `appOL` toOL
-                [ LOCK
-                , CMPXCHG size (OpReg tmp) (OpAddr amode)
-                , JXX NE lbl
-                ]
-
-    size = intSize width
-
-genCCall dflags _ (PrimTarget (MO_AtomicRead width)) [dst] [addr] = do
-  load_code <- intLoadCode (MOV (intSize width)) addr
-  let platform = targetPlatform dflags
-  use_sse2 <- sse2Enabled
-  return (load_code (getRegisterReg platform use_sse2 (CmmLocal dst)))
-
-genCCall _ _ (PrimTarget (MO_AtomicWrite width)) [] [addr, val] = do
-    assignMem_IntCode (intSize width) addr val
-
-genCCall dflags _ (PrimTarget (MO_Cmpxchg width)) [dst] [addr, old, new] = do
-    Amode amode addr_code <- getAmode addr
-    newval <- getNewRegNat size
-    newval_code <- getAnyReg new
-    oldval <- getNewRegNat size
-    oldval_code <- getAnyReg old
-    use_sse2 <- sse2Enabled
-    let platform = targetPlatform dflags
-        dst_r    = getRegisterReg platform use_sse2 (CmmLocal dst)
-        code     = toOL
-                   [ MOV size (OpReg oldval) (OpReg eax)
-                   , LOCK
-                   , CMPXCHG size (OpReg newval) (OpAddr amode)
-                   , MOV size (OpReg eax) (OpReg dst_r)
-                   ]
-    return $ addr_code `appOL` newval_code newval `appOL` oldval_code oldval
-        `appOL` code
-  where
-    size = intSize width
-
 genCCall _ is32Bit target dest_regs args
  | is32Bit   = genCCall32 target dest_regs args
  | otherwise = genCCall64 target dest_regs args
@@ -2472,11 +2385,6 @@ outOfLineCmmOp mop res args
               MO_PopCnt _  -> fsLit "popcnt"
               MO_BSwap _   -> fsLit "bswap"
 
-              MO_AtomicRMW _ _ -> fsLit "atomicrmw"
-              MO_AtomicRead _  -> fsLit "atomicread"
-              MO_AtomicWrite _ -> fsLit "atomicwrite"
-              MO_Cmpxchg _     -> fsLit "cmpxchg"
-
               MO_UF_Conv _ -> unsupported
 
               MO_S_QuotRem {}  -> unsupported
diff --git a/compiler/nativeGen/X86/Instr.hs b/compiler/nativeGen/X86/Instr.hs
index ac91747171..05fff9be96 100644
--- a/compiler/nativeGen/X86/Instr.hs
+++ b/compiler/nativeGen/X86/Instr.hs
@@ -327,10 +327,6 @@ data Instr
         | PREFETCH  PrefetchVariant Size Operand -- prefetch Variant, addr size, address to prefetch
                                         -- variant can be NTA, Lvl0, Lvl1, or Lvl2
 
-        | LOCK  -- lock prefix
-        | XADD        Size Operand Operand  -- src (r), dst (r/m)
-        | CMPXCHG     Size Operand Operand  -- src (r), dst (r/m), eax implicit
-
 data PrefetchVariant = NTA | Lvl0 | Lvl1 | Lvl2
 
 
@@ -341,8 +337,6 @@ data Operand
 
 
 
--- | Returns which registers are read and written as a (read, written)
--- pair.
 x86_regUsageOfInstr :: Platform -> Instr -> RegUsage
 x86_regUsageOfInstr platform instr
  = case instr of
@@ -434,21 +428,10 @@ x86_regUsageOfInstr platform instr
 
     -- note: might be a better way to do this
     PREFETCH _  _ src -> mkRU (use_R src []) []
-    LOCK                -> noUsage
-    XADD _ src dst      -> usageMM src dst
-    CMPXCHG _ src dst   -> usageRMM src dst (OpReg eax)
 
     _other              -> panic "regUsage: unrecognised instr"
- where
-    -- # Definitions
-    --
-    -- Written: If the operand is a register, it's written. If it's an
-    -- address, registers mentioned in the address are read.
-    --
-    -- Modified: If the operand is a register, it's both read and
-    -- written. If it's an address, registers mentioned in the address
-    -- are read.
 
+ where
     -- 2 operand form; first operand Read; second Written
     usageRW :: Operand -> Operand -> RegUsage
     usageRW op (OpReg reg)      = mkRU (use_R op []) [reg]
@@ -461,18 +444,6 @@ x86_regUsageOfInstr platform instr
     usageRM op (OpAddr ea)      = mkRUR (use_R op $! use_EA ea [])
     usageRM _ _                 = panic "X86.RegInfo.usageRM: no match"
 
-    -- 2 operand form; first operand Modified; second Modified
-    usageMM :: Operand -> Operand -> RegUsage
-    usageMM (OpReg src) (OpReg dst) = mkRU [src, dst] [src, dst]
-    usageMM (OpReg src) (OpAddr ea) = mkRU (use_EA ea [src]) [src]
-    usageMM _ _                     = panic "X86.RegInfo.usageMM: no match"
-
-    -- 3 operand form; first operand Read; second Modified; third Modified
-    usageRMM :: Operand -> Operand -> Operand -> RegUsage
-    usageRMM (OpReg src) (OpReg dst) (OpReg reg) = mkRU [src, dst, reg] [dst, reg]
-    usageRMM (OpReg src) (OpAddr ea) (OpReg reg) = mkRU (use_EA ea [src, reg]) [reg]
-    usageRMM _ _ _                               = panic "X86.RegInfo.usageRMM: no match"
-
     -- 1 operand form; operand Modified
     usageM :: Operand -> RegUsage
     usageM (OpReg reg)          = mkRU [reg] [reg]
@@ -505,7 +476,6 @@ x86_regUsageOfInstr platform instr
         where src' = filter (interesting platform) src
               dst' = filter (interesting platform) dst
 
--- | Is this register interesting for the register allocator?
 interesting :: Platform -> Reg -> Bool
 interesting _        (RegVirtual _)              = True
 interesting platform (RegReal (RealRegSingle i)) = isFastTrue (freeReg platform i)
@@ -513,8 +483,6 @@ interesting _        (RegReal (RealRegPair{}))   = panic "X86.interesting: no re
 
 
 
--- | Applies the supplied function to all registers in instructions.
--- Typically used to change virtual registers to real registers.
 x86_patchRegsOfInstr :: Instr -> (Reg -> Reg) -> Instr
 x86_patchRegsOfInstr instr env
  = case instr of
@@ -603,10 +571,6 @@ x86_patchRegsOfInstr instr env
 
     PREFETCH lvl size src -> PREFETCH lvl size (patchOp src)
 
-    LOCK                -> instr
-    XADD sz src dst     -> patch2 (XADD sz) src dst
-    CMPXCHG sz src dst  -> patch2 (CMPXCHG sz) src dst
-
     _other              -> panic "patchRegs: unrecognised instr"
 
   where
diff --git a/compiler/nativeGen/X86/Ppr.hs b/compiler/nativeGen/X86/Ppr.hs
index 7771c02512..459c041ba5 100644
--- a/compiler/nativeGen/X86/Ppr.hs
+++ b/compiler/nativeGen/X86/Ppr.hs
@@ -886,14 +886,6 @@ pprInstr GFREE
             ptext (sLit "\tffree %st(4) ;ffree %st(5)")
           ]
 
--- Atomics
-
-pprInstr LOCK = ptext (sLit "\tlock")
-
-pprInstr (XADD size src dst) = pprSizeOpOp (sLit "xadd") size src dst
-
-pprInstr (CMPXCHG size src dst) = pprSizeOpOp (sLit "cmpxchg") size src dst
-
 pprInstr _
         = panic "X86.Ppr.pprInstr: no match"
 
diff --git a/compiler/prelude/primops.txt.pp b/compiler/prelude/primops.txt.pp
index 4faa585246..4851315eb4 100644
--- a/compiler/prelude/primops.txt.pp
+++ b/compiler/prelude/primops.txt.pp
@@ -1363,79 +1363,19 @@ primop  SetByteArrayOp "setByteArray#" GenPrimOp
   code_size = { primOpCodeSizeForeignCall + 4 }
   can_fail = True
 
--- Atomic operations
-
-primop  AtomicReadByteArrayOp_Int "atomicReadIntArray#" GenPrimOp
-   MutableByteArray# s -> Int# -> State# s -> (# State# s, Int# #)
-   {Given an array and an offset in Int units, read an element. The
-    index is assumed to be in bounds. Implies a full memory barrier.}
-   with has_side_effects = True
-        can_fail = True
-
-primop  AtomicWriteByteArrayOp_Int "atomicWriteIntArray#" GenPrimOp
-   MutableByteArray# s -> Int# -> Int# -> State# s -> State# s
-   {Given an array and an offset in Int units, write an element. The
-    index is assumed to be in bounds. Implies a full memory barrier.}
-   with has_side_effects = True
-        can_fail = True
-
 primop CasByteArrayOp_Int "casIntArray#" GenPrimOp
    MutableByteArray# s -> Int# -> Int# -> Int# -> State# s -> (# State# s, Int# #)
-   {Given an array, an offset in Int units, the expected old value, and
-    the new value, perform an atomic compare and swap i.e. write the new
-    value if the current value matches the provided old value. Returns
-    the value of the element before the operation. Implies a full memory
-    barrier.}
-   with has_side_effects = True
-        can_fail = True
+   {Machine-level atomic compare and swap on a word within a ByteArray.}
+   with
+   out_of_line = True
+   has_side_effects = True
 
 primop FetchAddByteArrayOp_Int "fetchAddIntArray#" GenPrimOp
    MutableByteArray# s -> Int# -> Int# -> State# s -> (# State# s, Int# #)
-   {Given an array, and offset in Int units, and a value to add,
-    atomically add the value to the element. Returns the value of the
-    element before the operation. Implies a full memory barrier.}
-   with has_side_effects = True
-        can_fail = True
-
-primop FetchSubByteArrayOp_Int "fetchSubIntArray#" GenPrimOp
-   MutableByteArray# s -> Int# -> Int# -> State# s -> (# State# s, Int# #)
-   {Given an array, and offset in Int units, and a value to subtract,
-    atomically substract the value to the element. Returns the value of
-    the element before the operation. Implies a full memory barrier.}
-   with has_side_effects = True
-        can_fail = True
-
-primop FetchAndByteArrayOp_Int "fetchAndIntArray#" GenPrimOp
-   MutableByteArray# s -> Int# -> Int# -> State# s -> (# State# s, Int# #)
-   {Given an array, and offset in Int units, and a value to AND,
-    atomically AND the value to the element. Returns the value of the
-    element before the operation. Implies a full memory barrier.}
-   with has_side_effects = True
-        can_fail = True
-
-primop FetchNandByteArrayOp_Int "fetchNandIntArray#" GenPrimOp
-   MutableByteArray# s -> Int# -> Int# -> State# s -> (# State# s, Int# #)
-   {Given an array, and offset in Int units, and a value to NAND,
-    atomically NAND the value to the element. Returns the value of the
-    element before the operation. Implies a full memory barrier.}
-   with has_side_effects = True
-        can_fail = True
-
-primop FetchOrByteArrayOp_Int "fetchOrIntArray#" GenPrimOp
-   MutableByteArray# s -> Int# -> Int# -> State# s -> (# State# s, Int# #)
-   {Given an array, and offset in Int units, and a value to OR,
-    atomically OR the value to the element. Returns the value of the
-    element before the operation. Implies a full memory barrier.}
-   with has_side_effects = True
-        can_fail = True
-
-primop FetchXorByteArrayOp_Int "fetchXorIntArray#" GenPrimOp
-   MutableByteArray# s -> Int# -> Int# -> State# s -> (# State# s, Int# #)
-   {Given an array, and offset in Int units, and a value to XOR,
-    atomically XOR the value to the element. Returns the value of the
-    element before the operation. Implies a full memory barrier.}
-   with has_side_effects = True
-        can_fail = True
+   {Machine-level word-sized fetch-and-add within a ByteArray.}
+   with
+   out_of_line = True
+   has_side_effects = True
 
 
 ------------------------------------------------------------------------
diff --git a/includes/stg/MiscClosures.h b/includes/stg/MiscClosures.h
index ee5a119aa1..0c4d2f9eaf 100644
--- a/includes/stg/MiscClosures.h
+++ b/includes/stg/MiscClosures.h
@@ -348,6 +348,7 @@ RTS_FUN_DECL(stg_newByteArrayzh);
 RTS_FUN_DECL(stg_newPinnedByteArrayzh);
 RTS_FUN_DECL(stg_newAlignedPinnedByteArrayzh);
 RTS_FUN_DECL(stg_casIntArrayzh);
+RTS_FUN_DECL(stg_fetchAddIntArrayzh);
 RTS_FUN_DECL(stg_newArrayzh);
 RTS_FUN_DECL(stg_newArrayArrayzh);
 RTS_FUN_DECL(stg_copyArrayzh);
diff --git a/libraries/ghc-prim/cbits/atomic.c b/libraries/ghc-prim/cbits/atomic.c
deleted file mode 100644
index e3d6cc1e95..0000000000
--- a/libraries/ghc-prim/cbits/atomic.c
+++ /dev/null
@@ -1,306 +0,0 @@
-#include "Rts.h"
-
-// Fallbacks for atomic primops on byte arrays. The builtins used
-// below are supported on both GCC and LLVM.
-//
-// Ideally these function would take StgWord8, StgWord16, etc but
-// older GCC versions incorrectly assume that the register that the
-// argument is passed in has been zero extended, which is incorrect
-// according to the ABI and is not what GHC does when it generates
-// calls to these functions.
-
-// FetchAddByteArrayOp_Int
-
-extern StgWord hs_atomic_add8(volatile StgWord8 *x, StgWord val);
-StgWord
-hs_atomic_add8(volatile StgWord8 *x, StgWord val)
-{
-  return __sync_fetch_and_add(x, (StgWord8) val);
-}
-
-extern StgWord hs_atomic_add16(volatile StgWord16 *x, StgWord val);
-StgWord
-hs_atomic_add16(volatile StgWord16 *x, StgWord val)
-{
-  return __sync_fetch_and_add(x, (StgWord16) val);
-}
-
-extern StgWord hs_atomic_add32(volatile StgWord32 *x, StgWord val);
-StgWord
-hs_atomic_add32(volatile StgWord32 *x, StgWord val)
-{
-  return __sync_fetch_and_add(x, (StgWord32) val);
-}
-
-extern StgWord64 hs_atomic_add64(volatile StgWord64 *x, StgWord64 val);
-StgWord64
-hs_atomic_add64(volatile StgWord64 *x, StgWord64 val)
-{
-  return __sync_fetch_and_add(x, val);
-}
-
-// FetchSubByteArrayOp_Int
-
-extern StgWord hs_atomic_sub8(volatile StgWord8 *x, StgWord val);
-StgWord
-hs_atomic_sub8(volatile StgWord8 *x, StgWord val)
-{
-  return __sync_fetch_and_sub(x, (StgWord8) val);
-}
-
-extern StgWord hs_atomic_sub16(volatile StgWord16 *x, StgWord val);
-StgWord
-hs_atomic_sub16(volatile StgWord16 *x, StgWord val)
-{
-  return __sync_fetch_and_sub(x, (StgWord16) val);
-}
-
-extern StgWord hs_atomic_sub32(volatile StgWord32 *x, StgWord val);
-StgWord
-hs_atomic_sub32(volatile StgWord32 *x, StgWord val)
-{
-  return __sync_fetch_and_sub(x, (StgWord32) val);
-}
-
-extern StgWord64 hs_atomic_sub64(volatile StgWord64 *x, StgWord64 val);
-StgWord64
-hs_atomic_sub64(volatile StgWord64 *x, StgWord64 val)
-{
-  return __sync_fetch_and_sub(x, val);
-}
-
-// FetchAndByteArrayOp_Int
-
-extern StgWord hs_atomic_and8(volatile StgWord8 *x, StgWord val);
-StgWord
-hs_atomic_and8(volatile StgWord8 *x, StgWord val)
-{
-  return __sync_fetch_and_and(x, (StgWord8) val);
-}
-
-extern StgWord hs_atomic_and16(volatile StgWord16 *x, StgWord val);
-StgWord
-hs_atomic_and16(volatile StgWord16 *x, StgWord val)
-{
-  return __sync_fetch_and_and(x, (StgWord16) val);
-}
-
-extern StgWord hs_atomic_and32(volatile StgWord32 *x, StgWord val);
-StgWord
-hs_atomic_and32(volatile StgWord32 *x, StgWord val)
-{
-  return __sync_fetch_and_and(x, (StgWord32) val);
-}
-
-extern StgWord64 hs_atomic_and64(volatile StgWord64 *x, StgWord64 val);
-StgWord64
-hs_atomic_and64(volatile StgWord64 *x, StgWord64 val)
-{
-  return __sync_fetch_and_and(x, val);
-}
-
-// FetchNandByteArrayOp_Int
-
-// Workaround for http://llvm.org/bugs/show_bug.cgi?id=8842
-#define CAS_NAND(x, val)                                            \
-  {                                                                 \
-    __typeof__ (*(x)) tmp = *(x);                                   \
-    while (!__sync_bool_compare_and_swap(x, tmp, ~(tmp & (val)))) { \
-      tmp = *(x);                                                   \
-    }                                                               \
-    return tmp;                                                     \
-  }
-
-extern StgWord hs_atomic_nand8(volatile StgWord8 *x, StgWord val);
-StgWord
-hs_atomic_nand8(volatile StgWord8 *x, StgWord val)
-{
-#ifdef __clang__
-  CAS_NAND(x, (StgWord8) val)
-#else
-  return __sync_fetch_and_nand(x, (StgWord8) val);
-#endif
-}
-
-extern StgWord hs_atomic_nand16(volatile StgWord16 *x, StgWord val);
-StgWord
-hs_atomic_nand16(volatile StgWord16 *x, StgWord val)
-{
-#ifdef __clang__
-  CAS_NAND(x, (StgWord16) val);
-#else
-  return __sync_fetch_and_nand(x, (StgWord16) val);
-#endif
-}
-
-extern StgWord hs_atomic_nand32(volatile StgWord32 *x, StgWord val);
-StgWord
-hs_atomic_nand32(volatile StgWord32 *x, StgWord val)
-{
-#ifdef __clang__
-  CAS_NAND(x, (StgWord32) val);
-#else
-  return __sync_fetch_and_nand(x, (StgWord32) val);
-#endif
-}
-
-extern StgWord64 hs_atomic_nand64(volatile StgWord64 *x, StgWord64 val);
-StgWord64
-hs_atomic_nand64(volatile StgWord64 *x, StgWord64 val)
-{
-#ifdef __clang__
-  CAS_NAND(x, val);
-#else
-  return __sync_fetch_and_nand(x, val);
-#endif
-}
-
-// FetchOrByteArrayOp_Int
-
-extern StgWord hs_atomic_or8(volatile StgWord8 *x, StgWord val);
-StgWord
-hs_atomic_or8(volatile StgWord8 *x, StgWord val)
-{
-  return __sync_fetch_and_or(x, (StgWord8) val);
-}
-
-extern StgWord hs_atomic_or16(volatile StgWord16 *x, StgWord val);
-StgWord
-hs_atomic_or16(volatile StgWord16 *x, StgWord val)
-{
-  return __sync_fetch_and_or(x, (StgWord16) val);
-}
-
-extern StgWord hs_atomic_or32(volatile StgWord32 *x, StgWord val);
-StgWord
-hs_atomic_or32(volatile StgWord32 *x, StgWord val)
-{
-  return __sync_fetch_and_or(x, (StgWord32) val);
-}
-
-extern StgWord64 hs_atomic_or64(volatile StgWord64 *x, StgWord64 val);
-StgWord64
-hs_atomic_or64(volatile StgWord64 *x, StgWord64 val)
-{
-  return __sync_fetch_and_or(x, val);
-}
-
-// FetchXorByteArrayOp_Int
-
-extern StgWord hs_atomic_xor8(volatile StgWord8 *x, StgWord val);
-StgWord
-hs_atomic_xor8(volatile StgWord8 *x, StgWord val)
-{
-  return __sync_fetch_and_xor(x, (StgWord8) val);
-}
-
-extern StgWord hs_atomic_xor16(volatile StgWord16 *x, StgWord val);
-StgWord
-hs_atomic_xor16(volatile StgWord16 *x, StgWord val)
-{
-  return __sync_fetch_and_xor(x, (StgWord16) val);
-}
-
-extern StgWord hs_atomic_xor32(volatile StgWord32 *x, StgWord val);
-StgWord
-hs_atomic_xor32(volatile StgWord32 *x, StgWord val)
-{
-  return __sync_fetch_and_xor(x, (StgWord32) val);
-}
-
-extern StgWord64 hs_atomic_xor64(volatile StgWord64 *x, StgWord64 val);
-StgWord64
-hs_atomic_xor64(volatile StgWord64 *x, StgWord64 val)
-{
-  return __sync_fetch_and_xor(x, val);
-}
-
-// CasByteArrayOp_Int
-
-extern StgWord hs_cmpxchg8(volatile StgWord8 *x, StgWord old, StgWord new);
-StgWord
-hs_cmpxchg8(volatile StgWord8 *x, StgWord old, StgWord new)
-{
-  return __sync_val_compare_and_swap(x, (StgWord8) old, (StgWord8) new);
-}
-
-extern StgWord hs_cmpxchg16(volatile StgWord16 *x, StgWord old, StgWord new);
-StgWord
-hs_cmpxchg16(volatile StgWord16 *x, StgWord old, StgWord new)
-{
-  return __sync_val_compare_and_swap(x, (StgWord16) old, (StgWord16) new);
-}
-
-extern StgWord hs_cmpxchg32(volatile StgWord32 *x, StgWord old, StgWord new);
-StgWord
-hs_cmpxchg32(volatile StgWord32 *x, StgWord old, StgWord new)
-{
-  return __sync_val_compare_and_swap(x, (StgWord32) old, (StgWord32) new);
-}
-
-extern StgWord hs_cmpxchg64(volatile StgWord64 *x, StgWord64 old, StgWord64 new);
-StgWord
-hs_cmpxchg64(volatile StgWord64 *x, StgWord64 old, StgWord64 new)
-{
-  return __sync_val_compare_and_swap(x, old, new);
-}
-
-// AtomicReadByteArrayOp_Int
-
-extern StgWord hs_atomicread8(volatile StgWord8 *x);
-StgWord
-hs_atomicread8(volatile StgWord8 *x)
-{
-  return *x;
-}
-
-extern StgWord hs_atomicread16(volatile StgWord16 *x);
-StgWord
-hs_atomicread16(volatile StgWord16 *x)
-{
-  return *x;
-}
-
-extern StgWord hs_atomicread32(volatile StgWord32 *x);
-StgWord
-hs_atomicread32(volatile StgWord32 *x)
-{
-  return *x;
-}
-
-extern StgWord64 hs_atomicread64(volatile StgWord64 *x);
-StgWord64
-hs_atomicread64(volatile StgWord64 *x)
-{
-  return *x;
-}
-
-// AtomicWriteByteArrayOp_Int
-
-extern void hs_atomicwrite8(volatile StgWord8 *x, StgWord val);
-void
-hs_atomicwrite8(volatile StgWord8 *x, StgWord val)
-{
-  *x = (StgWord8) val;
-}
-
-extern void hs_atomicwrite16(volatile StgWord16 *x, StgWord val);
-void
-hs_atomicwrite16(volatile StgWord16 *x, StgWord val)
-{
-  *x = (StgWord16) val;
-}
-
-extern void hs_atomicwrite32(volatile StgWord32 *x, StgWord val);
-void
-hs_atomicwrite32(volatile StgWord32 *x, StgWord val)
-{
-  *x = (StgWord32) val;
-}
-
-extern void hs_atomicwrite64(volatile StgWord64 *x, StgWord64 val);
-void
-hs_atomicwrite64(volatile StgWord64 *x, StgWord64 val)
-{
-  *x = (StgWord64) val;
-}
diff --git a/libraries/ghc-prim/ghc-prim.cabal b/libraries/ghc-prim/ghc-prim.cabal
index bc9f57126a..c861342b56 100644
--- a/libraries/ghc-prim/ghc-prim.cabal
+++ b/libraries/ghc-prim/ghc-prim.cabal
@@ -52,7 +52,6 @@ Library
         exposed-modules: GHC.Prim
 
     c-sources:
-        cbits/atomic.c
         cbits/bswap.c
         cbits/debug.c
         cbits/longlong.c
diff --git a/rts/Linker.c b/rts/Linker.c
index fb07d58452..f7f554ce6c 100644
--- a/rts/Linker.c
+++ b/rts/Linker.c
@@ -1186,6 +1186,7 @@ typedef struct _RtsSymbolVal {
       SymI_HasProto(stg_newBCOzh)                                       \
       SymI_HasProto(stg_newByteArrayzh)                                 \
       SymI_HasProto(stg_casIntArrayzh)                                  \
+      SymI_HasProto(stg_fetchAddIntArrayzh)                             \
       SymI_HasProto(stg_newMVarzh)                                      \
       SymI_HasProto(stg_newMutVarzh)                                    \
       SymI_HasProto(stg_newTVarzh)                                      \
diff --git a/rts/PrimOps.cmm b/rts/PrimOps.cmm
index 5f04a6d732..4d7baca824 100644
--- a/rts/PrimOps.cmm
+++ b/rts/PrimOps.cmm
@@ -151,6 +151,18 @@ stg_casIntArrayzh( gcptr arr, W_ ind, W_ old, W_ new )
 }
 
 
+stg_fetchAddIntArrayzh( gcptr arr, W_ ind, W_ incr )
+/* MutableByteArray# s -> Int# -> Int# -> State# s -> (# State# s, Int# #) */
+{
+    W_ p, h;
+
+    p = arr + SIZEOF_StgArrWords + WDS(ind);
+    (h) = ccall atomic_inc(p, incr);
+
+    return(h);
+}
+
+
 stg_newArrayzh ( W_ n /* words */, gcptr init )
 {
     W_ words, size, p;
diff --git a/testsuite/tests/concurrent/should_run/.gitignore b/testsuite/tests/concurrent/should_run/.gitignore
index d64f644233..4f0a3175da 100644
--- a/testsuite/tests/concurrent/should_run/.gitignore
+++ b/testsuite/tests/concurrent/should_run/.gitignore
@@ -1,4 +1,3 @@
-AtomicPrimops
 T7970
 compareAndSwap
 readMVar1
diff --git a/testsuite/tests/concurrent/should_run/AtomicPrimops.hs b/testsuite/tests/concurrent/should_run/AtomicPrimops.hs
deleted file mode 100644
index 0c55aba93e..0000000000
--- a/testsuite/tests/concurrent/should_run/AtomicPrimops.hs
+++ /dev/null
@@ -1,245 +0,0 @@
-{-# LANGUAGE MagicHash #-}
-{-# LANGUAGE UnboxedTuples #-}
-
-module Main ( main ) where
-
-import Control.Concurrent
-import Control.Concurrent.MVar
-import Control.Monad (when)
-import Foreign.Storable
-import GHC.Exts
-import GHC.IO
-
--- | Iterations per worker.
-iters :: Int
-iters = 1000000
-
-main :: IO ()
-main = do
-    fetchAddSubTest
-    fetchAndTest
-    fetchNandTest
-    fetchOrTest
-    fetchXorTest
-    casTest
-    readWriteTest
-
--- | Test fetchAddIntArray# by having two threads concurrenctly
--- increment a counter and then checking the sum at the end.
-fetchAddSubTest :: IO ()
-fetchAddSubTest = do
-    tot <- race 0
-        (\ mba -> work fetchAddIntArray mba iters 2)
-        (\ mba -> work fetchSubIntArray mba iters 1)
-    assertEq 1000000 tot "fetchAddSubTest"
-  where
-    work :: (MByteArray -> Int -> Int -> IO ()) -> MByteArray -> Int -> Int
-         -> IO ()
-    work op mba 0 val = return ()
-    work op mba n val = op mba 0 val >> work op mba (n-1) val
-
--- | Test fetchXorIntArray# by having two threads concurrenctly XORing
--- and then checking the result at the end. Works since XOR is
--- commutative.
---
--- Covers the code paths for AND, NAND, and OR as well.
-fetchXorTest :: IO ()
-fetchXorTest = do
-    res <- race n0
-        (\ mba -> work mba iters t1pat)
-        (\ mba -> work mba iters t2pat)
-    assertEq expected res "fetchXorTest"
-  where
-    work :: MByteArray -> Int -> Int -> IO ()
-    work mba 0 val = return ()
-    work mba n val = fetchXorIntArray mba 0 val >> work mba (n-1) val
-
-    -- Initial value is a large prime and the two patterns are 1010...
-    -- and 0101...
-    (n0, t1pat, t2pat)
-        | sizeOf (undefined :: Int) == 8 =
-            (0x00000000ffffffff, 0x5555555555555555, 0x9999999999999999)
-        | otherwise = (0x0000ffff, 0x55555555, 0x99999999)
-    expected
-        | sizeOf (undefined :: Int) == 8 = 4294967295
-        | otherwise = 65535
-
--- The tests for AND, NAND, and OR are trivial for two reasons:
---
---  * The code path is already well exercised by 'fetchXorTest'.
---
---  * It's harder to test these operations, as a long sequence of them
---    convert to a single value but we'd like to write a test in the
---    style of 'fetchXorTest' that applies the operation repeatedly,
---    to make it likely that any race conditions are detected.
---
--- Right now we only test that they return the correct value for a
--- single op on each thread.
-
-fetchOpTest :: (MByteArray -> Int -> Int -> IO ())
-            -> Int -> String -> IO ()
-fetchOpTest op expected name = do
-    res <- race n0
-        (\ mba -> work mba t1pat)
-        (\ mba -> work mba t2pat)
-    assertEq expected res name
-  where
-    work :: MByteArray -> Int -> IO ()
-    work mba val = op mba 0 val
-
-    -- Initial value is a large prime and the two patterns are 1010...
-    -- and 0101...
-    (n0, t1pat, t2pat)
-        | sizeOf (undefined :: Int) == 8 =
-            (0x00000000ffffffff, 0x5555555555555555, 0x9999999999999999)
-        | otherwise = (0x0000ffff, 0x55555555, 0x99999999)
-
-fetchAndTest :: IO ()
-fetchAndTest = fetchOpTest fetchAndIntArray expected "fetchAndTest"
-  where expected
-            | sizeOf (undefined :: Int) == 8 = 286331153
-            | otherwise = 4369
-
-fetchNandTest :: IO ()
-fetchNandTest = fetchOpTest fetchNandIntArray expected "fetchNandTest"
-  where expected
-            | sizeOf (undefined :: Int) == 8 = 7378697629770151799
-            | otherwise = -2576976009
-
-fetchOrTest :: IO ()
-fetchOrTest = fetchOpTest fetchOrIntArray expected "fetchOrTest"
-  where expected
-            | sizeOf (undefined :: Int) == 8 = 15987178197787607039
-            | otherwise = 3722313727
-
--- | Test casIntArray# by using it to emulate fetchAddIntArray# and
--- then having two threads concurrenctly increment a counter,
--- checking the sum at the end.
-casTest :: IO ()
-casTest = do
-    tot <- race 0
-        (\ mba -> work mba iters 1)
-        (\ mba -> work mba iters 2)
-    assertEq 3000000 tot "casTest"
-  where
-    work :: MByteArray -> Int -> Int -> IO ()
-    work mba 0 val = return ()
-    work mba n val = add mba 0 val >> work mba (n-1) val
-
-    -- Fetch-and-add implemented using CAS.
-    add :: MByteArray -> Int -> Int -> IO ()
-    add mba ix n = do
-        old <- readIntArray mba ix
-        old' <- casIntArray mba ix old (old + n)
-        when (old /= old') $ add mba ix n
-
--- | Tests atomic reads and writes by making sure that one thread sees
--- updates that are done on another. This test isn't very good at the
--- moment, as this might work even without atomic ops, but at least it
--- exercises the code.
-readWriteTest :: IO ()
-readWriteTest = do
-    mba <- newByteArray (sizeOf (undefined :: Int))
-    writeIntArray mba 0 0
-    latch <- newEmptyMVar
-    done <- newEmptyMVar
-    forkIO $ do
-        takeMVar latch
-        n <- atomicReadIntArray mba 0
-        assertEq 1 n "readWriteTest"
-        putMVar done ()
-    atomicWriteIntArray mba 0 1
-    putMVar latch ()
-    takeMVar done
-
--- | Create two threads that mutate the byte array passed to them
--- concurrently. The array is one word large.
-race :: Int                    -- ^ Initial value of array element
-     -> (MByteArray -> IO ())  -- ^ Thread 1 action
-     -> (MByteArray -> IO ())  -- ^ Thread 2 action
-     -> IO Int                 -- ^ Final value of array element
-race n0 thread1 thread2 = do
-    done1 <- newEmptyMVar
-    done2 <- newEmptyMVar
-    mba <- newByteArray (sizeOf (undefined :: Int))
-    writeIntArray mba 0 n0
-    forkIO $ thread1 mba >> putMVar done1 ()
-    forkIO $ thread2 mba >> putMVar done2 ()
-    mapM_ takeMVar [done1, done2]
-    readIntArray mba 0
-
-------------------------------------------------------------------------
--- Test helper
-
-assertEq :: (Eq a, Show a) => a -> a -> String -> IO ()
-assertEq expected actual name
-    | expected == actual = putStrLn $ name ++ ": OK"
-    | otherwise = do
-        putStrLn $ name ++ ": FAIL"
-        putStrLn $ "Expected: " ++ show expected
-        putStrLn $ "  Actual: " ++ show actual
-
-------------------------------------------------------------------------
--- Wrappers around MutableByteArray#
-
-data MByteArray = MBA (MutableByteArray# RealWorld)
-
-fetchAddIntArray :: MByteArray -> Int -> Int -> IO ()
-fetchAddIntArray (MBA mba#) (I# ix#) (I# n#) = IO $ \ s# ->
-    case fetchAddIntArray# mba# ix# n# s# of
-        (# s2#, _ #) -> (# s2#, () #)
-
-fetchSubIntArray :: MByteArray -> Int -> Int -> IO ()
-fetchSubIntArray (MBA mba#) (I# ix#) (I# n#) = IO $ \ s# ->
-    case fetchSubIntArray# mba# ix# n# s# of
-        (# s2#, _ #) -> (# s2#, () #)
-
-fetchAndIntArray :: MByteArray -> Int -> Int -> IO ()
-fetchAndIntArray (MBA mba#) (I# ix#) (I# n#) = IO $ \ s# ->
-    case fetchAndIntArray# mba# ix# n# s# of
-        (# s2#, _ #) -> (# s2#, () #)
-
-fetchNandIntArray :: MByteArray -> Int -> Int -> IO ()
-fetchNandIntArray (MBA mba#) (I# ix#) (I# n#) = IO $ \ s# ->
-    case fetchNandIntArray# mba# ix# n# s# of
-        (# s2#, _ #) -> (# s2#, () #)
-
-fetchOrIntArray :: MByteArray -> Int -> Int -> IO ()
-fetchOrIntArray (MBA mba#) (I# ix#) (I# n#) = IO $ \ s# ->
-    case fetchOrIntArray# mba# ix# n# s# of
-        (# s2#, _ #) -> (# s2#, () #)
-
-fetchXorIntArray :: MByteArray -> Int -> Int -> IO ()
-fetchXorIntArray (MBA mba#) (I# ix#) (I# n#) = IO $ \ s# ->
-    case fetchXorIntArray# mba# ix# n# s# of
-        (# s2#, _ #) -> (# s2#, () #)
-
-newByteArray :: Int -> IO MByteArray
-newByteArray (I# n#) = IO $ \ s# ->
-    case newByteArray# n# s# of
-        (# s2#, mba# #) -> (# s2#, MBA mba# #)
-
-writeIntArray :: MByteArray -> Int -> Int -> IO ()
-writeIntArray (MBA mba#) (I# ix#) (I# n#) = IO $ \ s# ->
-    case writeIntArray# mba# ix# n# s# of
-        s2# -> (# s2#, () #)
-
-readIntArray :: MByteArray -> Int -> IO Int
-readIntArray (MBA mba#) (I# ix#) = IO $ \ s# ->
-    case readIntArray# mba# ix# s# of
-        (# s2#, n# #) -> (# s2#, I# n# #)
-
-atomicWriteIntArray :: MByteArray -> Int -> Int -> IO ()
-atomicWriteIntArray (MBA mba#) (I# ix#) (I# n#) = IO $ \ s# ->
-    case atomicWriteIntArray# mba# ix# n# s# of
-        s2# -> (# s2#, () #)
-
-atomicReadIntArray :: MByteArray -> Int -> IO Int
-atomicReadIntArray (MBA mba#) (I# ix#) = IO $ \ s# ->
-    case atomicReadIntArray# mba# ix# s# of
-        (# s2#, n# #) -> (# s2#, I# n# #)
-
-casIntArray :: MByteArray -> Int -> Int -> Int -> IO Int
-casIntArray (MBA mba#) (I# ix#) (I# old#) (I# new#) = IO $ \ s# ->
-    case casIntArray# mba# ix# old# new# s# of
-        (# s2#, old2# #) -> (# s2#, I# old2# #)
diff --git a/testsuite/tests/concurrent/should_run/AtomicPrimops.stdout b/testsuite/tests/concurrent/should_run/AtomicPrimops.stdout
deleted file mode 100644
index c37041a040..0000000000
--- a/testsuite/tests/concurrent/should_run/AtomicPrimops.stdout
+++ /dev/null
@@ -1,7 +0,0 @@
-fetchAddSubTest: OK
-fetchAndTest: OK
-fetchNandTest: OK
-fetchOrTest: OK
-fetchXorTest: OK
-casTest: OK
-readWriteTest: OK
diff --git a/testsuite/tests/concurrent/should_run/all.T b/testsuite/tests/concurrent/should_run/all.T
index 0a66892d82..0b502c3bc7 100644
--- a/testsuite/tests/concurrent/should_run/all.T
+++ b/testsuite/tests/concurrent/should_run/all.T
@@ -81,7 +81,6 @@ test('tryReadMVar1', normal, compile_and_run, [''])
 test('tryReadMVar2', normal, compile_and_run, [''])
 
 test('T7970', normal, compile_and_run, [''])
-test('AtomicPrimops', normal, compile_and_run, [''])
 
 # -----------------------------------------------------------------------------
 # These tests we only do for a full run