Add more primops for atomic ops on byte arrays

Summary: Add more primops for atomic ops on byte arrays Adds the following primops: * atomicReadIntArray# * atomicWriteIntArray# * fetchSubIntArray# * fetchOrIntArray# * fetchXorIntArray# * fetchAndIntArray# Makes these pre-existing out-of-line primops inline: * fetchAddIntArray# * casIntArray#
author: Johan Tibell <johan.tibell@gmail.com> 2014-06-09 11:43:21 +0200
committer: Johan Tibell <johan.tibell@gmail.com> 2014-06-24 19:06:53 +0200
commit: d8abf85f8ca176854e9d5d0b12371c4bc402aac3 (patch)
tree: 9542f2b8a7ee9b9759396e4172ad5fc4ce1d2f3c /compiler
parent: a4a79b5a04658ac542b1e07a6975b488fd589441 (diff)
download: haskell-d8abf85f8ca176854e9d5d0b12371c4bc402aac3.tar.gz
14 files changed, 449 insertions, 40 deletions
diff --git a/compiler/cmm/CmmMachOp.hs b/compiler/cmm/CmmMachOp.hs
index c4ec393ad6..d8ce492de1 100644
--- a/compiler/cmm/CmmMachOp.hs
+++ b/compiler/cmm/CmmMachOp.hs
@@ -19,6 +19,9 @@ module CmmMachOp
     -- CallishMachOp
     , CallishMachOp(..), callishMachOpHints
     , pprCallishMachOp
+
+    -- Atomic read-modify-write
+    , AtomicMachOp(..)
    )
 where
 
@@ -547,8 +550,24 @@ data CallishMachOp
 
   | MO_PopCnt Width
   | MO_BSwap Width
+
+  -- Atomic read-modify-write.
+  | MO_AtomicRMW Width AtomicMachOp
+  | MO_AtomicRead Width
+  | MO_AtomicWrite Width
+  | MO_Cmpxchg Width
   deriving (Eq, Show)
 
+-- | The operation to perform atomically.
+data AtomicMachOp =
+      AMO_Add
+    | AMO_Sub
+    | AMO_And
+    | AMO_Nand
+    | AMO_Or
+    | AMO_Xor
+      deriving (Eq, Show)
+
 pprCallishMachOp :: CallishMachOp -> SDoc
 pprCallishMachOp mo = text (show mo)
 
diff --git a/compiler/cmm/CmmSink.hs b/compiler/cmm/CmmSink.hs
index 4c025425ab..4dced9afd2 100644
--- a/compiler/cmm/CmmSink.hs
+++ b/compiler/cmm/CmmSink.hs
@@ -650,6 +650,10 @@ data AbsMem
 -- perhaps we ought to have a special annotation for calls that can
 -- modify heap/stack memory.  For now we just use the conservative
 -- definition here.
+--
+-- Some CallishMachOp imply a memory barrier e.g. AtomicRMW and
+-- therefore we should never float any memory operations across one of
+-- these calls.
 
 
 bothMems :: AbsMem -> AbsMem -> AbsMem
diff --git a/compiler/cmm/PprC.hs b/compiler/cmm/PprC.hs
index 47b247e278..455c79ba02 100644
--- a/compiler/cmm/PprC.hs
+++ b/compiler/cmm/PprC.hs
@@ -753,6 +753,10 @@ pprCallishMachOp_for_C mop
         MO_Memmove      -> ptext (sLit "memmove")
         (MO_BSwap w)    -> ptext (sLit $ bSwapLabel w)
         (MO_PopCnt w)   -> ptext (sLit $ popCntLabel w)
+        (MO_AtomicRMW w amop) -> ptext (sLit $ atomicRMWLabel w amop)
+        (MO_Cmpxchg w)  -> ptext (sLit $ cmpxchgLabel w)
+        (MO_AtomicRead w)  -> ptext (sLit $ atomicReadLabel w)
+        (MO_AtomicWrite w) -> ptext (sLit $ atomicWriteLabel w)
         (MO_UF_Conv w)  -> ptext (sLit $ word2FloatLabel w)
 
         MO_S_QuotRem  {} -> unsupported
diff --git a/compiler/codeGen/StgCmmPrim.hs b/compiler/codeGen/StgCmmPrim.hs
index 40a5e3649b..e4c682bf02 100644
--- a/compiler/codeGen/StgCmmPrim.hs
+++ b/compiler/codeGen/StgCmmPrim.hs
@@ -769,6 +769,25 @@ emitPrimOp _ res PrefetchByteArrayOp0        args = doPrefetchByteArrayOp 0 res
 emitPrimOp _ res PrefetchMutableByteArrayOp0 args = doPrefetchByteArrayOp 0 res args
 emitPrimOp _ res PrefetchAddrOp0             args = doPrefetchAddrOp 0 res args
 
+-- Atomic read-modify-write
+emitPrimOp dflags [res] FetchAddByteArrayOp_Int [mba, ix, n] =
+    doAtomicRMW res AMO_Add mba ix (bWord dflags) n
+emitPrimOp dflags [res] FetchSubByteArrayOp_Int [mba, ix, n] =
+    doAtomicRMW res AMO_Sub mba ix (bWord dflags) n
+emitPrimOp dflags [res] FetchAndByteArrayOp_Int [mba, ix, n] =
+    doAtomicRMW res AMO_And mba ix (bWord dflags) n
+emitPrimOp dflags [res] FetchNandByteArrayOp_Int [mba, ix, n] =
+    doAtomicRMW res AMO_Nand mba ix (bWord dflags) n
+emitPrimOp dflags [res] FetchOrByteArrayOp_Int [mba, ix, n] =
+    doAtomicRMW res AMO_Or mba ix (bWord dflags) n
+emitPrimOp dflags [res] FetchXorByteArrayOp_Int [mba, ix, n] =
+    doAtomicRMW res AMO_Xor mba ix (bWord dflags) n
+emitPrimOp dflags [res] AtomicReadByteArrayOp_Int [mba, ix] =
+    doAtomicReadByteArray res mba ix (bWord dflags)
+emitPrimOp dflags [] AtomicWriteByteArrayOp_Int [mba, ix, val] =
+    doAtomicWriteByteArray mba ix (bWord dflags) val
+emitPrimOp dflags [res] CasByteArrayOp_Int [mba, ix, old, new] =
+    doCasByteArray res mba ix (bWord dflags) old new
 
 -- The rest just translate straightforwardly
 emitPrimOp dflags [res] op [arg]
@@ -1933,6 +1952,81 @@ doWriteSmallPtrArrayOp addr idx val = do
     emit (setInfo addr (CmmLit (CmmLabel mkSMAP_DIRTY_infoLabel)))
 
 ------------------------------------------------------------------------------
+-- Atomic read-modify-write
+
+-- | Emit an atomic modification to a byte array element. The result
+-- reg contains that previous value of the element. Implies a full
+-- memory barrier.
+doAtomicRMW :: LocalReg      -- ^ Result reg
+            -> AtomicMachOp  -- ^ Atomic op (e.g. add)
+            -> CmmExpr       -- ^ MutableByteArray#
+            -> CmmExpr       -- ^ Index
+            -> CmmType       -- ^ Type of element by which we are indexing
+            -> CmmExpr       -- ^ Op argument (e.g. amount to add)
+            -> FCode ()
+doAtomicRMW res amop mba idx idx_ty n = do
+    dflags <- getDynFlags
+    let width = typeWidth idx_ty
+        addr  = cmmIndexOffExpr dflags (arrWordsHdrSize dflags)
+                width mba idx
+    emitPrimCall
+        [ res ]
+        (MO_AtomicRMW width amop)
+        [ addr, n ]
+
+-- | Emit an atomic read to a byte array that acts as a memory barrier.
+doAtomicReadByteArray
+    :: LocalReg  -- ^ Result reg
+    -> CmmExpr   -- ^ MutableByteArray#
+    -> CmmExpr   -- ^ Index
+    -> CmmType   -- ^ Type of element by which we are indexing
+    -> FCode ()
+doAtomicReadByteArray res mba idx idx_ty = do
+    dflags <- getDynFlags
+    let width = typeWidth idx_ty
+        addr  = cmmIndexOffExpr dflags (arrWordsHdrSize dflags)
+                width mba idx
+    emitPrimCall
+        [ res ]
+        (MO_AtomicRead width)
+        [ addr ]
+
+-- | Emit an atomic write to a byte array that acts as a memory barrier.
+doAtomicWriteByteArray
+    :: CmmExpr   -- ^ MutableByteArray#
+    -> CmmExpr   -- ^ Index
+    -> CmmType   -- ^ Type of element by which we are indexing
+    -> CmmExpr   -- ^ Value to write
+    -> FCode ()
+doAtomicWriteByteArray mba idx idx_ty val = do
+    dflags <- getDynFlags
+    let width = typeWidth idx_ty
+        addr  = cmmIndexOffExpr dflags (arrWordsHdrSize dflags)
+                width mba idx
+    emitPrimCall
+        [ {- no results -} ]
+        (MO_AtomicWrite width)
+        [ addr, val ]
+
+doCasByteArray
+    :: LocalReg  -- ^ Result reg
+    -> CmmExpr   -- ^ MutableByteArray#
+    -> CmmExpr   -- ^ Index
+    -> CmmType   -- ^ Type of element by which we are indexing
+    -> CmmExpr   -- ^ Old value
+    -> CmmExpr   -- ^ New value
+    -> FCode ()
+doCasByteArray res mba idx idx_ty old new = do
+    dflags <- getDynFlags
+    let width = (typeWidth idx_ty)
+        addr = cmmIndexOffExpr dflags (arrWordsHdrSize dflags)
+               width mba idx
+    emitPrimCall
+        [ res ]
+        (MO_Cmpxchg width)
+        [ addr, old, new ]
+
+------------------------------------------------------------------------------
 -- Helpers for emitting function calls
 
 -- | Emit a call to @memcpy@.
diff --git a/compiler/llvmGen/Llvm/AbsSyn.hs b/compiler/llvmGen/Llvm/AbsSyn.hs
index f92bd89c5c..24d0856ea3 100644
--- a/compiler/llvmGen/Llvm/AbsSyn.hs
+++ b/compiler/llvmGen/Llvm/AbsSyn.hs
@@ -65,6 +65,8 @@ data LlvmFunction = LlvmFunction {
 
 type LlvmFunctions = [LlvmFunction]
 
+type SingleThreaded = Bool
+
 -- | LLVM ordering types for synchronization purposes. (Introduced in LLVM
 -- 3.0). Please see the LLVM documentation for a better description.
 data LlvmSyncOrdering
@@ -224,6 +226,11 @@ data LlvmExpression
   | Load LlvmVar
 
   {- |
+    Atomic load of the value at location ptr
+  -}
+  | ALoad LlvmSyncOrdering SingleThreaded LlvmVar
+
+  {- |
     Navigate in an structure, selecting elements
       * inbound: Is the pointer inbounds? (computed pointer doesn't overflow)
       * ptr:     Location of the structure
diff --git a/compiler/llvmGen/Llvm/PpLlvm.hs b/compiler/llvmGen/Llvm/PpLlvm.hs
index 025078226d..73077257f8 100644
--- a/compiler/llvmGen/Llvm/PpLlvm.hs
+++ b/compiler/llvmGen/Llvm/PpLlvm.hs
@@ -239,6 +239,7 @@ ppLlvmExpression expr
         Insert     vec elt idx      -> ppInsert vec elt idx
         GetElemPtr inb ptr indexes  -> ppGetElementPtr inb ptr indexes
         Load       ptr              -> ppLoad ptr
+        ALoad      ord st ptr       -> ppALoad ord st ptr
         Malloc     tp amount        -> ppMalloc tp amount
         Phi        tp precessors    -> ppPhi tp precessors
         Asm        asm c ty v se sk -> ppAsm asm c ty v se sk
@@ -327,13 +328,18 @@ ppSyncOrdering SyncSeqCst    = text "seq_cst"
 -- of specifying alignment.
 
 ppLoad :: LlvmVar -> SDoc
-ppLoad var
-    | isVecPtrVar var = text "load" <+> ppr var <>
-                        comma <+> text "align 1"
-    | otherwise       = text "load" <+> ppr var
+ppLoad var = text "load" <+> ppr var <> align
   where
-    isVecPtrVar :: LlvmVar -> Bool
-    isVecPtrVar = isVector . pLower . getVarType
+    align | isVector . pLower . getVarType $ var = text ", align 1"
+          | otherwise = empty
+
+ppALoad :: LlvmSyncOrdering -> SingleThreaded -> LlvmVar -> SDoc
+ppALoad ord st var = sdocWithDynFlags $ \dflags ->
+  let alignment = (llvmWidthInBits dflags $ getVarType var) `quot` 8
+      align     = text ", align" <+> ppr alignment
+      sThreaded | st        = text " singlethread"
+                | otherwise = empty
+  in text "load atomic" <+> ppr var <> sThreaded <+> ppSyncOrdering ord <> align
 
 ppStore :: LlvmVar -> LlvmVar -> SDoc
 ppStore val dst
diff --git a/compiler/llvmGen/LlvmCodeGen/CodeGen.hs b/compiler/llvmGen/LlvmCodeGen/CodeGen.hs
index 517553516b..4a56600937 100644
--- a/compiler/llvmGen/LlvmCodeGen/CodeGen.hs
+++ b/compiler/llvmGen/LlvmCodeGen/CodeGen.hs
@@ -15,6 +15,7 @@ import BlockId
 import CodeGen.Platform ( activeStgRegs, callerSaves )
 import CLabel
 import Cmm
+import CPrim
 import PprCmm
 import CmmUtils
 import Hoopl
@@ -32,6 +33,7 @@ import Unique
 import Data.List ( nub )
 import Data.Maybe ( catMaybes )
 
+type Atomic = Bool
 type LlvmStatements = OrdList LlvmStatement
 
 -- -----------------------------------------------------------------------------
@@ -228,6 +230,17 @@ genCall t@(PrimTarget (MO_PopCnt w)) dsts args =
 genCall t@(PrimTarget (MO_BSwap w)) dsts args =
     genCallSimpleCast w t dsts args
 
+genCall (PrimTarget (MO_AtomicRead _)) [dst] [addr] = do
+  dstV <- getCmmReg (CmmLocal dst)
+  (v1, stmts, top) <- genLoad True addr (localRegType dst)
+  let stmt1 = Store v1 dstV
+  return (stmts `snocOL` stmt1, top)
+
+-- TODO: implement these properly rather than calling to RTS functions.
+-- genCall t@(PrimTarget (MO_AtomicWrite width)) [] [addr, val] = undefined
+-- genCall t@(PrimTarget (MO_AtomicRMW width amop)) [dst] [addr, n] = undefined
+-- genCall t@(PrimTarget (MO_Cmpxchg width)) [dst] [addr, old, new] = undefined
+
 -- Handle memcpy function specifically since llvm's intrinsic version takes
 -- some extra parameters.
 genCall t@(PrimTarget op) [] args'
@@ -548,7 +561,6 @@ cmmPrimOpFunctions mop = do
 
     (MO_Prefetch_Data _ )-> fsLit "llvm.prefetch"
 
-
     MO_S_QuotRem {}  -> unsupported
     MO_U_QuotRem {}  -> unsupported
     MO_U_QuotRem2 {} -> unsupported
@@ -558,6 +570,12 @@ cmmPrimOpFunctions mop = do
     MO_Touch         -> unsupported
     MO_UF_Conv _     -> unsupported
 
+    MO_AtomicRead _  -> unsupported
+
+    MO_AtomicRMW w amop -> fsLit $ atomicRMWLabel w amop
+    MO_Cmpxchg w        -> fsLit $ cmpxchgLabel w
+    MO_AtomicWrite w    -> fsLit $ atomicWriteLabel w
+
 -- | Tail function calls
 genJump :: CmmExpr -> [GlobalReg] -> LlvmM StmtData
 
@@ -849,7 +867,7 @@ exprToVarOpt opt e = case e of
         -> genLit opt lit
 
     CmmLoad e' ty
-        -> genLoad e' ty
+        -> genLoad False e' ty
 
     -- Cmmreg in expression is the value, so must load. If you want actual
     -- reg pointer, call getCmmReg directly.
@@ -1268,41 +1286,41 @@ genMachOp_slow _ _ _ = panic "genMachOp: More then 2 expressions in MachOp!"
 
 
 -- | Handle CmmLoad expression.
-genLoad :: CmmExpr -> CmmType -> LlvmM ExprData
+genLoad :: Atomic -> CmmExpr -> CmmType -> LlvmM ExprData
 
 -- First we try to detect a few common cases and produce better code for
 -- these then the default case. We are mostly trying to detect Cmm code
 -- like I32[Sp + n] and use 'getelementptr' operations instead of the
 -- generic case that uses casts and pointer arithmetic
-genLoad e@(CmmReg (CmmGlobal r)) ty
-    = genLoad_fast e r 0 ty
+genLoad atomic e@(CmmReg (CmmGlobal r)) ty
+    = genLoad_fast atomic e r 0 ty
 
-genLoad e@(CmmRegOff (CmmGlobal r) n) ty
-    = genLoad_fast e r n ty
+genLoad atomic e@(CmmRegOff (CmmGlobal r) n) ty
+    = genLoad_fast atomic e r n ty
 
-genLoad e@(CmmMachOp (MO_Add _) [
+genLoad atomic e@(CmmMachOp (MO_Add _) [
                             (CmmReg (CmmGlobal r)),
                             (CmmLit (CmmInt n _))])
                 ty
-    = genLoad_fast e r (fromInteger n) ty
+    = genLoad_fast atomic e r (fromInteger n) ty
 
-genLoad e@(CmmMachOp (MO_Sub _) [
+genLoad atomic e@(CmmMachOp (MO_Sub _) [
                             (CmmReg (CmmGlobal r)),
                             (CmmLit (CmmInt n _))])
                 ty
-    = genLoad_fast e r (negate $ fromInteger n) ty
+    = genLoad_fast atomic e r (negate $ fromInteger n) ty
 
 -- generic case
-genLoad e ty
+genLoad atomic e ty
     = do other <- getTBAAMeta otherN
-         genLoad_slow e ty other
+         genLoad_slow atomic e ty other
 
 -- | Handle CmmLoad expression.
 -- This is a special case for loading from a global register pointer
 -- offset such as I32[Sp+8].
-genLoad_fast :: CmmExpr -> GlobalReg -> Int -> CmmType
-                -> LlvmM ExprData
-genLoad_fast e r n ty = do
+genLoad_fast :: Atomic -> CmmExpr -> GlobalReg -> Int -> CmmType
+             -> LlvmM ExprData
+genLoad_fast atomic e r n ty = do
     dflags <- getDynFlags
     (gv, grt, s1) <- getCmmRegVal (CmmGlobal r)
     meta          <- getTBAARegMeta r
@@ -1315,7 +1333,7 @@ genLoad_fast e r n ty = do
                 case grt == ty' of
                      -- were fine
                      True -> do
-                         (var, s3) <- doExpr ty' (MExpr meta $ Load ptr)
+                         (var, s3) <- doExpr ty' (MExpr meta $ loadInstr ptr)
                          return (var, s1 `snocOL` s2 `snocOL` s3,
                                      [])
 
@@ -1323,32 +1341,34 @@ genLoad_fast e r n ty = do
                      False -> do
                          let pty = pLift ty'
                          (ptr', s3) <- doExpr pty $ Cast LM_Bitcast ptr pty
-                         (var, s4) <- doExpr ty' (MExpr meta $ Load ptr')
+                         (var, s4) <- doExpr ty' (MExpr meta $ loadInstr ptr')
                          return (var, s1 `snocOL` s2 `snocOL` s3
                                     `snocOL` s4, [])
 
             -- If its a bit type then we use the slow method since
             -- we can't avoid casting anyway.
-            False -> genLoad_slow e ty meta
-
+            False -> genLoad_slow atomic  e ty meta
+  where
+    loadInstr ptr | atomic    = ALoad SyncSeqCst False ptr
+                  | otherwise = Load ptr
 
 -- | Handle Cmm load expression.
 -- Generic case. Uses casts and pointer arithmetic if needed.
-genLoad_slow :: CmmExpr -> CmmType -> [MetaAnnot] -> LlvmM ExprData
-genLoad_slow e ty meta = do
+genLoad_slow :: Atomic -> CmmExpr -> CmmType -> [MetaAnnot] -> LlvmM ExprData
+genLoad_slow atomic e ty meta = do
     (iptr, stmts, tops) <- exprToVar e
     dflags <- getDynFlags
     case getVarType iptr of
          LMPointer _ -> do
                     (dvar, load) <- doExpr (cmmToLlvmType ty)
-                                           (MExpr meta $ Load iptr)
+                                           (MExpr meta $ loadInstr iptr)
                     return (dvar, stmts `snocOL` load, tops)
 
          i@(LMInt _) | i == llvmWord dflags -> do
                     let pty = LMPointer $ cmmToLlvmType ty
                     (ptr, cast)  <- doExpr pty $ Cast LM_Inttoptr iptr pty
                     (dvar, load) <- doExpr (cmmToLlvmType ty)
-                                           (MExpr meta $ Load ptr)
+                                           (MExpr meta $ loadInstr ptr)
                     return (dvar, stmts `snocOL` cast `snocOL` load, tops)
 
          other -> do dflags <- getDynFlags
@@ -1357,6 +1377,9 @@ genLoad_slow e ty meta = do
                             "Size of Ptr: " ++ show (llvmPtrBits dflags) ++
                             ", Size of var: " ++ show (llvmWidthInBits dflags other) ++
                             ", Var: " ++ showSDoc dflags (ppr iptr)))
+  where
+    loadInstr ptr | atomic    = ALoad SyncSeqCst False ptr
+                  | otherwise = Load ptr
 
 
 -- | Handle CmmReg expression. This will return a pointer to the stack
diff --git a/compiler/nativeGen/CPrim.hs b/compiler/nativeGen/CPrim.hs
index a6f4cab7bd..34782dfc1c 100644
--- a/compiler/nativeGen/CPrim.hs
+++ b/compiler/nativeGen/CPrim.hs
@@ -1,11 +1,16 @@
 -- | Generating C symbol names emitted by the compiler.
 module CPrim
-    ( popCntLabel
+    ( atomicReadLabel
+    , atomicWriteLabel
+    , atomicRMWLabel
+    , cmpxchgLabel
+    , popCntLabel
     , bSwapLabel
     , word2FloatLabel
     ) where
 
 import CmmType
+import CmmMachOp
 import Outputable
 
 popCntLabel :: Width -> String
@@ -31,3 +36,46 @@ word2FloatLabel w = "hs_word2float" ++ pprWidth w
     pprWidth W32 = "32"
     pprWidth W64 = "64"
     pprWidth w   = pprPanic "word2FloatLabel: Unsupported word width " (ppr w)
+
+atomicRMWLabel :: Width -> AtomicMachOp -> String
+atomicRMWLabel w amop = "hs_atomic_" ++ pprFunName amop ++ pprWidth w
+  where
+    pprWidth W8  = "8"
+    pprWidth W16 = "16"
+    pprWidth W32 = "32"
+    pprWidth W64 = "64"
+    pprWidth w   = pprPanic "atomicRMWLabel: Unsupported word width " (ppr w)
+
+    pprFunName AMO_Add  = "add"
+    pprFunName AMO_Sub  = "sub"
+    pprFunName AMO_And  = "and"
+    pprFunName AMO_Nand = "nand"
+    pprFunName AMO_Or   = "or"
+    pprFunName AMO_Xor  = "xor"
+
+cmpxchgLabel :: Width -> String
+cmpxchgLabel w = "hs_cmpxchg" ++ pprWidth w
+  where
+    pprWidth W8  = "8"
+    pprWidth W16 = "16"
+    pprWidth W32 = "32"
+    pprWidth W64 = "64"
+    pprWidth w   = pprPanic "cmpxchgLabel: Unsupported word width " (ppr w)
+
+atomicReadLabel :: Width -> String
+atomicReadLabel w = "hs_atomicread" ++ pprWidth w
+  where
+    pprWidth W8  = "8"
+    pprWidth W16 = "16"
+    pprWidth W32 = "32"
+    pprWidth W64 = "64"
+    pprWidth w   = pprPanic "atomicReadLabel: Unsupported word width " (ppr w)
+
+atomicWriteLabel :: Width -> String
+atomicWriteLabel w = "hs_atomicwrite" ++ pprWidth w
+  where
+    pprWidth W8  = "8"
+    pprWidth W16 = "16"
+    pprWidth W32 = "32"
+    pprWidth W64 = "64"
+    pprWidth w   = pprPanic "atomicWriteLabel: Unsupported word width " (ppr w)
diff --git a/compiler/nativeGen/PPC/CodeGen.hs b/compiler/nativeGen/PPC/CodeGen.hs
index 91651e6065..22a2c7cb6a 100644
--- a/compiler/nativeGen/PPC/CodeGen.hs
+++ b/compiler/nativeGen/PPC/CodeGen.hs
@@ -1160,6 +1160,10 @@ genCCall' dflags gcp target dest_regs args0
 
                     MO_BSwap w   -> (fsLit $ bSwapLabel w, False)
                     MO_PopCnt w  -> (fsLit $ popCntLabel w, False)
+                    MO_AtomicRMW w amop -> (fsLit $ atomicRMWLabel w amop, False)
+                    MO_Cmpxchg w -> (fsLit $ cmpxchgLabel w, False)
+                    MO_AtomicRead w  -> (fsLit $ atomicReadLabel w, False)
+                    MO_AtomicWrite w -> (fsLit $ atomicWriteLabel w, False)
 
                     MO_S_QuotRem {}  -> unsupported
                     MO_U_QuotRem {}  -> unsupported
diff --git a/compiler/nativeGen/SPARC/CodeGen.hs b/compiler/nativeGen/SPARC/CodeGen.hs
index f5e61d0a8f..51f89d629f 100644
--- a/compiler/nativeGen/SPARC/CodeGen.hs
+++ b/compiler/nativeGen/SPARC/CodeGen.hs
@@ -654,6 +654,10 @@ outOfLineMachOp_table mop
 
         MO_BSwap w   -> fsLit $ bSwapLabel w
         MO_PopCnt w  -> fsLit $ popCntLabel w
+        MO_AtomicRMW w amop -> fsLit $ atomicRMWLabel w amop
+        MO_Cmpxchg w -> fsLit $ cmpxchgLabel w
+        MO_AtomicRead w -> fsLit $ atomicReadLabel w
+        MO_AtomicWrite w -> fsLit $ atomicWriteLabel w
 
         MO_S_QuotRem {}  -> unsupported
         MO_U_QuotRem {}  -> unsupported
diff --git a/compiler/nativeGen/X86/CodeGen.hs b/compiler/nativeGen/X86/CodeGen.hs
index fa93767fa3..140e8b2992 100644
--- a/compiler/nativeGen/X86/CodeGen.hs
+++ b/compiler/nativeGen/X86/CodeGen.hs
@@ -1761,6 +1761,93 @@ genCCall dflags is32Bit (PrimTarget (MO_UF_Conv width)) dest_regs args = do
   where
     lbl = mkCmmCodeLabel primPackageId (fsLit (word2FloatLabel width))
 
+genCCall dflags _ (PrimTarget (MO_AtomicRMW width amop)) [dst] [addr, n] = do
+    Amode amode addr_code <- getAmode addr
+    arg <- getNewRegNat size
+    arg_code <- getAnyReg n
+    use_sse2 <- sse2Enabled
+    let platform = targetPlatform dflags
+        dst_r    = getRegisterReg platform use_sse2 (CmmLocal dst)
+    code <- op_code dst_r arg amode
+    return $ addr_code `appOL` arg_code arg `appOL` code
+  where
+    -- Code for the operation
+    op_code :: Reg       -- ^ The destination reg
+            -> Reg       -- ^ Register containing argument
+            -> AddrMode  -- ^ Address of location to mutate
+            -> NatM (OrdList Instr)
+    op_code dst_r arg amode = case amop of
+        -- In the common case where dst_r is a virtual register the
+        -- final move should go away, because it's the last use of arg
+        -- and the first use of dst_r.
+        AMO_Add  -> return $ toOL [ LOCK
+                                  , XADD size (OpReg arg) (OpAddr amode)
+                                  , MOV size (OpReg arg) (OpReg dst_r)
+                                  ]
+        AMO_Sub  -> return $ toOL [ NEGI size (OpReg arg)
+                                  , LOCK
+                                  , XADD size (OpReg arg) (OpAddr amode)
+                                  , MOV size (OpReg arg) (OpReg dst_r)
+                                  ]
+        AMO_And  -> cmpxchg_code (\ src dst -> unitOL $ AND size src dst)
+        AMO_Nand -> cmpxchg_code (\ src dst -> toOL [ AND size src dst
+                                                    , NOT size dst
+                                                    ])
+        AMO_Or   -> cmpxchg_code (\ src dst -> unitOL $ OR size src dst)
+        AMO_Xor  -> cmpxchg_code (\ src dst -> unitOL $ XOR size src dst)
+      where
+        -- Simulate operation that lacks a dedicated instruction using
+        -- cmpxchg.
+        cmpxchg_code :: (Operand -> Operand -> OrdList Instr)
+                     -> NatM (OrdList Instr)
+        cmpxchg_code instrs = do
+            lbl <- getBlockIdNat
+            tmp <- getNewRegNat size
+            return $ toOL
+                [ MOV size (OpAddr amode) (OpReg eax)
+                , JXX ALWAYS lbl
+                , NEWBLOCK lbl
+                  -- Keep old value so we can return it:
+                , MOV size (OpReg eax) (OpReg dst_r)
+                , MOV size (OpReg eax) (OpReg tmp)
+                ]
+                `appOL` instrs (OpReg arg) (OpReg tmp) `appOL` toOL
+                [ LOCK
+                , CMPXCHG size (OpReg tmp) (OpAddr amode)
+                , JXX NE lbl
+                ]
+
+    size = intSize width
+
+genCCall dflags _ (PrimTarget (MO_AtomicRead width)) [dst] [addr] = do
+  load_code <- intLoadCode (MOV (intSize width)) addr
+  let platform = targetPlatform dflags
+  use_sse2 <- sse2Enabled
+  return (load_code (getRegisterReg platform use_sse2 (CmmLocal dst)))
+
+genCCall _ _ (PrimTarget (MO_AtomicWrite width)) [] [addr, val] = do
+    assignMem_IntCode (intSize width) addr val
+
+genCCall dflags _ (PrimTarget (MO_Cmpxchg width)) [dst] [addr, old, new] = do
+    Amode amode addr_code <- getAmode addr
+    newval <- getNewRegNat size
+    newval_code <- getAnyReg new
+    oldval <- getNewRegNat size
+    oldval_code <- getAnyReg old
+    use_sse2 <- sse2Enabled
+    let platform = targetPlatform dflags
+        dst_r    = getRegisterReg platform use_sse2 (CmmLocal dst)
+        code     = toOL
+                   [ MOV size (OpReg oldval) (OpReg eax)
+                   , LOCK
+                   , CMPXCHG size (OpReg newval) (OpAddr amode)
+                   , MOV size (OpReg eax) (OpReg dst_r)
+                   ]
+    return $ addr_code `appOL` newval_code newval `appOL` oldval_code oldval
+        `appOL` code
+  where
+    size = intSize width
+
 genCCall _ is32Bit target dest_regs args
  | is32Bit   = genCCall32 target dest_regs args
  | otherwise = genCCall64 target dest_regs args
@@ -2385,6 +2472,11 @@ outOfLineCmmOp mop res args
               MO_PopCnt _  -> fsLit "popcnt"
               MO_BSwap _   -> fsLit "bswap"
 
+              MO_AtomicRMW _ _ -> fsLit "atomicrmw"
+              MO_AtomicRead _  -> fsLit "atomicread"
+              MO_AtomicWrite _ -> fsLit "atomicwrite"
+              MO_Cmpxchg _     -> fsLit "cmpxchg"
+
               MO_UF_Conv _ -> unsupported
 
               MO_S_QuotRem {}  -> unsupported
diff --git a/compiler/nativeGen/X86/Instr.hs b/compiler/nativeGen/X86/Instr.hs
index 05fff9be96..ac91747171 100644
--- a/compiler/nativeGen/X86/Instr.hs
+++ b/compiler/nativeGen/X86/Instr.hs
@@ -327,6 +327,10 @@ data Instr
         | PREFETCH  PrefetchVariant Size Operand -- prefetch Variant, addr size, address to prefetch
                                         -- variant can be NTA, Lvl0, Lvl1, or Lvl2
 
+        | LOCK  -- lock prefix
+        | XADD        Size Operand Operand  -- src (r), dst (r/m)
+        | CMPXCHG     Size Operand Operand  -- src (r), dst (r/m), eax implicit
+
 data PrefetchVariant = NTA | Lvl0 | Lvl1 | Lvl2
 
 
@@ -337,6 +341,8 @@ data Operand
 
 
 
+-- | Returns which registers are read and written as a (read, written)
+-- pair.
 x86_regUsageOfInstr :: Platform -> Instr -> RegUsage
 x86_regUsageOfInstr platform instr
  = case instr of
@@ -428,10 +434,21 @@ x86_regUsageOfInstr platform instr
 
     -- note: might be a better way to do this
     PREFETCH _  _ src -> mkRU (use_R src []) []
+    LOCK                -> noUsage
+    XADD _ src dst      -> usageMM src dst
+    CMPXCHG _ src dst   -> usageRMM src dst (OpReg eax)
 
     _other              -> panic "regUsage: unrecognised instr"
-
  where
+    -- # Definitions
+    --
+    -- Written: If the operand is a register, it's written. If it's an
+    -- address, registers mentioned in the address are read.
+    --
+    -- Modified: If the operand is a register, it's both read and
+    -- written. If it's an address, registers mentioned in the address
+    -- are read.
+
     -- 2 operand form; first operand Read; second Written
     usageRW :: Operand -> Operand -> RegUsage
     usageRW op (OpReg reg)      = mkRU (use_R op []) [reg]
@@ -444,6 +461,18 @@ x86_regUsageOfInstr platform instr
     usageRM op (OpAddr ea)      = mkRUR (use_R op $! use_EA ea [])
     usageRM _ _                 = panic "X86.RegInfo.usageRM: no match"
 
+    -- 2 operand form; first operand Modified; second Modified
+    usageMM :: Operand -> Operand -> RegUsage
+    usageMM (OpReg src) (OpReg dst) = mkRU [src, dst] [src, dst]
+    usageMM (OpReg src) (OpAddr ea) = mkRU (use_EA ea [src]) [src]
+    usageMM _ _                     = panic "X86.RegInfo.usageMM: no match"
+
+    -- 3 operand form; first operand Read; second Modified; third Modified
+    usageRMM :: Operand -> Operand -> Operand -> RegUsage
+    usageRMM (OpReg src) (OpReg dst) (OpReg reg) = mkRU [src, dst, reg] [dst, reg]
+    usageRMM (OpReg src) (OpAddr ea) (OpReg reg) = mkRU (use_EA ea [src, reg]) [reg]
+    usageRMM _ _ _                               = panic "X86.RegInfo.usageRMM: no match"
+
     -- 1 operand form; operand Modified
     usageM :: Operand -> RegUsage
     usageM (OpReg reg)          = mkRU [reg] [reg]
@@ -476,6 +505,7 @@ x86_regUsageOfInstr platform instr
         where src' = filter (interesting platform) src
               dst' = filter (interesting platform) dst
 
+-- | Is this register interesting for the register allocator?
 interesting :: Platform -> Reg -> Bool
 interesting _        (RegVirtual _)              = True
 interesting platform (RegReal (RealRegSingle i)) = isFastTrue (freeReg platform i)
@@ -483,6 +513,8 @@ interesting _        (RegReal (RealRegPair{}))   = panic "X86.interesting: no re
 
 
 
+-- | Applies the supplied function to all registers in instructions.
+-- Typically used to change virtual registers to real registers.
 x86_patchRegsOfInstr :: Instr -> (Reg -> Reg) -> Instr
 x86_patchRegsOfInstr instr env
  = case instr of
@@ -571,6 +603,10 @@ x86_patchRegsOfInstr instr env
 
     PREFETCH lvl size src -> PREFETCH lvl size (patchOp src)
 
+    LOCK                -> instr
+    XADD sz src dst     -> patch2 (XADD sz) src dst
+    CMPXCHG sz src dst  -> patch2 (CMPXCHG sz) src dst
+
     _other              -> panic "patchRegs: unrecognised instr"
 
   where
diff --git a/compiler/nativeGen/X86/Ppr.hs b/compiler/nativeGen/X86/Ppr.hs
index 459c041ba5..7771c02512 100644
--- a/compiler/nativeGen/X86/Ppr.hs
+++ b/compiler/nativeGen/X86/Ppr.hs
@@ -886,6 +886,14 @@ pprInstr GFREE
             ptext (sLit "\tffree %st(4) ;ffree %st(5)")
           ]
 
+-- Atomics
+
+pprInstr LOCK = ptext (sLit "\tlock")
+
+pprInstr (XADD size src dst) = pprSizeOpOp (sLit "xadd") size src dst
+
+pprInstr (CMPXCHG size src dst) = pprSizeOpOp (sLit "cmpxchg") size src dst
+
 pprInstr _
         = panic "X86.Ppr.pprInstr: no match"
 
diff --git a/compiler/prelude/primops.txt.pp b/compiler/prelude/primops.txt.pp
index 4851315eb4..4faa585246 100644
--- a/compiler/prelude/primops.txt.pp
+++ b/compiler/prelude/primops.txt.pp
@@ -1363,19 +1363,79 @@ primop  SetByteArrayOp "setByteArray#" GenPrimOp
   code_size = { primOpCodeSizeForeignCall + 4 }
   can_fail = True
 
+-- Atomic operations
+
+primop  AtomicReadByteArrayOp_Int "atomicReadIntArray#" GenPrimOp
+   MutableByteArray# s -> Int# -> State# s -> (# State# s, Int# #)
+   {Given an array and an offset in Int units, read an element. The
+    index is assumed to be in bounds. Implies a full memory barrier.}
+   with has_side_effects = True
+        can_fail = True
+
+primop  AtomicWriteByteArrayOp_Int "atomicWriteIntArray#" GenPrimOp
+   MutableByteArray# s -> Int# -> Int# -> State# s -> State# s
+   {Given an array and an offset in Int units, write an element. The
+    index is assumed to be in bounds. Implies a full memory barrier.}
+   with has_side_effects = True
+        can_fail = True
+
 primop CasByteArrayOp_Int "casIntArray#" GenPrimOp
    MutableByteArray# s -> Int# -> Int# -> Int# -> State# s -> (# State# s, Int# #)
-   {Machine-level atomic compare and swap on a word within a ByteArray.}
-   with
-   out_of_line = True
-   has_side_effects = True
+   {Given an array, an offset in Int units, the expected old value, and
+    the new value, perform an atomic compare and swap i.e. write the new
+    value if the current value matches the provided old value. Returns
+    the value of the element before the operation. Implies a full memory
+    barrier.}
+   with has_side_effects = True
+        can_fail = True
 
 primop FetchAddByteArrayOp_Int "fetchAddIntArray#" GenPrimOp
    MutableByteArray# s -> Int# -> Int# -> State# s -> (# State# s, Int# #)
-   {Machine-level word-sized fetch-and-add within a ByteArray.}
-   with
-   out_of_line = True
-   has_side_effects = True
+   {Given an array, and offset in Int units, and a value to add,
+    atomically add the value to the element. Returns the value of the
+    element before the operation. Implies a full memory barrier.}
+   with has_side_effects = True
+        can_fail = True
+
+primop FetchSubByteArrayOp_Int "fetchSubIntArray#" GenPrimOp
+   MutableByteArray# s -> Int# -> Int# -> State# s -> (# State# s, Int# #)
+   {Given an array, and offset in Int units, and a value to subtract,
+    atomically substract the value to the element. Returns the value of
+    the element before the operation. Implies a full memory barrier.}
+   with has_side_effects = True
+        can_fail = True
+
+primop FetchAndByteArrayOp_Int "fetchAndIntArray#" GenPrimOp
+   MutableByteArray# s -> Int# -> Int# -> State# s -> (# State# s, Int# #)
+   {Given an array, and offset in Int units, and a value to AND,
+    atomically AND the value to the element. Returns the value of the
+    element before the operation. Implies a full memory barrier.}
+   with has_side_effects = True
+        can_fail = True
+
+primop FetchNandByteArrayOp_Int "fetchNandIntArray#" GenPrimOp
+   MutableByteArray# s -> Int# -> Int# -> State# s -> (# State# s, Int# #)
+   {Given an array, and offset in Int units, and a value to NAND,
+    atomically NAND the value to the element. Returns the value of the
+    element before the operation. Implies a full memory barrier.}
+   with has_side_effects = True
+        can_fail = True
+
+primop FetchOrByteArrayOp_Int "fetchOrIntArray#" GenPrimOp
+   MutableByteArray# s -> Int# -> Int# -> State# s -> (# State# s, Int# #)
+   {Given an array, and offset in Int units, and a value to OR,
+    atomically OR the value to the element. Returns the value of the
+    element before the operation. Implies a full memory barrier.}
+   with has_side_effects = True
+        can_fail = True
+
+primop FetchXorByteArrayOp_Int "fetchXorIntArray#" GenPrimOp
+   MutableByteArray# s -> Int# -> Int# -> State# s -> (# State# s, Int# #)
+   {Given an array, and offset in Int units, and a value to XOR,
+    atomically XOR the value to the element. Returns the value of the
+    element before the operation. Implies a full memory barrier.}
+   with has_side_effects = True
+        can_fail = True
 
 
 ------------------------------------------------------------------------
author	Johan Tibell <johan.tibell@gmail.com>	2014-06-09 11:43:21 +0200
committer	Johan Tibell <johan.tibell@gmail.com>	2014-06-24 19:06:53 +0200
commit	d8abf85f8ca176854e9d5d0b12371c4bc402aac3 (patch)
tree	9542f2b8a7ee9b9759396e4172ad5fc4ce1d2f3c /compiler
parent	a4a79b5a04658ac542b1e07a6975b488fd589441 (diff)
download	haskell-d8abf85f8ca176854e9d5d0b12371c4bc402aac3.tar.gz