summaryrefslogtreecommitdiff
diff options
context:
space:
mode:
authorGeoffrey Mainland <gmainlan@microsoft.com>2012-10-19 09:06:17 +0100
committerGeoffrey Mainland <gmainlan@microsoft.com>2013-02-01 22:00:24 +0000
commit4af62075bbe9e96a3678fc90288496e0c4c7c17d (patch)
tree3df4fa03089310cd66678681a4ce78dd39bea25f
parent6480a35c15717025c169980b1cc763a7e6f36056 (diff)
downloadhaskell-4af62075bbe9e96a3678fc90288496e0c4c7c17d.tar.gz
Add the Float32X4# primitive type and associated primops.
This patch lays the groundwork needed for primop support for SIMD vectors. In addition to the groundwork, we add support for the FloatX4# primitive type and associated primops. * Add the FloatX4# primitive type and associated primops. * Add CodeGen support for Float vectors. * Compile vector operations to LLVM vector operations in the LLVM code generator. * Make the x86 native backend fail gracefully when encountering vector primops. * Only generate primop wrappers for vector primops when using LLVM.
-rw-r--r--compiler/cmm/CmmMachOp.hs29
-rw-r--r--compiler/cmm/PprC.hs30
-rw-r--r--compiler/codeGen/StgCmmPrim.hs474
-rw-r--r--compiler/llvmGen/LlvmCodeGen/CodeGen.hs43
-rw-r--r--compiler/nativeGen/X86/CodeGen.hs36
-rw-r--r--compiler/prelude/PrelNames.lhs5
-rw-r--r--compiler/prelude/TysPrim.lhs23
-rw-r--r--compiler/prelude/primops.txt.pp95
-rw-r--r--utils/genprimopcode/Main.hs28
9 files changed, 618 insertions, 145 deletions
diff --git a/compiler/cmm/CmmMachOp.hs b/compiler/cmm/CmmMachOp.hs
index a6c9beebc4..304dfb0938 100644
--- a/compiler/cmm/CmmMachOp.hs
+++ b/compiler/cmm/CmmMachOp.hs
@@ -103,6 +103,17 @@ data MachOp
| MO_SS_Conv Width Width -- Signed int -> Signed int
| MO_UU_Conv Width Width -- unsigned int -> unsigned int
| MO_FF_Conv Width Width -- Float -> Float
+
+ -- Vector element insertion and extraction operations
+ | MO_V_Insert Length Width -- Insert scalar into vector
+ | MO_V_Extract Length Width -- Extract scalar from vector
+
+ -- Floating point vector operations
+ | MO_VF_Add Length Width
+ | MO_VF_Sub Length Width
+ | MO_VF_Neg Length Width -- unary -
+ | MO_VF_Mul Length Width
+ | MO_VF_Quot Length Width
deriving (Eq, Show)
pprMachOp :: MachOp -> SDoc
@@ -338,6 +349,15 @@ machOpResultType dflags mop tys =
MO_FS_Conv _ to -> cmmBits to
MO_SF_Conv _ to -> cmmFloat to
MO_FF_Conv _ to -> cmmFloat to
+
+ MO_V_Insert {} -> ty1
+ MO_V_Extract {} -> vecElemType ty1
+
+ MO_VF_Add {} -> ty1
+ MO_VF_Sub {} -> ty1
+ MO_VF_Mul {} -> ty1
+ MO_VF_Quot {} -> ty1
+ MO_VF_Neg {} -> ty1
where
(ty1:_) = tys
@@ -405,6 +425,15 @@ machOpArgReps dflags op =
MO_FS_Conv from _ -> [from]
MO_FF_Conv from _ -> [from]
+ MO_V_Insert l r -> [typeWidth (vec l (cmmFloat r)),r,wordWidth dflags]
+ MO_V_Extract l r -> [typeWidth (vec l (cmmFloat r)),wordWidth dflags]
+
+ MO_VF_Add _ r -> [r,r]
+ MO_VF_Sub _ r -> [r,r]
+ MO_VF_Mul _ r -> [r,r]
+ MO_VF_Quot _ r -> [r,r]
+ MO_VF_Neg _ r -> [r]
+
-----------------------------------------------------------------------------
-- CallishMachOp
-----------------------------------------------------------------------------
diff --git a/compiler/cmm/PprC.hs b/compiler/cmm/PprC.hs
index 2ca8b67d72..b714e834b6 100644
--- a/compiler/cmm/PprC.hs
+++ b/compiler/cmm/PprC.hs
@@ -626,6 +626,36 @@ pprMachOp_for_C mop = case mop of
(panic $ "PprC.pprMachOp_for_C: MO_U_MulMayOflo"
++ " should have been handled earlier!")
+ MO_V_Insert {} -> pprTrace "offending mop:"
+ (ptext $ sLit "MO_V_Insert")
+ (panic $ "PprC.pprMachOp_for_C: MO_V_Insert"
+ ++ " should have been handled earlier!")
+ MO_V_Extract {} -> pprTrace "offending mop:"
+ (ptext $ sLit "MO_V_Extract")
+ (panic $ "PprC.pprMachOp_for_C: MO_V_Extract"
+ ++ " should have been handled earlier!")
+
+ MO_VF_Add {} -> pprTrace "offending mop:"
+ (ptext $ sLit "MO_VF_Add")
+ (panic $ "PprC.pprMachOp_for_C: MO_VF_Add"
+ ++ " should have been handled earlier!")
+ MO_VF_Sub {} -> pprTrace "offending mop:"
+ (ptext $ sLit "MO_VF_Sub")
+ (panic $ "PprC.pprMachOp_for_C: MO_VF_Sub"
+ ++ " should have been handled earlier!")
+ MO_VF_Neg {} -> pprTrace "offending mop:"
+ (ptext $ sLit "MO_VF_Neg")
+ (panic $ "PprC.pprMachOp_for_C: MO_VF_Neg"
+ ++ " should have been handled earlier!")
+ MO_VF_Mul {} -> pprTrace "offending mop:"
+ (ptext $ sLit "MO_VF_Mul")
+ (panic $ "PprC.pprMachOp_for_C: MO_VF_Mul"
+ ++ " should have been handled earlier!")
+ MO_VF_Quot {} -> pprTrace "offending mop:"
+ (ptext $ sLit "MO_VF_Quot")
+ (panic $ "PprC.pprMachOp_for_C: MO_VF_Quot"
+ ++ " should have been handled earlier!")
+
signedOp :: MachOp -> Bool -- Argument type(s) are signed ints
signedOp (MO_S_Quot _) = True
signedOp (MO_S_Rem _) = True
diff --git a/compiler/codeGen/StgCmmPrim.hs b/compiler/codeGen/StgCmmPrim.hs
index 986286647b..9a583b8354 100644
--- a/compiler/codeGen/StgCmmPrim.hs
+++ b/compiler/codeGen/StgCmmPrim.hs
@@ -365,117 +365,129 @@ emitPrimOp dflags [res] SizeofMutableArrayArrayOp [arg]
-- IndexXXXoffAddr
-emitPrimOp dflags res IndexOffAddrOp_Char args = doIndexOffAddrOp (Just (mo_u_8ToWord dflags)) b8 res args
-emitPrimOp dflags res IndexOffAddrOp_WideChar args = doIndexOffAddrOp (Just (mo_u_32ToWord dflags)) b32 res args
-emitPrimOp dflags res IndexOffAddrOp_Int args = doIndexOffAddrOp Nothing (bWord dflags) res args
-emitPrimOp dflags res IndexOffAddrOp_Word args = doIndexOffAddrOp Nothing (bWord dflags) res args
-emitPrimOp dflags res IndexOffAddrOp_Addr args = doIndexOffAddrOp Nothing (bWord dflags) res args
-emitPrimOp _ res IndexOffAddrOp_Float args = doIndexOffAddrOp Nothing f32 res args
-emitPrimOp _ res IndexOffAddrOp_Double args = doIndexOffAddrOp Nothing f64 res args
-emitPrimOp dflags res IndexOffAddrOp_StablePtr args = doIndexOffAddrOp Nothing (bWord dflags) res args
-emitPrimOp dflags res IndexOffAddrOp_Int8 args = doIndexOffAddrOp (Just (mo_s_8ToWord dflags)) b8 res args
-emitPrimOp dflags res IndexOffAddrOp_Int16 args = doIndexOffAddrOp (Just (mo_s_16ToWord dflags)) b16 res args
-emitPrimOp dflags res IndexOffAddrOp_Int32 args = doIndexOffAddrOp (Just (mo_s_32ToWord dflags)) b32 res args
-emitPrimOp _ res IndexOffAddrOp_Int64 args = doIndexOffAddrOp Nothing b64 res args
-emitPrimOp dflags res IndexOffAddrOp_Word8 args = doIndexOffAddrOp (Just (mo_u_8ToWord dflags)) b8 res args
-emitPrimOp dflags res IndexOffAddrOp_Word16 args = doIndexOffAddrOp (Just (mo_u_16ToWord dflags)) b16 res args
-emitPrimOp dflags res IndexOffAddrOp_Word32 args = doIndexOffAddrOp (Just (mo_u_32ToWord dflags)) b32 res args
-emitPrimOp _ res IndexOffAddrOp_Word64 args = doIndexOffAddrOp Nothing b64 res args
+emitPrimOp dflags res IndexOffAddrOp_Char args = doIndexOffAddrOp (Just (mo_u_8ToWord dflags)) b8 res args
+emitPrimOp dflags res IndexOffAddrOp_WideChar args = doIndexOffAddrOp (Just (mo_u_32ToWord dflags)) b32 res args
+emitPrimOp dflags res IndexOffAddrOp_Int args = doIndexOffAddrOp Nothing (bWord dflags) res args
+emitPrimOp dflags res IndexOffAddrOp_Word args = doIndexOffAddrOp Nothing (bWord dflags) res args
+emitPrimOp dflags res IndexOffAddrOp_Addr args = doIndexOffAddrOp Nothing (bWord dflags) res args
+emitPrimOp _ res IndexOffAddrOp_Float args = doIndexOffAddrOp Nothing f32 res args
+emitPrimOp _ res IndexOffAddrOp_Double args = doIndexOffAddrOp Nothing f64 res args
+emitPrimOp dflags res IndexOffAddrOp_StablePtr args = doIndexOffAddrOp Nothing (bWord dflags) res args
+emitPrimOp dflags res IndexOffAddrOp_Int8 args = doIndexOffAddrOp (Just (mo_s_8ToWord dflags)) b8 res args
+emitPrimOp dflags res IndexOffAddrOp_Int16 args = doIndexOffAddrOp (Just (mo_s_16ToWord dflags)) b16 res args
+emitPrimOp dflags res IndexOffAddrOp_Int32 args = doIndexOffAddrOp (Just (mo_s_32ToWord dflags)) b32 res args
+emitPrimOp _ res IndexOffAddrOp_Int64 args = doIndexOffAddrOp Nothing b64 res args
+emitPrimOp dflags res IndexOffAddrOp_Word8 args = doIndexOffAddrOp (Just (mo_u_8ToWord dflags)) b8 res args
+emitPrimOp dflags res IndexOffAddrOp_Word16 args = doIndexOffAddrOp (Just (mo_u_16ToWord dflags)) b16 res args
+emitPrimOp dflags res IndexOffAddrOp_Word32 args = doIndexOffAddrOp (Just (mo_u_32ToWord dflags)) b32 res args
+emitPrimOp _ res IndexOffAddrOp_Word64 args = doIndexOffAddrOp Nothing b64 res args
+emitPrimOp _ res IndexOffAddrOp_FloatX4 args = doIndexOffAddrOp Nothing vec4f32 res args
+emitPrimOp _ res IndexOffAddrOp_FloatAsFloatX4 args = doIndexOffAddrOpAs Nothing vec4f32 f32 res args
-- ReadXXXoffAddr, which are identical, for our purposes, to IndexXXXoffAddr.
-emitPrimOp dflags res ReadOffAddrOp_Char args = doIndexOffAddrOp (Just (mo_u_8ToWord dflags)) b8 res args
-emitPrimOp dflags res ReadOffAddrOp_WideChar args = doIndexOffAddrOp (Just (mo_u_32ToWord dflags)) b32 res args
-emitPrimOp dflags res ReadOffAddrOp_Int args = doIndexOffAddrOp Nothing (bWord dflags) res args
-emitPrimOp dflags res ReadOffAddrOp_Word args = doIndexOffAddrOp Nothing (bWord dflags) res args
-emitPrimOp dflags res ReadOffAddrOp_Addr args = doIndexOffAddrOp Nothing (bWord dflags) res args
-emitPrimOp _ res ReadOffAddrOp_Float args = doIndexOffAddrOp Nothing f32 res args
-emitPrimOp _ res ReadOffAddrOp_Double args = doIndexOffAddrOp Nothing f64 res args
-emitPrimOp dflags res ReadOffAddrOp_StablePtr args = doIndexOffAddrOp Nothing (bWord dflags) res args
-emitPrimOp dflags res ReadOffAddrOp_Int8 args = doIndexOffAddrOp (Just (mo_s_8ToWord dflags)) b8 res args
-emitPrimOp dflags res ReadOffAddrOp_Int16 args = doIndexOffAddrOp (Just (mo_s_16ToWord dflags)) b16 res args
-emitPrimOp dflags res ReadOffAddrOp_Int32 args = doIndexOffAddrOp (Just (mo_s_32ToWord dflags)) b32 res args
-emitPrimOp _ res ReadOffAddrOp_Int64 args = doIndexOffAddrOp Nothing b64 res args
-emitPrimOp dflags res ReadOffAddrOp_Word8 args = doIndexOffAddrOp (Just (mo_u_8ToWord dflags)) b8 res args
-emitPrimOp dflags res ReadOffAddrOp_Word16 args = doIndexOffAddrOp (Just (mo_u_16ToWord dflags)) b16 res args
-emitPrimOp dflags res ReadOffAddrOp_Word32 args = doIndexOffAddrOp (Just (mo_u_32ToWord dflags)) b32 res args
-emitPrimOp _ res ReadOffAddrOp_Word64 args = doIndexOffAddrOp Nothing b64 res args
+emitPrimOp dflags res ReadOffAddrOp_Char args = doIndexOffAddrOp (Just (mo_u_8ToWord dflags)) b8 res args
+emitPrimOp dflags res ReadOffAddrOp_WideChar args = doIndexOffAddrOp (Just (mo_u_32ToWord dflags)) b32 res args
+emitPrimOp dflags res ReadOffAddrOp_Int args = doIndexOffAddrOp Nothing (bWord dflags) res args
+emitPrimOp dflags res ReadOffAddrOp_Word args = doIndexOffAddrOp Nothing (bWord dflags) res args
+emitPrimOp dflags res ReadOffAddrOp_Addr args = doIndexOffAddrOp Nothing (bWord dflags) res args
+emitPrimOp _ res ReadOffAddrOp_Float args = doIndexOffAddrOp Nothing f32 res args
+emitPrimOp _ res ReadOffAddrOp_Double args = doIndexOffAddrOp Nothing f64 res args
+emitPrimOp dflags res ReadOffAddrOp_StablePtr args = doIndexOffAddrOp Nothing (bWord dflags) res args
+emitPrimOp dflags res ReadOffAddrOp_Int8 args = doIndexOffAddrOp (Just (mo_s_8ToWord dflags)) b8 res args
+emitPrimOp dflags res ReadOffAddrOp_Int16 args = doIndexOffAddrOp (Just (mo_s_16ToWord dflags)) b16 res args
+emitPrimOp dflags res ReadOffAddrOp_Int32 args = doIndexOffAddrOp (Just (mo_s_32ToWord dflags)) b32 res args
+emitPrimOp _ res ReadOffAddrOp_Int64 args = doIndexOffAddrOp Nothing b64 res args
+emitPrimOp dflags res ReadOffAddrOp_Word8 args = doIndexOffAddrOp (Just (mo_u_8ToWord dflags)) b8 res args
+emitPrimOp dflags res ReadOffAddrOp_Word16 args = doIndexOffAddrOp (Just (mo_u_16ToWord dflags)) b16 res args
+emitPrimOp dflags res ReadOffAddrOp_Word32 args = doIndexOffAddrOp (Just (mo_u_32ToWord dflags)) b32 res args
+emitPrimOp _ res ReadOffAddrOp_Word64 args = doIndexOffAddrOp Nothing b64 res args
+emitPrimOp _ res ReadOffAddrOp_FloatX4 args = doIndexOffAddrOp Nothing vec4f32 res args
+emitPrimOp _ res ReadOffAddrOp_FloatAsFloatX4 args = doIndexOffAddrOpAs Nothing vec4f32 b32 res args
-- IndexXXXArray
-emitPrimOp dflags res IndexByteArrayOp_Char args = doIndexByteArrayOp (Just (mo_u_8ToWord dflags)) b8 res args
-emitPrimOp dflags res IndexByteArrayOp_WideChar args = doIndexByteArrayOp (Just (mo_u_32ToWord dflags)) b32 res args
-emitPrimOp dflags res IndexByteArrayOp_Int args = doIndexByteArrayOp Nothing (bWord dflags) res args
-emitPrimOp dflags res IndexByteArrayOp_Word args = doIndexByteArrayOp Nothing (bWord dflags) res args
-emitPrimOp dflags res IndexByteArrayOp_Addr args = doIndexByteArrayOp Nothing (bWord dflags) res args
-emitPrimOp _ res IndexByteArrayOp_Float args = doIndexByteArrayOp Nothing f32 res args
-emitPrimOp _ res IndexByteArrayOp_Double args = doIndexByteArrayOp Nothing f64 res args
-emitPrimOp dflags res IndexByteArrayOp_StablePtr args = doIndexByteArrayOp Nothing (bWord dflags) res args
-emitPrimOp dflags res IndexByteArrayOp_Int8 args = doIndexByteArrayOp (Just (mo_s_8ToWord dflags)) b8 res args
-emitPrimOp dflags res IndexByteArrayOp_Int16 args = doIndexByteArrayOp (Just (mo_s_16ToWord dflags)) b16 res args
-emitPrimOp dflags res IndexByteArrayOp_Int32 args = doIndexByteArrayOp (Just (mo_s_32ToWord dflags)) b32 res args
-emitPrimOp _ res IndexByteArrayOp_Int64 args = doIndexByteArrayOp Nothing b64 res args
-emitPrimOp dflags res IndexByteArrayOp_Word8 args = doIndexByteArrayOp (Just (mo_u_8ToWord dflags)) b8 res args
-emitPrimOp dflags res IndexByteArrayOp_Word16 args = doIndexByteArrayOp (Just (mo_u_16ToWord dflags)) b16 res args
-emitPrimOp dflags res IndexByteArrayOp_Word32 args = doIndexByteArrayOp (Just (mo_u_32ToWord dflags)) b32 res args
-emitPrimOp _ res IndexByteArrayOp_Word64 args = doIndexByteArrayOp Nothing b64 res args
+emitPrimOp dflags res IndexByteArrayOp_Char args = doIndexByteArrayOp (Just (mo_u_8ToWord dflags)) b8 res args
+emitPrimOp dflags res IndexByteArrayOp_WideChar args = doIndexByteArrayOp (Just (mo_u_32ToWord dflags)) b32 res args
+emitPrimOp dflags res IndexByteArrayOp_Int args = doIndexByteArrayOp Nothing (bWord dflags) res args
+emitPrimOp dflags res IndexByteArrayOp_Word args = doIndexByteArrayOp Nothing (bWord dflags) res args
+emitPrimOp dflags res IndexByteArrayOp_Addr args = doIndexByteArrayOp Nothing (bWord dflags) res args
+emitPrimOp _ res IndexByteArrayOp_Float args = doIndexByteArrayOp Nothing f32 res args
+emitPrimOp _ res IndexByteArrayOp_Double args = doIndexByteArrayOp Nothing f64 res args
+emitPrimOp dflags res IndexByteArrayOp_StablePtr args = doIndexByteArrayOp Nothing (bWord dflags) res args
+emitPrimOp dflags res IndexByteArrayOp_Int8 args = doIndexByteArrayOp (Just (mo_s_8ToWord dflags)) b8 res args
+emitPrimOp dflags res IndexByteArrayOp_Int16 args = doIndexByteArrayOp (Just (mo_s_16ToWord dflags)) b16 res args
+emitPrimOp dflags res IndexByteArrayOp_Int32 args = doIndexByteArrayOp (Just (mo_s_32ToWord dflags)) b32 res args
+emitPrimOp _ res IndexByteArrayOp_Int64 args = doIndexByteArrayOp Nothing b64 res args
+emitPrimOp dflags res IndexByteArrayOp_Word8 args = doIndexByteArrayOp (Just (mo_u_8ToWord dflags)) b8 res args
+emitPrimOp dflags res IndexByteArrayOp_Word16 args = doIndexByteArrayOp (Just (mo_u_16ToWord dflags)) b16 res args
+emitPrimOp dflags res IndexByteArrayOp_Word32 args = doIndexByteArrayOp (Just (mo_u_32ToWord dflags)) b32 res args
+emitPrimOp _ res IndexByteArrayOp_Word64 args = doIndexByteArrayOp Nothing b64 res args
+emitPrimOp _ res IndexByteArrayOp_FloatX4 args = doIndexByteArrayOp Nothing vec4f32 res args
+emitPrimOp _ res IndexByteArrayOp_FloatAsFloatX4 args = doIndexByteArrayOpAs Nothing vec4f32 f32 res args
-- ReadXXXArray, identical to IndexXXXArray.
-emitPrimOp dflags res ReadByteArrayOp_Char args = doIndexByteArrayOp (Just (mo_u_8ToWord dflags)) b8 res args
-emitPrimOp dflags res ReadByteArrayOp_WideChar args = doIndexByteArrayOp (Just (mo_u_32ToWord dflags)) b32 res args
-emitPrimOp dflags res ReadByteArrayOp_Int args = doIndexByteArrayOp Nothing (bWord dflags) res args
-emitPrimOp dflags res ReadByteArrayOp_Word args = doIndexByteArrayOp Nothing (bWord dflags) res args
-emitPrimOp dflags res ReadByteArrayOp_Addr args = doIndexByteArrayOp Nothing (bWord dflags) res args
-emitPrimOp _ res ReadByteArrayOp_Float args = doIndexByteArrayOp Nothing f32 res args
-emitPrimOp _ res ReadByteArrayOp_Double args = doIndexByteArrayOp Nothing f64 res args
-emitPrimOp dflags res ReadByteArrayOp_StablePtr args = doIndexByteArrayOp Nothing (bWord dflags) res args
-emitPrimOp dflags res ReadByteArrayOp_Int8 args = doIndexByteArrayOp (Just (mo_s_8ToWord dflags)) b8 res args
-emitPrimOp dflags res ReadByteArrayOp_Int16 args = doIndexByteArrayOp (Just (mo_s_16ToWord dflags)) b16 res args
-emitPrimOp dflags res ReadByteArrayOp_Int32 args = doIndexByteArrayOp (Just (mo_s_32ToWord dflags)) b32 res args
-emitPrimOp _ res ReadByteArrayOp_Int64 args = doIndexByteArrayOp Nothing b64 res args
-emitPrimOp dflags res ReadByteArrayOp_Word8 args = doIndexByteArrayOp (Just (mo_u_8ToWord dflags)) b8 res args
-emitPrimOp dflags res ReadByteArrayOp_Word16 args = doIndexByteArrayOp (Just (mo_u_16ToWord dflags)) b16 res args
-emitPrimOp dflags res ReadByteArrayOp_Word32 args = doIndexByteArrayOp (Just (mo_u_32ToWord dflags)) b32 res args
-emitPrimOp _ res ReadByteArrayOp_Word64 args = doIndexByteArrayOp Nothing b64 res args
+emitPrimOp dflags res ReadByteArrayOp_Char args = doIndexByteArrayOp (Just (mo_u_8ToWord dflags)) b8 res args
+emitPrimOp dflags res ReadByteArrayOp_WideChar args = doIndexByteArrayOp (Just (mo_u_32ToWord dflags)) b32 res args
+emitPrimOp dflags res ReadByteArrayOp_Int args = doIndexByteArrayOp Nothing (bWord dflags) res args
+emitPrimOp dflags res ReadByteArrayOp_Word args = doIndexByteArrayOp Nothing (bWord dflags) res args
+emitPrimOp dflags res ReadByteArrayOp_Addr args = doIndexByteArrayOp Nothing (bWord dflags) res args
+emitPrimOp _ res ReadByteArrayOp_Float args = doIndexByteArrayOp Nothing f32 res args
+emitPrimOp _ res ReadByteArrayOp_Double args = doIndexByteArrayOp Nothing f64 res args
+emitPrimOp dflags res ReadByteArrayOp_StablePtr args = doIndexByteArrayOp Nothing (bWord dflags) res args
+emitPrimOp dflags res ReadByteArrayOp_Int8 args = doIndexByteArrayOp (Just (mo_s_8ToWord dflags)) b8 res args
+emitPrimOp dflags res ReadByteArrayOp_Int16 args = doIndexByteArrayOp (Just (mo_s_16ToWord dflags)) b16 res args
+emitPrimOp dflags res ReadByteArrayOp_Int32 args = doIndexByteArrayOp (Just (mo_s_32ToWord dflags)) b32 res args
+emitPrimOp _ res ReadByteArrayOp_Int64 args = doIndexByteArrayOp Nothing b64 res args
+emitPrimOp dflags res ReadByteArrayOp_Word8 args = doIndexByteArrayOp (Just (mo_u_8ToWord dflags)) b8 res args
+emitPrimOp dflags res ReadByteArrayOp_Word16 args = doIndexByteArrayOp (Just (mo_u_16ToWord dflags)) b16 res args
+emitPrimOp dflags res ReadByteArrayOp_Word32 args = doIndexByteArrayOp (Just (mo_u_32ToWord dflags)) b32 res args
+emitPrimOp _ res ReadByteArrayOp_Word64 args = doIndexByteArrayOp Nothing b64 res args
+emitPrimOp _ res ReadByteArrayOp_FloatX4 args = doIndexByteArrayOp Nothing vec4f32 res args
+emitPrimOp _ res ReadByteArrayOp_FloatAsFloatX4 args = doIndexByteArrayOpAs Nothing vec4f32 f32 res args
-- WriteXXXoffAddr
-emitPrimOp dflags res WriteOffAddrOp_Char args = doWriteOffAddrOp (Just (mo_WordTo8 dflags)) res args
-emitPrimOp dflags res WriteOffAddrOp_WideChar args = doWriteOffAddrOp (Just (mo_WordTo32 dflags)) res args
-emitPrimOp _ res WriteOffAddrOp_Int args = doWriteOffAddrOp Nothing res args
-emitPrimOp _ res WriteOffAddrOp_Word args = doWriteOffAddrOp Nothing res args
-emitPrimOp _ res WriteOffAddrOp_Addr args = doWriteOffAddrOp Nothing res args
-emitPrimOp _ res WriteOffAddrOp_Float args = doWriteOffAddrOp Nothing res args
-emitPrimOp _ res WriteOffAddrOp_Double args = doWriteOffAddrOp Nothing res args
-emitPrimOp _ res WriteOffAddrOp_StablePtr args = doWriteOffAddrOp Nothing res args
-emitPrimOp dflags res WriteOffAddrOp_Int8 args = doWriteOffAddrOp (Just (mo_WordTo8 dflags)) res args
-emitPrimOp dflags res WriteOffAddrOp_Int16 args = doWriteOffAddrOp (Just (mo_WordTo16 dflags)) res args
-emitPrimOp dflags res WriteOffAddrOp_Int32 args = doWriteOffAddrOp (Just (mo_WordTo32 dflags)) res args
-emitPrimOp _ res WriteOffAddrOp_Int64 args = doWriteOffAddrOp Nothing res args
-emitPrimOp dflags res WriteOffAddrOp_Word8 args = doWriteOffAddrOp (Just (mo_WordTo8 dflags)) res args
-emitPrimOp dflags res WriteOffAddrOp_Word16 args = doWriteOffAddrOp (Just (mo_WordTo16 dflags)) res args
-emitPrimOp dflags res WriteOffAddrOp_Word32 args = doWriteOffAddrOp (Just (mo_WordTo32 dflags)) res args
-emitPrimOp _ res WriteOffAddrOp_Word64 args = doWriteOffAddrOp Nothing res args
+emitPrimOp dflags res WriteOffAddrOp_Char args = doWriteOffAddrOp (Just (mo_WordTo8 dflags)) b8 res args
+emitPrimOp dflags res WriteOffAddrOp_WideChar args = doWriteOffAddrOp (Just (mo_WordTo32 dflags)) b32 res args
+emitPrimOp dflags res WriteOffAddrOp_Int args = doWriteOffAddrOp Nothing (bWord dflags) res args
+emitPrimOp dflags res WriteOffAddrOp_Word args = doWriteOffAddrOp Nothing (bWord dflags) res args
+emitPrimOp dflags res WriteOffAddrOp_Addr args = doWriteOffAddrOp Nothing (bWord dflags) res args
+emitPrimOp _ res WriteOffAddrOp_Float args = doWriteOffAddrOp Nothing f32 res args
+emitPrimOp _ res WriteOffAddrOp_Double args = doWriteOffAddrOp Nothing f64 res args
+emitPrimOp dflags res WriteOffAddrOp_StablePtr args = doWriteOffAddrOp Nothing (bWord dflags) res args
+emitPrimOp dflags res WriteOffAddrOp_Int8 args = doWriteOffAddrOp (Just (mo_WordTo8 dflags)) b8 res args
+emitPrimOp dflags res WriteOffAddrOp_Int16 args = doWriteOffAddrOp (Just (mo_WordTo16 dflags)) b16 res args
+emitPrimOp dflags res WriteOffAddrOp_Int32 args = doWriteOffAddrOp (Just (mo_WordTo32 dflags)) b32 res args
+emitPrimOp _ res WriteOffAddrOp_Int64 args = doWriteOffAddrOp Nothing b64 res args
+emitPrimOp dflags res WriteOffAddrOp_Word8 args = doWriteOffAddrOp (Just (mo_WordTo8 dflags)) b8 res args
+emitPrimOp dflags res WriteOffAddrOp_Word16 args = doWriteOffAddrOp (Just (mo_WordTo16 dflags)) b16 res args
+emitPrimOp dflags res WriteOffAddrOp_Word32 args = doWriteOffAddrOp (Just (mo_WordTo32 dflags)) b32 res args
+emitPrimOp _ res WriteOffAddrOp_Word64 args = doWriteOffAddrOp Nothing b64 res args
+emitPrimOp _ res WriteOffAddrOp_FloatX4 args = doWriteOffAddrOp Nothing vec4f32 res args
+emitPrimOp _ res WriteOffAddrOp_FloatAsFloatX4 args = doWriteOffAddrOp Nothing f32 res args
-- WriteXXXArray
-emitPrimOp dflags res WriteByteArrayOp_Char args = doWriteByteArrayOp (Just (mo_WordTo8 dflags)) res args
-emitPrimOp dflags res WriteByteArrayOp_WideChar args = doWriteByteArrayOp (Just (mo_WordTo32 dflags)) res args
-emitPrimOp _ res WriteByteArrayOp_Int args = doWriteByteArrayOp Nothing res args
-emitPrimOp _ res WriteByteArrayOp_Word args = doWriteByteArrayOp Nothing res args
-emitPrimOp _ res WriteByteArrayOp_Addr args = doWriteByteArrayOp Nothing res args
-emitPrimOp _ res WriteByteArrayOp_Float args = doWriteByteArrayOp Nothing res args
-emitPrimOp _ res WriteByteArrayOp_Double args = doWriteByteArrayOp Nothing res args
-emitPrimOp _ res WriteByteArrayOp_StablePtr args = doWriteByteArrayOp Nothing res args
-emitPrimOp dflags res WriteByteArrayOp_Int8 args = doWriteByteArrayOp (Just (mo_WordTo8 dflags)) res args
-emitPrimOp dflags res WriteByteArrayOp_Int16 args = doWriteByteArrayOp (Just (mo_WordTo16 dflags)) res args
-emitPrimOp dflags res WriteByteArrayOp_Int32 args = doWriteByteArrayOp (Just (mo_WordTo32 dflags)) res args
-emitPrimOp _ res WriteByteArrayOp_Int64 args = doWriteByteArrayOp Nothing res args
-emitPrimOp dflags res WriteByteArrayOp_Word8 args = doWriteByteArrayOp (Just (mo_WordTo8 dflags)) res args
-emitPrimOp dflags res WriteByteArrayOp_Word16 args = doWriteByteArrayOp (Just (mo_WordTo16 dflags)) res args
-emitPrimOp dflags res WriteByteArrayOp_Word32 args = doWriteByteArrayOp (Just (mo_WordTo32 dflags)) res args
-emitPrimOp _ res WriteByteArrayOp_Word64 args = doWriteByteArrayOp Nothing res args
+emitPrimOp dflags res WriteByteArrayOp_Char args = doWriteByteArrayOp (Just (mo_WordTo8 dflags)) b8 res args
+emitPrimOp dflags res WriteByteArrayOp_WideChar args = doWriteByteArrayOp (Just (mo_WordTo32 dflags)) b32 res args
+emitPrimOp dflags res WriteByteArrayOp_Int args = doWriteByteArrayOp Nothing (bWord dflags) res args
+emitPrimOp dflags res WriteByteArrayOp_Word args = doWriteByteArrayOp Nothing (bWord dflags) res args
+emitPrimOp dflags res WriteByteArrayOp_Addr args = doWriteByteArrayOp Nothing (bWord dflags) res args
+emitPrimOp _ res WriteByteArrayOp_Float args = doWriteByteArrayOp Nothing f32 res args
+emitPrimOp _ res WriteByteArrayOp_Double args = doWriteByteArrayOp Nothing f64 res args
+emitPrimOp dflags res WriteByteArrayOp_StablePtr args = doWriteByteArrayOp Nothing (bWord dflags) res args
+emitPrimOp dflags res WriteByteArrayOp_Int8 args = doWriteByteArrayOp (Just (mo_WordTo8 dflags)) b8 res args
+emitPrimOp dflags res WriteByteArrayOp_Int16 args = doWriteByteArrayOp (Just (mo_WordTo16 dflags)) b16 res args
+emitPrimOp dflags res WriteByteArrayOp_Int32 args = doWriteByteArrayOp (Just (mo_WordTo32 dflags)) b32 res args
+emitPrimOp _ res WriteByteArrayOp_Int64 args = doWriteByteArrayOp Nothing b64 res args
+emitPrimOp dflags res WriteByteArrayOp_Word8 args = doWriteByteArrayOp (Just (mo_WordTo8 dflags)) b8 res args
+emitPrimOp dflags res WriteByteArrayOp_Word16 args = doWriteByteArrayOp (Just (mo_WordTo16 dflags)) b16 res args
+emitPrimOp dflags res WriteByteArrayOp_Word32 args = doWriteByteArrayOp (Just (mo_WordTo32 dflags)) b32 res args
+emitPrimOp _ res WriteByteArrayOp_Word64 args = doWriteByteArrayOp Nothing b64 res args
+emitPrimOp _ res WriteByteArrayOp_FloatX4 args = doWriteByteArrayOp Nothing vec4f32 res args
+emitPrimOp _ res WriteByteArrayOp_FloatAsFloatX4 args = doWriteByteArrayOp Nothing f32 res args
-- Copying and setting byte arrays
emitPrimOp _ [] CopyByteArrayOp [src,src_off,dst,dst_off,n] =
@@ -498,6 +510,25 @@ emitPrimOp _ [res] Word2FloatOp [w] = emitPrimCall [res]
emitPrimOp _ [res] Word2DoubleOp [w] = emitPrimCall [res]
(MO_UF_Conv W64) [w]
+-- SIMD vector packing and unpacking
+emitPrimOp _ [res] FloatToFloatX4Op [e] =
+ doVecPackOp Nothing vec4f32 zero [e,e,e,e] res
+ where
+ zero :: CmmExpr
+ zero = CmmLit $ CmmVec (replicate 4 (CmmFloat 0 W32))
+
+emitPrimOp _ [res] FloatX4PackOp es@[_,_,_,_] =
+ doVecPackOp Nothing vec4f32 zero es res
+ where
+ zero :: CmmExpr
+ zero = CmmLit $ CmmVec (replicate 4 (CmmFloat 0 W32))
+
+emitPrimOp _ res@[_,_,_,_] FloatX4UnpackOp [arg] =
+ doVecUnpackOp Nothing vec4f32 arg res
+
+emitPrimOp _ [res] FloatX4InsertOp [v,e,i] =
+ doVecInsertOp Nothing vec4f32 v e i res
+
-- The rest just translate straightforwardly
emitPrimOp dflags [res] op [arg]
| nopOp op
@@ -804,6 +835,14 @@ translateOp _ FloatMulOp = Just (MO_F_Mul W32)
translateOp _ FloatDivOp = Just (MO_F_Quot W32)
translateOp _ FloatNegOp = Just (MO_F_Neg W32)
+-- Floating point vector ops
+
+translateOp _ FloatX4AddOp = Just (MO_VF_Add 4 W32)
+translateOp _ FloatX4SubOp = Just (MO_VF_Sub 4 W32)
+translateOp _ FloatX4MulOp = Just (MO_VF_Mul 4 W32)
+translateOp _ FloatX4DivOp = Just (MO_VF_Quot 4 W32)
+translateOp _ FloatX4NegOp = Just (MO_VF_Neg 4 W32)
+
-- Conversions
translateOp dflags Int2DoubleOp = Just (MO_SF_Conv (wordWidth dflags) W64)
@@ -864,42 +903,87 @@ callishOp _ = Nothing
------------------------------------------------------------------------------
-- Helpers for translating various minor variants of array indexing.
-doIndexOffAddrOp :: Maybe MachOp -> CmmType -> [LocalReg] -> [CmmExpr] -> FCode ()
+doIndexOffAddrOp :: Maybe MachOp
+ -> CmmType
+ -> [LocalReg]
+ -> [CmmExpr]
+ -> FCode ()
doIndexOffAddrOp maybe_post_read_cast rep [res] [addr,idx]
- = mkBasicIndexedRead 0 maybe_post_read_cast rep res addr idx
+ = mkBasicIndexedRead 0 maybe_post_read_cast rep res addr rep idx
doIndexOffAddrOp _ _ _ _
- = panic "CgPrimOp: doIndexOffAddrOp"
-
-doIndexByteArrayOp :: Maybe MachOp -> CmmType -> [LocalReg] -> [CmmExpr] -> FCode ()
+ = panic "StgCmmPrim: doIndexOffAddrOp"
+
+doIndexOffAddrOpAs :: Maybe MachOp
+ -> CmmType
+ -> CmmType
+ -> [LocalReg]
+ -> [CmmExpr]
+ -> FCode ()
+doIndexOffAddrOpAs maybe_post_read_cast rep idx_rep [res] [addr,idx]
+ = mkBasicIndexedRead 0 maybe_post_read_cast rep res addr idx_rep idx
+doIndexOffAddrOpAs _ _ _ _ _
+ = panic "StgCmmPrim: doIndexOffAddrOpAs"
+
+doIndexByteArrayOp :: Maybe MachOp
+ -> CmmType
+ -> [LocalReg]
+ -> [CmmExpr]
+ -> FCode ()
doIndexByteArrayOp maybe_post_read_cast rep [res] [addr,idx]
= do dflags <- getDynFlags
- mkBasicIndexedRead (arrWordsHdrSize dflags) maybe_post_read_cast rep res addr idx
-doIndexByteArrayOp _ _ _ _
- = panic "CgPrimOp: doIndexByteArrayOp"
+ mkBasicIndexedRead (arrWordsHdrSize dflags) maybe_post_read_cast rep res addr rep idx
+doIndexByteArrayOp _ _ _ _
+ = panic "StgCmmPrim: doIndexByteArrayOp"
+
+doIndexByteArrayOpAs :: Maybe MachOp
+ -> CmmType
+ -> CmmType
+ -> [LocalReg]
+ -> [CmmExpr]
+ -> FCode ()
+doIndexByteArrayOpAs maybe_post_read_cast rep idx_rep [res] [addr,idx]
+ = do dflags <- getDynFlags
+ mkBasicIndexedRead (arrWordsHdrSize dflags) maybe_post_read_cast rep res addr idx_rep idx
+doIndexByteArrayOpAs _ _ _ _ _
+ = panic "StgCmmPrim: doIndexByteArrayOpAs"
-doReadPtrArrayOp :: LocalReg -> CmmExpr -> CmmExpr -> FCode ()
+doReadPtrArrayOp :: LocalReg
+ -> CmmExpr
+ -> CmmExpr
+ -> FCode ()
doReadPtrArrayOp res addr idx
= do dflags <- getDynFlags
- mkBasicIndexedRead (arrPtrsHdrSize dflags) Nothing (gcWord dflags) res addr idx
-
-
-doWriteOffAddrOp :: Maybe MachOp -> [LocalReg] -> [CmmExpr] -> FCode ()
-doWriteOffAddrOp maybe_pre_write_cast [] [addr,idx,val]
- = mkBasicIndexedWrite 0 maybe_pre_write_cast addr idx val
-doWriteOffAddrOp _ _ _
- = panic "CgPrimOp: doWriteOffAddrOp"
+ mkBasicIndexedRead (arrPtrsHdrSize dflags) Nothing (gcWord dflags) res addr (gcWord dflags) idx
-doWriteByteArrayOp :: Maybe MachOp -> [LocalReg] -> [CmmExpr] -> FCode ()
-doWriteByteArrayOp maybe_pre_write_cast [] [addr,idx,val]
+doWriteOffAddrOp :: Maybe MachOp
+ -> CmmType
+ -> [LocalReg]
+ -> [CmmExpr]
+ -> FCode ()
+doWriteOffAddrOp maybe_pre_write_cast idx_ty [] [addr,idx,val]
+ = mkBasicIndexedWrite 0 maybe_pre_write_cast addr idx_ty idx val
+doWriteOffAddrOp _ _ _ _
+ = panic "StgCmmPrim: doWriteOffAddrOp"
+
+doWriteByteArrayOp :: Maybe MachOp
+ -> CmmType
+ -> [LocalReg]
+ -> [CmmExpr]
+ -> FCode ()
+doWriteByteArrayOp maybe_pre_write_cast idx_ty [] [addr,idx,val]
= do dflags <- getDynFlags
- mkBasicIndexedWrite (arrWordsHdrSize dflags) maybe_pre_write_cast addr idx val
-doWriteByteArrayOp _ _ _
- = panic "CgPrimOp: doWriteByteArrayOp"
+ mkBasicIndexedWrite (arrWordsHdrSize dflags) maybe_pre_write_cast addr idx_ty idx val
+doWriteByteArrayOp _ _ _ _
+ = panic "StgCmmPrim: doWriteByteArrayOp"
-doWritePtrArrayOp :: CmmExpr -> CmmExpr -> CmmExpr -> FCode ()
+doWritePtrArrayOp :: CmmExpr
+ -> CmmExpr
+ -> CmmExpr
+ -> FCode ()
doWritePtrArrayOp addr idx val
= do dflags <- getDynFlags
- mkBasicIndexedWrite (arrPtrsHdrSize dflags) Nothing addr idx val
+ let ty = cmmExprType dflags val
+ mkBasicIndexedWrite (arrPtrsHdrSize dflags) Nothing addr ty idx val
emit (setInfo addr (CmmLit (CmmLabel mkMAP_DIRTY_infoLabel)))
-- the write barrier. We must write a byte into the mark table:
-- bits8[a + header_size + StgMutArrPtrs_size(a) + x >> N]
@@ -915,38 +999,154 @@ loadArrPtrsSize :: DynFlags -> CmmExpr -> CmmExpr
loadArrPtrsSize dflags addr = CmmLoad (cmmOffsetB dflags addr off) (bWord dflags)
where off = fixedHdrSize dflags * wORD_SIZE dflags + oFFSET_StgMutArrPtrs_ptrs dflags
-mkBasicIndexedRead :: ByteOff -> Maybe MachOp -> CmmType
- -> LocalReg -> CmmExpr -> CmmExpr -> FCode ()
-mkBasicIndexedRead off Nothing read_rep res base idx
+mkBasicIndexedRead :: ByteOff -- Initial offset in bytes
+ -> Maybe MachOp -- Optional result cast
+ -> CmmType -- Type of element we are accessing
+ -> LocalReg -- Destination
+ -> CmmExpr -- Base address
+ -> CmmType -- Type of element by which we are indexing
+ -> CmmExpr -- Index
+ -> FCode ()
+mkBasicIndexedRead off Nothing ty res base idx_ty idx
= do dflags <- getDynFlags
- emitAssign (CmmLocal res) (cmmLoadIndexOffExpr dflags off read_rep base idx)
-mkBasicIndexedRead off (Just cast) read_rep res base idx
+ emitAssign (CmmLocal res) (cmmLoadIndexOffExpr dflags off ty base idx_ty idx)
+mkBasicIndexedRead off (Just cast) ty res base idx_ty idx
= do dflags <- getDynFlags
emitAssign (CmmLocal res) (CmmMachOp cast [
- cmmLoadIndexOffExpr dflags off read_rep base idx])
-
-mkBasicIndexedWrite :: ByteOff -> Maybe MachOp
- -> CmmExpr -> CmmExpr -> CmmExpr -> FCode ()
-mkBasicIndexedWrite off Nothing base idx val
+ cmmLoadIndexOffExpr dflags off ty base idx_ty idx])
+
+mkBasicIndexedWrite :: ByteOff -- Initial offset in bytes
+ -> Maybe MachOp -- Optional value cast
+ -> CmmExpr -- Base address
+ -> CmmType -- Type of element by which we are indexing
+ -> CmmExpr -- Index
+ -> CmmExpr -- Value to write
+ -> FCode ()
+mkBasicIndexedWrite off Nothing base idx_ty idx val
= do dflags <- getDynFlags
- emitStore (cmmIndexOffExpr dflags off (typeWidth (cmmExprType dflags val)) base idx) val
-mkBasicIndexedWrite off (Just cast) base idx val
- = mkBasicIndexedWrite off Nothing base idx (CmmMachOp cast [val])
+ emitStore (cmmIndexOffExpr dflags off (typeWidth idx_ty) base idx) val
+mkBasicIndexedWrite off (Just cast) base idx_ty idx val
+ = mkBasicIndexedWrite off Nothing base idx_ty idx (CmmMachOp cast [val])
-- ----------------------------------------------------------------------------
-- Misc utils
-cmmIndexOffExpr :: DynFlags -> ByteOff -> Width -> CmmExpr -> CmmExpr -> CmmExpr
+cmmIndexOffExpr :: DynFlags
+ -> ByteOff -- Initial offset in bytes
+ -> Width -- Width of element by which we are indexing
+ -> CmmExpr -- Base address
+ -> CmmExpr -- Index
+ -> CmmExpr
cmmIndexOffExpr dflags off width base idx
= cmmIndexExpr dflags width (cmmOffsetB dflags base off) idx
-cmmLoadIndexOffExpr :: DynFlags -> ByteOff -> CmmType -> CmmExpr -> CmmExpr -> CmmExpr
-cmmLoadIndexOffExpr dflags off ty base idx
- = CmmLoad (cmmIndexOffExpr dflags off (typeWidth ty) base idx) ty
+cmmLoadIndexOffExpr :: DynFlags
+ -> ByteOff -- Initial offset in bytes
+ -> CmmType -- Type of element we are accessing
+ -> CmmExpr -- Base address
+ -> CmmType -- Type of element by which we are indexing
+ -> CmmExpr -- Index
+ -> CmmExpr
+cmmLoadIndexOffExpr dflags off ty base idx_ty idx
+ = CmmLoad (cmmIndexOffExpr dflags off (typeWidth idx_ty) base idx) ty
setInfo :: CmmExpr -> CmmExpr -> CmmAGraph
setInfo closure_ptr info_ptr = mkStore closure_ptr info_ptr
+------------------------------------------------------------------------------
+-- Helpers for translating vector packing and unpacking.
+
+doVecPackOp :: Maybe MachOp -- Cast from element to vector component
+ -> CmmType -- Type of vector
+ -> CmmExpr -- Initial vector
+ -> [CmmExpr] -- Elements
+ -> CmmFormal -- Destination for result
+ -> FCode ()
+doVecPackOp maybe_pre_write_cast ty z es res = do
+ dst <- newTemp ty
+ emitAssign (CmmLocal dst) z
+ vecPack dst es 0
+ where
+ vecPack :: CmmFormal -> [CmmExpr] -> Int -> FCode ()
+ vecPack src [] _ =
+ emitAssign (CmmLocal res) (CmmReg (CmmLocal src))
+
+ vecPack src (e : es) i = do
+ dst <- newTemp ty
+ emitAssign (CmmLocal dst) (CmmMachOp (MO_V_Insert len wid)
+ [CmmReg (CmmLocal src), cast e, iLit])
+ vecPack dst es (i + 1)
+ where
+ -- vector indices are always 32-bits
+ iLit = CmmLit (CmmInt (toInteger i) W32)
+
+ cast :: CmmExpr -> CmmExpr
+ cast val = case maybe_pre_write_cast of
+ Nothing -> val
+ Just cast -> CmmMachOp cast [val]
+
+ len :: Length
+ len = vecLength ty
+
+ wid :: Width
+ wid = typeWidth (vecElemType ty)
+
+doVecUnpackOp :: Maybe MachOp -- Cast from vector component to element result
+ -> CmmType -- Type of vector
+ -> CmmExpr -- Vector
+ -> [CmmFormal] -- Element results
+ -> FCode ()
+doVecUnpackOp maybe_post_read_cast ty e res =
+ vecUnpack res 0
+ where
+ vecUnpack :: [CmmFormal] -> Int -> FCode ()
+ vecUnpack [] _ =
+ return ()
+
+ vecUnpack (r : rs) i = do
+ emitAssign (CmmLocal r) (cast (CmmMachOp (MO_V_Extract len wid)
+ [e, iLit]))
+ vecUnpack rs (i + 1)
+ where
+ -- vector indices are always 32-bits
+ iLit = CmmLit (CmmInt (toInteger i) W32)
+
+ cast :: CmmExpr -> CmmExpr
+ cast val = case maybe_post_read_cast of
+ Nothing -> val
+ Just cast -> CmmMachOp cast [val]
+
+ len :: Length
+ len = vecLength ty
+
+ wid :: Width
+ wid = typeWidth (vecElemType ty)
+
+doVecInsertOp :: Maybe MachOp -- Cast from element to vector component
+ -> CmmType -- Vector type
+ -> CmmExpr -- Source vector
+ -> CmmExpr -- Element
+ -> CmmExpr -- Index at which to insert element
+ -> CmmFormal -- Destination for result
+ -> FCode ()
+doVecInsertOp maybe_pre_write_cast ty src e idx res = do
+ dflags <- getDynFlags
+ -- vector indices are always 32-bits
+ let idx' :: CmmExpr
+ idx' = CmmMachOp (MO_SS_Conv (wordWidth dflags) W32) [idx]
+ emitAssign (CmmLocal res) (CmmMachOp (MO_V_Insert len wid) [src, cast e, idx'])
+ where
+ cast :: CmmExpr -> CmmExpr
+ cast val = case maybe_pre_write_cast of
+ Nothing -> val
+ Just cast -> CmmMachOp cast [val]
+
+ len :: Length
+ len = vecLength ty
+
+ wid :: Width
+ wid = typeWidth (vecElemType ty)
+
-- ----------------------------------------------------------------------------
-- Copying byte arrays
diff --git a/compiler/llvmGen/LlvmCodeGen/CodeGen.hs b/compiler/llvmGen/LlvmCodeGen/CodeGen.hs
index b7e085116d..cd864ca1a2 100644
--- a/compiler/llvmGen/LlvmCodeGen/CodeGen.hs
+++ b/compiler/llvmGen/LlvmCodeGen/CodeGen.hs
@@ -879,6 +879,13 @@ genMachOp env _ op [x] = case op of
MO_FF_Conv from to
-> sameConv from (widthToLlvmFloat to) LM_Fptrunc LM_Fpext
+ MO_VF_Neg len w ->
+ let ty = widthToLlvmFloat w
+ vecty = LMVector len ty
+ all0 = LMFloatLit (-0) ty
+ all0s = LMLitVar $ LMVectorLit (replicate len all0)
+ in negate vecty all0s LM_MO_FSub
+
-- Handle unsupported cases explicitly so we get a warning
-- of missing case when new MachOps added
MO_Add _ -> panicOp
@@ -919,6 +926,14 @@ genMachOp env _ op [x] = case op of
MO_Shl _ -> panicOp
MO_U_Shr _ -> panicOp
MO_S_Shr _ -> panicOp
+
+ MO_V_Insert _ _ -> panicOp
+ MO_V_Extract _ _ -> panicOp
+
+ MO_VF_Add _ _ -> panicOp
+ MO_VF_Sub _ _ -> panicOp
+ MO_VF_Mul _ _ -> panicOp
+ MO_VF_Quot _ _ -> panicOp
where
dflags = getDflags env
@@ -984,6 +999,24 @@ genMachOp_fast env opt op r n e
-- This handles all the cases not handle by the specialised genMachOp_fast.
genMachOp_slow :: LlvmEnv -> EOption -> MachOp -> [CmmExpr] -> UniqSM ExprData
+-- Element extraction
+genMachOp_slow env _ (MO_V_Extract {}) [val, idx] = do
+ (env1, vval, stmts1, top1) <- exprToVar env val
+ (env2, vidx, stmts2, top2) <- exprToVar env1 idx
+ let (LMVector _ ty) = getVarType vval
+ (v1, s1) <- doExpr ty $ Extract vval vidx
+ return (env2, v1, stmts1 `appOL` stmts2 `snocOL` s1, top1 ++ top2)
+
+-- Element insertion
+genMachOp_slow env _ (MO_V_Insert {}) [val, elt, idx] = do
+ (env1, vval, stmts1, top1) <- exprToVar env val
+ (env2, velt, stmts2, top2) <- exprToVar env1 elt
+ (env3, vidx, stmts3, top3) <- exprToVar env2 idx
+ let ty = getVarType vval
+ (v1, s1) <- doExpr ty $ Insert vval velt vidx
+ return (env3, v1, stmts1 `appOL` stmts2 `appOL` stmts3 `snocOL` s1,
+ top1 ++ top2 ++ top3)
+
-- Binary MachOp
genMachOp_slow env opt op [x, y] = case op of
@@ -1032,6 +1065,11 @@ genMachOp_slow env opt op [x, y] = case op of
MO_Shl _ -> genBinMach LM_MO_Shl
MO_U_Shr _ -> genBinMach LM_MO_LShr
MO_S_Shr _ -> genBinMach LM_MO_AShr
+
+ MO_VF_Add _ _ -> genBinMach LM_MO_FAdd
+ MO_VF_Sub _ _ -> genBinMach LM_MO_FSub
+ MO_VF_Mul _ _ -> genBinMach LM_MO_FMul
+ MO_VF_Quot _ _ -> genBinMach LM_MO_FDiv
MO_Not _ -> panicOp
MO_S_Neg _ -> panicOp
@@ -1043,6 +1081,11 @@ genMachOp_slow env opt op [x, y] = case op of
MO_UU_Conv _ _ -> panicOp
MO_FF_Conv _ _ -> panicOp
+ MO_V_Insert {} -> panicOp
+ MO_V_Extract {} -> panicOp
+
+ MO_VF_Neg {} -> panicOp
+
where
dflags = getDflags env
diff --git a/compiler/nativeGen/X86/CodeGen.hs b/compiler/nativeGen/X86/CodeGen.hs
index 5d904204ac..46e8e9b81f 100644
--- a/compiler/nativeGen/X86/CodeGen.hs
+++ b/compiler/nativeGen/X86/CodeGen.hs
@@ -602,6 +602,14 @@ getRegister' dflags is32Bit (CmmMachOp mop [x]) = do -- unary MachOps
MO_FS_Conv from to -> coerceFP2Int from to x
MO_SF_Conv from to -> coerceInt2FP from to x
+ MO_V_Insert {} -> needLlvm
+ MO_V_Extract {} -> needLlvm
+ MO_VF_Add {} -> needLlvm
+ MO_VF_Sub {} -> needLlvm
+ MO_VF_Mul {} -> needLlvm
+ MO_VF_Quot {} -> needLlvm
+ MO_VF_Neg {} -> needLlvm
+
_other -> pprPanic "getRegister" (pprMachOp mop)
where
triv_ucode :: (Size -> Operand -> Instr) -> Size -> NatM Register
@@ -694,6 +702,14 @@ getRegister' _ is32Bit (CmmMachOp mop [x, y]) = do -- dyadic MachOps
MO_U_Shr rep -> shift_code rep SHR x y {-False-}
MO_S_Shr rep -> shift_code rep SAR x y {-False-}
+ MO_V_Insert {} -> needLlvm
+ MO_V_Extract {} -> needLlvm
+ MO_VF_Add {} -> needLlvm
+ MO_VF_Sub {} -> needLlvm
+ MO_VF_Mul {} -> needLlvm
+ MO_VF_Quot {} -> needLlvm
+ MO_VF_Neg {} -> needLlvm
+
_other -> pprPanic "getRegister(x86) - binary CmmMachOp (1)" (pprMachOp mop)
where
--------------------
@@ -884,7 +900,9 @@ getRegister' dflags _ (CmmLit lit)
code dst = unitOL (MOV size (OpImm imm) (OpReg dst))
return (Any size code)
-getRegister' _ _ other = pprPanic "getRegister(x86)" (ppr other)
+getRegister' _ _ other
+ | isVecExpr other = needLlvm
+ | otherwise = pprPanic "getRegister(x86)" (ppr other)
intLoadCode :: (Operand -> Operand -> Instr) -> CmmExpr
@@ -2690,3 +2708,19 @@ sse2NegCode w x = do
]
--
return (Any sz code)
+
+isVecExpr :: CmmExpr -> Bool
+isVecExpr (CmmMachOp (MO_V_Insert {}) _) = True
+isVecExpr (CmmMachOp (MO_V_Extract {}) _) = True
+isVecExpr (CmmMachOp (MO_VF_Add {}) _) = True
+isVecExpr (CmmMachOp (MO_VF_Sub {}) _) = True
+isVecExpr (CmmMachOp (MO_VF_Mul {}) _) = True
+isVecExpr (CmmMachOp (MO_VF_Quot {}) _) = True
+isVecExpr (CmmMachOp (MO_VF_Neg {}) _) = True
+isVecExpr (CmmMachOp _ [e]) = isVecExpr e
+isVecExpr _ = False
+
+needLlvm :: NatM a
+needLlvm =
+ sorry $ unlines ["The native code generator does not support vector"
+ ,"instructions. Please use -fllvm."]
diff --git a/compiler/prelude/PrelNames.lhs b/compiler/prelude/PrelNames.lhs
index 261d10295f..961a823436 100644
--- a/compiler/prelude/PrelNames.lhs
+++ b/compiler/prelude/PrelNames.lhs
@@ -1420,6 +1420,11 @@ typeNatAddTyFamNameKey = mkPreludeTyConUnique 162
typeNatMulTyFamNameKey = mkPreludeTyConUnique 163
typeNatExpTyFamNameKey = mkPreludeTyConUnique 164
+-- SIMD vector types (Unique keys)
+floatX4PrimTyConKey :: Unique
+
+floatX4PrimTyConKey = mkPreludeTyConUnique 170
+
---------------- Template Haskell -------------------
-- USES TyConUniques 200-299
-----------------------------------------------------
diff --git a/compiler/prelude/TysPrim.lhs b/compiler/prelude/TysPrim.lhs
index 44ba035dc0..960a27b276 100644
--- a/compiler/prelude/TysPrim.lhs
+++ b/compiler/prelude/TysPrim.lhs
@@ -73,7 +73,10 @@ module TysPrim(
eqPrimTyCon, -- ty1 ~# ty2
-- * Any
- anyTy, anyTyCon, anyTypeOfKind
+ anyTy, anyTyCon, anyTypeOfKind,
+
+ -- * SIMD
+ floatX4PrimTyCon, floatX4PrimTy
) where
#include "HsVersions.h"
@@ -135,6 +138,8 @@ primTyCons
, constraintKindTyCon
, superKindTyCon
, anyKindTyCon
+
+ , floatX4PrimTyCon
]
mkPrimTc :: FastString -> Unique -> TyCon -> Name
@@ -144,7 +149,7 @@ mkPrimTc fs unique tycon
(ATyCon tycon) -- Relevant TyCon
UserSyntax -- None are built-in syntax
-charPrimTyConName, intPrimTyConName, int32PrimTyConName, int64PrimTyConName, wordPrimTyConName, word32PrimTyConName, word64PrimTyConName, addrPrimTyConName, floatPrimTyConName, doublePrimTyConName, statePrimTyConName, realWorldTyConName, arrayPrimTyConName, arrayArrayPrimTyConName, byteArrayPrimTyConName, mutableArrayPrimTyConName, mutableByteArrayPrimTyConName, mutableArrayArrayPrimTyConName, mutVarPrimTyConName, mVarPrimTyConName, tVarPrimTyConName, stablePtrPrimTyConName, stableNamePrimTyConName, bcoPrimTyConName, weakPrimTyConName, threadIdPrimTyConName, eqPrimTyConName :: Name
+charPrimTyConName, intPrimTyConName, int32PrimTyConName, int64PrimTyConName, wordPrimTyConName, word32PrimTyConName, word64PrimTyConName, addrPrimTyConName, floatPrimTyConName, doublePrimTyConName, statePrimTyConName, realWorldTyConName, arrayPrimTyConName, arrayArrayPrimTyConName, byteArrayPrimTyConName, mutableArrayPrimTyConName, mutableByteArrayPrimTyConName, mutableArrayArrayPrimTyConName, mutVarPrimTyConName, mVarPrimTyConName, tVarPrimTyConName, stablePtrPrimTyConName, stableNamePrimTyConName, bcoPrimTyConName, weakPrimTyConName, threadIdPrimTyConName, eqPrimTyConName, floatX4PrimTyConName :: Name
charPrimTyConName = mkPrimTc (fsLit "Char#") charPrimTyConKey charPrimTyCon
intPrimTyConName = mkPrimTc (fsLit "Int#") intPrimTyConKey intPrimTyCon
int32PrimTyConName = mkPrimTc (fsLit "Int32#") int32PrimTyConKey int32PrimTyCon
@@ -172,6 +177,7 @@ stableNamePrimTyConName = mkPrimTc (fsLit "StableName#") stableNamePrimTyC
bcoPrimTyConName = mkPrimTc (fsLit "BCO#") bcoPrimTyConKey bcoPrimTyCon
weakPrimTyConName = mkPrimTc (fsLit "Weak#") weakPrimTyConKey weakPrimTyCon
threadIdPrimTyConName = mkPrimTc (fsLit "ThreadId#") threadIdPrimTyConKey threadIdPrimTyCon
+floatX4PrimTyConName = mkPrimTc (fsLit "FloatX4#") floatX4PrimTyConKey floatX4PrimTyCon
\end{code}
%************************************************************************
@@ -729,3 +735,16 @@ anyTyCon = mkSynTyCon anyTyConName kind [kKiVar]
anyTypeOfKind :: Kind -> Type
anyTypeOfKind kind = mkNakedTyConApp anyTyCon [kind]
\end{code}
+
+%************************************************************************
+%* *
+\subsection{SIMD vector type}
+%* *
+%************************************************************************
+
+\begin{code}
+floatX4PrimTy :: Type
+floatX4PrimTy = mkTyConTy floatX4PrimTyCon
+floatX4PrimTyCon :: TyCon
+floatX4PrimTyCon = pcPrimTyCon0 floatX4PrimTyConName (VecRep 4 FloatElemRep)
+\end{code}
diff --git a/compiler/prelude/primops.txt.pp b/compiler/prelude/primops.txt.pp
index 6d551d90e5..9cdda0ec8f 100644
--- a/compiler/prelude/primops.txt.pp
+++ b/compiler/prelude/primops.txt.pp
@@ -2202,6 +2202,101 @@ primop TraceMarkerOp "traceMarker#" GenPrimOp
------------------------------------------------------------------------
+section "Float SIMD Vectors"
+ {Operations on SIMD vectors of 4 single-precision (32-bit)
+ floating-point numbers.}
+------------------------------------------------------------------------
+
+primtype FloatX4#
+
+primop FloatToFloatX4Op "floatToFloatX4#" GenPrimOp
+ Float# -> FloatX4#
+
+primop FloatX4PackOp "packFloatX4#" GenPrimOp
+ Float# -> Float# -> Float# -> Float# -> FloatX4#
+
+primop FloatX4UnpackOp "unpackFloatX4#" GenPrimOp
+ FloatX4# -> (# Float#, Float#, Float#, Float# #)
+
+primop FloatX4InsertOp "insertFloatX4#" GenPrimOp
+ FloatX4# -> Float# -> Int# -> FloatX4#
+ with can_fail = True
+
+primop FloatX4AddOp "plusFloatX4#" Dyadic
+ FloatX4# -> FloatX4# -> FloatX4#
+ with commutable = True
+
+primop FloatX4SubOp "minusFloatX4#" Dyadic
+ FloatX4# -> FloatX4# -> FloatX4#
+
+primop FloatX4MulOp "timesFloatX4#" Dyadic
+ FloatX4# -> FloatX4# -> FloatX4#
+ with commutable = True
+
+primop FloatX4DivOp "divideFloatX4#" Dyadic
+ FloatX4# -> FloatX4# -> FloatX4#
+ with can_fail = True
+
+primop FloatX4NegOp "negateFloatX4#" Monadic
+ FloatX4# -> FloatX4#
+
+primop IndexByteArrayOp_FloatX4 "indexFloatX4Array#" GenPrimOp
+ ByteArray# -> Int# -> FloatX4#
+ with can_fail = True
+
+primop ReadByteArrayOp_FloatX4 "readFloatX4Array#" GenPrimOp
+ MutableByteArray# s -> Int# -> State# s -> (# State# s, FloatX4# #)
+ with has_side_effects = True
+ can_fail = True
+
+primop WriteByteArrayOp_FloatX4 "writeFloatX4Array#" GenPrimOp
+ MutableByteArray# s -> Int# -> FloatX4# -> State# s -> State# s
+ with has_side_effects = True
+ can_fail = True
+
+primop IndexOffAddrOp_FloatX4 "indexFloatX4OffAddr#" GenPrimOp
+ Addr# -> Int# -> FloatX4#
+ with can_fail = True
+
+primop ReadOffAddrOp_FloatX4 "readFloatX4OffAddr#" GenPrimOp
+ Addr# -> Int# -> State# s -> (# State# s, FloatX4# #)
+ with has_side_effects = True
+ can_fail = True
+
+primop WriteOffAddrOp_FloatX4 "writeFloatX4OffAddr#" GenPrimOp
+ Addr# -> Int# -> FloatX4# -> State# s -> State# s
+ with has_side_effects = True
+ can_fail = True
+
+primop IndexByteArrayOp_FloatAsFloatX4 "indexFloatArrayAsFloatX4#" GenPrimOp
+ ByteArray# -> Int# -> FloatX4#
+ with can_fail = True
+
+primop ReadByteArrayOp_FloatAsFloatX4 "readFloatArrayAsFloatX4#" GenPrimOp
+ MutableByteArray# s -> Int# -> State# s -> (# State# s, FloatX4# #)
+ with has_side_effects = True
+ can_fail = True
+
+primop WriteByteArrayOp_FloatAsFloatX4 "writeFloatArrayAsFloatX4#" GenPrimOp
+ MutableByteArray# s -> Int# -> FloatX4# -> State# s -> State# s
+ with has_side_effects = True
+ can_fail = True
+
+primop IndexOffAddrOp_FloatAsFloatX4 "indexFloatOffAddrAsFloatX4#" GenPrimOp
+ Addr# -> Int# -> FloatX4#
+ with can_fail = True
+
+primop ReadOffAddrOp_FloatAsFloatX4 "readFloatOffAddrAsFloatX4#" GenPrimOp
+ Addr# -> Int# -> State# s -> (# State# s, FloatX4# #)
+ with has_side_effects = True
+ can_fail = True
+
+primop WriteOffAddrOp_FloatAsFloatX4 "writeFloatOffAddrAsFloatX4#" GenPrimOp
+ Addr# -> Int# -> FloatX4# -> State# s -> State# s
+ with has_side_effects = True
+ can_fail = True
+
+------------------------------------------------------------------------
--- ---
------------------------------------------------------------------------
diff --git a/utils/genprimopcode/Main.hs b/utils/genprimopcode/Main.hs
index debdd27102..27368f3ae7 100644
--- a/utils/genprimopcode/Main.hs
+++ b/utils/genprimopcode/Main.hs
@@ -502,20 +502,27 @@ gen_latex_doc (Info defaults entries)
gen_wrappers :: Info -> String
gen_wrappers (Info _ entries)
- = "{-# LANGUAGE NoImplicitPrelude, UnboxedTuples #-}\n"
+ = "{-# LANGUAGE CPP, NoImplicitPrelude, UnboxedTuples #-}\n"
-- Dependencies on Prelude must be explicit in libraries/base, but we
-- don't need the Prelude here so we add NoImplicitPrelude.
++ "module GHC.PrimopWrappers where\n"
++ "import qualified GHC.Prim\n"
++ "import GHC.Types (Bool)\n"
++ "import GHC.Tuple ()\n"
- ++ "import GHC.Prim (" ++ types ++ ")\n"
- ++ unlines (concatMap f specs)
+ ++ "import GHC.Prim (" ++ concat (intersperse ", " othertycons) ++ ")\n"
+ ++ "#if defined (__GLASGOW_HASKELL_LLVM__)\n"
+ ++ "import GHC.Prim (" ++ concat (intersperse ", " vectycons) ++ ")\n"
+ ++ "#endif /* defined (__GLASGOW_HASKELL_LLVM__) */\n"
+ ++ unlines (concatMap f otherspecs)
+ ++ "#if defined (__GLASGOW_HASKELL_LLVM__)\n"
+ ++ unlines (concatMap f vecspecs)
+ ++ "#endif /* defined (__GLASGOW_HASKELL_LLVM__) */\n"
where
specs = filter (not.dodgy) (filter is_primop entries)
+ (vecspecs, otherspecs) = partition (llvmOnlyTy . ty) specs
tycons = foldr union [] $ map (tyconsIn . ty) specs
- tycons' = filter (`notElem` ["()", "Bool"]) tycons
- types = concat $ intersperse ", " tycons'
+ (vectycons, othertycons) =
+ (partition llvmOnlyTyCon . filter (`notElem` ["()", "Bool"])) tycons
f spec = let args = map (\n -> "a" ++ show n) [1 .. arity (ty spec)]
src_name = wrap (name spec)
lhs = src_name ++ " " ++ unwords args
@@ -536,6 +543,16 @@ gen_wrappers (Info _ entries)
"parAtAbs#", "parAtRel#", "parAtForNow#"
]
+ llvmOnlyTy :: Ty -> Bool
+ llvmOnlyTy (TyF ty1 ty2) = llvmOnlyTy ty1 || llvmOnlyTy ty2
+ llvmOnlyTy (TyApp tycon tys) = llvmOnlyTyCon tycon || any llvmOnlyTy tys
+ llvmOnlyTy (TyVar _) = False
+ llvmOnlyTy (TyUTup tys) = any llvmOnlyTy tys
+
+ llvmOnlyTyCon :: TyCon -> Bool
+ llvmOnlyTyCon "FloatX4#" = True
+ llvmOnlyTyCon _ = False
+
gen_primop_list :: Info -> String
gen_primop_list (Info _ entries)
= unlines (
@@ -653,6 +670,7 @@ ppType (TyApp "Word64#" []) = "word64PrimTy"
ppType (TyApp "Addr#" []) = "addrPrimTy"
ppType (TyApp "Float#" []) = "floatPrimTy"
ppType (TyApp "Double#" []) = "doublePrimTy"
+ppType (TyApp "FloatX4#" []) = "floatX4PrimTy"
ppType (TyApp "ByteArray#" []) = "byteArrayPrimTy"
ppType (TyApp "RealWorld" []) = "realWorldTy"
ppType (TyApp "ThreadId#" []) = "threadIdPrimTy"