diff options
author | Geoffrey Mainland <gmainlan@microsoft.com> | 2012-10-19 09:06:17 +0100 |
---|---|---|
committer | Geoffrey Mainland <gmainlan@microsoft.com> | 2013-02-01 22:00:24 +0000 |
commit | 4af62075bbe9e96a3678fc90288496e0c4c7c17d (patch) | |
tree | 3df4fa03089310cd66678681a4ce78dd39bea25f | |
parent | 6480a35c15717025c169980b1cc763a7e6f36056 (diff) | |
download | haskell-4af62075bbe9e96a3678fc90288496e0c4c7c17d.tar.gz |
Add the Float32X4# primitive type and associated primops.
This patch lays the groundwork needed for primop support for SIMD vectors. In
addition to the groundwork, we add support for the FloatX4# primitive type and
associated primops.
* Add the FloatX4# primitive type and associated primops.
* Add CodeGen support for Float vectors.
* Compile vector operations to LLVM vector operations in the LLVM code
generator.
* Make the x86 native backend fail gracefully when encountering vector primops.
* Only generate primop wrappers for vector primops when using LLVM.
-rw-r--r-- | compiler/cmm/CmmMachOp.hs | 29 | ||||
-rw-r--r-- | compiler/cmm/PprC.hs | 30 | ||||
-rw-r--r-- | compiler/codeGen/StgCmmPrim.hs | 474 | ||||
-rw-r--r-- | compiler/llvmGen/LlvmCodeGen/CodeGen.hs | 43 | ||||
-rw-r--r-- | compiler/nativeGen/X86/CodeGen.hs | 36 | ||||
-rw-r--r-- | compiler/prelude/PrelNames.lhs | 5 | ||||
-rw-r--r-- | compiler/prelude/TysPrim.lhs | 23 | ||||
-rw-r--r-- | compiler/prelude/primops.txt.pp | 95 | ||||
-rw-r--r-- | utils/genprimopcode/Main.hs | 28 |
9 files changed, 618 insertions, 145 deletions
diff --git a/compiler/cmm/CmmMachOp.hs b/compiler/cmm/CmmMachOp.hs index a6c9beebc4..304dfb0938 100644 --- a/compiler/cmm/CmmMachOp.hs +++ b/compiler/cmm/CmmMachOp.hs @@ -103,6 +103,17 @@ data MachOp | MO_SS_Conv Width Width -- Signed int -> Signed int | MO_UU_Conv Width Width -- unsigned int -> unsigned int | MO_FF_Conv Width Width -- Float -> Float + + -- Vector element insertion and extraction operations + | MO_V_Insert Length Width -- Insert scalar into vector + | MO_V_Extract Length Width -- Extract scalar from vector + + -- Floating point vector operations + | MO_VF_Add Length Width + | MO_VF_Sub Length Width + | MO_VF_Neg Length Width -- unary - + | MO_VF_Mul Length Width + | MO_VF_Quot Length Width deriving (Eq, Show) pprMachOp :: MachOp -> SDoc @@ -338,6 +349,15 @@ machOpResultType dflags mop tys = MO_FS_Conv _ to -> cmmBits to MO_SF_Conv _ to -> cmmFloat to MO_FF_Conv _ to -> cmmFloat to + + MO_V_Insert {} -> ty1 + MO_V_Extract {} -> vecElemType ty1 + + MO_VF_Add {} -> ty1 + MO_VF_Sub {} -> ty1 + MO_VF_Mul {} -> ty1 + MO_VF_Quot {} -> ty1 + MO_VF_Neg {} -> ty1 where (ty1:_) = tys @@ -405,6 +425,15 @@ machOpArgReps dflags op = MO_FS_Conv from _ -> [from] MO_FF_Conv from _ -> [from] + MO_V_Insert l r -> [typeWidth (vec l (cmmFloat r)),r,wordWidth dflags] + MO_V_Extract l r -> [typeWidth (vec l (cmmFloat r)),wordWidth dflags] + + MO_VF_Add _ r -> [r,r] + MO_VF_Sub _ r -> [r,r] + MO_VF_Mul _ r -> [r,r] + MO_VF_Quot _ r -> [r,r] + MO_VF_Neg _ r -> [r] + ----------------------------------------------------------------------------- -- CallishMachOp ----------------------------------------------------------------------------- diff --git a/compiler/cmm/PprC.hs b/compiler/cmm/PprC.hs index 2ca8b67d72..b714e834b6 100644 --- a/compiler/cmm/PprC.hs +++ b/compiler/cmm/PprC.hs @@ -626,6 +626,36 @@ pprMachOp_for_C mop = case mop of (panic $ "PprC.pprMachOp_for_C: MO_U_MulMayOflo" ++ " should have been handled earlier!") + MO_V_Insert {} -> pprTrace "offending mop:" + (ptext $ sLit "MO_V_Insert") + (panic $ "PprC.pprMachOp_for_C: MO_V_Insert" + ++ " should have been handled earlier!") + MO_V_Extract {} -> pprTrace "offending mop:" + (ptext $ sLit "MO_V_Extract") + (panic $ "PprC.pprMachOp_for_C: MO_V_Extract" + ++ " should have been handled earlier!") + + MO_VF_Add {} -> pprTrace "offending mop:" + (ptext $ sLit "MO_VF_Add") + (panic $ "PprC.pprMachOp_for_C: MO_VF_Add" + ++ " should have been handled earlier!") + MO_VF_Sub {} -> pprTrace "offending mop:" + (ptext $ sLit "MO_VF_Sub") + (panic $ "PprC.pprMachOp_for_C: MO_VF_Sub" + ++ " should have been handled earlier!") + MO_VF_Neg {} -> pprTrace "offending mop:" + (ptext $ sLit "MO_VF_Neg") + (panic $ "PprC.pprMachOp_for_C: MO_VF_Neg" + ++ " should have been handled earlier!") + MO_VF_Mul {} -> pprTrace "offending mop:" + (ptext $ sLit "MO_VF_Mul") + (panic $ "PprC.pprMachOp_for_C: MO_VF_Mul" + ++ " should have been handled earlier!") + MO_VF_Quot {} -> pprTrace "offending mop:" + (ptext $ sLit "MO_VF_Quot") + (panic $ "PprC.pprMachOp_for_C: MO_VF_Quot" + ++ " should have been handled earlier!") + signedOp :: MachOp -> Bool -- Argument type(s) are signed ints signedOp (MO_S_Quot _) = True signedOp (MO_S_Rem _) = True diff --git a/compiler/codeGen/StgCmmPrim.hs b/compiler/codeGen/StgCmmPrim.hs index 986286647b..9a583b8354 100644 --- a/compiler/codeGen/StgCmmPrim.hs +++ b/compiler/codeGen/StgCmmPrim.hs @@ -365,117 +365,129 @@ emitPrimOp dflags [res] SizeofMutableArrayArrayOp [arg] -- IndexXXXoffAddr -emitPrimOp dflags res IndexOffAddrOp_Char args = doIndexOffAddrOp (Just (mo_u_8ToWord dflags)) b8 res args -emitPrimOp dflags res IndexOffAddrOp_WideChar args = doIndexOffAddrOp (Just (mo_u_32ToWord dflags)) b32 res args -emitPrimOp dflags res IndexOffAddrOp_Int args = doIndexOffAddrOp Nothing (bWord dflags) res args -emitPrimOp dflags res IndexOffAddrOp_Word args = doIndexOffAddrOp Nothing (bWord dflags) res args -emitPrimOp dflags res IndexOffAddrOp_Addr args = doIndexOffAddrOp Nothing (bWord dflags) res args -emitPrimOp _ res IndexOffAddrOp_Float args = doIndexOffAddrOp Nothing f32 res args -emitPrimOp _ res IndexOffAddrOp_Double args = doIndexOffAddrOp Nothing f64 res args -emitPrimOp dflags res IndexOffAddrOp_StablePtr args = doIndexOffAddrOp Nothing (bWord dflags) res args -emitPrimOp dflags res IndexOffAddrOp_Int8 args = doIndexOffAddrOp (Just (mo_s_8ToWord dflags)) b8 res args -emitPrimOp dflags res IndexOffAddrOp_Int16 args = doIndexOffAddrOp (Just (mo_s_16ToWord dflags)) b16 res args -emitPrimOp dflags res IndexOffAddrOp_Int32 args = doIndexOffAddrOp (Just (mo_s_32ToWord dflags)) b32 res args -emitPrimOp _ res IndexOffAddrOp_Int64 args = doIndexOffAddrOp Nothing b64 res args -emitPrimOp dflags res IndexOffAddrOp_Word8 args = doIndexOffAddrOp (Just (mo_u_8ToWord dflags)) b8 res args -emitPrimOp dflags res IndexOffAddrOp_Word16 args = doIndexOffAddrOp (Just (mo_u_16ToWord dflags)) b16 res args -emitPrimOp dflags res IndexOffAddrOp_Word32 args = doIndexOffAddrOp (Just (mo_u_32ToWord dflags)) b32 res args -emitPrimOp _ res IndexOffAddrOp_Word64 args = doIndexOffAddrOp Nothing b64 res args +emitPrimOp dflags res IndexOffAddrOp_Char args = doIndexOffAddrOp (Just (mo_u_8ToWord dflags)) b8 res args +emitPrimOp dflags res IndexOffAddrOp_WideChar args = doIndexOffAddrOp (Just (mo_u_32ToWord dflags)) b32 res args +emitPrimOp dflags res IndexOffAddrOp_Int args = doIndexOffAddrOp Nothing (bWord dflags) res args +emitPrimOp dflags res IndexOffAddrOp_Word args = doIndexOffAddrOp Nothing (bWord dflags) res args +emitPrimOp dflags res IndexOffAddrOp_Addr args = doIndexOffAddrOp Nothing (bWord dflags) res args +emitPrimOp _ res IndexOffAddrOp_Float args = doIndexOffAddrOp Nothing f32 res args +emitPrimOp _ res IndexOffAddrOp_Double args = doIndexOffAddrOp Nothing f64 res args +emitPrimOp dflags res IndexOffAddrOp_StablePtr args = doIndexOffAddrOp Nothing (bWord dflags) res args +emitPrimOp dflags res IndexOffAddrOp_Int8 args = doIndexOffAddrOp (Just (mo_s_8ToWord dflags)) b8 res args +emitPrimOp dflags res IndexOffAddrOp_Int16 args = doIndexOffAddrOp (Just (mo_s_16ToWord dflags)) b16 res args +emitPrimOp dflags res IndexOffAddrOp_Int32 args = doIndexOffAddrOp (Just (mo_s_32ToWord dflags)) b32 res args +emitPrimOp _ res IndexOffAddrOp_Int64 args = doIndexOffAddrOp Nothing b64 res args +emitPrimOp dflags res IndexOffAddrOp_Word8 args = doIndexOffAddrOp (Just (mo_u_8ToWord dflags)) b8 res args +emitPrimOp dflags res IndexOffAddrOp_Word16 args = doIndexOffAddrOp (Just (mo_u_16ToWord dflags)) b16 res args +emitPrimOp dflags res IndexOffAddrOp_Word32 args = doIndexOffAddrOp (Just (mo_u_32ToWord dflags)) b32 res args +emitPrimOp _ res IndexOffAddrOp_Word64 args = doIndexOffAddrOp Nothing b64 res args +emitPrimOp _ res IndexOffAddrOp_FloatX4 args = doIndexOffAddrOp Nothing vec4f32 res args +emitPrimOp _ res IndexOffAddrOp_FloatAsFloatX4 args = doIndexOffAddrOpAs Nothing vec4f32 f32 res args -- ReadXXXoffAddr, which are identical, for our purposes, to IndexXXXoffAddr. -emitPrimOp dflags res ReadOffAddrOp_Char args = doIndexOffAddrOp (Just (mo_u_8ToWord dflags)) b8 res args -emitPrimOp dflags res ReadOffAddrOp_WideChar args = doIndexOffAddrOp (Just (mo_u_32ToWord dflags)) b32 res args -emitPrimOp dflags res ReadOffAddrOp_Int args = doIndexOffAddrOp Nothing (bWord dflags) res args -emitPrimOp dflags res ReadOffAddrOp_Word args = doIndexOffAddrOp Nothing (bWord dflags) res args -emitPrimOp dflags res ReadOffAddrOp_Addr args = doIndexOffAddrOp Nothing (bWord dflags) res args -emitPrimOp _ res ReadOffAddrOp_Float args = doIndexOffAddrOp Nothing f32 res args -emitPrimOp _ res ReadOffAddrOp_Double args = doIndexOffAddrOp Nothing f64 res args -emitPrimOp dflags res ReadOffAddrOp_StablePtr args = doIndexOffAddrOp Nothing (bWord dflags) res args -emitPrimOp dflags res ReadOffAddrOp_Int8 args = doIndexOffAddrOp (Just (mo_s_8ToWord dflags)) b8 res args -emitPrimOp dflags res ReadOffAddrOp_Int16 args = doIndexOffAddrOp (Just (mo_s_16ToWord dflags)) b16 res args -emitPrimOp dflags res ReadOffAddrOp_Int32 args = doIndexOffAddrOp (Just (mo_s_32ToWord dflags)) b32 res args -emitPrimOp _ res ReadOffAddrOp_Int64 args = doIndexOffAddrOp Nothing b64 res args -emitPrimOp dflags res ReadOffAddrOp_Word8 args = doIndexOffAddrOp (Just (mo_u_8ToWord dflags)) b8 res args -emitPrimOp dflags res ReadOffAddrOp_Word16 args = doIndexOffAddrOp (Just (mo_u_16ToWord dflags)) b16 res args -emitPrimOp dflags res ReadOffAddrOp_Word32 args = doIndexOffAddrOp (Just (mo_u_32ToWord dflags)) b32 res args -emitPrimOp _ res ReadOffAddrOp_Word64 args = doIndexOffAddrOp Nothing b64 res args +emitPrimOp dflags res ReadOffAddrOp_Char args = doIndexOffAddrOp (Just (mo_u_8ToWord dflags)) b8 res args +emitPrimOp dflags res ReadOffAddrOp_WideChar args = doIndexOffAddrOp (Just (mo_u_32ToWord dflags)) b32 res args +emitPrimOp dflags res ReadOffAddrOp_Int args = doIndexOffAddrOp Nothing (bWord dflags) res args +emitPrimOp dflags res ReadOffAddrOp_Word args = doIndexOffAddrOp Nothing (bWord dflags) res args +emitPrimOp dflags res ReadOffAddrOp_Addr args = doIndexOffAddrOp Nothing (bWord dflags) res args +emitPrimOp _ res ReadOffAddrOp_Float args = doIndexOffAddrOp Nothing f32 res args +emitPrimOp _ res ReadOffAddrOp_Double args = doIndexOffAddrOp Nothing f64 res args +emitPrimOp dflags res ReadOffAddrOp_StablePtr args = doIndexOffAddrOp Nothing (bWord dflags) res args +emitPrimOp dflags res ReadOffAddrOp_Int8 args = doIndexOffAddrOp (Just (mo_s_8ToWord dflags)) b8 res args +emitPrimOp dflags res ReadOffAddrOp_Int16 args = doIndexOffAddrOp (Just (mo_s_16ToWord dflags)) b16 res args +emitPrimOp dflags res ReadOffAddrOp_Int32 args = doIndexOffAddrOp (Just (mo_s_32ToWord dflags)) b32 res args +emitPrimOp _ res ReadOffAddrOp_Int64 args = doIndexOffAddrOp Nothing b64 res args +emitPrimOp dflags res ReadOffAddrOp_Word8 args = doIndexOffAddrOp (Just (mo_u_8ToWord dflags)) b8 res args +emitPrimOp dflags res ReadOffAddrOp_Word16 args = doIndexOffAddrOp (Just (mo_u_16ToWord dflags)) b16 res args +emitPrimOp dflags res ReadOffAddrOp_Word32 args = doIndexOffAddrOp (Just (mo_u_32ToWord dflags)) b32 res args +emitPrimOp _ res ReadOffAddrOp_Word64 args = doIndexOffAddrOp Nothing b64 res args +emitPrimOp _ res ReadOffAddrOp_FloatX4 args = doIndexOffAddrOp Nothing vec4f32 res args +emitPrimOp _ res ReadOffAddrOp_FloatAsFloatX4 args = doIndexOffAddrOpAs Nothing vec4f32 b32 res args -- IndexXXXArray -emitPrimOp dflags res IndexByteArrayOp_Char args = doIndexByteArrayOp (Just (mo_u_8ToWord dflags)) b8 res args -emitPrimOp dflags res IndexByteArrayOp_WideChar args = doIndexByteArrayOp (Just (mo_u_32ToWord dflags)) b32 res args -emitPrimOp dflags res IndexByteArrayOp_Int args = doIndexByteArrayOp Nothing (bWord dflags) res args -emitPrimOp dflags res IndexByteArrayOp_Word args = doIndexByteArrayOp Nothing (bWord dflags) res args -emitPrimOp dflags res IndexByteArrayOp_Addr args = doIndexByteArrayOp Nothing (bWord dflags) res args -emitPrimOp _ res IndexByteArrayOp_Float args = doIndexByteArrayOp Nothing f32 res args -emitPrimOp _ res IndexByteArrayOp_Double args = doIndexByteArrayOp Nothing f64 res args -emitPrimOp dflags res IndexByteArrayOp_StablePtr args = doIndexByteArrayOp Nothing (bWord dflags) res args -emitPrimOp dflags res IndexByteArrayOp_Int8 args = doIndexByteArrayOp (Just (mo_s_8ToWord dflags)) b8 res args -emitPrimOp dflags res IndexByteArrayOp_Int16 args = doIndexByteArrayOp (Just (mo_s_16ToWord dflags)) b16 res args -emitPrimOp dflags res IndexByteArrayOp_Int32 args = doIndexByteArrayOp (Just (mo_s_32ToWord dflags)) b32 res args -emitPrimOp _ res IndexByteArrayOp_Int64 args = doIndexByteArrayOp Nothing b64 res args -emitPrimOp dflags res IndexByteArrayOp_Word8 args = doIndexByteArrayOp (Just (mo_u_8ToWord dflags)) b8 res args -emitPrimOp dflags res IndexByteArrayOp_Word16 args = doIndexByteArrayOp (Just (mo_u_16ToWord dflags)) b16 res args -emitPrimOp dflags res IndexByteArrayOp_Word32 args = doIndexByteArrayOp (Just (mo_u_32ToWord dflags)) b32 res args -emitPrimOp _ res IndexByteArrayOp_Word64 args = doIndexByteArrayOp Nothing b64 res args +emitPrimOp dflags res IndexByteArrayOp_Char args = doIndexByteArrayOp (Just (mo_u_8ToWord dflags)) b8 res args +emitPrimOp dflags res IndexByteArrayOp_WideChar args = doIndexByteArrayOp (Just (mo_u_32ToWord dflags)) b32 res args +emitPrimOp dflags res IndexByteArrayOp_Int args = doIndexByteArrayOp Nothing (bWord dflags) res args +emitPrimOp dflags res IndexByteArrayOp_Word args = doIndexByteArrayOp Nothing (bWord dflags) res args +emitPrimOp dflags res IndexByteArrayOp_Addr args = doIndexByteArrayOp Nothing (bWord dflags) res args +emitPrimOp _ res IndexByteArrayOp_Float args = doIndexByteArrayOp Nothing f32 res args +emitPrimOp _ res IndexByteArrayOp_Double args = doIndexByteArrayOp Nothing f64 res args +emitPrimOp dflags res IndexByteArrayOp_StablePtr args = doIndexByteArrayOp Nothing (bWord dflags) res args +emitPrimOp dflags res IndexByteArrayOp_Int8 args = doIndexByteArrayOp (Just (mo_s_8ToWord dflags)) b8 res args +emitPrimOp dflags res IndexByteArrayOp_Int16 args = doIndexByteArrayOp (Just (mo_s_16ToWord dflags)) b16 res args +emitPrimOp dflags res IndexByteArrayOp_Int32 args = doIndexByteArrayOp (Just (mo_s_32ToWord dflags)) b32 res args +emitPrimOp _ res IndexByteArrayOp_Int64 args = doIndexByteArrayOp Nothing b64 res args +emitPrimOp dflags res IndexByteArrayOp_Word8 args = doIndexByteArrayOp (Just (mo_u_8ToWord dflags)) b8 res args +emitPrimOp dflags res IndexByteArrayOp_Word16 args = doIndexByteArrayOp (Just (mo_u_16ToWord dflags)) b16 res args +emitPrimOp dflags res IndexByteArrayOp_Word32 args = doIndexByteArrayOp (Just (mo_u_32ToWord dflags)) b32 res args +emitPrimOp _ res IndexByteArrayOp_Word64 args = doIndexByteArrayOp Nothing b64 res args +emitPrimOp _ res IndexByteArrayOp_FloatX4 args = doIndexByteArrayOp Nothing vec4f32 res args +emitPrimOp _ res IndexByteArrayOp_FloatAsFloatX4 args = doIndexByteArrayOpAs Nothing vec4f32 f32 res args -- ReadXXXArray, identical to IndexXXXArray. -emitPrimOp dflags res ReadByteArrayOp_Char args = doIndexByteArrayOp (Just (mo_u_8ToWord dflags)) b8 res args -emitPrimOp dflags res ReadByteArrayOp_WideChar args = doIndexByteArrayOp (Just (mo_u_32ToWord dflags)) b32 res args -emitPrimOp dflags res ReadByteArrayOp_Int args = doIndexByteArrayOp Nothing (bWord dflags) res args -emitPrimOp dflags res ReadByteArrayOp_Word args = doIndexByteArrayOp Nothing (bWord dflags) res args -emitPrimOp dflags res ReadByteArrayOp_Addr args = doIndexByteArrayOp Nothing (bWord dflags) res args -emitPrimOp _ res ReadByteArrayOp_Float args = doIndexByteArrayOp Nothing f32 res args -emitPrimOp _ res ReadByteArrayOp_Double args = doIndexByteArrayOp Nothing f64 res args -emitPrimOp dflags res ReadByteArrayOp_StablePtr args = doIndexByteArrayOp Nothing (bWord dflags) res args -emitPrimOp dflags res ReadByteArrayOp_Int8 args = doIndexByteArrayOp (Just (mo_s_8ToWord dflags)) b8 res args -emitPrimOp dflags res ReadByteArrayOp_Int16 args = doIndexByteArrayOp (Just (mo_s_16ToWord dflags)) b16 res args -emitPrimOp dflags res ReadByteArrayOp_Int32 args = doIndexByteArrayOp (Just (mo_s_32ToWord dflags)) b32 res args -emitPrimOp _ res ReadByteArrayOp_Int64 args = doIndexByteArrayOp Nothing b64 res args -emitPrimOp dflags res ReadByteArrayOp_Word8 args = doIndexByteArrayOp (Just (mo_u_8ToWord dflags)) b8 res args -emitPrimOp dflags res ReadByteArrayOp_Word16 args = doIndexByteArrayOp (Just (mo_u_16ToWord dflags)) b16 res args -emitPrimOp dflags res ReadByteArrayOp_Word32 args = doIndexByteArrayOp (Just (mo_u_32ToWord dflags)) b32 res args -emitPrimOp _ res ReadByteArrayOp_Word64 args = doIndexByteArrayOp Nothing b64 res args +emitPrimOp dflags res ReadByteArrayOp_Char args = doIndexByteArrayOp (Just (mo_u_8ToWord dflags)) b8 res args +emitPrimOp dflags res ReadByteArrayOp_WideChar args = doIndexByteArrayOp (Just (mo_u_32ToWord dflags)) b32 res args +emitPrimOp dflags res ReadByteArrayOp_Int args = doIndexByteArrayOp Nothing (bWord dflags) res args +emitPrimOp dflags res ReadByteArrayOp_Word args = doIndexByteArrayOp Nothing (bWord dflags) res args +emitPrimOp dflags res ReadByteArrayOp_Addr args = doIndexByteArrayOp Nothing (bWord dflags) res args +emitPrimOp _ res ReadByteArrayOp_Float args = doIndexByteArrayOp Nothing f32 res args +emitPrimOp _ res ReadByteArrayOp_Double args = doIndexByteArrayOp Nothing f64 res args +emitPrimOp dflags res ReadByteArrayOp_StablePtr args = doIndexByteArrayOp Nothing (bWord dflags) res args +emitPrimOp dflags res ReadByteArrayOp_Int8 args = doIndexByteArrayOp (Just (mo_s_8ToWord dflags)) b8 res args +emitPrimOp dflags res ReadByteArrayOp_Int16 args = doIndexByteArrayOp (Just (mo_s_16ToWord dflags)) b16 res args +emitPrimOp dflags res ReadByteArrayOp_Int32 args = doIndexByteArrayOp (Just (mo_s_32ToWord dflags)) b32 res args +emitPrimOp _ res ReadByteArrayOp_Int64 args = doIndexByteArrayOp Nothing b64 res args +emitPrimOp dflags res ReadByteArrayOp_Word8 args = doIndexByteArrayOp (Just (mo_u_8ToWord dflags)) b8 res args +emitPrimOp dflags res ReadByteArrayOp_Word16 args = doIndexByteArrayOp (Just (mo_u_16ToWord dflags)) b16 res args +emitPrimOp dflags res ReadByteArrayOp_Word32 args = doIndexByteArrayOp (Just (mo_u_32ToWord dflags)) b32 res args +emitPrimOp _ res ReadByteArrayOp_Word64 args = doIndexByteArrayOp Nothing b64 res args +emitPrimOp _ res ReadByteArrayOp_FloatX4 args = doIndexByteArrayOp Nothing vec4f32 res args +emitPrimOp _ res ReadByteArrayOp_FloatAsFloatX4 args = doIndexByteArrayOpAs Nothing vec4f32 f32 res args -- WriteXXXoffAddr -emitPrimOp dflags res WriteOffAddrOp_Char args = doWriteOffAddrOp (Just (mo_WordTo8 dflags)) res args -emitPrimOp dflags res WriteOffAddrOp_WideChar args = doWriteOffAddrOp (Just (mo_WordTo32 dflags)) res args -emitPrimOp _ res WriteOffAddrOp_Int args = doWriteOffAddrOp Nothing res args -emitPrimOp _ res WriteOffAddrOp_Word args = doWriteOffAddrOp Nothing res args -emitPrimOp _ res WriteOffAddrOp_Addr args = doWriteOffAddrOp Nothing res args -emitPrimOp _ res WriteOffAddrOp_Float args = doWriteOffAddrOp Nothing res args -emitPrimOp _ res WriteOffAddrOp_Double args = doWriteOffAddrOp Nothing res args -emitPrimOp _ res WriteOffAddrOp_StablePtr args = doWriteOffAddrOp Nothing res args -emitPrimOp dflags res WriteOffAddrOp_Int8 args = doWriteOffAddrOp (Just (mo_WordTo8 dflags)) res args -emitPrimOp dflags res WriteOffAddrOp_Int16 args = doWriteOffAddrOp (Just (mo_WordTo16 dflags)) res args -emitPrimOp dflags res WriteOffAddrOp_Int32 args = doWriteOffAddrOp (Just (mo_WordTo32 dflags)) res args -emitPrimOp _ res WriteOffAddrOp_Int64 args = doWriteOffAddrOp Nothing res args -emitPrimOp dflags res WriteOffAddrOp_Word8 args = doWriteOffAddrOp (Just (mo_WordTo8 dflags)) res args -emitPrimOp dflags res WriteOffAddrOp_Word16 args = doWriteOffAddrOp (Just (mo_WordTo16 dflags)) res args -emitPrimOp dflags res WriteOffAddrOp_Word32 args = doWriteOffAddrOp (Just (mo_WordTo32 dflags)) res args -emitPrimOp _ res WriteOffAddrOp_Word64 args = doWriteOffAddrOp Nothing res args +emitPrimOp dflags res WriteOffAddrOp_Char args = doWriteOffAddrOp (Just (mo_WordTo8 dflags)) b8 res args +emitPrimOp dflags res WriteOffAddrOp_WideChar args = doWriteOffAddrOp (Just (mo_WordTo32 dflags)) b32 res args +emitPrimOp dflags res WriteOffAddrOp_Int args = doWriteOffAddrOp Nothing (bWord dflags) res args +emitPrimOp dflags res WriteOffAddrOp_Word args = doWriteOffAddrOp Nothing (bWord dflags) res args +emitPrimOp dflags res WriteOffAddrOp_Addr args = doWriteOffAddrOp Nothing (bWord dflags) res args +emitPrimOp _ res WriteOffAddrOp_Float args = doWriteOffAddrOp Nothing f32 res args +emitPrimOp _ res WriteOffAddrOp_Double args = doWriteOffAddrOp Nothing f64 res args +emitPrimOp dflags res WriteOffAddrOp_StablePtr args = doWriteOffAddrOp Nothing (bWord dflags) res args +emitPrimOp dflags res WriteOffAddrOp_Int8 args = doWriteOffAddrOp (Just (mo_WordTo8 dflags)) b8 res args +emitPrimOp dflags res WriteOffAddrOp_Int16 args = doWriteOffAddrOp (Just (mo_WordTo16 dflags)) b16 res args +emitPrimOp dflags res WriteOffAddrOp_Int32 args = doWriteOffAddrOp (Just (mo_WordTo32 dflags)) b32 res args +emitPrimOp _ res WriteOffAddrOp_Int64 args = doWriteOffAddrOp Nothing b64 res args +emitPrimOp dflags res WriteOffAddrOp_Word8 args = doWriteOffAddrOp (Just (mo_WordTo8 dflags)) b8 res args +emitPrimOp dflags res WriteOffAddrOp_Word16 args = doWriteOffAddrOp (Just (mo_WordTo16 dflags)) b16 res args +emitPrimOp dflags res WriteOffAddrOp_Word32 args = doWriteOffAddrOp (Just (mo_WordTo32 dflags)) b32 res args +emitPrimOp _ res WriteOffAddrOp_Word64 args = doWriteOffAddrOp Nothing b64 res args +emitPrimOp _ res WriteOffAddrOp_FloatX4 args = doWriteOffAddrOp Nothing vec4f32 res args +emitPrimOp _ res WriteOffAddrOp_FloatAsFloatX4 args = doWriteOffAddrOp Nothing f32 res args -- WriteXXXArray -emitPrimOp dflags res WriteByteArrayOp_Char args = doWriteByteArrayOp (Just (mo_WordTo8 dflags)) res args -emitPrimOp dflags res WriteByteArrayOp_WideChar args = doWriteByteArrayOp (Just (mo_WordTo32 dflags)) res args -emitPrimOp _ res WriteByteArrayOp_Int args = doWriteByteArrayOp Nothing res args -emitPrimOp _ res WriteByteArrayOp_Word args = doWriteByteArrayOp Nothing res args -emitPrimOp _ res WriteByteArrayOp_Addr args = doWriteByteArrayOp Nothing res args -emitPrimOp _ res WriteByteArrayOp_Float args = doWriteByteArrayOp Nothing res args -emitPrimOp _ res WriteByteArrayOp_Double args = doWriteByteArrayOp Nothing res args -emitPrimOp _ res WriteByteArrayOp_StablePtr args = doWriteByteArrayOp Nothing res args -emitPrimOp dflags res WriteByteArrayOp_Int8 args = doWriteByteArrayOp (Just (mo_WordTo8 dflags)) res args -emitPrimOp dflags res WriteByteArrayOp_Int16 args = doWriteByteArrayOp (Just (mo_WordTo16 dflags)) res args -emitPrimOp dflags res WriteByteArrayOp_Int32 args = doWriteByteArrayOp (Just (mo_WordTo32 dflags)) res args -emitPrimOp _ res WriteByteArrayOp_Int64 args = doWriteByteArrayOp Nothing res args -emitPrimOp dflags res WriteByteArrayOp_Word8 args = doWriteByteArrayOp (Just (mo_WordTo8 dflags)) res args -emitPrimOp dflags res WriteByteArrayOp_Word16 args = doWriteByteArrayOp (Just (mo_WordTo16 dflags)) res args -emitPrimOp dflags res WriteByteArrayOp_Word32 args = doWriteByteArrayOp (Just (mo_WordTo32 dflags)) res args -emitPrimOp _ res WriteByteArrayOp_Word64 args = doWriteByteArrayOp Nothing res args +emitPrimOp dflags res WriteByteArrayOp_Char args = doWriteByteArrayOp (Just (mo_WordTo8 dflags)) b8 res args +emitPrimOp dflags res WriteByteArrayOp_WideChar args = doWriteByteArrayOp (Just (mo_WordTo32 dflags)) b32 res args +emitPrimOp dflags res WriteByteArrayOp_Int args = doWriteByteArrayOp Nothing (bWord dflags) res args +emitPrimOp dflags res WriteByteArrayOp_Word args = doWriteByteArrayOp Nothing (bWord dflags) res args +emitPrimOp dflags res WriteByteArrayOp_Addr args = doWriteByteArrayOp Nothing (bWord dflags) res args +emitPrimOp _ res WriteByteArrayOp_Float args = doWriteByteArrayOp Nothing f32 res args +emitPrimOp _ res WriteByteArrayOp_Double args = doWriteByteArrayOp Nothing f64 res args +emitPrimOp dflags res WriteByteArrayOp_StablePtr args = doWriteByteArrayOp Nothing (bWord dflags) res args +emitPrimOp dflags res WriteByteArrayOp_Int8 args = doWriteByteArrayOp (Just (mo_WordTo8 dflags)) b8 res args +emitPrimOp dflags res WriteByteArrayOp_Int16 args = doWriteByteArrayOp (Just (mo_WordTo16 dflags)) b16 res args +emitPrimOp dflags res WriteByteArrayOp_Int32 args = doWriteByteArrayOp (Just (mo_WordTo32 dflags)) b32 res args +emitPrimOp _ res WriteByteArrayOp_Int64 args = doWriteByteArrayOp Nothing b64 res args +emitPrimOp dflags res WriteByteArrayOp_Word8 args = doWriteByteArrayOp (Just (mo_WordTo8 dflags)) b8 res args +emitPrimOp dflags res WriteByteArrayOp_Word16 args = doWriteByteArrayOp (Just (mo_WordTo16 dflags)) b16 res args +emitPrimOp dflags res WriteByteArrayOp_Word32 args = doWriteByteArrayOp (Just (mo_WordTo32 dflags)) b32 res args +emitPrimOp _ res WriteByteArrayOp_Word64 args = doWriteByteArrayOp Nothing b64 res args +emitPrimOp _ res WriteByteArrayOp_FloatX4 args = doWriteByteArrayOp Nothing vec4f32 res args +emitPrimOp _ res WriteByteArrayOp_FloatAsFloatX4 args = doWriteByteArrayOp Nothing f32 res args -- Copying and setting byte arrays emitPrimOp _ [] CopyByteArrayOp [src,src_off,dst,dst_off,n] = @@ -498,6 +510,25 @@ emitPrimOp _ [res] Word2FloatOp [w] = emitPrimCall [res] emitPrimOp _ [res] Word2DoubleOp [w] = emitPrimCall [res] (MO_UF_Conv W64) [w] +-- SIMD vector packing and unpacking +emitPrimOp _ [res] FloatToFloatX4Op [e] = + doVecPackOp Nothing vec4f32 zero [e,e,e,e] res + where + zero :: CmmExpr + zero = CmmLit $ CmmVec (replicate 4 (CmmFloat 0 W32)) + +emitPrimOp _ [res] FloatX4PackOp es@[_,_,_,_] = + doVecPackOp Nothing vec4f32 zero es res + where + zero :: CmmExpr + zero = CmmLit $ CmmVec (replicate 4 (CmmFloat 0 W32)) + +emitPrimOp _ res@[_,_,_,_] FloatX4UnpackOp [arg] = + doVecUnpackOp Nothing vec4f32 arg res + +emitPrimOp _ [res] FloatX4InsertOp [v,e,i] = + doVecInsertOp Nothing vec4f32 v e i res + -- The rest just translate straightforwardly emitPrimOp dflags [res] op [arg] | nopOp op @@ -804,6 +835,14 @@ translateOp _ FloatMulOp = Just (MO_F_Mul W32) translateOp _ FloatDivOp = Just (MO_F_Quot W32) translateOp _ FloatNegOp = Just (MO_F_Neg W32) +-- Floating point vector ops + +translateOp _ FloatX4AddOp = Just (MO_VF_Add 4 W32) +translateOp _ FloatX4SubOp = Just (MO_VF_Sub 4 W32) +translateOp _ FloatX4MulOp = Just (MO_VF_Mul 4 W32) +translateOp _ FloatX4DivOp = Just (MO_VF_Quot 4 W32) +translateOp _ FloatX4NegOp = Just (MO_VF_Neg 4 W32) + -- Conversions translateOp dflags Int2DoubleOp = Just (MO_SF_Conv (wordWidth dflags) W64) @@ -864,42 +903,87 @@ callishOp _ = Nothing ------------------------------------------------------------------------------ -- Helpers for translating various minor variants of array indexing. -doIndexOffAddrOp :: Maybe MachOp -> CmmType -> [LocalReg] -> [CmmExpr] -> FCode () +doIndexOffAddrOp :: Maybe MachOp + -> CmmType + -> [LocalReg] + -> [CmmExpr] + -> FCode () doIndexOffAddrOp maybe_post_read_cast rep [res] [addr,idx] - = mkBasicIndexedRead 0 maybe_post_read_cast rep res addr idx + = mkBasicIndexedRead 0 maybe_post_read_cast rep res addr rep idx doIndexOffAddrOp _ _ _ _ - = panic "CgPrimOp: doIndexOffAddrOp" - -doIndexByteArrayOp :: Maybe MachOp -> CmmType -> [LocalReg] -> [CmmExpr] -> FCode () + = panic "StgCmmPrim: doIndexOffAddrOp" + +doIndexOffAddrOpAs :: Maybe MachOp + -> CmmType + -> CmmType + -> [LocalReg] + -> [CmmExpr] + -> FCode () +doIndexOffAddrOpAs maybe_post_read_cast rep idx_rep [res] [addr,idx] + = mkBasicIndexedRead 0 maybe_post_read_cast rep res addr idx_rep idx +doIndexOffAddrOpAs _ _ _ _ _ + = panic "StgCmmPrim: doIndexOffAddrOpAs" + +doIndexByteArrayOp :: Maybe MachOp + -> CmmType + -> [LocalReg] + -> [CmmExpr] + -> FCode () doIndexByteArrayOp maybe_post_read_cast rep [res] [addr,idx] = do dflags <- getDynFlags - mkBasicIndexedRead (arrWordsHdrSize dflags) maybe_post_read_cast rep res addr idx -doIndexByteArrayOp _ _ _ _ - = panic "CgPrimOp: doIndexByteArrayOp" + mkBasicIndexedRead (arrWordsHdrSize dflags) maybe_post_read_cast rep res addr rep idx +doIndexByteArrayOp _ _ _ _ + = panic "StgCmmPrim: doIndexByteArrayOp" + +doIndexByteArrayOpAs :: Maybe MachOp + -> CmmType + -> CmmType + -> [LocalReg] + -> [CmmExpr] + -> FCode () +doIndexByteArrayOpAs maybe_post_read_cast rep idx_rep [res] [addr,idx] + = do dflags <- getDynFlags + mkBasicIndexedRead (arrWordsHdrSize dflags) maybe_post_read_cast rep res addr idx_rep idx +doIndexByteArrayOpAs _ _ _ _ _ + = panic "StgCmmPrim: doIndexByteArrayOpAs" -doReadPtrArrayOp :: LocalReg -> CmmExpr -> CmmExpr -> FCode () +doReadPtrArrayOp :: LocalReg + -> CmmExpr + -> CmmExpr + -> FCode () doReadPtrArrayOp res addr idx = do dflags <- getDynFlags - mkBasicIndexedRead (arrPtrsHdrSize dflags) Nothing (gcWord dflags) res addr idx - - -doWriteOffAddrOp :: Maybe MachOp -> [LocalReg] -> [CmmExpr] -> FCode () -doWriteOffAddrOp maybe_pre_write_cast [] [addr,idx,val] - = mkBasicIndexedWrite 0 maybe_pre_write_cast addr idx val -doWriteOffAddrOp _ _ _ - = panic "CgPrimOp: doWriteOffAddrOp" + mkBasicIndexedRead (arrPtrsHdrSize dflags) Nothing (gcWord dflags) res addr (gcWord dflags) idx -doWriteByteArrayOp :: Maybe MachOp -> [LocalReg] -> [CmmExpr] -> FCode () -doWriteByteArrayOp maybe_pre_write_cast [] [addr,idx,val] +doWriteOffAddrOp :: Maybe MachOp + -> CmmType + -> [LocalReg] + -> [CmmExpr] + -> FCode () +doWriteOffAddrOp maybe_pre_write_cast idx_ty [] [addr,idx,val] + = mkBasicIndexedWrite 0 maybe_pre_write_cast addr idx_ty idx val +doWriteOffAddrOp _ _ _ _ + = panic "StgCmmPrim: doWriteOffAddrOp" + +doWriteByteArrayOp :: Maybe MachOp + -> CmmType + -> [LocalReg] + -> [CmmExpr] + -> FCode () +doWriteByteArrayOp maybe_pre_write_cast idx_ty [] [addr,idx,val] = do dflags <- getDynFlags - mkBasicIndexedWrite (arrWordsHdrSize dflags) maybe_pre_write_cast addr idx val -doWriteByteArrayOp _ _ _ - = panic "CgPrimOp: doWriteByteArrayOp" + mkBasicIndexedWrite (arrWordsHdrSize dflags) maybe_pre_write_cast addr idx_ty idx val +doWriteByteArrayOp _ _ _ _ + = panic "StgCmmPrim: doWriteByteArrayOp" -doWritePtrArrayOp :: CmmExpr -> CmmExpr -> CmmExpr -> FCode () +doWritePtrArrayOp :: CmmExpr + -> CmmExpr + -> CmmExpr + -> FCode () doWritePtrArrayOp addr idx val = do dflags <- getDynFlags - mkBasicIndexedWrite (arrPtrsHdrSize dflags) Nothing addr idx val + let ty = cmmExprType dflags val + mkBasicIndexedWrite (arrPtrsHdrSize dflags) Nothing addr ty idx val emit (setInfo addr (CmmLit (CmmLabel mkMAP_DIRTY_infoLabel))) -- the write barrier. We must write a byte into the mark table: -- bits8[a + header_size + StgMutArrPtrs_size(a) + x >> N] @@ -915,38 +999,154 @@ loadArrPtrsSize :: DynFlags -> CmmExpr -> CmmExpr loadArrPtrsSize dflags addr = CmmLoad (cmmOffsetB dflags addr off) (bWord dflags) where off = fixedHdrSize dflags * wORD_SIZE dflags + oFFSET_StgMutArrPtrs_ptrs dflags -mkBasicIndexedRead :: ByteOff -> Maybe MachOp -> CmmType - -> LocalReg -> CmmExpr -> CmmExpr -> FCode () -mkBasicIndexedRead off Nothing read_rep res base idx +mkBasicIndexedRead :: ByteOff -- Initial offset in bytes + -> Maybe MachOp -- Optional result cast + -> CmmType -- Type of element we are accessing + -> LocalReg -- Destination + -> CmmExpr -- Base address + -> CmmType -- Type of element by which we are indexing + -> CmmExpr -- Index + -> FCode () +mkBasicIndexedRead off Nothing ty res base idx_ty idx = do dflags <- getDynFlags - emitAssign (CmmLocal res) (cmmLoadIndexOffExpr dflags off read_rep base idx) -mkBasicIndexedRead off (Just cast) read_rep res base idx + emitAssign (CmmLocal res) (cmmLoadIndexOffExpr dflags off ty base idx_ty idx) +mkBasicIndexedRead off (Just cast) ty res base idx_ty idx = do dflags <- getDynFlags emitAssign (CmmLocal res) (CmmMachOp cast [ - cmmLoadIndexOffExpr dflags off read_rep base idx]) - -mkBasicIndexedWrite :: ByteOff -> Maybe MachOp - -> CmmExpr -> CmmExpr -> CmmExpr -> FCode () -mkBasicIndexedWrite off Nothing base idx val + cmmLoadIndexOffExpr dflags off ty base idx_ty idx]) + +mkBasicIndexedWrite :: ByteOff -- Initial offset in bytes + -> Maybe MachOp -- Optional value cast + -> CmmExpr -- Base address + -> CmmType -- Type of element by which we are indexing + -> CmmExpr -- Index + -> CmmExpr -- Value to write + -> FCode () +mkBasicIndexedWrite off Nothing base idx_ty idx val = do dflags <- getDynFlags - emitStore (cmmIndexOffExpr dflags off (typeWidth (cmmExprType dflags val)) base idx) val -mkBasicIndexedWrite off (Just cast) base idx val - = mkBasicIndexedWrite off Nothing base idx (CmmMachOp cast [val]) + emitStore (cmmIndexOffExpr dflags off (typeWidth idx_ty) base idx) val +mkBasicIndexedWrite off (Just cast) base idx_ty idx val + = mkBasicIndexedWrite off Nothing base idx_ty idx (CmmMachOp cast [val]) -- ---------------------------------------------------------------------------- -- Misc utils -cmmIndexOffExpr :: DynFlags -> ByteOff -> Width -> CmmExpr -> CmmExpr -> CmmExpr +cmmIndexOffExpr :: DynFlags + -> ByteOff -- Initial offset in bytes + -> Width -- Width of element by which we are indexing + -> CmmExpr -- Base address + -> CmmExpr -- Index + -> CmmExpr cmmIndexOffExpr dflags off width base idx = cmmIndexExpr dflags width (cmmOffsetB dflags base off) idx -cmmLoadIndexOffExpr :: DynFlags -> ByteOff -> CmmType -> CmmExpr -> CmmExpr -> CmmExpr -cmmLoadIndexOffExpr dflags off ty base idx - = CmmLoad (cmmIndexOffExpr dflags off (typeWidth ty) base idx) ty +cmmLoadIndexOffExpr :: DynFlags + -> ByteOff -- Initial offset in bytes + -> CmmType -- Type of element we are accessing + -> CmmExpr -- Base address + -> CmmType -- Type of element by which we are indexing + -> CmmExpr -- Index + -> CmmExpr +cmmLoadIndexOffExpr dflags off ty base idx_ty idx + = CmmLoad (cmmIndexOffExpr dflags off (typeWidth idx_ty) base idx) ty setInfo :: CmmExpr -> CmmExpr -> CmmAGraph setInfo closure_ptr info_ptr = mkStore closure_ptr info_ptr +------------------------------------------------------------------------------ +-- Helpers for translating vector packing and unpacking. + +doVecPackOp :: Maybe MachOp -- Cast from element to vector component + -> CmmType -- Type of vector + -> CmmExpr -- Initial vector + -> [CmmExpr] -- Elements + -> CmmFormal -- Destination for result + -> FCode () +doVecPackOp maybe_pre_write_cast ty z es res = do + dst <- newTemp ty + emitAssign (CmmLocal dst) z + vecPack dst es 0 + where + vecPack :: CmmFormal -> [CmmExpr] -> Int -> FCode () + vecPack src [] _ = + emitAssign (CmmLocal res) (CmmReg (CmmLocal src)) + + vecPack src (e : es) i = do + dst <- newTemp ty + emitAssign (CmmLocal dst) (CmmMachOp (MO_V_Insert len wid) + [CmmReg (CmmLocal src), cast e, iLit]) + vecPack dst es (i + 1) + where + -- vector indices are always 32-bits + iLit = CmmLit (CmmInt (toInteger i) W32) + + cast :: CmmExpr -> CmmExpr + cast val = case maybe_pre_write_cast of + Nothing -> val + Just cast -> CmmMachOp cast [val] + + len :: Length + len = vecLength ty + + wid :: Width + wid = typeWidth (vecElemType ty) + +doVecUnpackOp :: Maybe MachOp -- Cast from vector component to element result + -> CmmType -- Type of vector + -> CmmExpr -- Vector + -> [CmmFormal] -- Element results + -> FCode () +doVecUnpackOp maybe_post_read_cast ty e res = + vecUnpack res 0 + where + vecUnpack :: [CmmFormal] -> Int -> FCode () + vecUnpack [] _ = + return () + + vecUnpack (r : rs) i = do + emitAssign (CmmLocal r) (cast (CmmMachOp (MO_V_Extract len wid) + [e, iLit])) + vecUnpack rs (i + 1) + where + -- vector indices are always 32-bits + iLit = CmmLit (CmmInt (toInteger i) W32) + + cast :: CmmExpr -> CmmExpr + cast val = case maybe_post_read_cast of + Nothing -> val + Just cast -> CmmMachOp cast [val] + + len :: Length + len = vecLength ty + + wid :: Width + wid = typeWidth (vecElemType ty) + +doVecInsertOp :: Maybe MachOp -- Cast from element to vector component + -> CmmType -- Vector type + -> CmmExpr -- Source vector + -> CmmExpr -- Element + -> CmmExpr -- Index at which to insert element + -> CmmFormal -- Destination for result + -> FCode () +doVecInsertOp maybe_pre_write_cast ty src e idx res = do + dflags <- getDynFlags + -- vector indices are always 32-bits + let idx' :: CmmExpr + idx' = CmmMachOp (MO_SS_Conv (wordWidth dflags) W32) [idx] + emitAssign (CmmLocal res) (CmmMachOp (MO_V_Insert len wid) [src, cast e, idx']) + where + cast :: CmmExpr -> CmmExpr + cast val = case maybe_pre_write_cast of + Nothing -> val + Just cast -> CmmMachOp cast [val] + + len :: Length + len = vecLength ty + + wid :: Width + wid = typeWidth (vecElemType ty) + -- ---------------------------------------------------------------------------- -- Copying byte arrays diff --git a/compiler/llvmGen/LlvmCodeGen/CodeGen.hs b/compiler/llvmGen/LlvmCodeGen/CodeGen.hs index b7e085116d..cd864ca1a2 100644 --- a/compiler/llvmGen/LlvmCodeGen/CodeGen.hs +++ b/compiler/llvmGen/LlvmCodeGen/CodeGen.hs @@ -879,6 +879,13 @@ genMachOp env _ op [x] = case op of MO_FF_Conv from to -> sameConv from (widthToLlvmFloat to) LM_Fptrunc LM_Fpext + MO_VF_Neg len w -> + let ty = widthToLlvmFloat w + vecty = LMVector len ty + all0 = LMFloatLit (-0) ty + all0s = LMLitVar $ LMVectorLit (replicate len all0) + in negate vecty all0s LM_MO_FSub + -- Handle unsupported cases explicitly so we get a warning -- of missing case when new MachOps added MO_Add _ -> panicOp @@ -919,6 +926,14 @@ genMachOp env _ op [x] = case op of MO_Shl _ -> panicOp MO_U_Shr _ -> panicOp MO_S_Shr _ -> panicOp + + MO_V_Insert _ _ -> panicOp + MO_V_Extract _ _ -> panicOp + + MO_VF_Add _ _ -> panicOp + MO_VF_Sub _ _ -> panicOp + MO_VF_Mul _ _ -> panicOp + MO_VF_Quot _ _ -> panicOp where dflags = getDflags env @@ -984,6 +999,24 @@ genMachOp_fast env opt op r n e -- This handles all the cases not handle by the specialised genMachOp_fast. genMachOp_slow :: LlvmEnv -> EOption -> MachOp -> [CmmExpr] -> UniqSM ExprData +-- Element extraction +genMachOp_slow env _ (MO_V_Extract {}) [val, idx] = do + (env1, vval, stmts1, top1) <- exprToVar env val + (env2, vidx, stmts2, top2) <- exprToVar env1 idx + let (LMVector _ ty) = getVarType vval + (v1, s1) <- doExpr ty $ Extract vval vidx + return (env2, v1, stmts1 `appOL` stmts2 `snocOL` s1, top1 ++ top2) + +-- Element insertion +genMachOp_slow env _ (MO_V_Insert {}) [val, elt, idx] = do + (env1, vval, stmts1, top1) <- exprToVar env val + (env2, velt, stmts2, top2) <- exprToVar env1 elt + (env3, vidx, stmts3, top3) <- exprToVar env2 idx + let ty = getVarType vval + (v1, s1) <- doExpr ty $ Insert vval velt vidx + return (env3, v1, stmts1 `appOL` stmts2 `appOL` stmts3 `snocOL` s1, + top1 ++ top2 ++ top3) + -- Binary MachOp genMachOp_slow env opt op [x, y] = case op of @@ -1032,6 +1065,11 @@ genMachOp_slow env opt op [x, y] = case op of MO_Shl _ -> genBinMach LM_MO_Shl MO_U_Shr _ -> genBinMach LM_MO_LShr MO_S_Shr _ -> genBinMach LM_MO_AShr + + MO_VF_Add _ _ -> genBinMach LM_MO_FAdd + MO_VF_Sub _ _ -> genBinMach LM_MO_FSub + MO_VF_Mul _ _ -> genBinMach LM_MO_FMul + MO_VF_Quot _ _ -> genBinMach LM_MO_FDiv MO_Not _ -> panicOp MO_S_Neg _ -> panicOp @@ -1043,6 +1081,11 @@ genMachOp_slow env opt op [x, y] = case op of MO_UU_Conv _ _ -> panicOp MO_FF_Conv _ _ -> panicOp + MO_V_Insert {} -> panicOp + MO_V_Extract {} -> panicOp + + MO_VF_Neg {} -> panicOp + where dflags = getDflags env diff --git a/compiler/nativeGen/X86/CodeGen.hs b/compiler/nativeGen/X86/CodeGen.hs index 5d904204ac..46e8e9b81f 100644 --- a/compiler/nativeGen/X86/CodeGen.hs +++ b/compiler/nativeGen/X86/CodeGen.hs @@ -602,6 +602,14 @@ getRegister' dflags is32Bit (CmmMachOp mop [x]) = do -- unary MachOps MO_FS_Conv from to -> coerceFP2Int from to x MO_SF_Conv from to -> coerceInt2FP from to x + MO_V_Insert {} -> needLlvm + MO_V_Extract {} -> needLlvm + MO_VF_Add {} -> needLlvm + MO_VF_Sub {} -> needLlvm + MO_VF_Mul {} -> needLlvm + MO_VF_Quot {} -> needLlvm + MO_VF_Neg {} -> needLlvm + _other -> pprPanic "getRegister" (pprMachOp mop) where triv_ucode :: (Size -> Operand -> Instr) -> Size -> NatM Register @@ -694,6 +702,14 @@ getRegister' _ is32Bit (CmmMachOp mop [x, y]) = do -- dyadic MachOps MO_U_Shr rep -> shift_code rep SHR x y {-False-} MO_S_Shr rep -> shift_code rep SAR x y {-False-} + MO_V_Insert {} -> needLlvm + MO_V_Extract {} -> needLlvm + MO_VF_Add {} -> needLlvm + MO_VF_Sub {} -> needLlvm + MO_VF_Mul {} -> needLlvm + MO_VF_Quot {} -> needLlvm + MO_VF_Neg {} -> needLlvm + _other -> pprPanic "getRegister(x86) - binary CmmMachOp (1)" (pprMachOp mop) where -------------------- @@ -884,7 +900,9 @@ getRegister' dflags _ (CmmLit lit) code dst = unitOL (MOV size (OpImm imm) (OpReg dst)) return (Any size code) -getRegister' _ _ other = pprPanic "getRegister(x86)" (ppr other) +getRegister' _ _ other + | isVecExpr other = needLlvm + | otherwise = pprPanic "getRegister(x86)" (ppr other) intLoadCode :: (Operand -> Operand -> Instr) -> CmmExpr @@ -2690,3 +2708,19 @@ sse2NegCode w x = do ] -- return (Any sz code) + +isVecExpr :: CmmExpr -> Bool +isVecExpr (CmmMachOp (MO_V_Insert {}) _) = True +isVecExpr (CmmMachOp (MO_V_Extract {}) _) = True +isVecExpr (CmmMachOp (MO_VF_Add {}) _) = True +isVecExpr (CmmMachOp (MO_VF_Sub {}) _) = True +isVecExpr (CmmMachOp (MO_VF_Mul {}) _) = True +isVecExpr (CmmMachOp (MO_VF_Quot {}) _) = True +isVecExpr (CmmMachOp (MO_VF_Neg {}) _) = True +isVecExpr (CmmMachOp _ [e]) = isVecExpr e +isVecExpr _ = False + +needLlvm :: NatM a +needLlvm = + sorry $ unlines ["The native code generator does not support vector" + ,"instructions. Please use -fllvm."] diff --git a/compiler/prelude/PrelNames.lhs b/compiler/prelude/PrelNames.lhs index 261d10295f..961a823436 100644 --- a/compiler/prelude/PrelNames.lhs +++ b/compiler/prelude/PrelNames.lhs @@ -1420,6 +1420,11 @@ typeNatAddTyFamNameKey = mkPreludeTyConUnique 162 typeNatMulTyFamNameKey = mkPreludeTyConUnique 163 typeNatExpTyFamNameKey = mkPreludeTyConUnique 164 +-- SIMD vector types (Unique keys) +floatX4PrimTyConKey :: Unique + +floatX4PrimTyConKey = mkPreludeTyConUnique 170 + ---------------- Template Haskell ------------------- -- USES TyConUniques 200-299 ----------------------------------------------------- diff --git a/compiler/prelude/TysPrim.lhs b/compiler/prelude/TysPrim.lhs index 44ba035dc0..960a27b276 100644 --- a/compiler/prelude/TysPrim.lhs +++ b/compiler/prelude/TysPrim.lhs @@ -73,7 +73,10 @@ module TysPrim( eqPrimTyCon, -- ty1 ~# ty2 -- * Any - anyTy, anyTyCon, anyTypeOfKind + anyTy, anyTyCon, anyTypeOfKind, + + -- * SIMD + floatX4PrimTyCon, floatX4PrimTy ) where #include "HsVersions.h" @@ -135,6 +138,8 @@ primTyCons , constraintKindTyCon , superKindTyCon , anyKindTyCon + + , floatX4PrimTyCon ] mkPrimTc :: FastString -> Unique -> TyCon -> Name @@ -144,7 +149,7 @@ mkPrimTc fs unique tycon (ATyCon tycon) -- Relevant TyCon UserSyntax -- None are built-in syntax -charPrimTyConName, intPrimTyConName, int32PrimTyConName, int64PrimTyConName, wordPrimTyConName, word32PrimTyConName, word64PrimTyConName, addrPrimTyConName, floatPrimTyConName, doublePrimTyConName, statePrimTyConName, realWorldTyConName, arrayPrimTyConName, arrayArrayPrimTyConName, byteArrayPrimTyConName, mutableArrayPrimTyConName, mutableByteArrayPrimTyConName, mutableArrayArrayPrimTyConName, mutVarPrimTyConName, mVarPrimTyConName, tVarPrimTyConName, stablePtrPrimTyConName, stableNamePrimTyConName, bcoPrimTyConName, weakPrimTyConName, threadIdPrimTyConName, eqPrimTyConName :: Name +charPrimTyConName, intPrimTyConName, int32PrimTyConName, int64PrimTyConName, wordPrimTyConName, word32PrimTyConName, word64PrimTyConName, addrPrimTyConName, floatPrimTyConName, doublePrimTyConName, statePrimTyConName, realWorldTyConName, arrayPrimTyConName, arrayArrayPrimTyConName, byteArrayPrimTyConName, mutableArrayPrimTyConName, mutableByteArrayPrimTyConName, mutableArrayArrayPrimTyConName, mutVarPrimTyConName, mVarPrimTyConName, tVarPrimTyConName, stablePtrPrimTyConName, stableNamePrimTyConName, bcoPrimTyConName, weakPrimTyConName, threadIdPrimTyConName, eqPrimTyConName, floatX4PrimTyConName :: Name charPrimTyConName = mkPrimTc (fsLit "Char#") charPrimTyConKey charPrimTyCon intPrimTyConName = mkPrimTc (fsLit "Int#") intPrimTyConKey intPrimTyCon int32PrimTyConName = mkPrimTc (fsLit "Int32#") int32PrimTyConKey int32PrimTyCon @@ -172,6 +177,7 @@ stableNamePrimTyConName = mkPrimTc (fsLit "StableName#") stableNamePrimTyC bcoPrimTyConName = mkPrimTc (fsLit "BCO#") bcoPrimTyConKey bcoPrimTyCon weakPrimTyConName = mkPrimTc (fsLit "Weak#") weakPrimTyConKey weakPrimTyCon threadIdPrimTyConName = mkPrimTc (fsLit "ThreadId#") threadIdPrimTyConKey threadIdPrimTyCon +floatX4PrimTyConName = mkPrimTc (fsLit "FloatX4#") floatX4PrimTyConKey floatX4PrimTyCon \end{code} %************************************************************************ @@ -729,3 +735,16 @@ anyTyCon = mkSynTyCon anyTyConName kind [kKiVar] anyTypeOfKind :: Kind -> Type anyTypeOfKind kind = mkNakedTyConApp anyTyCon [kind] \end{code} + +%************************************************************************ +%* * +\subsection{SIMD vector type} +%* * +%************************************************************************ + +\begin{code} +floatX4PrimTy :: Type +floatX4PrimTy = mkTyConTy floatX4PrimTyCon +floatX4PrimTyCon :: TyCon +floatX4PrimTyCon = pcPrimTyCon0 floatX4PrimTyConName (VecRep 4 FloatElemRep) +\end{code} diff --git a/compiler/prelude/primops.txt.pp b/compiler/prelude/primops.txt.pp index 6d551d90e5..9cdda0ec8f 100644 --- a/compiler/prelude/primops.txt.pp +++ b/compiler/prelude/primops.txt.pp @@ -2202,6 +2202,101 @@ primop TraceMarkerOp "traceMarker#" GenPrimOp ------------------------------------------------------------------------ +section "Float SIMD Vectors" + {Operations on SIMD vectors of 4 single-precision (32-bit) + floating-point numbers.} +------------------------------------------------------------------------ + +primtype FloatX4# + +primop FloatToFloatX4Op "floatToFloatX4#" GenPrimOp + Float# -> FloatX4# + +primop FloatX4PackOp "packFloatX4#" GenPrimOp + Float# -> Float# -> Float# -> Float# -> FloatX4# + +primop FloatX4UnpackOp "unpackFloatX4#" GenPrimOp + FloatX4# -> (# Float#, Float#, Float#, Float# #) + +primop FloatX4InsertOp "insertFloatX4#" GenPrimOp + FloatX4# -> Float# -> Int# -> FloatX4# + with can_fail = True + +primop FloatX4AddOp "plusFloatX4#" Dyadic + FloatX4# -> FloatX4# -> FloatX4# + with commutable = True + +primop FloatX4SubOp "minusFloatX4#" Dyadic + FloatX4# -> FloatX4# -> FloatX4# + +primop FloatX4MulOp "timesFloatX4#" Dyadic + FloatX4# -> FloatX4# -> FloatX4# + with commutable = True + +primop FloatX4DivOp "divideFloatX4#" Dyadic + FloatX4# -> FloatX4# -> FloatX4# + with can_fail = True + +primop FloatX4NegOp "negateFloatX4#" Monadic + FloatX4# -> FloatX4# + +primop IndexByteArrayOp_FloatX4 "indexFloatX4Array#" GenPrimOp + ByteArray# -> Int# -> FloatX4# + with can_fail = True + +primop ReadByteArrayOp_FloatX4 "readFloatX4Array#" GenPrimOp + MutableByteArray# s -> Int# -> State# s -> (# State# s, FloatX4# #) + with has_side_effects = True + can_fail = True + +primop WriteByteArrayOp_FloatX4 "writeFloatX4Array#" GenPrimOp + MutableByteArray# s -> Int# -> FloatX4# -> State# s -> State# s + with has_side_effects = True + can_fail = True + +primop IndexOffAddrOp_FloatX4 "indexFloatX4OffAddr#" GenPrimOp + Addr# -> Int# -> FloatX4# + with can_fail = True + +primop ReadOffAddrOp_FloatX4 "readFloatX4OffAddr#" GenPrimOp + Addr# -> Int# -> State# s -> (# State# s, FloatX4# #) + with has_side_effects = True + can_fail = True + +primop WriteOffAddrOp_FloatX4 "writeFloatX4OffAddr#" GenPrimOp + Addr# -> Int# -> FloatX4# -> State# s -> State# s + with has_side_effects = True + can_fail = True + +primop IndexByteArrayOp_FloatAsFloatX4 "indexFloatArrayAsFloatX4#" GenPrimOp + ByteArray# -> Int# -> FloatX4# + with can_fail = True + +primop ReadByteArrayOp_FloatAsFloatX4 "readFloatArrayAsFloatX4#" GenPrimOp + MutableByteArray# s -> Int# -> State# s -> (# State# s, FloatX4# #) + with has_side_effects = True + can_fail = True + +primop WriteByteArrayOp_FloatAsFloatX4 "writeFloatArrayAsFloatX4#" GenPrimOp + MutableByteArray# s -> Int# -> FloatX4# -> State# s -> State# s + with has_side_effects = True + can_fail = True + +primop IndexOffAddrOp_FloatAsFloatX4 "indexFloatOffAddrAsFloatX4#" GenPrimOp + Addr# -> Int# -> FloatX4# + with can_fail = True + +primop ReadOffAddrOp_FloatAsFloatX4 "readFloatOffAddrAsFloatX4#" GenPrimOp + Addr# -> Int# -> State# s -> (# State# s, FloatX4# #) + with has_side_effects = True + can_fail = True + +primop WriteOffAddrOp_FloatAsFloatX4 "writeFloatOffAddrAsFloatX4#" GenPrimOp + Addr# -> Int# -> FloatX4# -> State# s -> State# s + with has_side_effects = True + can_fail = True + +------------------------------------------------------------------------ --- --- ------------------------------------------------------------------------ diff --git a/utils/genprimopcode/Main.hs b/utils/genprimopcode/Main.hs index debdd27102..27368f3ae7 100644 --- a/utils/genprimopcode/Main.hs +++ b/utils/genprimopcode/Main.hs @@ -502,20 +502,27 @@ gen_latex_doc (Info defaults entries) gen_wrappers :: Info -> String gen_wrappers (Info _ entries) - = "{-# LANGUAGE NoImplicitPrelude, UnboxedTuples #-}\n" + = "{-# LANGUAGE CPP, NoImplicitPrelude, UnboxedTuples #-}\n" -- Dependencies on Prelude must be explicit in libraries/base, but we -- don't need the Prelude here so we add NoImplicitPrelude. ++ "module GHC.PrimopWrappers where\n" ++ "import qualified GHC.Prim\n" ++ "import GHC.Types (Bool)\n" ++ "import GHC.Tuple ()\n" - ++ "import GHC.Prim (" ++ types ++ ")\n" - ++ unlines (concatMap f specs) + ++ "import GHC.Prim (" ++ concat (intersperse ", " othertycons) ++ ")\n" + ++ "#if defined (__GLASGOW_HASKELL_LLVM__)\n" + ++ "import GHC.Prim (" ++ concat (intersperse ", " vectycons) ++ ")\n" + ++ "#endif /* defined (__GLASGOW_HASKELL_LLVM__) */\n" + ++ unlines (concatMap f otherspecs) + ++ "#if defined (__GLASGOW_HASKELL_LLVM__)\n" + ++ unlines (concatMap f vecspecs) + ++ "#endif /* defined (__GLASGOW_HASKELL_LLVM__) */\n" where specs = filter (not.dodgy) (filter is_primop entries) + (vecspecs, otherspecs) = partition (llvmOnlyTy . ty) specs tycons = foldr union [] $ map (tyconsIn . ty) specs - tycons' = filter (`notElem` ["()", "Bool"]) tycons - types = concat $ intersperse ", " tycons' + (vectycons, othertycons) = + (partition llvmOnlyTyCon . filter (`notElem` ["()", "Bool"])) tycons f spec = let args = map (\n -> "a" ++ show n) [1 .. arity (ty spec)] src_name = wrap (name spec) lhs = src_name ++ " " ++ unwords args @@ -536,6 +543,16 @@ gen_wrappers (Info _ entries) "parAtAbs#", "parAtRel#", "parAtForNow#" ] + llvmOnlyTy :: Ty -> Bool + llvmOnlyTy (TyF ty1 ty2) = llvmOnlyTy ty1 || llvmOnlyTy ty2 + llvmOnlyTy (TyApp tycon tys) = llvmOnlyTyCon tycon || any llvmOnlyTy tys + llvmOnlyTy (TyVar _) = False + llvmOnlyTy (TyUTup tys) = any llvmOnlyTy tys + + llvmOnlyTyCon :: TyCon -> Bool + llvmOnlyTyCon "FloatX4#" = True + llvmOnlyTyCon _ = False + gen_primop_list :: Info -> String gen_primop_list (Info _ entries) = unlines ( @@ -653,6 +670,7 @@ ppType (TyApp "Word64#" []) = "word64PrimTy" ppType (TyApp "Addr#" []) = "addrPrimTy" ppType (TyApp "Float#" []) = "floatPrimTy" ppType (TyApp "Double#" []) = "doublePrimTy" +ppType (TyApp "FloatX4#" []) = "floatX4PrimTy" ppType (TyApp "ByteArray#" []) = "byteArrayPrimTy" ppType (TyApp "RealWorld" []) = "realWorldTy" ppType (TyApp "ThreadId#" []) = "threadIdPrimTy" |