10 files changed, 351 insertions, 564 deletions
diff --git a/compiler/cmm/CmmMachOp.hs b/compiler/cmm/CmmMachOp.hs
index 8d42bbd2cb..c009d15e25 100644
--- a/compiler/cmm/CmmMachOp.hs
+++ b/compiler/cmm/CmmMachOp.hs
@@ -118,6 +118,10 @@ data MachOp
   | MO_VS_Rem  Length Width
   | MO_VS_Neg  Length Width
 
+  -- Unsigned vector multiply/divide
+  | MO_VU_Quot Length Width
+  | MO_VU_Rem  Length Width
+
   -- Floting point vector element insertion and extraction operations
   | MO_VF_Insert  Length Width   -- Insert scalar into vector
   | MO_VF_Extract Length Width   -- Extract scalar from vector
@@ -375,6 +379,9 @@ machOpResultType dflags mop tys =
     MO_VS_Rem  l w      -> cmmVec l (cmmBits w)
     MO_VS_Neg  l w      -> cmmVec l (cmmBits w)
 
+    MO_VU_Quot l w      -> cmmVec l (cmmBits w)
+    MO_VU_Rem  l w      -> cmmVec l (cmmBits w)
+
     MO_VF_Insert  l w   -> cmmVec l (cmmFloat w)
     MO_VF_Extract _ w   -> cmmFloat w
 
@@ -461,6 +468,9 @@ machOpArgReps dflags op =
     MO_VS_Rem  _ r      -> [r,r]
     MO_VS_Neg  _ r      -> [r]
 
+    MO_VU_Quot _ r      -> [r,r]
+    MO_VU_Rem  _ r      -> [r,r]
+
     MO_VF_Insert  l r   -> [typeWidth (vec l (cmmFloat r)),r,wordWidth dflags]
     MO_VF_Extract l r   -> [typeWidth (vec l (cmmFloat r)),wordWidth dflags]
 
diff --git a/compiler/cmm/PprC.hs b/compiler/cmm/PprC.hs
index d45b103954..c468161c73 100644
--- a/compiler/cmm/PprC.hs
+++ b/compiler/cmm/PprC.hs
@@ -651,6 +651,15 @@ pprMachOp_for_C mop = case mop of
                                 (panic $ "PprC.pprMachOp_for_C: MO_VS_Neg"
                                       ++ " should have been handled earlier!")
 
+        MO_VU_Quot {}     -> pprTrace "offending mop:"
+                                (ptext $ sLit "MO_VU_Quot")
+                                (panic $ "PprC.pprMachOp_for_C: MO_VU_Quot"
+                                      ++ " should have been handled earlier!")
+        MO_VU_Rem {}      -> pprTrace "offending mop:"
+                                (ptext $ sLit "MO_VU_Rem")
+                                (panic $ "PprC.pprMachOp_for_C: MO_VU_Rem"
+                                      ++ " should have been handled earlier!")
+
         MO_VF_Insert {}   -> pprTrace "offending mop:"
                                 (ptext $ sLit "MO_VF_Insert")
                                 (panic $ "PprC.pprMachOp_for_C: MO_VF_Insert"
diff --git a/compiler/codeGen/StgCmmPrim.hs b/compiler/codeGen/StgCmmPrim.hs
index 8560f7cf1c..5250c9378e 100644
--- a/compiler/codeGen/StgCmmPrim.hs
+++ b/compiler/codeGen/StgCmmPrim.hs
@@ -40,7 +40,7 @@ import FastString
 import Outputable
 import Util
 
-import Control.Monad (liftM)
+import Control.Monad (liftM, when)
 import Data.Bits
 
 ------------------------------------------------------------------------
@@ -380,14 +380,6 @@ emitPrimOp dflags res IndexOffAddrOp_Word8            args = doIndexOffAddrOp
 emitPrimOp dflags res IndexOffAddrOp_Word16           args = doIndexOffAddrOp   (Just (mo_u_16ToWord dflags)) b16 res args
 emitPrimOp dflags res IndexOffAddrOp_Word32           args = doIndexOffAddrOp   (Just (mo_u_32ToWord dflags)) b32 res args
 emitPrimOp _      res IndexOffAddrOp_Word64           args = doIndexOffAddrOp   Nothing b64 res args
-emitPrimOp _      res IndexOffAddrOp_FloatX4          args = doIndexOffAddrOp   Nothing vec4f32 res args
-emitPrimOp _      res IndexOffAddrOp_FloatAsFloatX4   args = doIndexOffAddrOpAs Nothing vec4f32 f32 res args
-emitPrimOp _      res IndexOffAddrOp_DoubleX2         args = doIndexOffAddrOp   Nothing vec2f64 res args
-emitPrimOp _      res IndexOffAddrOp_DoubleAsDoubleX2 args = doIndexOffAddrOpAs Nothing vec2f64 f64 res args
-emitPrimOp _      res IndexOffAddrOp_Int32X4          args = doIndexOffAddrOp   Nothing vec4b32 res args
-emitPrimOp _      res IndexOffAddrOp_Int32AsInt32X4   args = doIndexOffAddrOpAs Nothing vec4b32 b32 res args
-emitPrimOp _      res IndexOffAddrOp_Int64X2          args = doIndexOffAddrOp   Nothing vec2b64 res args
-emitPrimOp _      res IndexOffAddrOp_Int64AsInt64X2   args = doIndexOffAddrOpAs Nothing vec2b64 b64 res args
 
 -- ReadXXXoffAddr, which are identical, for our purposes, to IndexXXXoffAddr.
 
@@ -407,14 +399,6 @@ emitPrimOp dflags res ReadOffAddrOp_Word8            args = doIndexOffAddrOp   (
 emitPrimOp dflags res ReadOffAddrOp_Word16           args = doIndexOffAddrOp   (Just (mo_u_16ToWord dflags)) b16 res args
 emitPrimOp dflags res ReadOffAddrOp_Word32           args = doIndexOffAddrOp   (Just (mo_u_32ToWord dflags)) b32 res args
 emitPrimOp _      res ReadOffAddrOp_Word64           args = doIndexOffAddrOp   Nothing b64 res args
-emitPrimOp _      res ReadOffAddrOp_FloatX4          args = doIndexOffAddrOp   Nothing vec4f32 res args
-emitPrimOp _      res ReadOffAddrOp_FloatAsFloatX4   args = doIndexOffAddrOpAs Nothing vec4f32 b32 res args
-emitPrimOp _      res ReadOffAddrOp_DoubleX2         args = doIndexOffAddrOp   Nothing vec2f64 res args
-emitPrimOp _      res ReadOffAddrOp_DoubleAsDoubleX2 args = doIndexOffAddrOpAs Nothing vec2f64 b64 res args
-emitPrimOp _      res ReadOffAddrOp_Int32X4          args = doIndexOffAddrOp   Nothing vec4b32 res args
-emitPrimOp _      res ReadOffAddrOp_Int32AsInt32X4   args = doIndexOffAddrOpAs Nothing vec4b32 b32 res args
-emitPrimOp _      res ReadOffAddrOp_Int64X2          args = doIndexOffAddrOp   Nothing vec2b64 res args
-emitPrimOp _      res ReadOffAddrOp_Int64AsInt64X2   args = doIndexOffAddrOpAs Nothing vec2b64 b64 res args
 
 -- IndexXXXArray
 
@@ -434,14 +418,6 @@ emitPrimOp dflags res IndexByteArrayOp_Word8            args = doIndexByteArrayO
 emitPrimOp dflags res IndexByteArrayOp_Word16           args = doIndexByteArrayOp   (Just (mo_u_16ToWord dflags)) b16  res args
 emitPrimOp dflags res IndexByteArrayOp_Word32           args = doIndexByteArrayOp   (Just (mo_u_32ToWord dflags)) b32  res args
 emitPrimOp _      res IndexByteArrayOp_Word64           args = doIndexByteArrayOp   Nothing b64  res args
-emitPrimOp _      res IndexByteArrayOp_FloatX4          args = doIndexByteArrayOp   Nothing vec4f32 res args
-emitPrimOp _      res IndexByteArrayOp_FloatAsFloatX4   args = doIndexByteArrayOpAs Nothing vec4f32 f32 res args
-emitPrimOp _      res IndexByteArrayOp_DoubleX2         args = doIndexByteArrayOp   Nothing vec2f64 res args
-emitPrimOp _      res IndexByteArrayOp_DoubleAsDoubleX2 args = doIndexByteArrayOpAs Nothing vec2f64 f64 res args
-emitPrimOp _      res IndexByteArrayOp_Int32X4          args = doIndexByteArrayOp   Nothing vec4b32 res args
-emitPrimOp _      res IndexByteArrayOp_Int32AsInt32X4   args = doIndexByteArrayOpAs Nothing vec4b32 b32 res args
-emitPrimOp _      res IndexByteArrayOp_Int64X2          args = doIndexByteArrayOp   Nothing vec2b64 res args
-emitPrimOp _      res IndexByteArrayOp_Int64AsInt64X2   args = doIndexByteArrayOpAs Nothing vec2b64 b64 res args
 
 -- ReadXXXArray, identical to IndexXXXArray.
 
@@ -461,14 +437,6 @@ emitPrimOp dflags res ReadByteArrayOp_Word8            args = doIndexByteArrayOp
 emitPrimOp dflags res ReadByteArrayOp_Word16           args = doIndexByteArrayOp   (Just (mo_u_16ToWord dflags)) b16  res args
 emitPrimOp dflags res ReadByteArrayOp_Word32           args = doIndexByteArrayOp   (Just (mo_u_32ToWord dflags)) b32  res args
 emitPrimOp _      res ReadByteArrayOp_Word64           args = doIndexByteArrayOp   Nothing b64  res args
-emitPrimOp _      res ReadByteArrayOp_FloatX4          args = doIndexByteArrayOp   Nothing vec4f32 res args
-emitPrimOp _      res ReadByteArrayOp_FloatAsFloatX4   args = doIndexByteArrayOpAs Nothing vec4f32 f32 res args
-emitPrimOp _      res ReadByteArrayOp_DoubleX2         args = doIndexByteArrayOp   Nothing vec2f64 res args
-emitPrimOp _      res ReadByteArrayOp_DoubleAsDoubleX2 args = doIndexByteArrayOpAs Nothing vec2f64 f64 res args
-emitPrimOp _      res ReadByteArrayOp_Int32X4          args = doIndexByteArrayOp   Nothing vec4b32 res args
-emitPrimOp _      res ReadByteArrayOp_Int32AsInt32X4   args = doIndexByteArrayOpAs Nothing vec4b32 b32 res args
-emitPrimOp _      res ReadByteArrayOp_Int64X2          args = doIndexByteArrayOp   Nothing vec2b64 res args
-emitPrimOp _      res ReadByteArrayOp_Int64AsInt64X2   args = doIndexByteArrayOpAs Nothing vec2b64 b64 res args
 
 -- WriteXXXoffAddr
 
@@ -488,14 +456,6 @@ emitPrimOp dflags res WriteOffAddrOp_Word8            args = doWriteOffAddrOp (J
 emitPrimOp dflags res WriteOffAddrOp_Word16           args = doWriteOffAddrOp (Just (mo_WordTo16 dflags)) b16 res args
 emitPrimOp dflags res WriteOffAddrOp_Word32           args = doWriteOffAddrOp (Just (mo_WordTo32 dflags)) b32 res args
 emitPrimOp _      res WriteOffAddrOp_Word64           args = doWriteOffAddrOp Nothing b64 res args
-emitPrimOp _      res WriteOffAddrOp_FloatX4          args = doWriteOffAddrOp Nothing vec4f32 res args
-emitPrimOp _      res WriteOffAddrOp_FloatAsFloatX4   args = doWriteOffAddrOp Nothing f32 res args
-emitPrimOp _      res WriteOffAddrOp_DoubleX2         args = doWriteOffAddrOp Nothing vec2f64 res args
-emitPrimOp _      res WriteOffAddrOp_DoubleAsDoubleX2 args = doWriteOffAddrOp Nothing f64 res args
-emitPrimOp _      res WriteOffAddrOp_Int32X4          args = doWriteOffAddrOp Nothing vec4b32 res args
-emitPrimOp _      res WriteOffAddrOp_Int32AsInt32X4   args = doWriteOffAddrOp Nothing b32 res args
-emitPrimOp _      res WriteOffAddrOp_Int64X2          args = doWriteOffAddrOp Nothing vec2b64 res args
-emitPrimOp _      res WriteOffAddrOp_Int64AsInt64X2   args = doWriteOffAddrOp Nothing b64 res args
 
 -- WriteXXXArray
 
@@ -515,14 +475,6 @@ emitPrimOp dflags res WriteByteArrayOp_Word8            args = doWriteByteArrayO
 emitPrimOp dflags res WriteByteArrayOp_Word16           args = doWriteByteArrayOp (Just (mo_WordTo16 dflags)) b16 res args
 emitPrimOp dflags res WriteByteArrayOp_Word32           args = doWriteByteArrayOp (Just (mo_WordTo32 dflags)) b32 res args
 emitPrimOp _      res WriteByteArrayOp_Word64           args = doWriteByteArrayOp Nothing b64 res args
-emitPrimOp _      res WriteByteArrayOp_FloatX4          args = doWriteByteArrayOp Nothing vec4f32 res args
-emitPrimOp _      res WriteByteArrayOp_FloatAsFloatX4   args = doWriteByteArrayOp Nothing f32 res args
-emitPrimOp _      res WriteByteArrayOp_DoubleX2         args = doWriteByteArrayOp Nothing vec2f64 res args
-emitPrimOp _      res WriteByteArrayOp_DoubleAsDoubleX2 args = doWriteByteArrayOp Nothing f64 res args
-emitPrimOp _      res WriteByteArrayOp_Int32X4          args = doWriteByteArrayOp Nothing vec4b32 res args
-emitPrimOp _      res WriteByteArrayOp_Int32AsInt32X4   args = doWriteByteArrayOp Nothing b32 res args
-emitPrimOp _      res WriteByteArrayOp_Int64X2          args = doWriteByteArrayOp Nothing vec2b64 res args
-emitPrimOp _      res WriteByteArrayOp_Int64AsInt64X2   args = doWriteByteArrayOp Nothing b64 res args
 
 -- Copying and setting byte arrays
 emitPrimOp _      [] CopyByteArrayOp [src,src_off,dst,dst_off,n] =
@@ -556,78 +508,136 @@ emitPrimOp _      [res] Word2FloatOp  [w] = emitPrimCall [res]
 emitPrimOp _      [res] Word2DoubleOp [w] = emitPrimCall [res]
                                             (MO_UF_Conv W64) [w]
 
--- SIMD vector packing and unpacking
-emitPrimOp _ [res] FloatToFloatX4Op [e] =
-    doVecPackOp Nothing vec4f32 zero [e,e,e,e] res
+-- SIMD primops
+emitPrimOp dflags [res] (VecBroadcastOp vcat n w) [e] =
+    doVecPackOp (vecElemInjectCast dflags vcat w) ty zeros (replicate n e) res
   where
-    zero :: CmmExpr
-    zero = CmmLit $ CmmVec (replicate 4 (CmmFloat 0 W32))
+    zeros :: CmmExpr
+    zeros = CmmLit $ CmmVec (replicate n zero)
+
+    zero :: CmmLit
+    zero = case vcat of
+             IntVec   -> CmmInt 0 w
+             WordVec  -> CmmInt 0 w
+             FloatVec -> CmmFloat 0 w
+
+    ty :: CmmType
+    ty = vecVmmType vcat n w
+
+emitPrimOp dflags [res] (VecPackOp vcat n w) es = do
+    when (length es /= n) $
+        panic "emitPrimOp: VecPackOp has wrong number of arguments"
+    doVecPackOp (vecElemInjectCast dflags vcat w) ty zeros es res
+  where
+    zeros :: CmmExpr
+    zeros = CmmLit $ CmmVec (replicate n zero)
+
+    zero :: CmmLit
+    zero = case vcat of
+             IntVec   -> CmmInt 0 w
+             WordVec  -> CmmInt 0 w
+             FloatVec -> CmmFloat 0 w
+
+    ty :: CmmType
+    ty = vecVmmType vcat n w
+
+emitPrimOp dflags res (VecUnpackOp vcat n w) [arg] = do
+    when (length res /= n) $
+        panic "emitPrimOp: VecUnpackOp has wrong number of results"
+    doVecUnpackOp (vecElemProjectCast dflags vcat w) ty arg res
+  where
+    ty :: CmmType
+    ty = vecVmmType vcat n w
 
-emitPrimOp _ [res] FloatX4PackOp es@[_,_,_,_] =
-    doVecPackOp Nothing vec4f32 zero es res
+emitPrimOp dflags [res] (VecInsertOp vcat n w) [v,e,i] =
+    doVecInsertOp (vecElemInjectCast dflags vcat w) ty v e i res
   where
-    zero :: CmmExpr
-    zero = CmmLit $ CmmVec (replicate 4 (CmmFloat 0 W32))
+    ty :: CmmType
+    ty = vecVmmType vcat n w
 
-emitPrimOp _ res@[_,_,_,_] FloatX4UnpackOp [arg] =
-    doVecUnpackOp Nothing vec4f32 arg res
+emitPrimOp _ res (VecIndexByteArrayOp vcat n w) args =
+    doIndexByteArrayOp Nothing ty res args
+  where
+    ty :: CmmType
+    ty = vecVmmType vcat n w
 
-emitPrimOp _ [res] FloatX4InsertOp [v,e,i] =
-    doVecInsertOp Nothing vec4f32 v e i res
+emitPrimOp _ res (VecReadByteArrayOp vcat n w) args =
+    doIndexByteArrayOp Nothing ty res args
+  where
+    ty :: CmmType
+    ty = vecVmmType vcat n w
 
-emitPrimOp _ [res] DoubleToDoubleX2Op [e] =
-    doVecPackOp Nothing vec2f64 zero [e,e] res
+emitPrimOp _ res (VecWriteByteArrayOp vcat n w) args =
+    doWriteByteArrayOp Nothing ty res args
   where
-    zero :: CmmExpr
-    zero = CmmLit $ CmmVec (replicate 2 (CmmFloat 0 W64))
+    ty :: CmmType
+    ty = vecVmmType vcat n w
 
-emitPrimOp _ [res] DoubleX2PackOp es@[_,_] =
-    doVecPackOp Nothing vec2f64 zero es res
+emitPrimOp _ res (VecIndexOffAddrOp vcat n w) args =
+    doIndexOffAddrOp Nothing ty res args
   where
-    zero :: CmmExpr
-    zero = CmmLit $ CmmVec (replicate 2 (CmmFloat 0 W64))
+    ty :: CmmType
+    ty = vecVmmType vcat n w
 
-emitPrimOp _ res@[_,_] DoubleX2UnpackOp [arg] =
-    doVecUnpackOp Nothing vec2f64 arg res
+emitPrimOp _ res (VecReadOffAddrOp vcat n w) args =
+    doIndexOffAddrOp Nothing ty res args
+  where
+    ty :: CmmType
+    ty = vecVmmType vcat n w
 
-emitPrimOp _ [res] DoubleX2InsertOp [v,e,i] =
-    doVecInsertOp Nothing vec2f64 v e i res
+emitPrimOp _ res (VecWriteOffAddrOp vcat n w) args =
+    doWriteOffAddrOp Nothing ty res args
+  where
+    ty :: CmmType
+    ty = vecVmmType vcat n w
 
-emitPrimOp dflags [res] Int32ToInt32X4Op [e] =
-    doVecPackOp (Just (mo_WordTo32 dflags)) vec4b32 zero [e,e,e,e] res
+emitPrimOp _ res (VecIndexScalarByteArrayOp vcat n w) args =
+    doIndexByteArrayOpAs Nothing vecty ty res args
   where
-    zero :: CmmExpr
-    zero = CmmLit $ CmmVec (replicate 4 (CmmInt 0 W32))
+    vecty :: CmmType
+    vecty = vecVmmType vcat n w
 
-emitPrimOp dflags [res] Int32X4PackOp es@[_,_,_,_] =
-    doVecPackOp (Just (mo_WordTo32 dflags)) vec4b32 zero es res
+    ty :: CmmType
+    ty = vecCmmCat vcat w
+
+emitPrimOp _ res (VecReadScalarByteArrayOp vcat n w) args =
+    doIndexByteArrayOpAs Nothing vecty ty res args
   where
-    zero :: CmmExpr
-    zero = CmmLit $ CmmVec (replicate 4 (CmmInt 0 W32))
+    vecty :: CmmType
+    vecty = vecVmmType vcat n w
 
-emitPrimOp dflags res@[_,_,_,_] Int32X4UnpackOp [arg] =
-    doVecUnpackOp (Just (mo_s_32ToWord dflags)) vec4b32 arg res
+    ty :: CmmType
+    ty = vecCmmCat vcat w
 
-emitPrimOp dflags [res] Int32X4InsertOp [v,e,i] =
-    doVecInsertOp (Just (mo_WordTo32 dflags)) vec4b32 v e i res
+emitPrimOp _ res (VecWriteScalarByteArrayOp vcat _ w) args =
+    doWriteByteArrayOp Nothing ty res args
+  where
+    ty :: CmmType
+    ty = vecCmmCat vcat w
 
-emitPrimOp _ [res] Int64ToInt64X2Op [e] =
-    doVecPackOp Nothing vec2b64 zero [e,e] res
+emitPrimOp _ res (VecIndexScalarOffAddrOp vcat n w) args =
+    doIndexOffAddrOpAs Nothing vecty ty res args
   where
-    zero :: CmmExpr
-    zero = CmmLit $ CmmVec (replicate 2 (CmmInt 0 W64))
+    vecty :: CmmType
+    vecty = vecVmmType vcat n w
 
-emitPrimOp _ [res] Int64X2PackOp es@[_,_] =
-    doVecPackOp Nothing vec2b64 zero es res
+    ty :: CmmType
+    ty = vecCmmCat vcat w
+
+emitPrimOp _ res (VecReadScalarOffAddrOp vcat n w) args =
+    doIndexOffAddrOpAs Nothing vecty ty res args
   where
-    zero :: CmmExpr
-    zero = CmmLit $ CmmVec (replicate 2 (CmmInt 0 W64))
+    vecty :: CmmType
+    vecty = vecVmmType vcat n w
 
-emitPrimOp _ res@[_,_] Int64X2UnpackOp [arg] =
-    doVecUnpackOp Nothing vec2b64 arg res
+    ty :: CmmType
+    ty = vecCmmCat vcat w
 
-emitPrimOp _ [res] Int64X2InsertOp [v,e,i] =
-    doVecInsertOp Nothing vec2b64 v e i res
+emitPrimOp _ res (VecWriteScalarOffAddrOp vcat _ w) args =
+    doWriteOffAddrOp Nothing ty res args
+  where
+    ty :: CmmType
+    ty = vecCmmCat vcat w
 
 -- Prefetch
 emitPrimOp _ res PrefetchByteArrayOp        args = doPrefetchByteArrayOp res args
@@ -944,33 +954,26 @@ translateOp _      FloatMulOp    = Just (MO_F_Mul  W32)
 translateOp _      FloatDivOp    = Just (MO_F_Quot W32)
 translateOp _      FloatNegOp    = Just (MO_F_Neg  W32)
 
--- Floating point vector ops
-
-translateOp _ FloatX4AddOp  = Just (MO_VF_Add  4 W32)
-translateOp _ FloatX4SubOp  = Just (MO_VF_Sub  4 W32)
-translateOp _ FloatX4MulOp  = Just (MO_VF_Mul  4 W32)
-translateOp _ FloatX4DivOp  = Just (MO_VF_Quot 4 W32)
-translateOp _ FloatX4NegOp  = Just (MO_VF_Neg  4 W32)
-
-translateOp _ DoubleX2AddOp  = Just (MO_VF_Add  2 W64)
-translateOp _ DoubleX2SubOp  = Just (MO_VF_Sub  2 W64)
-translateOp _ DoubleX2MulOp  = Just (MO_VF_Mul  2 W64)
-translateOp _ DoubleX2DivOp  = Just (MO_VF_Quot 2 W64)
-translateOp _ DoubleX2NegOp  = Just (MO_VF_Neg  2 W64)
-
-translateOp _ Int32X4AddOp   = Just (MO_V_Add   4 W32)
-translateOp _ Int32X4SubOp   = Just (MO_V_Sub   4 W32)
-translateOp _ Int32X4MulOp   = Just (MO_V_Mul   4 W32)
-translateOp _ Int32X4QuotOp  = Just (MO_VS_Quot 4 W32)
-translateOp _ Int32X4RemOp   = Just (MO_VS_Rem  4 W32)
-translateOp _ Int32X4NegOp   = Just (MO_VS_Neg  4 W32)
-
-translateOp _ Int64X2AddOp   = Just (MO_V_Add   2 W64)
-translateOp _ Int64X2SubOp   = Just (MO_V_Sub   2 W64)
-translateOp _ Int64X2MulOp   = Just (MO_V_Mul   2 W64)
-translateOp _ Int64X2QuotOp  = Just (MO_VS_Quot 2 W64)
-translateOp _ Int64X2RemOp   = Just (MO_VS_Rem  2 W64)
-translateOp _ Int64X2NegOp   = Just (MO_VS_Neg  2 W64)
+-- Vector ops
+
+translateOp _ (VecAddOp FloatVec n w) = Just (MO_VF_Add  n w)
+translateOp _ (VecSubOp FloatVec n w) = Just (MO_VF_Sub  n w)
+translateOp _ (VecMulOp FloatVec n w) = Just (MO_VF_Mul  n w)
+translateOp _ (VecDivOp FloatVec n w) = Just (MO_VF_Quot n w)
+translateOp _ (VecNegOp FloatVec n w) = Just (MO_VF_Neg  n w)
+
+translateOp _ (VecAddOp  IntVec n w) = Just (MO_V_Add   n w)
+translateOp _ (VecSubOp  IntVec n w) = Just (MO_V_Sub   n w)
+translateOp _ (VecMulOp  IntVec n w) = Just (MO_V_Mul   n w)
+translateOp _ (VecQuotOp IntVec n w) = Just (MO_VS_Quot n w)
+translateOp _ (VecRemOp  IntVec n w) = Just (MO_VS_Rem  n w)
+translateOp _ (VecNegOp  IntVec n w) = Just (MO_VS_Neg  n w)
+
+translateOp _ (VecAddOp  WordVec n w) = Just (MO_V_Add   n w)
+translateOp _ (VecSubOp  WordVec n w) = Just (MO_V_Sub   n w)
+translateOp _ (VecMulOp  WordVec n w) = Just (MO_V_Mul   n w)
+translateOp _ (VecQuotOp WordVec n w) = Just (MO_VU_Quot n w)
+translateOp _ (VecRemOp  WordVec n w) = Just (MO_VU_Rem  n w)
 
 -- Conversions
 
@@ -1183,6 +1186,41 @@ setInfo :: CmmExpr -> CmmExpr -> CmmAGraph
 setInfo closure_ptr info_ptr = mkStore closure_ptr info_ptr
 
 ------------------------------------------------------------------------------
+-- Helpers for translating vector primops.
+
+vecVmmType :: PrimOpVecCat -> Length -> Width -> CmmType
+vecVmmType pocat n w = vec n (vecCmmCat pocat w)
+
+vecCmmCat :: PrimOpVecCat -> Width -> CmmType
+vecCmmCat IntVec   = cmmBits
+vecCmmCat WordVec  = cmmBits
+vecCmmCat FloatVec = cmmFloat
+
+vecElemInjectCast :: DynFlags -> PrimOpVecCat -> Width -> Maybe MachOp
+vecElemInjectCast _      FloatVec _   =  Nothing
+vecElemInjectCast dflags IntVec   W8  =  Just (mo_WordTo8  dflags)
+vecElemInjectCast dflags IntVec   W16 =  Just (mo_WordTo16 dflags)
+vecElemInjectCast dflags IntVec   W32 =  Just (mo_WordTo32 dflags)
+vecElemInjectCast _      IntVec   W64 =  Nothing
+vecElemInjectCast dflags WordVec  W8  =  Just (mo_WordTo8  dflags)
+vecElemInjectCast dflags WordVec  W16 =  Just (mo_WordTo16 dflags)
+vecElemInjectCast dflags WordVec  W32 =  Just (mo_WordTo32 dflags)
+vecElemInjectCast _      WordVec  W64 =  Nothing
+vecElemInjectCast _      _        _   =  Nothing
+
+vecElemProjectCast :: DynFlags -> PrimOpVecCat -> Width -> Maybe MachOp
+vecElemProjectCast _      FloatVec _   =  Nothing
+vecElemProjectCast dflags IntVec   W8  =  Just (mo_s_8ToWord  dflags)
+vecElemProjectCast dflags IntVec   W16 =  Just (mo_s_16ToWord dflags)
+vecElemProjectCast dflags IntVec   W32 =  Just (mo_s_32ToWord dflags)
+vecElemProjectCast _      IntVec   W64 =  Nothing
+vecElemProjectCast dflags WordVec  W8  =  Just (mo_u_8ToWord  dflags)
+vecElemProjectCast dflags WordVec  W16 =  Just (mo_u_16ToWord dflags)
+vecElemProjectCast dflags WordVec  W32 =  Just (mo_u_32ToWord dflags)
+vecElemProjectCast _      WordVec  W64 =  Nothing
+vecElemProjectCast _      _        _   =  Nothing
+
+------------------------------------------------------------------------------
 -- Helpers for translating vector packing and unpacking.
 
 doVecPackOp :: Maybe MachOp  -- Cast from element to vector component
diff --git a/compiler/ghc.mk b/compiler/ghc.mk
index 4fdadd7c30..5b9610103b 100644
--- a/compiler/ghc.mk
+++ b/compiler/ghc.mk
@@ -250,8 +250,12 @@ PRIMOP_BITS_NAMES = primop-data-decl.hs-incl        \
                     primop-code-size.hs-incl        \
                     primop-can-fail.hs-incl         \
                     primop-strictness.hs-incl       \
-                    primop-fixity.hs-incl       \
-                    primop-primop-info.hs-incl
+                    primop-fixity.hs-incl           \
+                    primop-primop-info.hs-incl      \
+                    primop-vector-uniques.hs-incl   \
+                    primop-vector-tys.hs-incl       \
+                    primop-vector-tys-exports.hs-incl \
+                    primop-vector-tycons.hs-incl
 
 PRIMOP_BITS_STAGE1 = $(addprefix compiler/stage1/build/,$(PRIMOP_BITS_NAMES))
 PRIMOP_BITS_STAGE2 = $(addprefix compiler/stage2/build/,$(PRIMOP_BITS_NAMES))
@@ -290,6 +294,14 @@ compiler/stage$1/build/primop-fixity.hs-incl: compiler/stage$1/build/primops.txt
 	"$$(genprimopcode_INPLACE)" --fixity             < $$< > $$@
 compiler/stage$1/build/primop-primop-info.hs-incl: compiler/stage$1/build/primops.txt $$$$(genprimopcode_INPLACE)
 	"$$(genprimopcode_INPLACE)" --primop-primop-info < $$< > $$@
+compiler/stage$1/build/primop-vector-uniques.hs-incl: compiler/stage$1/build/primops.txt $$$$(genprimopcode_INPLACE)
+	"$$(genprimopcode_INPLACE)" --primop-vector-uniques     < $$< > $$@
+compiler/stage$1/build/primop-vector-tys.hs-incl: compiler/stage$1/build/primops.txt $$$$(genprimopcode_INPLACE)
+	"$$(genprimopcode_INPLACE)" --primop-vector-tys         < $$< > $$@
+compiler/stage$1/build/primop-vector-tys-exports.hs-incl: compiler/stage$1/build/primops.txt $$$$(genprimopcode_INPLACE)
+	"$$(genprimopcode_INPLACE)" --primop-vector-tys-exports < $$< > $$@
+compiler/stage$1/build/primop-vector-tycons.hs-incl: compiler/stage$1/build/primops.txt $$$$(genprimopcode_INPLACE)
+	"$$(genprimopcode_INPLACE)" --primop-vector-tycons      < $$< > $$@
 
 # Usages aren't used any more; but the generator
 # can still generate them if we want them back
diff --git a/compiler/llvmGen/LlvmCodeGen/CodeGen.hs b/compiler/llvmGen/LlvmCodeGen/CodeGen.hs
index 33107c0b68..c52640b17f 100644
--- a/compiler/llvmGen/LlvmCodeGen/CodeGen.hs
+++ b/compiler/llvmGen/LlvmCodeGen/CodeGen.hs
@@ -967,6 +967,9 @@ genMachOp _ op [x] = case op of
 
     MO_VS_Quot    _ _ -> panicOp
     MO_VS_Rem     _ _ -> panicOp
+
+    MO_VU_Quot    _ _ -> panicOp
+    MO_VU_Rem     _ _ -> panicOp
  
     MO_VF_Insert  _ _ -> panicOp
     MO_VF_Extract _ _ -> panicOp
@@ -1140,6 +1143,9 @@ genMachOp_slow opt op [x, y] = case op of
 
     MO_VS_Quot l w -> genCastBinMach (LMVector l (widthToLlvmInt w)) LM_MO_SDiv
     MO_VS_Rem  l w -> genCastBinMach (LMVector l (widthToLlvmInt w)) LM_MO_SRem
+
+    MO_VU_Quot l w -> genCastBinMach (LMVector l (widthToLlvmInt w)) LM_MO_UDiv
+    MO_VU_Rem  l w -> genCastBinMach (LMVector l (widthToLlvmInt w)) LM_MO_URem
  
     MO_VF_Add  l w -> genCastBinMach (LMVector l (widthToLlvmFloat w)) LM_MO_FAdd
     MO_VF_Sub  l w -> genCastBinMach (LMVector l (widthToLlvmFloat w)) LM_MO_FSub
diff --git a/compiler/nativeGen/X86/CodeGen.hs b/compiler/nativeGen/X86/CodeGen.hs
index f6143d3fb9..e18da25347 100644
--- a/compiler/nativeGen/X86/CodeGen.hs
+++ b/compiler/nativeGen/X86/CodeGen.hs
@@ -610,6 +610,8 @@ getRegister' dflags is32Bit (CmmMachOp mop [x]) = do -- unary MachOps
       MO_VS_Quot {}    -> needLlvm
       MO_VS_Rem {}     -> needLlvm
       MO_VS_Neg {}     -> needLlvm
+      MO_VU_Quot {}    -> needLlvm
+      MO_VU_Rem {}     -> needLlvm
       MO_VF_Insert {}  -> needLlvm
       MO_VF_Extract {} -> needLlvm
       MO_VF_Add {}     -> needLlvm
diff --git a/compiler/prelude/PrelNames.lhs b/compiler/prelude/PrelNames.lhs
index 81fb9be52a..07730e653d 100644
--- a/compiler/prelude/PrelNames.lhs
+++ b/compiler/prelude/PrelNames.lhs
@@ -1474,15 +1474,6 @@ typeNatMulTyFamNameKey    = mkPreludeTyConUnique 163
 typeNatExpTyFamNameKey    = mkPreludeTyConUnique 164
 typeNatLeqTyFamNameKey    = mkPreludeTyConUnique 165
 
--- SIMD vector types (Unique keys)
-floatX4PrimTyConKey, doubleX2PrimTyConKey, int32X4PrimTyConKey,
-  int64X2PrimTyConKey :: Unique
-
-floatX4PrimTyConKey  = mkPreludeTyConUnique 170
-doubleX2PrimTyConKey = mkPreludeTyConUnique 171
-int32X4PrimTyConKey  = mkPreludeTyConUnique 172
-int64X2PrimTyConKey  = mkPreludeTyConUnique 173
-
 ntTyConKey:: Unique
 ntTyConKey = mkPreludeTyConUnique 174
 coercibleTyConKey :: Unique
@@ -1492,6 +1483,12 @@ coercibleTyConKey = mkPreludeTyConUnique 175
 --      USES TyConUniques 200-299
 -----------------------------------------------------
 
+----------------------- SIMD ------------------------
+--      USES TyConUniques 300-399
+-----------------------------------------------------
+
+#include "primop-vector-uniques.hs-incl"
+
 unitTyConKey :: Unique
 unitTyConKey = mkTupleTyConUnique BoxedTuple 0
 \end{code}
diff --git a/compiler/prelude/PrimOp.lhs b/compiler/prelude/PrimOp.lhs
index 8b1970c37f..22753ee3ea 100644
--- a/compiler/prelude/PrimOp.lhs
+++ b/compiler/prelude/PrimOp.lhs
@@ -5,7 +5,7 @@
 
 \begin{code}
 module PrimOp (
-        PrimOp(..), allThePrimOps,
+        PrimOp(..), PrimOpVecCat(..), allThePrimOps,
         primOpType, primOpSig,
         primOpTag, maxPrimOpTag, primOpOcc,
 
@@ -25,6 +25,7 @@ module PrimOp (
 import TysPrim
 import TysWiredIn
 
+import CmmType
 import Demand
 import Var              ( TyVar )
 import OccName          ( OccName, pprOccName, mkVarOccFS )
@@ -64,6 +65,7 @@ primOpTag op = iBox (tagOf_PrimOp op)
 -- supplies
 -- tagOf_PrimOp :: PrimOp -> FastInt
 #include "primop-tag.hs-incl"
+tagOf_PrimOp _ = error "tagOf_PrimOp: unknown primop"
 
 
 instance Eq PrimOp where
@@ -82,6 +84,12 @@ instance Outputable PrimOp where
     ppr op = pprPrimOp op
 \end{code}
 
+\begin{code}
+data PrimOpVecCat = IntVec
+                  | WordVec
+                  | FloatVec
+\end{code}
+
 An @Enum@-derived list would be better; meanwhile... (ToDo)
 
 \begin{code}
@@ -173,6 +181,7 @@ else, notably a type, can be constructed) for each @PrimOp@.
 \begin{code}
 primOpInfo :: PrimOp -> PrimOpInfo
 #include "primop-primop-info.hs-incl"
+primOpInfo _ = error "primOpInfo: unknown primop"
 \end{code}
 
 Here are a load of comments from the old primOp info:
diff --git a/compiler/prelude/TysPrim.lhs b/compiler/prelude/TysPrim.lhs
index f166065b22..b17f1a6f9a 100644
--- a/compiler/prelude/TysPrim.lhs
+++ b/compiler/prelude/TysPrim.lhs
@@ -76,11 +76,8 @@ module TysPrim(
 	-- * Any
 	anyTy, anyTyCon, anyTypeOfKind,
 
-        -- * SIMD
-	floatX4PrimTyCon,		floatX4PrimTy,
-	doubleX2PrimTyCon,		doubleX2PrimTy,
-	int32X4PrimTyCon,		int32X4PrimTy,
-	int64X2PrimTyCon,		int64X2PrimTy
+	-- * SIMD
+#include "primop-vector-tys-exports.hs-incl"
   ) where
 
 #include "HsVersions.h"
@@ -144,10 +141,7 @@ primTyCons
     , superKindTyCon
     , anyKindTyCon
 
-    , floatX4PrimTyCon
-    , doubleX2PrimTyCon
-    , int32X4PrimTyCon
-    , int64X2PrimTyCon
+#include "primop-vector-tycons.hs-incl"
     ]
 
 mkPrimTc :: FastString -> Unique -> TyCon -> Name
@@ -157,7 +151,7 @@ mkPrimTc fs unique tycon
 		  (ATyCon tycon)	-- Relevant TyCon
 		  UserSyntax		-- None are built-in syntax
 
-charPrimTyConName, intPrimTyConName, int32PrimTyConName, int64PrimTyConName, wordPrimTyConName, word32PrimTyConName, word64PrimTyConName, addrPrimTyConName, floatPrimTyConName, doublePrimTyConName, statePrimTyConName, realWorldTyConName, arrayPrimTyConName, arrayArrayPrimTyConName, byteArrayPrimTyConName, mutableArrayPrimTyConName, mutableByteArrayPrimTyConName, mutableArrayArrayPrimTyConName, mutVarPrimTyConName, mVarPrimTyConName, tVarPrimTyConName, stablePtrPrimTyConName, stableNamePrimTyConName, bcoPrimTyConName, weakPrimTyConName, threadIdPrimTyConName, eqPrimTyConName, eqReprPrimTyConName, floatX4PrimTyConName, doubleX2PrimTyConName, int32X4PrimTyConName, int64X2PrimTyConName :: Name
+charPrimTyConName, intPrimTyConName, int32PrimTyConName, int64PrimTyConName, wordPrimTyConName, word32PrimTyConName, word64PrimTyConName, addrPrimTyConName, floatPrimTyConName, doublePrimTyConName, statePrimTyConName, realWorldTyConName, arrayPrimTyConName, arrayArrayPrimTyConName, byteArrayPrimTyConName, mutableArrayPrimTyConName, mutableByteArrayPrimTyConName, mutableArrayArrayPrimTyConName, mutVarPrimTyConName, mVarPrimTyConName, tVarPrimTyConName, stablePtrPrimTyConName, stableNamePrimTyConName, bcoPrimTyConName, weakPrimTyConName, threadIdPrimTyConName, eqPrimTyConName, eqReprPrimTyConName :: Name
 charPrimTyConName    	      = mkPrimTc (fsLit "Char#") charPrimTyConKey charPrimTyCon
 intPrimTyConName     	      = mkPrimTc (fsLit "Int#") intPrimTyConKey  intPrimTyCon
 int32PrimTyConName	      = mkPrimTc (fsLit "Int32#") int32PrimTyConKey int32PrimTyCon
@@ -186,10 +180,6 @@ stableNamePrimTyConName       = mkPrimTc (fsLit "StableName#") stableNamePrimTyC
 bcoPrimTyConName 	      = mkPrimTc (fsLit "BCO#") bcoPrimTyConKey bcoPrimTyCon
 weakPrimTyConName  	      = mkPrimTc (fsLit "Weak#") weakPrimTyConKey weakPrimTyCon
 threadIdPrimTyConName  	      = mkPrimTc (fsLit "ThreadId#") threadIdPrimTyConKey threadIdPrimTyCon
-floatX4PrimTyConName          = mkPrimTc (fsLit "FloatX4#") floatX4PrimTyConKey floatX4PrimTyCon
-doubleX2PrimTyConName         = mkPrimTc (fsLit "DoubleX2#") doubleX2PrimTyConKey doubleX2PrimTyCon
-int32X4PrimTyConName          = mkPrimTc (fsLit "Int32X4#") int32X4PrimTyConKey int32X4PrimTyCon
-int64X2PrimTyConName          = mkPrimTc (fsLit "Int64X2#") int64X2PrimTyConKey int64X2PrimTyCon
 \end{code}
 
 %************************************************************************
@@ -766,28 +756,10 @@ anyTypeOfKind kind = TyConApp anyTyCon [kind]
 
 %************************************************************************
 %*									*
-\subsection{SIMD vector type}
+\subsection{SIMD vector types}
 %*									*
 %************************************************************************
 
 \begin{code}
-floatX4PrimTy :: Type
-floatX4PrimTy = mkTyConTy floatX4PrimTyCon
-floatX4PrimTyCon :: TyCon
-floatX4PrimTyCon = pcPrimTyCon0 floatX4PrimTyConName (VecRep 4 FloatElemRep)
-
-doubleX2PrimTy :: Type
-doubleX2PrimTy = mkTyConTy doubleX2PrimTyCon
-doubleX2PrimTyCon :: TyCon
-doubleX2PrimTyCon = pcPrimTyCon0 doubleX2PrimTyConName (VecRep 2 DoubleElemRep)
-
-int32X4PrimTy :: Type
-int32X4PrimTy = mkTyConTy int32X4PrimTyCon
-int32X4PrimTyCon :: TyCon
-int32X4PrimTyCon = pcPrimTyCon0 int32X4PrimTyConName (VecRep 4 Int32ElemRep)
-
-int64X2PrimTy :: Type
-int64X2PrimTy = mkTyConTy int64X2PrimTyCon
-int64X2PrimTyCon :: TyCon
-int64X2PrimTyCon = pcPrimTyCon0 int64X2PrimTyConName (VecRep 2 Int64ElemRep)
+#include "primop-vector-tys.hs-incl"
 \end{code}
diff --git a/compiler/prelude/primops.txt.pp b/compiler/prelude/primops.txt.pp
index cfd6afa4c6..f4b7b6c5d0 100644
--- a/compiler/prelude/primops.txt.pp
+++ b/compiler/prelude/primops.txt.pp
@@ -39,6 +39,22 @@
 -- (eg, out_of_line), whilst avoiding parsing complex expressions
 -- needed for strictness info.
 
+-- The vector attribute is rather special. It takes a list of 3-tuples, each of
+-- which is of the form <ELEM_TYPE,SCALAR_TYPE,LENGTH>. ELEM_TYPE is the type of
+-- the elements in the vector; LENGTH is the length of the vector; and
+-- SCALAR_TYPE is the scalar type used to inject to/project from vector
+-- element. Note that ELEM_TYPE and SCALAR_TYPE are not the same; for example,
+-- to broadcast a scalar value to a vector whose elements are of type Int8, we
+-- use an Int#.
+
+-- When a primtype or primop has a vector attribute, it is instantiated at each
+-- 3-tuple in the list of 3-tuples. That is, the vector attribute allows us to
+-- define a family of types or primops. Vector support also adds three new
+-- keywords: VECTOR, SCALAR, and VECTUPLE. These keywords are expanded to types
+-- derived from the 3-tuple. For the 3-tuple <Int64,INT64,2>, VECTOR expands to
+-- Int64X2#, SCALAR expands to INT64, and VECTUPLE expands to (# INT64, INT64
+-- #).
+
 defaults
    has_side_effects = False
    out_of_line      = False   -- See Note Note [PrimOp can_fail and has_side_effects] in PrimOp
@@ -48,6 +64,7 @@ defaults
    strictness       = { \ arity -> mkStrictSig (mkTopDmdType (replicate arity topDmd) topRes) }
    fixity           = Nothing
    llvm_only        = False
+   vector           = []
 
 -- Currently, documentation is produced using latex, so contents of
 -- description fields should be legal latex. Descriptions can contain
@@ -2373,479 +2390,194 @@ primclass Coercible a b
    }
 
 ------------------------------------------------------------------------
-section "Float SIMD Vectors"
-	{Operations on SIMD vectors of 4 single-precision (32-bit)
-         floating-point numbers.}
+section "SIMD Vectors" 
+	{Operations on SIMD vectors.}
 ------------------------------------------------------------------------
 
-primtype FloatX4#
-   with llvm_only = True
+#define ALL_VECTOR_TYPES \
+  [<Int32,INT32,4>,<Int64,INT64,2> \
+  ,<Word32,WORD32,4>,<Word64,WORD64,2> \
+  ,<Float,Float#,4>,<Double,Double#,2>]
 
-primop FloatToFloatX4Op "floatToFloatX4#" GenPrimOp
-   Float# -> FloatX4#
-   with llvm_only = True
+#define SIGNED_VECTOR_TYPES \
+  [<Int32,INT32,4>,<Int64,INT64,2> \
+  ,<Float,Float#,4>,<Double,Double#,2>]
 
-primop FloatX4PackOp "packFloatX4#" GenPrimOp
-   Float# -> Float# -> Float# -> Float# -> FloatX4#
-   with llvm_only = True
+#define FLOAT_VECTOR_TYPES \
+  [<Float,Float#,4>,<Double,Double#,2>]
 
-primop FloatX4UnpackOp "unpackFloatX4#" GenPrimOp
-   FloatX4# -> (# Float#, Float#, Float#, Float# #)
-   with llvm_only = True
+#define INT_VECTOR_TYPES \
+  [<Int32,INT32,4>,<Int64,INT64,2> \
+  ,<Word32,WORD32,4>,<Word64,WORD64,2>]
 
-primop FloatX4InsertOp "insertFloatX4#" GenPrimOp
-   FloatX4# -> Float# -> Int# -> FloatX4#
-   with can_fail = True
-        llvm_only = True
-
-primop FloatX4AddOp "plusFloatX4#" Dyadic
-   FloatX4# -> FloatX4# -> FloatX4#
-   with commutable = True
-        llvm_only = True
-
-primop FloatX4SubOp "minusFloatX4#" Dyadic
-  FloatX4# -> FloatX4# -> FloatX4#
+primtype VECTOR
    with llvm_only = True
+   	vector = ALL_VECTOR_TYPES
 
-primop FloatX4MulOp "timesFloatX4#" Dyadic
-   FloatX4# -> FloatX4# -> FloatX4#
-   with commutable = True
-        llvm_only = True
-
-primop FloatX4DivOp "divideFloatX4#" Dyadic
-   FloatX4# -> FloatX4# -> FloatX4#
-   with can_fail = True
-        llvm_only = True
-
-primop FloatX4NegOp "negateFloatX4#" Monadic
-   FloatX4# -> FloatX4#
+primop VecBroadcastOp "broadcast#" GenPrimOp
+   SCALAR -> VECTOR
+   { Broadcast a scalar to all elements of a vector. }
    with llvm_only = True
+   	vector = ALL_VECTOR_TYPES
 
-primop IndexByteArrayOp_FloatX4 "indexFloatX4Array#" GenPrimOp
-   ByteArray# -> Int# -> FloatX4#
-   with can_fail = True
-        llvm_only = True
-
-primop ReadByteArrayOp_FloatX4 "readFloatX4Array#" GenPrimOp
-   MutableByteArray# s -> Int# -> State# s -> (# State# s, FloatX4# #)
-   with has_side_effects = True
-        can_fail = True
-        llvm_only = True
-
-primop WriteByteArrayOp_FloatX4 "writeFloatX4Array#" GenPrimOp
-   MutableByteArray# s -> Int# -> FloatX4# -> State# s -> State# s
-   with has_side_effects = True
-        can_fail = True
-        llvm_only = True
-
-primop IndexOffAddrOp_FloatX4 "indexFloatX4OffAddr#" GenPrimOp
-   Addr# -> Int# -> FloatX4#
-   with can_fail = True
-        llvm_only = True
-
-primop ReadOffAddrOp_FloatX4 "readFloatX4OffAddr#" GenPrimOp
-   Addr# -> Int# -> State# s -> (# State# s, FloatX4# #)
-   with has_side_effects = True
-        can_fail = True
-        llvm_only = True
-
-primop  WriteOffAddrOp_FloatX4 "writeFloatX4OffAddr#" GenPrimOp
-   Addr# -> Int# -> FloatX4# -> State# s -> State# s
-   with has_side_effects = True
-        can_fail = True
-        llvm_only = True
-
-primop IndexByteArrayOp_FloatAsFloatX4 "indexFloatArrayAsFloatX4#" GenPrimOp
-   ByteArray# -> Int# -> FloatX4#
-   with can_fail = True
-        llvm_only = True
-
-primop ReadByteArrayOp_FloatAsFloatX4 "readFloatArrayAsFloatX4#" GenPrimOp
-   MutableByteArray# s -> Int# -> State# s -> (# State# s, FloatX4# #)
-   with has_side_effects = True
-        can_fail = True
-        llvm_only = True
-
-primop WriteByteArrayOp_FloatAsFloatX4 "writeFloatArrayAsFloatX4#" GenPrimOp
-   MutableByteArray# s -> Int# -> FloatX4# -> State# s -> State# s
-   with has_side_effects = True
-        can_fail = True
-        llvm_only = True
-
-primop IndexOffAddrOp_FloatAsFloatX4 "indexFloatOffAddrAsFloatX4#" GenPrimOp
-   Addr# -> Int# -> FloatX4#
-   with can_fail = True
-        llvm_only = True
-
-primop ReadOffAddrOp_FloatAsFloatX4 "readFloatOffAddrAsFloatX4#" GenPrimOp
-   Addr# -> Int# -> State# s -> (# State# s, FloatX4# #)
-   with has_side_effects = True
-        can_fail = True
-        llvm_only = True
-
-primop  WriteOffAddrOp_FloatAsFloatX4 "writeFloatOffAddrAsFloatX4#" GenPrimOp
-   Addr# -> Int# -> FloatX4# -> State# s -> State# s
-   with has_side_effects = True
-        can_fail = True
-        llvm_only = True
-
-------------------------------------------------------------------------
-section "Double SIMD Vectors"
-	{Operations on SIMD vectors of 2 double-precision (64-bit)
-         floating-point numbers.}
-------------------------------------------------------------------------
-
-primtype DoubleX2#
+primop VecPackOp "pack#" GenPrimOp
+   VECTUPLE -> VECTOR
+   { Pack the elements of an unboxed tuple into a vector. }
    with llvm_only = True
+   	vector = ALL_VECTOR_TYPES
 
-primop DoubleToDoubleX2Op "doubleToDoubleX2#" GenPrimOp
-   Double# -> DoubleX2#
+primop VecUnpackOp "unpack#" GenPrimOp
+   VECTOR -> VECTUPLE
+   { Unpack the elements of a vector into an unboxed tuple. #}
    with llvm_only = True
+   	vector = ALL_VECTOR_TYPES
 
-primop DoubleX2InsertOp "insertDoubleX2#" GenPrimOp
-   DoubleX2# -> Double# -> Int# -> DoubleX2#
+primop VecInsertOp "insert#" GenPrimOp
+   VECTOR -> SCALAR -> Int# -> VECTOR
+   { Insert a scalar at the given position in a vector. }
    with can_fail = True
         llvm_only = True
+   	vector = ALL_VECTOR_TYPES
 
-primop DoubleX2PackOp "packDoubleX2#" GenPrimOp
-   Double# -> Double# -> DoubleX2#
-   with llvm_only = True
-
-primop DoubleX2UnpackOp "unpackDoubleX2#" GenPrimOp
-   DoubleX2# -> (# Double#, Double# #)
-   with llvm_only = True
-
-primop DoubleX2AddOp "plusDoubleX2#" Dyadic
-   DoubleX2# -> DoubleX2# -> DoubleX2#
+primop VecAddOp "plus#" Dyadic
+   VECTOR -> VECTOR -> VECTOR
+   { Add two vectors element-wise. }
    with commutable = True
         llvm_only = True
+   	vector = ALL_VECTOR_TYPES
 
-primop DoubleX2SubOp "minusDoubleX2#" Dyadic
-  DoubleX2# -> DoubleX2# -> DoubleX2#
+primop VecSubOp "minus#" Dyadic
+   VECTOR -> VECTOR -> VECTOR
+   { Subtract two vectors element-wise. }
    with llvm_only = True
+   	vector = ALL_VECTOR_TYPES
 
-primop DoubleX2MulOp "timesDoubleX2#" Dyadic
-   DoubleX2# -> DoubleX2# -> DoubleX2#
+primop VecMulOp "times#" Dyadic
+   VECTOR -> VECTOR -> VECTOR
+   { Multiply two vectors element-wise. }
    with commutable = True
         llvm_only = True
+   	vector = ALL_VECTOR_TYPES
 
-primop DoubleX2DivOp "divideDoubleX2#" Dyadic
-   DoubleX2# -> DoubleX2# -> DoubleX2#
-   with can_fail = True
-        llvm_only = True
-
-primop DoubleX2NegOp "negateDoubleX2#" Monadic
-   DoubleX2# -> DoubleX2#
-   with llvm_only = True
-
-primop IndexByteArrayOp_DoubleX2 "indexDoubleX2Array#" GenPrimOp
-   ByteArray# -> Int# -> DoubleX2#
-   with can_fail = True
-        llvm_only = True
-
-primop ReadByteArrayOp_DoubleX2 "readDoubleX2Array#" GenPrimOp
-   MutableByteArray# s -> Int# -> State# s -> (# State# s, DoubleX2# #)
-   with has_side_effects = True
-        can_fail = True
-        llvm_only = True
-
-primop WriteByteArrayOp_DoubleX2 "writeDoubleX2Array#" GenPrimOp
-   MutableByteArray# s -> Int# -> DoubleX2# -> State# s -> State# s
-   with has_side_effects = True
-        can_fail = True
-        llvm_only = True
-
-primop IndexOffAddrOp_DoubleX2 "indexDoubleX2OffAddr#" GenPrimOp
-   Addr# -> Int# -> DoubleX2#
-   with can_fail = True
-        llvm_only = True
-
-primop ReadOffAddrOp_DoubleX2 "readDoubleX2OffAddr#" GenPrimOp
-   Addr# -> Int# -> State# s -> (# State# s, DoubleX2# #)
-   with has_side_effects = True
-        can_fail = True
-        llvm_only = True
-
-primop  WriteOffAddrOp_DoubleX2 "writeDoubleX2OffAddr#" GenPrimOp
-   Addr# -> Int# -> DoubleX2# -> State# s -> State# s
-   with has_side_effects = True
-        can_fail = True
-        llvm_only = True
-
-primop IndexByteArrayOp_DoubleAsDoubleX2 "indexDoubleArrayAsDoubleX2#" GenPrimOp
-   ByteArray# -> Int# -> DoubleX2#
-   with can_fail = True
-        llvm_only = True
-
-primop ReadByteArrayOp_DoubleAsDoubleX2 "readDoubleArrayAsDoubleX2#" GenPrimOp
-   MutableByteArray# s -> Int# -> State# s -> (# State# s, DoubleX2# #)
-   with has_side_effects = True
-        can_fail = True
-        llvm_only = True
-
-primop WriteByteArrayOp_DoubleAsDoubleX2 "writeDoubleArrayAsDoubleX2#" GenPrimOp
-   MutableByteArray# s -> Int# -> DoubleX2# -> State# s -> State# s
-   with has_side_effects = True
-        can_fail = True
-        llvm_only = True
-
-primop IndexOffAddrOp_DoubleAsDoubleX2 "indexDoubleOffAddrAsDoubleX2#" GenPrimOp
-   Addr# -> Int# -> DoubleX2#
-   with can_fail = True
-        llvm_only = True
-
-primop ReadOffAddrOp_DoubleAsDoubleX2 "readDoubleOffAddrAsDoubleX2#" GenPrimOp
-   Addr# -> Int# -> State# s -> (# State# s, DoubleX2# #)
-   with has_side_effects = True
-        can_fail = True
-        llvm_only = True
-
-primop  WriteOffAddrOp_DoubleAsDoubleX2 "writeDoubleOffAddrAsDoubleX2#" GenPrimOp
-   Addr# -> Int# -> DoubleX2# -> State# s -> State# s
-   with has_side_effects = True
-        can_fail = True
-        llvm_only = True
-
-------------------------------------------------------------------------
-section "Int32 SIMD Vectors"
-	{Operations on SIMD vectors of 4 32-bit signed integers.}
-------------------------------------------------------------------------
-
-primtype Int32X4#
-   with llvm_only = True
-
-primop Int32ToInt32X4Op "int32ToInt32X4#" GenPrimOp
-   INT32 -> Int32X4#
-   with llvm_only = True
-
-primop Int32X4InsertOp "insertInt32X4#" GenPrimOp
-   Int32X4# -> INT32 -> Int# -> Int32X4#
+primop VecDivOp "divide#" Dyadic
+   VECTOR -> VECTOR -> VECTOR
+   { Divide two vectors element-wise. }
    with can_fail = True
         llvm_only = True
+   	vector = FLOAT_VECTOR_TYPES
 
-primop Int32X4PackOp "packInt32X4#" GenPrimOp
-   INT32 -> INT32 -> INT32 -> INT32 -> Int32X4#
-   with llvm_only = True
-
-primop Int32X4UnpackOp "unpackInt32X4#" GenPrimOp
-   Int32X4# -> (# INT32, INT32, INT32, INT32 #)
-   with llvm_only = True
-
-primop Int32X4AddOp "plusInt32X4#" Dyadic
-   Int32X4# -> Int32X4# -> Int32X4#
-   with commutable = True
-        llvm_only = True
-
-primop Int32X4SubOp "minusInt32X4#" Dyadic
-  Int32X4# -> Int32X4# -> Int32X4#
-   with llvm_only = True
-
-primop Int32X4MulOp "timesInt32X4#" Dyadic
-   Int32X4# -> Int32X4# -> Int32X4#
-   with commutable = True
-        llvm_only = True
-
-primop Int32X4QuotOp "quotInt32X4#" Dyadic
-   Int32X4# -> Int32X4# -> Int32X4#
+primop VecQuotOp "quot#" Dyadic
+   VECTOR -> VECTOR -> VECTOR
+   { Rounds towards zero element-wise. }
    with can_fail = True
         llvm_only = True
+   	vector = INT_VECTOR_TYPES
 
-primop Int32X4RemOp "remInt32X4#" Dyadic
-   Int32X4# -> Int32X4# -> Int32X4#
+primop VecRemOp "rem#" Dyadic
+   VECTOR -> VECTOR -> VECTOR
+   { Satisfies \texttt{(quot\# x y) times\# y plus\# (rem\# x y) == x}. }
    with can_fail = True
         llvm_only = True
+   	vector = INT_VECTOR_TYPES
 
-primop Int32X4NegOp "negateInt32X4#" Monadic
-   Int32X4# -> Int32X4#
+primop VecNegOp "negate#" Monadic
+   VECTOR -> VECTOR
+   { Negate element-wise. }
    with llvm_only = True
+   	vector = SIGNED_VECTOR_TYPES
 
-primop IndexByteArrayOp_Int32X4 "indexInt32X4Array#" GenPrimOp
-   ByteArray# -> Int# -> Int32X4#
-   with can_fail = True
-        llvm_only = True
-
-primop ReadByteArrayOp_Int32X4 "readInt32X4Array#" GenPrimOp
-   MutableByteArray# s -> Int# -> State# s -> (# State# s, Int32X4# #)
-   with has_side_effects = True
-        can_fail = True
-        llvm_only = True
-
-primop WriteByteArrayOp_Int32X4 "writeInt32X4Array#" GenPrimOp
-   MutableByteArray# s -> Int# -> Int32X4# -> State# s -> State# s
-   with has_side_effects = True
-        can_fail = True
-        llvm_only = True
-
-primop IndexOffAddrOp_Int32X4 "indexInt32X4OffAddr#" GenPrimOp
-   Addr# -> Int# -> Int32X4#
-   with can_fail = True
-        llvm_only = True
-
-primop ReadOffAddrOp_Int32X4 "readInt32X4OffAddr#" GenPrimOp
-   Addr# -> Int# -> State# s -> (# State# s, Int32X4# #)
-   with has_side_effects = True
-        can_fail = True
-        llvm_only = True
-
-primop  WriteOffAddrOp_Int32X4 "writeInt32X4OffAddr#" GenPrimOp
-   Addr# -> Int# -> Int32X4# -> State# s -> State# s
-   with has_side_effects = True
-        can_fail = True
-        llvm_only = True
-
-primop IndexByteArrayOp_Int32AsInt32X4 "indexInt32ArrayAsInt32X4#" GenPrimOp
-   ByteArray# -> Int# -> Int32X4#
-   with can_fail = True
-        llvm_only = True
-
-primop ReadByteArrayOp_Int32AsInt32X4 "readInt32ArrayAsInt32X4#" GenPrimOp
-   MutableByteArray# s -> Int# -> State# s -> (# State# s, Int32X4# #)
-   with has_side_effects = True
-        can_fail = True
-        llvm_only = True
-
-primop WriteByteArrayOp_Int32AsInt32X4 "writeInt32ArrayAsInt32X4#" GenPrimOp
-   MutableByteArray# s -> Int# -> Int32X4# -> State# s -> State# s
-   with has_side_effects = True
-        can_fail = True
-        llvm_only = True
-
-primop IndexOffAddrOp_Int32AsInt32X4 "indexInt32OffAddrAsInt32X4#" GenPrimOp
-   Addr# -> Int# -> Int32X4#
+primop VecIndexByteArrayOp "indexArray#" GenPrimOp
+   ByteArray# -> Int# -> VECTOR
+   { Read a vector from specified index of immutable array. }
    with can_fail = True
         llvm_only = True
+   	vector = ALL_VECTOR_TYPES
 
-primop ReadOffAddrOp_Int32AsInt32X4 "readInt32OffAddrAsInt32X4#" GenPrimOp
-   Addr# -> Int# -> State# s -> (# State# s, Int32X4# #)
+primop VecReadByteArrayOp "readArray#" GenPrimOp
+   MutableByteArray# s -> Int# -> State# s -> (# State# s, VECTOR #)
+   { Read a vector from specified index of mutable array. }
    with has_side_effects = True
         can_fail = True
         llvm_only = True
+   	vector = ALL_VECTOR_TYPES
 
-primop  WriteOffAddrOp_Int32AsInt32X4 "writeInt32OffAddrAsInt32X4#" GenPrimOp
-   Addr# -> Int# -> Int32X4# -> State# s -> State# s
+primop VecWriteByteArrayOp "writeArray#" GenPrimOp
+   MutableByteArray# s -> Int# -> VECTOR -> State# s -> State# s
+   { Write a vector to specified index of mutable array. }
    with has_side_effects = True
         can_fail = True
         llvm_only = True
+   	vector = ALL_VECTOR_TYPES
 
-------------------------------------------------------------------------
-section "Int64 SIMD Vectors"
-	{Operations on SIMD vectors of 2 64-bit signed integers.}
-------------------------------------------------------------------------
-
-primtype Int64X2#
-   with llvm_only = True
-
-primop Int64ToInt64X2Op "int64ToInt64X2#" GenPrimOp
-   INT64 -> Int64X2#
-   with llvm_only = True
-
-primop Int64X2InsertOp "insertInt64X2#" GenPrimOp
-   Int64X2# -> INT64 -> Int# -> Int64X2#
-   with can_fail = True
-        llvm_only = True
-
-primop Int64X2PackOp "packInt64X2#" GenPrimOp
-   INT64 -> INT64 -> Int64X2#
-   with llvm_only = True
-
-primop Int64X2UnpackOp "unpackInt64X2#" GenPrimOp
-   Int64X2# -> (# INT64, INT64 #)
-   with llvm_only = True
-
-primop Int64X2AddOp "plusInt64X2#" Dyadic
-   Int64X2# -> Int64X2# -> Int64X2#
-   with commutable = True
-        llvm_only = True
-
-primop Int64X2SubOp "minusInt64X2#" Dyadic
-  Int64X2# -> Int64X2# -> Int64X2#
-   with llvm_only = True
-
-primop Int64X2MulOp "timesInt64X2#" Dyadic
-   Int64X2# -> Int64X2# -> Int64X2#
-   with commutable = True
-        llvm_only = True
-
-primop Int64X2QuotOp "quotInt64X2#" Dyadic
-   Int64X2# -> Int64X2# -> Int64X2#
-   with can_fail = True
-        llvm_only = True
-
-primop Int64X2RemOp "remInt64X2#" Dyadic
-   Int64X2# -> Int64X2# -> Int64X2#
-   with can_fail = True
-        llvm_only = True
-
-primop Int64X2NegOp "negateInt64X2#" Monadic
-   Int64X2# -> Int64X2#
-   with llvm_only = True
-
-primop IndexByteArrayOp_Int64X2 "indexInt64X2Array#" GenPrimOp
-   ByteArray# -> Int# -> Int64X2#
+primop VecIndexOffAddrOp "indexOffAddr#" GenPrimOp
+   Addr# -> Int# -> VECTOR
+   { Reads vector; offset in bytes. }
    with can_fail = True
         llvm_only = True
+   	vector = ALL_VECTOR_TYPES
 
-primop ReadByteArrayOp_Int64X2 "readInt64X2Array#" GenPrimOp
-   MutableByteArray# s -> Int# -> State# s -> (# State# s, Int64X2# #)
+primop VecReadOffAddrOp "readOffAddr#" GenPrimOp
+   Addr# -> Int# -> State# s -> (# State# s, VECTOR #)
+   { Reads vector; offset in bytes. }
    with has_side_effects = True
         can_fail = True
         llvm_only = True
+   	vector = ALL_VECTOR_TYPES
 
-primop WriteByteArrayOp_Int64X2 "writeInt64X2Array#" GenPrimOp
-   MutableByteArray# s -> Int# -> Int64X2# -> State# s -> State# s
+primop VecWriteOffAddrOp "writeOffAddr#" GenPrimOp
+   Addr# -> Int# -> VECTOR -> State# s -> State# s
+   { Write vector; offset in bytes. }
    with has_side_effects = True
         can_fail = True
         llvm_only = True
+   	vector = ALL_VECTOR_TYPES
 
-primop IndexOffAddrOp_Int64X2 "indexInt64X2OffAddr#" GenPrimOp
-   Addr# -> Int# -> Int64X2#
-   with can_fail = True
-        llvm_only = True
-
-primop ReadOffAddrOp_Int64X2 "readInt64X2OffAddr#" GenPrimOp
-   Addr# -> Int# -> State# s -> (# State# s, Int64X2# #)
-   with has_side_effects = True
-        llvm_only = True
-
-primop  WriteOffAddrOp_Int64X2 "writeInt64X2OffAddr#" GenPrimOp
-   Addr# -> Int# -> Int64X2# -> State# s -> State# s
-   with has_side_effects = True
-        can_fail = True
-        llvm_only = True
 
-primop IndexByteArrayOp_Int64AsInt64X2 "indexInt64ArrayAsInt64X2#" GenPrimOp
-   ByteArray# -> Int# -> Int64X2#
+primop VecIndexScalarByteArrayOp "indexArrayAs#" GenPrimOp
+   ByteArray# -> Int# -> VECTOR
+   { Read a vector from specified index of immutable array of scalars; offset is in scalar elements. }
    with can_fail = True
         llvm_only = True
+   	vector = ALL_VECTOR_TYPES
 
-primop ReadByteArrayOp_Int64AsInt64X2 "readInt64ArrayAsInt64X2#" GenPrimOp
-   MutableByteArray# s -> Int# -> State# s -> (# State# s, Int64X2# #)
+primop VecReadScalarByteArrayOp "readArrayAs#" GenPrimOp
+   MutableByteArray# s -> Int# -> State# s -> (# State# s, VECTOR #)
+   { Read a vector from specified index of mutable array of scalars; offset is in scalar elements. }
    with has_side_effects = True
         can_fail = True
         llvm_only = True
+   	vector = ALL_VECTOR_TYPES
 
-primop WriteByteArrayOp_Int64AsInt64X2 "writeInt64ArrayAsInt64X2#" GenPrimOp
-   MutableByteArray# s -> Int# -> Int64X2# -> State# s -> State# s
+primop VecWriteScalarByteArrayOp "writeArrayAs#" GenPrimOp
+   MutableByteArray# s -> Int# -> VECTOR -> State# s -> State# s
+   { Write a vector to specified index of mutable array of scalars; offset is in scalar elements. }
    with has_side_effects = True
         can_fail = True
         llvm_only = True
+   	vector = ALL_VECTOR_TYPES
 
-primop IndexOffAddrOp_Int64AsInt64X2 "indexInt64OffAddrAsInt64X2#" GenPrimOp
-   Addr# -> Int# -> Int64X2#
+primop VecIndexScalarOffAddrOp "indexOffAddrAs#" GenPrimOp
+   Addr# -> Int# -> VECTOR
+   { Reads vector; offset in scalar elements. }
    with can_fail = True
         llvm_only = True
+   	vector = ALL_VECTOR_TYPES
 
-primop ReadOffAddrOp_Int64AsInt64X2 "readInt64OffAddrAsInt64X2#" GenPrimOp
-   Addr# -> Int# -> State# s -> (# State# s, Int64X2# #)
+primop VecReadScalarOffAddrOp "readOffAddrAs#" GenPrimOp
+   Addr# -> Int# -> State# s -> (# State# s, VECTOR #)
+   { Reads vector; offset in scalar elements. }
    with has_side_effects = True
         can_fail = True
         llvm_only = True
+   	vector = ALL_VECTOR_TYPES
 
-primop  WriteOffAddrOp_Int64AsInt64X2 "writeInt64OffAddrAsInt64X2#" GenPrimOp
-   Addr# -> Int# -> Int64X2# -> State# s -> State# s
+primop VecWriteScalarOffAddrOp "writeOffAddrAs#" GenPrimOp
+   Addr# -> Int# -> VECTOR -> State# s -> State# s
+   { Write vector; offset in scalar elements. }
    with has_side_effects = True
         can_fail = True
         llvm_only = True
+   	vector = ALL_VECTOR_TYPES
 
 ------------------------------------------------------------------------
 section "Prefetch"