16 files changed, 308 insertions, 114 deletions
diff --git a/compiler/cmm/CmmCallConv.hs b/compiler/cmm/CmmCallConv.hs
index dd4d6a6c1a..913f15d436 100644
--- a/compiler/cmm/CmmCallConv.hs
+++ b/compiler/cmm/CmmCallConv.hs
@@ -70,7 +70,9 @@ assignArgumentsPos dflags off conv arg_ty reps = (stk_off, assignments)
       assign_regs assts (r:rs) regs | isVecType ty   = vec
                                     | isFloatType ty = float
                                     | otherwise      = int
-        where vec = (assts, (r:rs))
+        where vec = case (w, regs) of
+                      (W128, (vs, fs, ds, ls, s:ss)) -> k (RegisterParam (XmmReg s), (vs, fs, ds, ls, ss))
+                      _ -> (assts, (r:rs))
               float = case (w, regs) of
                         (W32, (vs, fs, ds, ls, s:ss)) -> k (RegisterParam (FloatReg s), (vs, fs, ds, ls, ss))
                         (W32, (vs, f:fs, ds, ls, ss))
diff --git a/compiler/cmm/CmmExpr.hs b/compiler/cmm/CmmExpr.hs
index dce962443b..1df8e848b8 100644
--- a/compiler/cmm/CmmExpr.hs
+++ b/compiler/cmm/CmmExpr.hs
@@ -336,6 +336,9 @@ data GlobalReg
   | LongReg             -- long int registers (64-bit, really)
         {-# UNPACK #-} !Int     -- its number
 
+  | XmmReg                      -- 128-bit SIMD vector register 
+        {-# UNPACK #-} !Int     -- its number
+
   -- STG registers
   | Sp                  -- Stack ptr; points to last occupied stack location.
   | SpLim               -- Stack limit
@@ -371,6 +374,7 @@ instance Eq GlobalReg where
    FloatReg i == FloatReg j = i==j
    DoubleReg i == DoubleReg j = i==j
    LongReg i == LongReg j = i==j
+   XmmReg i == XmmReg j = i==j
    Sp == Sp = True
    SpLim == SpLim = True
    Hp == Hp = True
@@ -392,6 +396,7 @@ instance Ord GlobalReg where
    compare (FloatReg i)  (FloatReg  j) = compare i j
    compare (DoubleReg i) (DoubleReg j) = compare i j
    compare (LongReg i)   (LongReg   j) = compare i j
+   compare (XmmReg i)    (XmmReg    j) = compare i j
    compare Sp Sp = EQ
    compare SpLim SpLim = EQ
    compare Hp Hp = EQ
@@ -413,6 +418,8 @@ instance Ord GlobalReg where
    compare _ (DoubleReg _)    = GT
    compare (LongReg _) _      = LT
    compare _ (LongReg _)      = GT
+   compare (XmmReg _) _       = LT
+   compare _ (XmmReg _)       = GT
    compare Sp _ = LT
    compare _ Sp = GT
    compare SpLim _ = LT
@@ -455,6 +462,8 @@ globalRegType dflags (VanillaReg _ VNonGcPtr) = bWord dflags
 globalRegType _      (FloatReg _)      = cmmFloat W32
 globalRegType _      (DoubleReg _)     = cmmFloat W64
 globalRegType _      (LongReg _)       = cmmBits W64
+globalRegType _      (XmmReg _)        = cmmVec 4 (cmmBits W32)
+
 globalRegType dflags Hp                = gcWord dflags
                                             -- The initialiser for all
                                             -- dynamically allocated closures
@@ -465,4 +474,5 @@ isArgReg (VanillaReg {}) = True
 isArgReg (FloatReg {})   = True
 isArgReg (DoubleReg {})  = True
 isArgReg (LongReg {})    = True
+isArgReg (XmmReg {})     = True
 isArgReg _               = False
diff --git a/compiler/cmm/CmmMachOp.hs b/compiler/cmm/CmmMachOp.hs
index 4e38cd42b9..0f18029d20 100644
--- a/compiler/cmm/CmmMachOp.hs
+++ b/compiler/cmm/CmmMachOp.hs
@@ -118,6 +118,10 @@ data MachOp
   | MO_VS_Rem  Length Width
   | MO_VS_Neg  Length Width
 
+  -- Floting point vector element insertion and extraction operations
+  | MO_VF_Insert  Length Width   -- Insert scalar into vector
+  | MO_VF_Extract Length Width   -- Extract scalar from vector
+
   -- Floating point vector operations
   | MO_VF_Add  Length Width  
   | MO_VF_Sub  Length Width  
@@ -360,22 +364,25 @@ machOpResultType dflags mop tys =
     MO_SF_Conv _ to     -> cmmFloat to
     MO_FF_Conv _ to     -> cmmFloat to
 
-    MO_V_Insert {}      -> ty1
-    MO_V_Extract {}     -> vecElemType ty1
-    
-    MO_V_Add {}         -> ty1
-    MO_V_Sub {}         -> ty1
-    MO_V_Mul {}         -> ty1
-
-    MO_VS_Quot {}       -> ty1
-    MO_VS_Rem {}        -> ty1
-    MO_VS_Neg {}        -> ty1
-
-    MO_VF_Add {}        -> ty1
-    MO_VF_Sub {}        -> ty1
-    MO_VF_Mul {}        -> ty1
-    MO_VF_Quot {}       -> ty1
-    MO_VF_Neg {}        -> ty1
+    MO_V_Insert  l w    -> cmmVec l (cmmBits w)
+    MO_V_Extract _ w    -> cmmBits w
+
+    MO_V_Add l w        -> cmmVec l (cmmBits w)
+    MO_V_Sub l w        -> cmmVec l (cmmBits w)
+    MO_V_Mul l w        -> cmmVec l (cmmBits w)
+
+    MO_VS_Quot l w      -> cmmVec l (cmmBits w)
+    MO_VS_Rem  l w      -> cmmVec l (cmmBits w)
+    MO_VS_Neg  l w      -> cmmVec l (cmmBits w)
+
+    MO_VF_Insert  l w   -> cmmVec l (cmmFloat w)
+    MO_VF_Extract _ w   -> cmmFloat w
+
+    MO_VF_Add  l w      -> cmmVec l (cmmFloat w)
+    MO_VF_Sub  l w      -> cmmVec l (cmmFloat w)
+    MO_VF_Mul  l w      -> cmmVec l (cmmFloat w)
+    MO_VF_Quot l w      -> cmmVec l (cmmFloat w)
+    MO_VF_Neg  l w      -> cmmVec l (cmmFloat w)
   where
     (ty1:_) = tys
 
@@ -443,8 +450,8 @@ machOpArgReps dflags op =
     MO_FS_Conv from _   -> [from]
     MO_FF_Conv from _   -> [from]
 
-    MO_V_Insert  l r    -> [typeWidth (vec l (cmmFloat r)),r,wordWidth dflags]
-    MO_V_Extract l r    -> [typeWidth (vec l (cmmFloat r)),wordWidth dflags]
+    MO_V_Insert  l r    -> [typeWidth (vec l (cmmBits r)),r,wordWidth dflags]
+    MO_V_Extract l r    -> [typeWidth (vec l (cmmBits r)),wordWidth dflags]
 
     MO_V_Add _ r        -> [r,r]
     MO_V_Sub _ r        -> [r,r]
@@ -454,6 +461,9 @@ machOpArgReps dflags op =
     MO_VS_Rem  _ r      -> [r,r]
     MO_VS_Neg  _ r      -> [r]
 
+    MO_VF_Insert  l r   -> [typeWidth (vec l (cmmFloat r)),r,wordWidth dflags]
+    MO_VF_Extract l r   -> [typeWidth (vec l (cmmFloat r)),wordWidth dflags]
+
     MO_VF_Add  _ r      -> [r,r]
     MO_VF_Sub  _ r      -> [r,r]
     MO_VF_Mul  _ r      -> [r,r]
diff --git a/compiler/cmm/PprC.hs b/compiler/cmm/PprC.hs
index 8712d5fb5c..cda68ef39e 100644
--- a/compiler/cmm/PprC.hs
+++ b/compiler/cmm/PprC.hs
@@ -661,6 +661,15 @@ pprMachOp_for_C mop = case mop of
                                 (panic $ "PprC.pprMachOp_for_C: MO_VS_Neg"
                                       ++ " should have been handled earlier!")
 
+        MO_VF_Insert {}   -> pprTrace "offending mop:"
+                                (ptext $ sLit "MO_VF_Insert")
+                                (panic $ "PprC.pprMachOp_for_C: MO_VF_Insert"
+                                      ++ " should have been handled earlier!")
+        MO_VF_Extract {}  -> pprTrace "offending mop:"
+                                (ptext $ sLit "MO_VF_Extract")
+                                (panic $ "PprC.pprMachOp_for_C: MO_VF_Extract"
+                                      ++ " should have been handled earlier!")
+
         MO_VF_Add {}      -> pprTrace "offending mop:"
                                 (ptext $ sLit "MO_VF_Add")
                                 (panic $ "PprC.pprMachOp_for_C: MO_VF_Add"
diff --git a/compiler/cmm/PprCmmExpr.hs b/compiler/cmm/PprCmmExpr.hs
index 3c9fa063ff..d1128b07d3 100644
--- a/compiler/cmm/PprCmmExpr.hs
+++ b/compiler/cmm/PprCmmExpr.hs
@@ -255,6 +255,7 @@ pprGlobalReg gr
         FloatReg   n   -> char 'F' <> int n
         DoubleReg  n   -> char 'D' <> int n
         LongReg    n   -> char 'L' <> int n
+        XmmReg     n   -> ptext (sLit "XMM") <> int n
         Sp             -> ptext (sLit "Sp")
         SpLim          -> ptext (sLit "SpLim")
         Hp             -> ptext (sLit "Hp")
diff --git a/compiler/codeGen/CgUtils.hs b/compiler/codeGen/CgUtils.hs
index bdb7f69b11..c06dd60cb1 100644
--- a/compiler/codeGen/CgUtils.hs
+++ b/compiler/codeGen/CgUtils.hs
@@ -49,6 +49,13 @@ baseRegOffset dflags (DoubleReg 4)       = oFFSET_StgRegTable_rD4 dflags
 baseRegOffset dflags (DoubleReg 5)       = oFFSET_StgRegTable_rD5 dflags
 baseRegOffset dflags (DoubleReg 6)       = oFFSET_StgRegTable_rD6 dflags
 baseRegOffset _      (DoubleReg n)       = panic ("Registers above D6 are not supported (tried to use D" ++ show n ++ ")")
+baseRegOffset dflags (XmmReg 1)          = oFFSET_StgRegTable_rXMM1 dflags
+baseRegOffset dflags (XmmReg 2)          = oFFSET_StgRegTable_rXMM2 dflags
+baseRegOffset dflags (XmmReg 3)          = oFFSET_StgRegTable_rXMM3 dflags
+baseRegOffset dflags (XmmReg 4)          = oFFSET_StgRegTable_rXMM4 dflags
+baseRegOffset dflags (XmmReg 5)          = oFFSET_StgRegTable_rXMM5 dflags
+baseRegOffset dflags (XmmReg 6)          = oFFSET_StgRegTable_rXMM6 dflags
+baseRegOffset _      (XmmReg n)          = panic ("Registers above XMM6 are not supported (tried to use XMM" ++ show n ++ ")")
 baseRegOffset dflags Sp                  = oFFSET_StgRegTable_rSp dflags
 baseRegOffset dflags SpLim               = oFFSET_StgRegTable_rSpLim dflags
 baseRegOffset dflags (LongReg 1)         = oFFSET_StgRegTable_rL1 dflags
diff --git a/compiler/codeGen/StgCmmPrim.hs b/compiler/codeGen/StgCmmPrim.hs
index 10a514b6e1..4e0d773097 100644
--- a/compiler/codeGen/StgCmmPrim.hs
+++ b/compiler/codeGen/StgCmmPrim.hs
@@ -1183,8 +1183,11 @@ doVecPackOp maybe_pre_write_cast ty z es res = do
 
     vecPack src (e : es) i = do
         dst <- newTemp ty
-        emitAssign (CmmLocal dst) (CmmMachOp (MO_V_Insert len wid)
-                                             [CmmReg (CmmLocal src), cast e, iLit])
+        if isFloatType (vecElemType ty)
+          then emitAssign (CmmLocal dst) (CmmMachOp (MO_VF_Insert len wid)
+                                                    [CmmReg (CmmLocal src), cast e, iLit])
+          else emitAssign (CmmLocal dst) (CmmMachOp (MO_V_Insert len wid)
+                                                    [CmmReg (CmmLocal src), cast e, iLit])
         vecPack dst es (i + 1)
       where
         -- vector indices are always 32-bits
@@ -1214,8 +1217,11 @@ doVecUnpackOp maybe_post_read_cast ty e res =
         return ()
 
     vecUnpack (r : rs) i = do
-        emitAssign (CmmLocal r) (cast (CmmMachOp (MO_V_Extract len wid)
-                                      [e, iLit]))
+        if isFloatType (vecElemType ty)
+          then emitAssign (CmmLocal r) (cast (CmmMachOp (MO_VF_Extract len wid)
+                                             [e, iLit]))
+          else emitAssign (CmmLocal r) (cast (CmmMachOp (MO_V_Extract len wid)
+                                             [e, iLit]))
         vecUnpack rs (i + 1)
       where
         -- vector indices are always 32-bits
@@ -1244,7 +1250,9 @@ doVecInsertOp maybe_pre_write_cast ty src e idx res = do
     -- vector indices are always 32-bits
     let idx' :: CmmExpr
         idx' = CmmMachOp (MO_SS_Conv (wordWidth dflags) W32) [idx]
-    emitAssign (CmmLocal res) (CmmMachOp (MO_V_Insert len wid) [src, cast e, idx'])
+    if isFloatType (vecElemType ty)
+      then emitAssign (CmmLocal res) (CmmMachOp (MO_VF_Insert len wid) [src, cast e, idx'])
+      else emitAssign (CmmLocal res) (CmmMachOp (MO_V_Insert len wid) [src, cast e, idx'])
   where
     cast :: CmmExpr -> CmmExpr
     cast val = case maybe_pre_write_cast of
diff --git a/compiler/llvmGen/LlvmCodeGen/Base.hs b/compiler/llvmGen/LlvmCodeGen/Base.hs
index 1457efe3cb..bcfce3401e 100644
--- a/compiler/llvmGen/LlvmCodeGen/Base.hs
+++ b/compiler/llvmGen/LlvmCodeGen/Base.hs
@@ -131,11 +131,12 @@ llvmFunArgs :: DynFlags -> LiveGlobalRegs -> [LlvmVar]
 llvmFunArgs dflags live =
     map (lmGlobalRegArg dflags) (filter isPassed (activeStgRegs platform))
     where platform = targetPlatform dflags
-          isLive r = not (isFloat r) || r `elem` alwaysLive || r `elem` live
-          isPassed r = not (isFloat r) || isLive r
-          isFloat (FloatReg _)  = True
-          isFloat (DoubleReg _) = True
-          isFloat _             = False
+          isLive r = not (isSSE r) || r `elem` alwaysLive || r `elem` live
+          isPassed r = not (isSSE r) || isLive r
+          isSSE (FloatReg _)  = True
+          isSSE (DoubleReg _) = True
+          isSSE (XmmReg _)    = True
+          isSSE _             = False
 
 -- | Llvm standard fun attributes
 llvmStdFunAttrs :: [LlvmFuncAttr]
diff --git a/compiler/llvmGen/LlvmCodeGen/CodeGen.hs b/compiler/llvmGen/LlvmCodeGen/CodeGen.hs
index efa7e9a706..969bca8ec0 100644
--- a/compiler/llvmGen/LlvmCodeGen/CodeGen.hs
+++ b/compiler/llvmGen/LlvmCodeGen/CodeGen.hs
@@ -470,6 +470,7 @@ castVar dflags v t
                       (vt, _) | isInt vt && isPointer t     -> LM_Inttoptr
                       (vt, _) | isPointer vt && isInt t     -> LM_Ptrtoint
                       (vt, _) | isPointer vt && isPointer t -> LM_Bitcast
+                      (vt, _) | isVector vt && isVector t   -> LM_Bitcast
 
                       (vt, _) -> panic $ "castVars: Can't cast this type ("
                                   ++ show vt ++ ") to (" ++ show t ++ ")"
@@ -582,16 +583,21 @@ genAssign env reg val = do
     let stmts = stmts1 `appOL` stmts2
 
     let ty = (pLower . getVarType) vreg
-    case isPointer ty && getVarType vval == llvmWord dflags of
-         -- Some registers are pointer types, so need to cast value to pointer
-         True -> do
-             (v, s1) <- doExpr ty $ Cast LM_Inttoptr vval ty
-             let s2 = Store v vreg
-             return (env2, stmts `snocOL` s1 `snocOL` s2, top1 ++ top2)
+    case ty of
+      -- Some registers are pointer types, so need to cast value to pointer
+      LMPointer _ | getVarType vval == llvmWord dflags -> do
+          (v, s1) <- doExpr ty $ Cast LM_Inttoptr vval ty
+          let s2 = Store v vreg
+          return (env2, stmts `snocOL` s1 `snocOL` s2, top1 ++ top2)
 
-         False -> do
-             let s1 = Store vval vreg
-             return (env2, stmts `snocOL` s1, top1 ++ top2)
+      LMVector _ _ -> do
+          (v, s1) <- doExpr ty $ Cast LM_Bitcast vval ty
+          let s2 = Store v vreg
+          return (env2, stmts `snocOL` s1 `snocOL` s2, top1 ++ top2)
+
+      _ -> do
+          let s1 = Store vval vreg
+          return (env2, stmts `snocOL` s1, top1 ++ top2)
 
 
 -- | CmmStore operation
@@ -884,14 +890,14 @@ genMachOp env _ op [x] = case op of
             vecty = LMVector len ty
             all0  = LMIntLit (-0) ty
             all0s = LMLitVar $ LMVectorLit (replicate len all0)
-        in negate vecty all0s LM_MO_Sub
+        in negateVec vecty all0s LM_MO_Sub
 
     MO_VF_Neg len w ->
         let ty    = widthToLlvmFloat w
             vecty = LMVector len ty
             all0  = LMFloatLit (-0) ty
             all0s = LMLitVar $ LMVectorLit (replicate len all0)
-        in negate vecty all0s LM_MO_FSub
+        in negateVec vecty all0s LM_MO_FSub
 
     -- Handle unsupported cases explicitly so we get a warning
     -- of missing case when new MachOps added
@@ -943,6 +949,9 @@ genMachOp env _ op [x] = case op of
 
     MO_VS_Quot    _ _ -> panicOp
     MO_VS_Rem     _ _ -> panicOp
+ 
+    MO_VF_Insert  _ _ -> panicOp
+    MO_VF_Extract _ _ -> panicOp
 
     MO_VF_Add     _ _ -> panicOp
     MO_VF_Sub     _ _ -> panicOp
@@ -957,6 +966,12 @@ genMachOp env _ op [x] = case op of
             (v1, s1) <- doExpr ty $ LlvmOp negOp v2 vx
             return (env', v1, stmts `snocOL` s1, top)
 
+        negateVec ty v2 negOp = do
+            (env', vx, stmts1, top) <- exprToVar env x
+            ([vx'], stmts2) <- castVars dflags [(vx, ty)]
+            (v1, s1) <- doExpr ty $ LlvmOp negOp v2 vx'
+            return (env', v1, stmts1 `appOL` stmts2 `snocOL` s1, top)
+
         fiConv ty convOp = do
             (env', vx, stmts, top) <- exprToVar env x
             (v1, s1) <- doExpr ty $ Cast convOp vx ty
@@ -1014,22 +1029,50 @@ genMachOp_fast env opt op r n e
 genMachOp_slow :: LlvmEnv -> EOption -> MachOp -> [CmmExpr] -> UniqSM ExprData
 
 -- Element extraction
-genMachOp_slow env _ (MO_V_Extract {}) [val, idx] = do
+genMachOp_slow env _ (MO_V_Extract l w) [val, idx] = do
+    (env1, vval, stmts1, top1) <- exprToVar env  val
+    (env2, vidx, stmts2, top2) <- exprToVar env1 idx
+    ([vval'], stmts3)          <- castVars dflags [(vval, LMVector l ty)]
+    (v1, s1)                   <- doExpr ty $ Extract vval' vidx
+    return (env2, v1, stmts1 `appOL` stmts2 `appOL` stmts3 `snocOL` s1, top1 ++ top2)
+  where
+    dflags = getDflags env
+    ty = widthToLlvmInt w
+
+genMachOp_slow env _ (MO_VF_Extract l w) [val, idx] = do
     (env1, vval, stmts1, top1) <- exprToVar env  val
     (env2, vidx, stmts2, top2) <- exprToVar env1 idx
-    let (LMVector _ ty)        =  getVarType vval
-    (v1, s1)                   <- doExpr ty $ Extract vval vidx
-    return (env2, v1, stmts1 `appOL` stmts2 `snocOL` s1, top1 ++ top2)
+    ([vval'], stmts3)          <- castVars dflags [(vval, LMVector l ty)]
+    (v1, s1)                   <- doExpr ty $ Extract vval' vidx
+    return (env2, v1, stmts1 `appOL` stmts2 `appOL` stmts3 `snocOL` s1, top1 ++ top2)
+  where
+    dflags = getDflags env
+    ty = widthToLlvmFloat w
 
 -- Element insertion
-genMachOp_slow env _ (MO_V_Insert {}) [val, elt, idx] = do
+genMachOp_slow env _ (MO_V_Insert l w) [val, elt, idx] = do
+    (env1, vval, stmts1, top1) <- exprToVar env  val
+    (env2, velt, stmts2, top2) <- exprToVar env1 elt
+    (env3, vidx, stmts3, top3) <- exprToVar env2 idx
+    ([vval'], stmts4)          <- castVars dflags [(vval, ty)]
+    (v1, s1)                   <- doExpr ty $ Insert vval' velt vidx
+    return (env3, v1, stmts1 `appOL` stmts2 `appOL` stmts3 `appOL` stmts4 `snocOL` s1,
+            top1 ++ top2 ++ top3)
+  where
+    dflags = getDflags env
+    ty = LMVector l (widthToLlvmInt w)
+
+genMachOp_slow env _ (MO_VF_Insert l w) [val, elt, idx] = do
     (env1, vval, stmts1, top1) <- exprToVar env  val
     (env2, velt, stmts2, top2) <- exprToVar env1 elt
     (env3, vidx, stmts3, top3) <- exprToVar env2 idx
-    let ty                     =  getVarType vval
-    (v1, s1)                   <- doExpr ty $ Insert vval velt vidx
-    return (env3, v1, stmts1 `appOL` stmts2 `appOL` stmts3 `snocOL` s1,
+    ([vval'], stmts4)          <- castVars dflags [(vval, ty)]
+    (v1, s1)                   <- doExpr ty $ Insert vval' velt vidx
+    return (env3, v1, stmts1 `appOL` stmts2 `appOL` stmts3 `appOL` stmts4 `snocOL` s1,
             top1 ++ top2 ++ top3)
+  where
+    dflags = getDflags env
+    ty = LMVector l (widthToLlvmFloat w)
     
 -- Binary MachOp
 genMachOp_slow env opt op [x, y] = case op of
@@ -1080,17 +1123,17 @@ genMachOp_slow env opt op [x, y] = case op of
     MO_U_Shr _ -> genBinMach LM_MO_LShr
     MO_S_Shr _ -> genBinMach LM_MO_AShr
 
-    MO_V_Add _ _   -> genBinMach LM_MO_Add
-    MO_V_Sub _ _   -> genBinMach LM_MO_Sub
-    MO_V_Mul _ _   -> genBinMach LM_MO_Mul
+    MO_V_Add l w   -> genCastBinMach (LMVector l (widthToLlvmInt w)) LM_MO_Add
+    MO_V_Sub l w   -> genCastBinMach (LMVector l (widthToLlvmInt w)) LM_MO_Sub
+    MO_V_Mul l w   -> genCastBinMach (LMVector l (widthToLlvmInt w)) LM_MO_Mul
 
-    MO_VS_Quot _ _ -> genBinMach LM_MO_SDiv
-    MO_VS_Rem _ _  -> genBinMach LM_MO_SRem
+    MO_VS_Quot l w -> genCastBinMach (LMVector l (widthToLlvmInt w)) LM_MO_SDiv
+    MO_VS_Rem  l w -> genCastBinMach (LMVector l (widthToLlvmInt w)) LM_MO_SRem
  
-    MO_VF_Add _ _  -> genBinMach LM_MO_FAdd
-    MO_VF_Sub _ _  -> genBinMach LM_MO_FSub
-    MO_VF_Mul _ _  -> genBinMach LM_MO_FMul
-    MO_VF_Quot _ _ -> genBinMach LM_MO_FDiv
+    MO_VF_Add  l w -> genCastBinMach (LMVector l (widthToLlvmFloat w)) LM_MO_FAdd
+    MO_VF_Sub  l w -> genCastBinMach (LMVector l (widthToLlvmFloat w)) LM_MO_FSub
+    MO_VF_Mul  l w -> genCastBinMach (LMVector l (widthToLlvmFloat w)) LM_MO_FMul
+    MO_VF_Quot l w -> genCastBinMach (LMVector l (widthToLlvmFloat w)) LM_MO_FDiv
 
     MO_Not _       -> panicOp
     MO_S_Neg _     -> panicOp
@@ -1107,6 +1150,9 @@ genMachOp_slow env opt op [x, y] = case op of
 
     MO_VS_Neg {} -> panicOp
 
+    MO_VF_Insert  {} -> panicOp
+    MO_VF_Extract {} -> panicOp
+
     MO_VF_Neg {} -> panicOp
 
     where
@@ -1134,6 +1180,14 @@ genMachOp_slow env opt op [x, y] = case op of
                                     `snocOL` dy `snocOL` s1
                     return (env2, v1, allStmts, top1 ++ top2)
 
+        binCastLlvmOp ty binOp = do
+            (env1, vx, stmts1, top1) <- exprToVar env x
+            (env2, vy, stmts2, top2) <- exprToVar env1 y
+            ([vx', vy'], stmts3) <- castVars dflags [(vx, ty), (vy, ty)]
+            (v1, s1) <- doExpr ty $ binOp vx' vy'
+            return (env2, v1, stmts1 `appOL` stmts2 `appOL` stmts3 `snocOL` s1,
+                    top1 ++ top2)
+
         -- | Need to use EOption here as Cmm expects word size results from
         -- comparisons while LLVM return i1. Need to extend to llvmWord type
         -- if expected. See Note [Literals and branch conditions].
@@ -1152,6 +1206,8 @@ genMachOp_slow env opt op [x, y] = case op of
 
         genBinMach op = binLlvmOp getVarType (LlvmOp op)
 
+        genCastBinMach ty op = binCastLlvmOp ty (LlvmOp op)
+
         -- | Detect if overflow will occur in signed multiply of the two
         -- CmmExpr's. This is the LLVM assembly equivalent of the NCG
         -- implementation. Its much longer due to type information/safety.
@@ -1427,10 +1483,11 @@ funEpilogue env live = do
     dflags = getDflags env
     platform = targetPlatform dflags
     isLive r = r `elem` alwaysLive || r `elem` live
-    isPassed r = not (isFloat r) || isLive r
-    isFloat (FloatReg _)  = True
-    isFloat (DoubleReg _) = True
-    isFloat _             = False
+    isPassed r = not (isSSE r) || isLive r
+    isSSE (FloatReg _)  = True
+    isSSE (DoubleReg _) = True
+    isSSE (XmmReg _)    = True
+    isSSE _             = False
     loadExpr r | isLive r = do
         let reg  = lmGlobalRegVar dflags r
         (v,s) <- doExpr (pLower $ getVarType reg) $ Load reg
diff --git a/compiler/llvmGen/LlvmCodeGen/Regs.hs b/compiler/llvmGen/LlvmCodeGen/Regs.hs
index e6cfcb2e18..7271c2f3d9 100644
--- a/compiler/llvmGen/LlvmCodeGen/Regs.hs
+++ b/compiler/llvmGen/LlvmCodeGen/Regs.hs
@@ -55,6 +55,12 @@ lmGlobalReg dflags suf reg
         DoubleReg 4    -> doubleGlobal $ "D4" ++ suf
         DoubleReg 5    -> doubleGlobal $ "D5" ++ suf
         DoubleReg 6    -> doubleGlobal $ "D6" ++ suf
+        XmmReg 1       -> xmmGlobal $ "XMM1" ++ suf
+        XmmReg 2       -> xmmGlobal $ "XMM2" ++ suf
+        XmmReg 3       -> xmmGlobal $ "XMM3" ++ suf
+        XmmReg 4       -> xmmGlobal $ "XMM4" ++ suf
+        XmmReg 5       -> xmmGlobal $ "XMM5" ++ suf
+        XmmReg 6       -> xmmGlobal $ "XMM6" ++ suf
         _other         -> panic $ "LlvmCodeGen.Reg: GlobalReg (" ++ (show reg)
                                 ++ ") not supported!"
         -- LongReg, HpLim, CCSS, CurrentTSO, CurrentNusery, HpAlloc
@@ -64,6 +70,7 @@ lmGlobalReg dflags suf reg
         ptrGlobal    name = LMNLocalVar (fsLit name) (llvmWordPtr dflags)
         floatGlobal  name = LMNLocalVar (fsLit name) LMFloat
         doubleGlobal name = LMNLocalVar (fsLit name) LMDouble
+        xmmGlobal    name = LMNLocalVar (fsLit name) (LMVector 4 (LMInt 32))
 
 -- | A list of STG Registers that should always be considered alive
 alwaysLive :: [GlobalReg]
diff --git a/compiler/nativeGen/X86/CodeGen.hs b/compiler/nativeGen/X86/CodeGen.hs
index 0df95a2f73..4177cadbf6 100644
--- a/compiler/nativeGen/X86/CodeGen.hs
+++ b/compiler/nativeGen/X86/CodeGen.hs
@@ -602,19 +602,21 @@ getRegister' dflags is32Bit (CmmMachOp mop [x]) = do -- unary MachOps
       MO_FS_Conv from to -> coerceFP2Int from to x
       MO_SF_Conv from to -> coerceInt2FP from to x
 
-      MO_V_Insert {}  -> needLlvm
-      MO_V_Extract {} -> needLlvm
-      MO_V_Add {}     -> needLlvm
-      MO_V_Sub {}     -> needLlvm
-      MO_V_Mul {}     -> needLlvm
-      MO_VS_Quot {}   -> needLlvm
-      MO_VS_Rem {}    -> needLlvm
-      MO_VS_Neg {}    -> needLlvm
-      MO_VF_Add {}    -> needLlvm
-      MO_VF_Sub {}    -> needLlvm
-      MO_VF_Mul {}    -> needLlvm
-      MO_VF_Quot {}   -> needLlvm
-      MO_VF_Neg {}    -> needLlvm
+      MO_V_Insert {}   -> needLlvm
+      MO_V_Extract {}  -> needLlvm
+      MO_V_Add {}      -> needLlvm
+      MO_V_Sub {}      -> needLlvm
+      MO_V_Mul {}      -> needLlvm
+      MO_VS_Quot {}    -> needLlvm
+      MO_VS_Rem {}     -> needLlvm
+      MO_VS_Neg {}     -> needLlvm
+      MO_VF_Insert {}  -> needLlvm
+      MO_VF_Extract {} -> needLlvm
+      MO_VF_Add {}     -> needLlvm
+      MO_VF_Sub {}     -> needLlvm
+      MO_VF_Mul {}     -> needLlvm
+      MO_VF_Quot {}    -> needLlvm
+      MO_VF_Neg {}     -> needLlvm
 
       _other -> pprPanic "getRegister" (pprMachOp mop)
    where
@@ -708,19 +710,21 @@ getRegister' _ is32Bit (CmmMachOp mop [x, y]) = do -- dyadic MachOps
       MO_U_Shr rep -> shift_code rep SHR x y {-False-}
       MO_S_Shr rep -> shift_code rep SAR x y {-False-}
 
-      MO_V_Insert {}  -> needLlvm
-      MO_V_Extract {} -> needLlvm
-      MO_V_Add {}     -> needLlvm
-      MO_V_Sub {}     -> needLlvm
-      MO_V_Mul {}     -> needLlvm
-      MO_VS_Quot {}   -> needLlvm
-      MO_VS_Rem {}    -> needLlvm
-      MO_VS_Neg {}    -> needLlvm
-      MO_VF_Add {}    -> needLlvm
-      MO_VF_Sub {}    -> needLlvm
-      MO_VF_Mul {}    -> needLlvm
-      MO_VF_Quot {}   -> needLlvm
-      MO_VF_Neg {}    -> needLlvm
+      MO_V_Insert {}   -> needLlvm
+      MO_V_Extract {}  -> needLlvm
+      MO_V_Add {}      -> needLlvm
+      MO_V_Sub {}      -> needLlvm
+      MO_V_Mul {}      -> needLlvm
+      MO_VS_Quot {}    -> needLlvm
+      MO_VS_Rem {}     -> needLlvm
+      MO_VS_Neg {}     -> needLlvm
+      MO_VF_Insert {}  -> needLlvm
+      MO_VF_Extract {} -> needLlvm
+      MO_VF_Add {}     -> needLlvm
+      MO_VF_Sub {}     -> needLlvm
+      MO_VF_Mul {}     -> needLlvm
+      MO_VF_Quot {}    -> needLlvm
+      MO_VF_Neg {}     -> needLlvm
 
       _other -> pprPanic "getRegister(x86) - binary CmmMachOp (1)" (pprMachOp mop)
   where
@@ -2722,21 +2726,23 @@ sse2NegCode w x = do
   return (Any sz code)
 
 isVecExpr :: CmmExpr -> Bool
-isVecExpr (CmmMachOp (MO_V_Insert {}) _)  = True
-isVecExpr (CmmMachOp (MO_V_Extract {}) _) = True
-isVecExpr (CmmMachOp (MO_V_Add {}) _)     = True
-isVecExpr (CmmMachOp (MO_V_Sub {}) _)     = True
-isVecExpr (CmmMachOp (MO_V_Mul {}) _)     = True
-isVecExpr (CmmMachOp (MO_VS_Quot {}) _)   = True
-isVecExpr (CmmMachOp (MO_VS_Rem {}) _)    = True
-isVecExpr (CmmMachOp (MO_VS_Neg {}) _)    = True
-isVecExpr (CmmMachOp (MO_VF_Add {}) _)    = True
-isVecExpr (CmmMachOp (MO_VF_Sub {}) _)    = True
-isVecExpr (CmmMachOp (MO_VF_Mul {}) _)    = True
-isVecExpr (CmmMachOp (MO_VF_Quot {}) _)   = True
-isVecExpr (CmmMachOp (MO_VF_Neg {}) _)    = True
-isVecExpr (CmmMachOp _ [e])               = isVecExpr e
-isVecExpr _                               = False
+isVecExpr (CmmMachOp (MO_V_Insert {}) _)   = True
+isVecExpr (CmmMachOp (MO_V_Extract {}) _)  = True
+isVecExpr (CmmMachOp (MO_V_Add {}) _)      = True
+isVecExpr (CmmMachOp (MO_V_Sub {}) _)      = True
+isVecExpr (CmmMachOp (MO_V_Mul {}) _)      = True
+isVecExpr (CmmMachOp (MO_VS_Quot {}) _)    = True
+isVecExpr (CmmMachOp (MO_VS_Rem {}) _)     = True
+isVecExpr (CmmMachOp (MO_VS_Neg {}) _)     = True
+isVecExpr (CmmMachOp (MO_VF_Insert {}) _)  = True
+isVecExpr (CmmMachOp (MO_VF_Extract {}) _) = True
+isVecExpr (CmmMachOp (MO_VF_Add {}) _)     = True
+isVecExpr (CmmMachOp (MO_VF_Sub {}) _)     = True
+isVecExpr (CmmMachOp (MO_VF_Mul {}) _)     = True
+isVecExpr (CmmMachOp (MO_VF_Quot {}) _)    = True
+isVecExpr (CmmMachOp (MO_VF_Neg {}) _)     = True
+isVecExpr (CmmMachOp _ [e])                = isVecExpr e
+isVecExpr _                                = False
 
 needLlvm :: NatM a
 needLlvm =
diff --git a/includes/CodeGen.Platform.hs b/includes/CodeGen.Platform.hs
index 14642bd1c5..beff19601d 100644
--- a/includes/CodeGen.Platform.hs
+++ b/includes/CodeGen.Platform.hs
@@ -388,36 +388,54 @@ activeStgRegs = [
 #ifdef REG_D1
     ,DoubleReg 1
 #endif
+#ifdef REG_XMM1
+    ,XmmReg 1
+#endif
 #ifdef REG_F2
     ,FloatReg 2
 #endif
 #ifdef REG_D2
     ,DoubleReg 2
 #endif
+#ifdef REG_XMM2
+    ,XmmReg 2
+#endif
 #ifdef REG_F3
     ,FloatReg 3
 #endif
 #ifdef REG_D3
     ,DoubleReg 3
 #endif
+#ifdef REG_XMM3
+    ,XmmReg 3
+#endif
 #ifdef REG_F4
     ,FloatReg 4
 #endif
 #ifdef REG_D4
     ,DoubleReg 4
 #endif
+#ifdef REG_XMM4
+    ,XmmReg 4
+#endif
 #ifdef REG_F5
     ,FloatReg 5
 #endif
 #ifdef REG_D5
     ,DoubleReg 5
 #endif
+#ifdef REG_XMM5
+    ,XmmReg 5
+#endif
 #ifdef REG_F6
     ,FloatReg 6
 #endif
 #ifdef REG_D6
     ,DoubleReg 6
 #endif
+#ifdef REG_XMM6
+    ,XmmReg 6
+#endif
 #else /* MAX_REAL_SSE_REG == 0 */
 #ifdef REG_F1
     ,FloatReg 1
@@ -569,6 +587,14 @@ globalRegMaybe (DoubleReg 6)            =
                                           Just (RealRegSingle REG_D6)
 #  endif
 # endif
+#if MAX_REAL_SSE_REG != 0
+globalRegMaybe (XmmReg 1)               = Just (RealRegSingle REG_XMM1)
+globalRegMaybe (XmmReg 2)               = Just (RealRegSingle REG_XMM2)
+globalRegMaybe (XmmReg 3)               = Just (RealRegSingle REG_XMM3)
+globalRegMaybe (XmmReg 4)               = Just (RealRegSingle REG_XMM4)
+globalRegMaybe (XmmReg 5)               = Just (RealRegSingle REG_XMM5)
+globalRegMaybe (XmmReg 6)               = Just (RealRegSingle REG_XMM6)
+# endif
 # ifdef REG_Sp
 globalRegMaybe Sp                       = Just (RealRegSingle REG_Sp)
 # endif
diff --git a/includes/stg/MachRegs.h b/includes/stg/MachRegs.h
index 6dc81f63c1..76bdb1fc21 100644
--- a/includes/stg/MachRegs.h
+++ b/includes/stg/MachRegs.h
@@ -167,12 +167,12 @@
 #define REG_D5    xmm5
 #define REG_D6    xmm6
 
-#define REG_SSE1    xmm1
-#define REG_SSE2    xmm2
-#define REG_SSE3    xmm3
-#define REG_SSE4    xmm4
-#define REG_SSE5    xmm5
-#define REG_SSE6    xmm6
+#define REG_XMM1    xmm1
+#define REG_XMM2    xmm2
+#define REG_XMM3    xmm3
+#define REG_XMM4    xmm4
+#define REG_XMM5    xmm5
+#define REG_XMM6    xmm6
 
 #if !defined(mingw32_HOST_OS)
 #define CALLER_SAVES_R3
@@ -199,13 +199,13 @@
 #define CALLER_SAVES_D6
 #endif
 
-#define CALLER_SAVES_SSE1
-#define CALLER_SAVES_SSE2
-#define CALLER_SAVES_SSE3
-#define CALLER_SAVES_SSE4
-#define CALLER_SAVES_SSE5
+#define CALLER_SAVES_XMM1
+#define CALLER_SAVES_XMM2
+#define CALLER_SAVES_XMM3
+#define CALLER_SAVES_XMM4
+#define CALLER_SAVES_XMM5
 #if !defined(mingw32_HOST_OS)
-#define CALLER_SAVES_SSE6
+#define CALLER_SAVES_XMM6
 #endif
 
 #define MAX_REAL_VANILLA_REG 6
diff --git a/includes/stg/Regs.h b/includes/stg/Regs.h
index fd1577e71a..10ae2851ac 100644
--- a/includes/stg/Regs.h
+++ b/includes/stg/Regs.h
@@ -81,6 +81,12 @@ typedef struct {
   StgDouble 	  rD4;
   StgDouble 	  rD5;
   StgDouble 	  rD6;
+  StgWord128 	  rXMM1;
+  StgWord128 	  rXMM2;
+  StgWord128 	  rXMM3;
+  StgWord128 	  rXMM4;
+  StgWord128 	  rXMM5;
+  StgWord128 	  rXMM6;
   StgWord64       rL1;
   StgPtr 	  rSp;
   StgPtr 	  rSpLim;
@@ -270,6 +276,42 @@ GLOBAL_REG_DECL(StgDouble,D6,REG_D6)
 #define D6 (BaseReg->rD6)
 #endif
 
+#if defined(REG_XMM1) && !defined(NO_GLOBAL_REG_DECLS)
+GLOBAL_REG_DECL(StgWord128,XMM1,REG_XMM1)
+#else
+#define XMM1 (BaseReg->rXMM1)
+#endif
+
+#if defined(REG_XMM2) && !defined(NO_GLOBAL_REG_DECLS)
+GLOBAL_REG_DECL(StgWord128,XMM2,REG_XMM2)
+#else
+#define XMM2 (BaseReg->rXMM2)
+#endif
+
+#if defined(REG_XMM3) && !defined(NO_GLOBAL_REG_DECLS)
+GLOBAL_REG_DECL(StgWord128,XMM3,REG_XMM3)
+#else
+#define XMM3 (BaseReg->rXMM3)
+#endif
+
+#if defined(REG_XMM4) && !defined(NO_GLOBAL_REG_DECLS)
+GLOBAL_REG_DECL(StgWord128,XMM4,REG_XMM4)
+#else
+#define XMM4 (BaseReg->rXMM4)
+#endif
+
+#if defined(REG_XMM5) && !defined(NO_GLOBAL_REG_DECLS)
+GLOBAL_REG_DECL(StgWord128,XMM5,REG_XMM5)
+#else
+#define XMM5 (BaseReg->rXMM5)
+#endif
+
+#if defined(REG_XMM6) && !defined(NO_GLOBAL_REG_DECLS)
+GLOBAL_REG_DECL(StgWord128,XMM6,REG_XMM6)
+#else
+#define XMM6 (BaseReg->rXMM6)
+#endif
+
 #if defined(REG_L1) && !defined(NO_GLOBAL_REG_DECLS)
 GLOBAL_REG_DECL(StgWord64,L1,REG_L1)
 #else
diff --git a/includes/stg/Types.h b/includes/stg/Types.h
index d6bdc9042b..ccc06a175b 100644
--- a/includes/stg/Types.h
+++ b/includes/stg/Types.h
@@ -83,6 +83,8 @@ typedef unsigned long long int StgWord64;
 #error cannot find a way to define StgInt64
 #endif
 
+typedef struct { StgWord64 h; StgWord64 l; } StgWord128;
+
 /*
  * Define the standard word size we'll use on this machine: make it
  * big enough to hold a pointer.
diff --git a/utils/deriveConstants/DeriveConstants.hs b/utils/deriveConstants/DeriveConstants.hs
index 66c1f0e456..e726bf7e0e 100644
--- a/utils/deriveConstants/DeriveConstants.hs
+++ b/utils/deriveConstants/DeriveConstants.hs
@@ -307,6 +307,12 @@ wanteds = concat
           ,fieldOffset Both "StgRegTable" "rD4"
           ,fieldOffset Both "StgRegTable" "rD5"
           ,fieldOffset Both "StgRegTable" "rD6"
+          ,fieldOffset Both "StgRegTable" "rXMM1"
+          ,fieldOffset Both "StgRegTable" "rXMM2"
+          ,fieldOffset Both "StgRegTable" "rXMM3"
+          ,fieldOffset Both "StgRegTable" "rXMM4"
+          ,fieldOffset Both "StgRegTable" "rXMM5"
+          ,fieldOffset Both "StgRegTable" "rXMM6"
           ,fieldOffset Both "StgRegTable" "rL1"
           ,fieldOffset Both "StgRegTable" "rSp"
           ,fieldOffset Both "StgRegTable" "rSpLim"