3 files changed, 57 insertions, 3 deletions
diff --git a/compiler/nativeGen/X86/CodeGen.hs b/compiler/nativeGen/X86/CodeGen.hs
index 73cfb28d46..fe6d4d0499 100644
--- a/compiler/nativeGen/X86/CodeGen.hs
+++ b/compiler/nativeGen/X86/CodeGen.hs
@@ -284,11 +284,13 @@ data ChildCode64
 data Register
         = Fixed Format Reg InstrBlock
         | Any   Format (Reg -> InstrBlock)
+        | AnyV  (Reg -> InstrBlock)
 
 
 swizzleRegisterRep :: Register -> Format -> Register
 swizzleRegisterRep (Fixed _ reg code) format = Fixed format reg code
 swizzleRegisterRep (Any _ codefn)     format = Any   format codefn
+swizzleRegisterRep r@(AnyV{})         _      = r
 
 
 -- | Grab the Reg for a CmmReg
@@ -368,6 +370,9 @@ getSomeReg expr = do
     Any rep code -> do
         tmp <- getNewRegNat rep
         return (tmp, code tmp)
+    AnyV code -> do
+        tmp <- getVectorReg
+        return (tmp, code tmp)
     Fixed _ reg code ->
         return (reg, code)
 
@@ -484,6 +489,16 @@ iselExpr64 (CmmMachOp (MO_UU_Conv _ W64) [expr]) = do
                           r_dst_lo
             )
 
+iselExpr64 (CmmMachOp (MO_V_Add len width) [expr1, expr2]) = do
+    r_dst <- getVectorReg
+    ChildCode64 code1 r1 <- iselExpr64 expr1
+    ChildCode64 code2 r2 <- iselExpr64 expr2
+    let fmt = VecFormat len width FmtInt
+    return $ ChildCode64 (code1 `appOL` code2 `appOL`
+                          toOL [ V_MOV fmt (OpReg r_dst) (OpReg r1)
+                               , V_ADD (VecFormat len width FmtInt) (OpReg r2) (OpReg r_dst)
+                               ]) r_dst
+
 iselExpr64 (CmmMachOp (MO_SS_Conv W32 W64) [expr]) = do
      fn <- getAnyReg expr
      r_dst_lo <-  getNewRegNat II32
@@ -501,6 +516,8 @@ iselExpr64 (CmmMachOp (MO_SS_Conv W32 W64) [expr]) = do
 iselExpr64 expr
    = pprPanic "iselExpr64(i386)" (ppr expr)
 
+getVectorReg :: NatM Reg
+getVectorReg = RegVirtual . VirtualRegSSE <$> getUniqueM
 
 --------------------------------------------------------------------------------
 getRegister :: CmmExpr -> NatM Register
@@ -822,7 +839,7 @@ getRegister' _ is32Bit (CmmMachOp mop [x, y]) = do -- dyadic MachOps
 
       MO_V_Insert {}   -> needLlvm
       MO_V_Extract {}  -> needLlvm
-      MO_V_Add {}      -> needLlvm
+      MO_V_Add {}      -> triv_op W512 V_ADD
       MO_V_Sub {}      -> needLlvm
       MO_V_Mul {}      -> needLlvm
       MO_VS_Quot {}    -> needLlvm
@@ -985,6 +1002,12 @@ getRegister' _ is32Bit (CmmMachOp mop [x, y]) = do -- dyadic MachOps
 
 
 getRegister' _ _ (CmmLoad mem pk)
+  | isVecType pk
+  = do
+    code <- vecLoadCode (V_MOV format) mem
+    return (AnyV code)
+
+getRegister' _ _ (CmmLoad mem pk)
   | isFloatType pk
   = do
     Amode addr mem_code <- getAmode mem
@@ -1096,6 +1119,9 @@ getByteReg expr = do
                     -- ToDo: could optimise slightly by checking for
                     -- byte-addressable real registers, but that will
                     -- happen very rarely if at all.
+                AnyV {} -> do
+                    tmp <- getVectorReg
+                    return (tmp, code tmp)
       else getSomeReg expr -- all regs are byte-addressable on x86_64
 
 -- Another variant: this time we want the result in a register that cannot
@@ -1116,6 +1142,9 @@ getNonClobberedReg expr = do
                 return (tmp, code `snocOL` reg2reg rep reg tmp)
         | otherwise ->
                 return (reg, code)
+    AnyV code -> do
+        tmp <- getVectorReg
+        return (tmp, code tmp)
 
 reg2reg :: Format -> Reg -> Reg -> Instr
 reg2reg format src dst = MOV format (OpReg src) (OpReg dst)
diff --git a/compiler/nativeGen/X86/Instr.hs b/compiler/nativeGen/X86/Instr.hs
index 6e5d656beb..e59594365b 100644
--- a/compiler/nativeGen/X86/Instr.hs
+++ b/compiler/nativeGen/X86/Instr.hs
@@ -8,8 +8,9 @@
 --
 -----------------------------------------------------------------------------
 
-module X86.Instr (Instr(..), Operand(..), PrefetchVariant(..), JumpDest(..),
-                  getJumpDestBlockId, canShortcut, shortcutStatics,
+module X86.Instr (Instr(..), Operand(..), PrefetchVariant(..),
+                  VecFormat(..), ScalarFormat(..),
+                  JumpDest(..), getJumpDestBlockId, canShortcut, shortcutStatics,
                   shortcutJump, allocMoreStack,
                   maxSpillSlots, archWordFormat )
 where
@@ -71,6 +72,19 @@ instance Instruction Instr where
         mkStackAllocInstr       = x86_mkStackAllocInstr
         mkStackDeallocInstr     = x86_mkStackDeallocInstr
 
+data VecFormat = VecFormat { vecLength :: !Length
+                           , vecWidth  :: !Width
+                           , vecScalar :: !ScalarFormat
+                           }
+
+instance Outputable VecFormat where
+    ppr (VecFormat l w f) = ppr (l,w,f)
+
+data ScalarFormat = FmtFloat | FmtInt
+
+instance Outputable ScalarFormat where
+    ppr FmtFloat = text "float"
+    ppr FmtInt   = text "int"
 
 -- -----------------------------------------------------------------------------
 -- Intel x86 instructions
@@ -193,6 +207,7 @@ data Instr
 
         -- Moves.
         | MOV         Format Operand Operand
+        | V_MOV       VecFormat Operand Operand
         | CMOV   Cond Format Operand Reg
         | MOVZxL      Format Operand Operand -- format is the size of operand 1
         | MOVSxL      Format Operand Operand -- format is the size of operand 1
@@ -205,6 +220,7 @@ data Instr
 
         -- Int Arithmetic.
         | ADD         Format Operand Operand
+        | V_ADD       VecFormat Operand Operand
         | ADC         Format Operand Operand
         | SUB         Format Operand Operand
         | SBB         Format Operand Operand
diff --git a/compiler/nativeGen/X86/Ppr.hs b/compiler/nativeGen/X86/Ppr.hs
index 095d9eba7c..7a799cfabd 100644
--- a/compiler/nativeGen/X86/Ppr.hs
+++ b/compiler/nativeGen/X86/Ppr.hs
@@ -833,6 +833,15 @@ pprInstr (XADD format src dst) = pprFormatOpOp (sLit "xadd") format src dst
 pprInstr (CMPXCHG format src dst)
    = pprFormatOpOp (sLit "cmpxchg") format src dst
 
+-- SIMD
+pprInstr (V_ADD format src dst)
+  | VecFormat _ W32 FmtFloat <- format = pprVecFormatOpOp (sLit "addps") format src dst
+  | VecFormat _ W64 FmtFloat <- format = pprVecFormatOpOp (sLit "addpd") format src dst
+  | VecFormat _ W64 FmtInt   <- format = pprVecFormatOpOp (sLit "addpd") format src dst
+pprInstr (V_MOV format src dst)
+  | VecFormat _ W32 FmtFloat <- format = pprVecFormatOpOp (sLit "movaps") format src dst
+  | VecFormat _ W64 FmtFloat <- format = pprVecFormatOpOp (sLit "movapd") format src dst
+  | VecFormat _ _   FmtInt   <- format = pprVecFormatOpOp (sLit "movdqa") format src dst
 
 
 --------------------------