7 files changed, 52 insertions, 122 deletions
diff --git a/compiler/cmm/CmmCallConv.hs b/compiler/cmm/CmmCallConv.hs
index 6df910edfa..4e6a9d293a 100644
--- a/compiler/cmm/CmmCallConv.hs
+++ b/compiler/cmm/CmmCallConv.hs
@@ -64,20 +64,13 @@ assignArgumentsPos dflags off conv arg_ty reps = (stk_off, assignments)
       assign_regs assts (r:rs) regs | isVecType ty   = vec
                                     | isFloatType ty = float
                                     | otherwise      = int
-        where vec = case regs of
-                      (vs, fs, ds, ls, s:ss)
-                        | passVectorInReg w dflags
-                          -> let elt_ty = vecElemType ty
-                                 reg_ty = if isFloatType elt_ty
-                                          then Float else Integer
-                                 reg_class = case w of
-                                               W128 -> XmmReg
-                                               W256 -> YmmReg
-                                               W512 -> ZmmReg
-                                               _    -> panic "CmmCallConv.assignArgumentsPos: Invalid vector width"
-                              in k (RegisterParam
-                                     (reg_class s (vecLength ty) (typeWidth elt_ty) reg_ty),
-                                     (vs, fs, ds, ls, ss))
+        where vec = case (w, regs) of
+                      (W128, (vs, fs, ds, ls, s:ss))
+                          | passVectorInReg W128 dflags -> k (RegisterParam (XmmReg s), (vs, fs, ds, ls, ss))
+                      (W256, (vs, fs, ds, ls, s:ss))
+                          | passVectorInReg W256 dflags -> k (RegisterParam (YmmReg s), (vs, fs, ds, ls, ss))
+                      (W512, (vs, fs, ds, ls, s:ss))
+                          | passVectorInReg W512 dflags -> k (RegisterParam (ZmmReg s), (vs, fs, ds, ls, ss))
                       _ -> (assts, (r:rs))
               float = case (w, regs) of
                         (W32, (vs, fs, ds, ls, s:ss))
@@ -96,7 +89,6 @@ assignArgumentsPos dflags off conv arg_ty reps = (stk_off, assignments)
                       (_, (vs, fs, ds, l:ls, ss)) | widthInBits w > widthInBits (wordWidth dflags)
                           -> k (RegisterParam l, (vs, fs, ds, ls, ss))
                       _   -> (assts, (r:rs))
-
               k (asst, regs') = assign_regs ((r, asst) : assts) rs regs'
               ty = arg_ty r
               w  = typeWidth ty
@@ -210,13 +202,11 @@ nodeOnly = ([VanillaReg 1], [], [], [], [])
 -- only use this functionality in hand-written C-- code in the RTS.
 realArgRegsCover :: DynFlags -> [GlobalReg]
 realArgRegsCover dflags
-    | passFloatArgsInXmm dflags
-      = map ($VGcPtr) (realVanillaRegs dflags) ++
-        realLongRegs dflags ++
-        map (\x -> XmmReg x 2 W64 Integer) (realXmmRegNos dflags)
-    | otherwise
-      = map ($VGcPtr) (realVanillaRegs dflags) ++
-        realFloatRegs dflags ++
-        realDoubleRegs dflags ++
-        realLongRegs dflags ++
-        map (\x -> XmmReg x 2 W64 Integer) (realXmmRegNos dflags)
+    | passFloatArgsInXmm dflags = map ($VGcPtr) (realVanillaRegs dflags) ++
+                                  realLongRegs dflags ++
+                                  map XmmReg (realXmmRegNos dflags)
+    | otherwise                 = map ($VGcPtr) (realVanillaRegs dflags) ++
+                                  realFloatRegs dflags ++
+                                  realDoubleRegs dflags ++
+                                  realLongRegs dflags ++
+                                  map XmmReg (realXmmRegNos dflags)
diff --git a/compiler/cmm/CmmExpr.hs b/compiler/cmm/CmmExpr.hs
index 79eaf8f89c..901df5d908 100644
--- a/compiler/cmm/CmmExpr.hs
+++ b/compiler/cmm/CmmExpr.hs
@@ -14,7 +14,6 @@ module CmmExpr
     , currentTSOReg, currentNurseryReg, hpAllocReg, cccsReg
     , node, baseReg
     , VGcPtr(..)
-    , GlobalVecRegTy(..)
 
     , DefinerOfRegs, UserOfRegs
     , foldRegsDefd, foldRegsUsed
@@ -42,7 +41,6 @@ import Outputable (panic)
 import Unique
 
 import Data.Set (Set)
-import Data.Monoid ((<>))
 import qualified Data.Set as Set
 
 import BasicTypes (Alignment, mkAlignment, alignmentOf)
@@ -394,7 +392,6 @@ data VGcPtr = VGcPtr | VNonGcPtr deriving( Eq, Show )
 -----------------------------------------------------------------------------
 {-
 Note [Overlapping global registers]
-~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
 
 The backend might not faithfully implement the abstraction of the STG
 machine with independent registers for different values of type
@@ -416,26 +413,6 @@ on a particular platform. The instance Eq GlobalReg is syntactic
 equality of STG registers and does not take overlap into
 account. However it is still used in UserOfRegs/DefinerOfRegs and
 there are likely still bugs there, beware!
-
-
-Note [SIMD registers]
-~~~~~~~~~~~~~~~~~~~~~
-
-GHC's treatment of SIMD registers is heavily modelled after the x86_64
-architecture. Namely we have 128- (XMM), 256- (YMM), and 512-bit (ZMM)
-registers. Furthermore, we treat each possible format in these registers as a
-distinct register which overlaps with the others. For instance, we XMM1 as a
-2xI64 register is distinct from but overlaps with (in the sense defined in Note
-[Overlapping global registers]) its use as a 4xI32 register.
-
-This model makes it easier to fit SIMD registers into the NCG, which generally
-expects that each global register has a single, known CmmType.
-
-In the future we could consider further refactoring this to eliminate the
-XMM, YMM, and ZMM register names (which are quite x86-specific) and instead just
-having a set of NxM-bit vector registers (e.g. Vec2x64A, Vec2x64B, ...,
-Vec4x32A, ..., Vec4x64A).
-
 -}
 
 data GlobalReg
@@ -455,15 +432,12 @@ data GlobalReg
 
   | XmmReg                      -- 128-bit SIMD vector register
         {-# UNPACK #-} !Int     -- its number
-        !Length !Width !GlobalVecRegTy
 
   | YmmReg                      -- 256-bit SIMD vector register
         {-# UNPACK #-} !Int     -- its number
-        !Length !Width !GlobalVecRegTy
 
   | ZmmReg                      -- 512-bit SIMD vector register
         {-# UNPACK #-} !Int     -- its number
-        !Length !Width !GlobalVecRegTy
 
   -- STG registers
   | Sp                  -- Stack ptr; points to last occupied stack location.
@@ -504,17 +478,17 @@ data GlobalReg
 
   deriving( Show )
 
-data GlobalVecRegTy = Integer | Float
-  deriving (Show, Eq, Ord)
-
 instance Eq GlobalReg where
    VanillaReg i _ == VanillaReg j _ = i==j -- Ignore type when seeking clashes
    FloatReg i == FloatReg j = i==j
    DoubleReg i == DoubleReg j = i==j
    LongReg i == LongReg j = i==j
-   XmmReg i l w grt == XmmReg j l' w' grt' = i==j && l == l' && w == w' && grt == grt'
-   YmmReg i l w grt == YmmReg j l' w' grt' = i==j && l == l' && w == w' && grt == grt'
-   ZmmReg i l w grt == ZmmReg j l' w' grt' = i==j && l == l' && w == w' && grt == grt'
+   -- NOTE: XMM, YMM, ZMM registers actually are the same registers
+   -- at least with respect to store at YMM i and then read from XMM i
+   -- and similarly for ZMM etc.
+   XmmReg i == XmmReg j = i==j
+   YmmReg i == YmmReg j = i==j
+   ZmmReg i == ZmmReg j = i==j
    Sp == Sp = True
    SpLim == SpLim = True
    Hp == Hp = True
@@ -538,21 +512,9 @@ instance Ord GlobalReg where
    compare (FloatReg i)  (FloatReg  j) = compare i j
    compare (DoubleReg i) (DoubleReg j) = compare i j
    compare (LongReg i)   (LongReg   j) = compare i j
-   compare (XmmReg i l w grt)
-           (XmmReg j l' w' grt')       = compare i j
-                                         <> compare l l'
-                                         <> compare w w'
-                                         <> compare grt grt'
-   compare (YmmReg i l w grt)
-           (YmmReg j l' w' grt')       = compare i j
-                                         <> compare l l'
-                                         <> compare w w'
-                                         <> compare grt grt'
-   compare (ZmmReg i l w grt)
-           (ZmmReg j l' w' grt')       = compare i j
-                                         <> compare l l'
-                                         <> compare w w'
-                                         <> compare grt grt'
+   compare (XmmReg i)    (XmmReg    j) = compare i j
+   compare (YmmReg i)    (YmmReg    j) = compare i j
+   compare (ZmmReg i)    (ZmmReg    j) = compare i j
    compare Sp Sp = EQ
    compare SpLim SpLim = EQ
    compare Hp Hp = EQ
@@ -576,12 +538,12 @@ instance Ord GlobalReg where
    compare _ (DoubleReg _)    = GT
    compare (LongReg _) _      = LT
    compare _ (LongReg _)      = GT
-   compare (XmmReg _ _ _ _) _ = LT
-   compare _ (XmmReg _ _ _ _) = GT
-   compare (YmmReg _ _ _ _) _ = LT
-   compare _ (YmmReg _ _ _ _) = GT
-   compare (ZmmReg _ _ _ _) _ = LT
-   compare _ (ZmmReg _ _ _ _) = GT
+   compare (XmmReg _) _       = LT
+   compare _ (XmmReg _)       = GT
+   compare (YmmReg _) _       = LT
+   compare _ (YmmReg _)       = GT
+   compare (ZmmReg _) _       = LT
+   compare _ (ZmmReg _)       = GT
    compare Sp _ = LT
    compare _ Sp = GT
    compare SpLim _ = LT
@@ -634,15 +596,12 @@ globalRegType dflags (VanillaReg _ VNonGcPtr) = bWord dflags
 globalRegType _      (FloatReg _)      = cmmFloat W32
 globalRegType _      (DoubleReg _)     = cmmFloat W64
 globalRegType _      (LongReg _)       = cmmBits W64
-globalRegType _      (XmmReg _ l w ty) = case ty of
-                                           Integer -> cmmVec l (cmmBits w)
-                                           Float   -> cmmVec l (cmmFloat w)
-globalRegType _      (YmmReg _ l w ty) = case ty of
-                                           Integer -> cmmVec l (cmmBits w)
-                                           Float   -> cmmVec l (cmmFloat w)
-globalRegType _      (ZmmReg _ l w ty) = case ty of
-                                           Integer -> cmmVec l (cmmBits w)
-                                           Float   -> cmmVec l (cmmFloat w)
+-- TODO: improve the internal model of SIMD/vectorized registers
+-- the right design SHOULd improve handling of float and double code too.
+-- see remarks in "NOTE [SIMD Design for the future]"" in StgCmmPrim
+globalRegType _      (XmmReg _)        = cmmVec 4 (cmmBits W32)
+globalRegType _      (YmmReg _)        = cmmVec 8 (cmmBits W32)
+globalRegType _      (ZmmReg _)        = cmmVec 16 (cmmBits W32)
 
 globalRegType dflags Hp                = gcWord dflags
                                             -- The initialiser for all
diff --git a/compiler/cmm/CmmLint.hs b/compiler/cmm/CmmLint.hs
index 53dcd70b7b..d5c3f84443 100644
--- a/compiler/cmm/CmmLint.hs
+++ b/compiler/cmm/CmmLint.hs
@@ -148,13 +148,9 @@ lintCmmMiddle node = case node of
             dflags <- getDynFlags
             erep <- lintCmmExpr expr
             let reg_ty = cmmRegType dflags reg
-            case isVecCatType reg_ty of
-              True -> if ((typeWidth reg_ty) == (typeWidth erep))
-                         then return ()
-                         else cmmLintAssignErr (CmmAssign reg expr) erep reg_ty
-              _    -> if (erep `cmmEqType_ignoring_ptrhood` reg_ty)
-                         then return ()
-                          else cmmLintAssignErr (CmmAssign reg expr) erep reg_ty
+            if (erep `cmmEqType_ignoring_ptrhood` reg_ty)
+                then return ()
+                else cmmLintAssignErr (CmmAssign reg expr) erep reg_ty
 
   CmmStore l r -> do
             _ <- lintCmmExpr l
diff --git a/compiler/cmm/CmmMachOp.hs b/compiler/cmm/CmmMachOp.hs
index 38d9edb480..9740d21bef 100644
--- a/compiler/cmm/CmmMachOp.hs
+++ b/compiler/cmm/CmmMachOp.hs
@@ -136,9 +136,8 @@ data MachOp
   | MO_VU_Rem  Length Width
 
   -- Floting point vector element insertion and extraction operations
-  | MO_VF_Broadcast Length Width   -- Broadcast a scalar into a vector
-  | MO_VF_Insert    Length Width   -- Insert scalar into vector
-  | MO_VF_Extract   Length Width   -- Extract scalar from vector
+  | MO_VF_Insert  Length Width   -- Insert scalar into vector
+  | MO_VF_Extract Length Width   -- Extract scalar from vector
 
   -- Floating point vector operations
   | MO_VF_Add  Length Width
@@ -431,7 +430,6 @@ machOpResultType dflags mop tys =
     MO_VU_Quot l w      -> cmmVec l (cmmBits w)
     MO_VU_Rem  l w      -> cmmVec l (cmmBits w)
 
-    MO_VF_Broadcast l w -> cmmVec l (cmmFloat w)
     MO_VF_Insert  l w   -> cmmVec l (cmmFloat w)
     MO_VF_Extract _ w   -> cmmFloat w
 
@@ -524,21 +522,16 @@ machOpArgReps dflags op =
     MO_VU_Quot _ r      -> [r,r]
     MO_VU_Rem  _ r      -> [r,r]
 
-    -- offset is always W32 as mentioned in StgCmmPrim.hs
-    MO_VF_Broadcast l r -> [vecwidth l r, r]
-    MO_VF_Insert    l r -> [vecwidth l r, r, W32]
-    MO_VF_Extract   l r -> [vecwidth l r, W32]
+    MO_VF_Insert  l r   -> [typeWidth (vec l (cmmFloat r)),r,wordWidth dflags]
+    MO_VF_Extract l r   -> [typeWidth (vec l (cmmFloat r)),wordWidth dflags]
 
-    -- NOTE: The below is owing to the fact that floats use the SSE registers
-    MO_VF_Add  l w      -> [vecwidth l w, vecwidth l w]
-    MO_VF_Sub  l w      -> [vecwidth l w, vecwidth l w]
-    MO_VF_Mul  l w      -> [vecwidth l w, vecwidth l w]
-    MO_VF_Quot l w      -> [vecwidth l w, vecwidth l w]
-    MO_VF_Neg  l w      -> [vecwidth l w]
+    MO_VF_Add  _ r      -> [r,r]
+    MO_VF_Sub  _ r      -> [r,r]
+    MO_VF_Mul  _ r      -> [r,r]
+    MO_VF_Quot _ r      -> [r,r]
+    MO_VF_Neg  _ r      -> [r]
 
     MO_AlignmentCheck _ r -> [r]
-    where
-      vecwidth l w = widthFromBytes (l*widthInBytes w)
 
 -----------------------------------------------------------------------------
 -- CallishMachOp
diff --git a/compiler/cmm/CmmType.hs b/compiler/cmm/CmmType.hs
index 17b588720f..43d23c7ee7 100644
--- a/compiler/cmm/CmmType.hs
+++ b/compiler/cmm/CmmType.hs
@@ -6,7 +6,6 @@ module CmmType
     , typeWidth, cmmEqType, cmmEqType_ignoring_ptrhood
     , isFloatType, isGcPtrType, isBitsType
     , isWord32, isWord64, isFloat64, isFloat32
-    , isVecCatType
 
     , Width(..)
     , widthInBits, widthInBytes, widthInLog, widthFromBytes
@@ -134,7 +133,7 @@ cInt :: DynFlags -> CmmType
 cInt dflags = cmmBits (cIntWidth  dflags)
 
 ------------ Predicates ----------------
-isFloatType, isGcPtrType, isBitsType, isVecCatType :: CmmType -> Bool
+isFloatType, isGcPtrType, isBitsType :: CmmType -> Bool
 isFloatType (CmmType FloatCat    _) = True
 isFloatType _other                  = False
 
@@ -144,9 +143,6 @@ isGcPtrType _other               = False
 isBitsType (CmmType BitsCat _) = True
 isBitsType _                   = False
 
-isVecCatType (CmmType (VecCat _ _) _) = True
-isVecCatType _other                   = False
-
 isWord32, isWord64, isFloat32, isFloat64 :: CmmType -> Bool
 -- isWord64 is true of 64-bit non-floats (both gc-ptrs and otherwise)
 -- isFloat32 and 64 are obvious
diff --git a/compiler/cmm/PprC.hs b/compiler/cmm/PprC.hs
index a60a26229b..7227edd57e 100644
--- a/compiler/cmm/PprC.hs
+++ b/compiler/cmm/PprC.hs
@@ -713,10 +713,6 @@ pprMachOp_for_C mop = case mop of
                                 (panic $ "PprC.pprMachOp_for_C: MO_VU_Rem"
                                       ++ " should have been handled earlier!")
 
-        MO_VF_Broadcast {} -> pprTrace "offending mop:"
-                                 (text "MO_VF_Broadcast")
-                                 (panic $ "PprC.pprMachOp_for_C: MO_VF_Broadcast"
-                                      ++ " should have been handled earlier!")
         MO_VF_Insert {}   -> pprTrace "offending mop:"
                                 (text "MO_VF_Insert")
                                 (panic $ "PprC.pprMachOp_for_C: MO_VF_Insert"
diff --git a/compiler/cmm/PprCmmExpr.hs b/compiler/cmm/PprCmmExpr.hs
index 2080c1f5d8..7bf73f1ca6 100644
--- a/compiler/cmm/PprCmmExpr.hs
+++ b/compiler/cmm/PprCmmExpr.hs
@@ -261,9 +261,9 @@ pprGlobalReg gr
         FloatReg   n   -> char 'F' <> int n
         DoubleReg  n   -> char 'D' <> int n
         LongReg    n   -> char 'L' <> int n
-        XmmReg     n _ _ _ -> text "XMM" <> int n
-        YmmReg     n _ _ _ -> text "YMM" <> int n
-        ZmmReg     n _ _ _ -> text "ZMM" <> int n
+        XmmReg     n   -> text "XMM" <> int n
+        YmmReg     n   -> text "YMM" <> int n
+        ZmmReg     n   -> text "ZMM" <> int n
         Sp             -> text "Sp"
         SpLim          -> text "SpLim"
         Hp             -> text "Hp"