10 files changed, 191 insertions, 109 deletions
diff --git a/compiler/cmm/CmmUtils.hs b/compiler/cmm/CmmUtils.hs
index c822da9673..435df58596 100644
--- a/compiler/cmm/CmmUtils.hs
+++ b/compiler/cmm/CmmUtils.hs
@@ -62,7 +62,7 @@ module CmmUtils(
 
 #include "HsVersions.h"
 
-import TyCon    ( PrimRep(..) )
+import TyCon    ( PrimRep(..), PrimElemRep(..) )
 import Type     ( UnaryType, typePrimRep )
 
 import SMRep
@@ -87,15 +87,28 @@ import Hoopl
 ---------------------------------------------------
 
 primRepCmmType :: DynFlags -> PrimRep -> CmmType
-primRepCmmType _      VoidRep    = panic "primRepCmmType:VoidRep"
-primRepCmmType dflags PtrRep     = gcWord dflags
-primRepCmmType dflags IntRep     = bWord dflags
-primRepCmmType dflags WordRep    = bWord dflags
-primRepCmmType _      Int64Rep   = b64
-primRepCmmType _      Word64Rep  = b64
-primRepCmmType dflags AddrRep    = bWord dflags
-primRepCmmType _      FloatRep   = f32
-primRepCmmType _      DoubleRep  = f64
+primRepCmmType _      VoidRep          = panic "primRepCmmType:VoidRep"
+primRepCmmType dflags PtrRep           = gcWord dflags
+primRepCmmType dflags IntRep           = bWord dflags
+primRepCmmType dflags WordRep          = bWord dflags
+primRepCmmType _      Int64Rep         = b64
+primRepCmmType _      Word64Rep        = b64
+primRepCmmType dflags AddrRep          = bWord dflags
+primRepCmmType _      FloatRep         = f32
+primRepCmmType _      DoubleRep        = f64
+primRepCmmType _      (VecRep len rep) = vec len (primElemRepCmmType rep)
+
+primElemRepCmmType :: PrimElemRep -> CmmType
+primElemRepCmmType Int8ElemRep   = b8
+primElemRepCmmType Int16ElemRep  = b16
+primElemRepCmmType Int32ElemRep  = b32
+primElemRepCmmType Int64ElemRep  = b64
+primElemRepCmmType Word8ElemRep  = b8
+primElemRepCmmType Word16ElemRep = b16
+primElemRepCmmType Word32ElemRep = b32
+primElemRepCmmType Word64ElemRep = b64
+primElemRepCmmType FloatElemRep  = f32
+primElemRepCmmType DoubleElemRep = f64
 
 typeCmmType :: DynFlags -> UnaryType -> CmmType
 typeCmmType dflags ty = primRepCmmType dflags (typePrimRep ty)
@@ -110,6 +123,7 @@ primRepForeignHint Word64Rep    = NoHint
 primRepForeignHint AddrRep      = AddrHint -- NB! AddrHint, but NonPtrArg
 primRepForeignHint FloatRep     = NoHint
 primRepForeignHint DoubleRep    = NoHint
+primRepForeignHint (VecRep {})  = NoHint
 
 typeForeignHint :: UnaryType -> ForeignHint
 typeForeignHint = primRepForeignHint . typePrimRep
diff --git a/compiler/codeGen/StgCmmLayout.hs b/compiler/codeGen/StgCmmLayout.hs
index 8544709bd8..a3bbefeb44 100644
--- a/compiler/codeGen/StgCmmLayout.hs
+++ b/compiler/codeGen/StgCmmLayout.hs
@@ -46,7 +46,7 @@ import CLabel
 import StgSyn
 import Id
 import Name
-import TyCon		( PrimRep(..) )
+import TyCon		( PrimRep(..), primElemRepSizeB )
 import BasicTypes	( RepArity )
 import DynFlags
 import Module
@@ -317,6 +317,7 @@ slowCallPattern (N: _)		      = (fsLit "stg_ap_n", 1)
 slowCallPattern (F: _)		      = (fsLit "stg_ap_f", 1)
 slowCallPattern (D: _)		      = (fsLit "stg_ap_d", 1)
 slowCallPattern (L: _)		      = (fsLit "stg_ap_l", 1)
+slowCallPattern (V16: _)	      = (fsLit "stg_ap_v16", 1)
 slowCallPattern []		      = (fsLit "stg_ap_0", 0)
 
 
@@ -333,36 +334,42 @@ data ArgRep = P   -- GC Ptr
             | V   -- Void
             | F   -- Float
             | D   -- Double
+            | V16 -- 16-byte (128-bit) vectors of Float/Double/Int8/Word32/etc.
 instance Outputable ArgRep where
-  ppr P = text "P"
-  ppr N = text "N"
-  ppr L = text "L"
-  ppr V = text "V"
-  ppr F = text "F"
-  ppr D = text "D"
+  ppr P   = text "P"
+  ppr N   = text "N"
+  ppr L   = text "L"
+  ppr V   = text "V"
+  ppr F   = text "F"
+  ppr D   = text "D"
+  ppr V16 = text "V16"
 
 toArgRep :: PrimRep -> ArgRep
-toArgRep VoidRep   = V
-toArgRep PtrRep    = P
-toArgRep IntRep    = N
-toArgRep WordRep   = N
-toArgRep AddrRep   = N
-toArgRep Int64Rep  = L
-toArgRep Word64Rep = L
-toArgRep FloatRep  = F
-toArgRep DoubleRep = D
+toArgRep VoidRep           = V
+toArgRep PtrRep            = P
+toArgRep IntRep            = N
+toArgRep WordRep           = N
+toArgRep AddrRep           = N
+toArgRep Int64Rep          = L
+toArgRep Word64Rep         = L
+toArgRep FloatRep          = F
+toArgRep DoubleRep         = D
+toArgRep (VecRep len elem)
+    | len*primElemRepSizeB elem == 16 = V16
+    | otherwise                       = error "toArgRep: bad vector primrep"
 
 isNonV :: ArgRep -> Bool
 isNonV V = False
 isNonV _ = True
 
 argRepSizeW :: DynFlags -> ArgRep -> WordOff                -- Size in words
-argRepSizeW _      N = 1
-argRepSizeW _      P = 1
-argRepSizeW _      F = 1
-argRepSizeW dflags L = wORD64_SIZE        `quot` wORD_SIZE dflags
-argRepSizeW dflags D = dOUBLE_SIZE dflags `quot` wORD_SIZE dflags
-argRepSizeW _      V = 0
+argRepSizeW _      N   = 1
+argRepSizeW _      P   = 1
+argRepSizeW _      F   = 1
+argRepSizeW dflags L   = wORD64_SIZE        `quot` wORD_SIZE dflags
+argRepSizeW dflags D   = dOUBLE_SIZE dflags `quot` wORD_SIZE dflags
+argRepSizeW _      V   = 0
+argRepSizeW dflags V16 = 16                 `quot` wORD_SIZE dflags
 
 idArgRep :: Id -> ArgRep
 idArgRep = toArgRep . idPrimRep
@@ -456,12 +463,13 @@ argBits dflags (arg : args) = take (argRepSizeW dflags arg) (repeat True)
 stdPattern :: [ArgRep] -> Maybe Int
 stdPattern reps
   = case reps of
-	[]  -> Just ARG_NONE	-- just void args, probably
-	[N] -> Just ARG_N
-	[P] -> Just ARG_P
-	[F] -> Just ARG_F
-	[D] -> Just ARG_D
-	[L] -> Just ARG_L
+	[]    -> Just ARG_NONE	-- just void args, probably
+	[N]   -> Just ARG_N
+	[P]   -> Just ARG_P
+	[F]   -> Just ARG_F
+	[D]   -> Just ARG_D
+	[L]   -> Just ARG_L
+	[V16] -> Just ARG_V16
 
 	[N,N] -> Just ARG_NN
 	[N,P] -> Just ARG_NP
diff --git a/compiler/ghci/ByteCodeAsm.lhs b/compiler/ghci/ByteCodeAsm.lhs
index 9631add3a9..b63778c801 100644
--- a/compiler/ghci/ByteCodeAsm.lhs
+++ b/compiler/ghci/ByteCodeAsm.lhs
@@ -437,20 +437,22 @@ isLarge :: Word -> Bool
 isLarge n = n > 65535
 
 push_alts :: ArgRep -> Word16
-push_alts V = bci_PUSH_ALTS_V
-push_alts P = bci_PUSH_ALTS_P
-push_alts N = bci_PUSH_ALTS_N
-push_alts L = bci_PUSH_ALTS_L
-push_alts F = bci_PUSH_ALTS_F
-push_alts D = bci_PUSH_ALTS_D
+push_alts V   = bci_PUSH_ALTS_V
+push_alts P   = bci_PUSH_ALTS_P
+push_alts N   = bci_PUSH_ALTS_N
+push_alts L   = bci_PUSH_ALTS_L
+push_alts F   = bci_PUSH_ALTS_F
+push_alts D   = bci_PUSH_ALTS_D
+push_alts V16 = error "push_alts: vector"
 
 return_ubx :: ArgRep -> Word16
-return_ubx V = bci_RETURN_V
-return_ubx P = bci_RETURN_P
-return_ubx N = bci_RETURN_N
-return_ubx L = bci_RETURN_L
-return_ubx F = bci_RETURN_F
-return_ubx D = bci_RETURN_D
+return_ubx V   = bci_RETURN_V
+return_ubx P   = bci_RETURN_P
+return_ubx N   = bci_RETURN_N
+return_ubx L   = bci_RETURN_L
+return_ubx F   = bci_RETURN_F
+return_ubx D   = bci_RETURN_D
+return_ubx V16 = error "return_ubx: vector"
 
 -- Make lists of host-sized words for literals, so that when the
 -- words are placed in memory at increasing addresses, the
diff --git a/compiler/llvmGen/LlvmCodeGen/CodeGen.hs b/compiler/llvmGen/LlvmCodeGen/CodeGen.hs
index 28933831f4..b7e085116d 100644
--- a/compiler/llvmGen/LlvmCodeGen/CodeGen.hs
+++ b/compiler/llvmGen/LlvmCodeGen/CodeGen.hs
@@ -1260,6 +1260,17 @@ genLit opt env (CmmInt i w)
 genLit _ env (CmmFloat r w)
   = return (env, LMLitVar $ LMFloatLit (fromRational r) (widthToLlvmFloat w),
               nilOL, [])
+ 
+genLit opt env (CmmVec ls)
+  = do llvmLits <- mapM toLlvmLit ls
+       return (env, LMLitVar $ LMVectorLit llvmLits, nilOL, [])
+  where
+    toLlvmLit :: CmmLit -> UniqSM LlvmLit
+    toLlvmLit lit = do
+        (_, llvmLitVar, _, _) <- genLit opt env lit
+        case llvmLitVar of
+          LMLitVar llvmLit -> return llvmLit
+          _ -> panic "genLit"
 
 genLit _ env cmm@(CmmLabel l)
   = let dflags = getDflags env
diff --git a/compiler/types/TyCon.lhs b/compiler/types/TyCon.lhs
index 1ad8a297ad..2ad8db01b0 100644
--- a/compiler/types/TyCon.lhs
+++ b/compiler/types/TyCon.lhs
@@ -79,9 +79,9 @@ module TyCon(
         pprPromotionQuote,
 
         -- * Primitive representations of Types
-        PrimRep(..),
+        PrimRep(..), PrimElemRep(..),
         tyConPrimRep,
-        primRepSizeW
+        primRepSizeW, primElemRepSizeB
 ) where
 
 #include "HsVersions.h"
@@ -784,22 +784,52 @@ data PrimRep
   | AddrRep             -- ^ A pointer, but /not/ to a Haskell value (use 'PtrRep')
   | FloatRep
   | DoubleRep
+  | VecRep Int PrimElemRep  -- ^ A vector
   deriving( Eq, Show )
 
+data PrimElemRep
+  = Int8ElemRep
+  | Int16ElemRep
+  | Int32ElemRep
+  | Int64ElemRep
+  | Word8ElemRep
+  | Word16ElemRep
+  | Word32ElemRep
+  | Word64ElemRep
+  | FloatElemRep
+  | DoubleElemRep
+   deriving( Eq, Show )
+
 instance Outputable PrimRep where
   ppr r = text (show r)
 
+instance Outputable PrimElemRep where
+  ppr r = text (show r)
+
 -- | Find the size of a 'PrimRep', in words
 primRepSizeW :: DynFlags -> PrimRep -> Int
-primRepSizeW _      IntRep   = 1
-primRepSizeW _      WordRep  = 1
-primRepSizeW dflags Int64Rep = wORD64_SIZE `quot` wORD_SIZE dflags
-primRepSizeW dflags Word64Rep= wORD64_SIZE `quot` wORD_SIZE dflags
-primRepSizeW _      FloatRep = 1    -- NB. might not take a full word
-primRepSizeW dflags DoubleRep= dOUBLE_SIZE dflags `quot` wORD_SIZE dflags
-primRepSizeW _      AddrRep  = 1
-primRepSizeW _      PtrRep   = 1
-primRepSizeW _      VoidRep  = 0
+primRepSizeW _      IntRep           = 1
+primRepSizeW _      WordRep          = 1
+primRepSizeW dflags Int64Rep         = wORD64_SIZE `quot` wORD_SIZE dflags
+primRepSizeW dflags Word64Rep        = wORD64_SIZE `quot` wORD_SIZE dflags
+primRepSizeW _      FloatRep         = 1    -- NB. might not take a full word
+primRepSizeW dflags DoubleRep        = dOUBLE_SIZE dflags `quot` wORD_SIZE dflags
+primRepSizeW _      AddrRep          = 1
+primRepSizeW _      PtrRep           = 1
+primRepSizeW _      VoidRep          = 0
+primRepSizeW dflags (VecRep len rep) = len * primElemRepSizeB rep `quot` wORD_SIZE dflags
+
+primElemRepSizeB :: PrimElemRep -> Int
+primElemRepSizeB Int8ElemRep   = 1
+primElemRepSizeB Int16ElemRep  = 2
+primElemRepSizeB Int32ElemRep  = 4
+primElemRepSizeB Int64ElemRep  = 8
+primElemRepSizeB Word8ElemRep  = 1
+primElemRepSizeB Word16ElemRep = 2
+primElemRepSizeB Word32ElemRep = 4
+primElemRepSizeB Word64ElemRep = 8
+primElemRepSizeB FloatElemRep  = 4
+primElemRepSizeB DoubleElemRep = 8
 \end{code}
 
 %************************************************************************
diff --git a/includes/Cmm.h b/includes/Cmm.h
index 41e7b894d2..1505b1cb6a 100644
--- a/includes/Cmm.h
+++ b/includes/Cmm.h
@@ -95,9 +95,10 @@
 #error Unknown long size
 #endif
 
-#define F_ float32
-#define D_ float64
-#define L_ bits64
+#define F_   float32
+#define D_   float64
+#define L_   bits64
+#define V16_ bits128
 
 #define SIZEOF_StgDouble 8
 #define SIZEOF_StgWord64 8
diff --git a/includes/rts/storage/FunTypes.h b/includes/rts/storage/FunTypes.h
index b44366789b..0ba65bb79d 100644
--- a/includes/rts/storage/FunTypes.h
+++ b/includes/rts/storage/FunTypes.h
@@ -33,22 +33,23 @@
 #define ARG_F        6 
 #define ARG_D        7 
 #define ARG_L        8 
-#define ARG_NN       9 
-#define ARG_NP       10
-#define ARG_PN       11
-#define ARG_PP       12
-#define ARG_NNN      13
-#define ARG_NNP      14
-#define ARG_NPN      15
-#define ARG_NPP      16
-#define ARG_PNN      17
-#define ARG_PNP      18
-#define ARG_PPN      19
-#define ARG_PPP      20
-#define ARG_PPPP     21
-#define ARG_PPPPP    22
-#define ARG_PPPPPP   23
-#define ARG_PPPPPPP  24
-#define ARG_PPPPPPPP 25
+#define ARG_V16      9 
+#define ARG_NN       10 
+#define ARG_NP       11
+#define ARG_PN       12
+#define ARG_PP       13
+#define ARG_NNN      14
+#define ARG_NNP      15
+#define ARG_NPN      16
+#define ARG_NPP      17
+#define ARG_PNN      18
+#define ARG_PNP      19
+#define ARG_PPN      20
+#define ARG_PPP      21
+#define ARG_PPPP     22
+#define ARG_PPPPP    23
+#define ARG_PPPPPP   24
+#define ARG_PPPPPPP  25
+#define ARG_PPPPPPPP 26
 
 #endif /* RTS_STORAGE_FUNTYPES_H */
diff --git a/includes/stg/MiscClosures.h b/includes/stg/MiscClosures.h
index 0eccfbf0e5..eec98c2357 100644
--- a/includes/stg/MiscClosures.h
+++ b/includes/stg/MiscClosures.h
@@ -223,6 +223,7 @@ RTS_RET(stg_ap_v);
 RTS_RET(stg_ap_f);
 RTS_RET(stg_ap_d);
 RTS_RET(stg_ap_l);
+RTS_RET(stg_ap_v16);
 RTS_RET(stg_ap_n);
 RTS_RET(stg_ap_p);
 RTS_RET(stg_ap_pv);
@@ -239,6 +240,7 @@ RTS_FUN_DECL(stg_ap_v_fast);
 RTS_FUN_DECL(stg_ap_f_fast);
 RTS_FUN_DECL(stg_ap_d_fast);
 RTS_FUN_DECL(stg_ap_l_fast);
+RTS_FUN_DECL(stg_ap_v16_fast);
 RTS_FUN_DECL(stg_ap_n_fast);
 RTS_FUN_DECL(stg_ap_p_fast);
 RTS_FUN_DECL(stg_ap_pv_fast);
diff --git a/rts/Linker.c b/rts/Linker.c
index 39b7897c94..4a539f5562 100644
--- a/rts/Linker.c
+++ b/rts/Linker.c
@@ -881,6 +881,7 @@ typedef struct _RtsSymbolVal {
       SymI_HasProto(stg_ap_f_ret)                       \
       SymI_HasProto(stg_ap_d_ret)                       \
       SymI_HasProto(stg_ap_l_ret)                       \
+      SymI_HasProto(stg_ap_v16_ret)                     \
       SymI_HasProto(stg_ap_n_ret)                       \
       SymI_HasProto(stg_ap_p_ret)                       \
       SymI_HasProto(stg_ap_pv_ret)                      \
@@ -1232,6 +1233,7 @@ typedef struct _RtsSymbolVal {
       SymI_HasProto(stg_ap_f_info)                                      \
       SymI_HasProto(stg_ap_d_info)                                      \
       SymI_HasProto(stg_ap_l_info)                                      \
+      SymI_HasProto(stg_ap_v16_info)                                    \
       SymI_HasProto(stg_ap_n_info)                                      \
       SymI_HasProto(stg_ap_p_info)                                      \
       SymI_HasProto(stg_ap_pv_info)                                     \
@@ -1247,6 +1249,7 @@ typedef struct _RtsSymbolVal {
       SymI_HasProto(stg_ap_f_fast)                                      \
       SymI_HasProto(stg_ap_d_fast)                                      \
       SymI_HasProto(stg_ap_l_fast)                                      \
+      SymI_HasProto(stg_ap_v16_fast)                                    \
       SymI_HasProto(stg_ap_n_fast)                                      \
       SymI_HasProto(stg_ap_p_fast)                                      \
       SymI_HasProto(stg_ap_pv_fast)                                     \
diff --git a/utils/genapply/GenApply.hs b/utils/genapply/GenApply.hs
index 33146b2a2c..2baf85896a 100644
--- a/utils/genapply/GenApply.hs
+++ b/utils/genapply/GenApply.hs
@@ -26,29 +26,32 @@ import System.IO
 -- Argument kinds (rougly equivalent to PrimRep)
 
 data ArgRep 
-  = N           -- non-ptr
-  | P           -- ptr
-  | V           -- void
-  | F           -- float
-  | D           -- double
-  | L           -- long (64-bit)
+  = N   -- non-ptr
+  | P   -- ptr
+  | V   -- void
+  | F   -- float
+  | D   -- double
+  | L   -- long (64-bit)
+  | V16 -- 16-byte (128-bit) vectors
 
 -- size of a value in *words*
 argSize :: ArgRep -> Int
-argSize N = 1
-argSize P = 1
-argSize V = 0
-argSize F = 1
-argSize D = (SIZEOF_DOUBLE `quot` SIZEOF_VOID_P :: Int)
-argSize L = (8 `quot` SIZEOF_VOID_P :: Int)
-
-showArg :: ArgRep -> Char
-showArg N = 'n'
-showArg P = 'p'
-showArg V = 'v'
-showArg F = 'f'
-showArg D = 'd'
-showArg L = 'l'
+argSize N   = 1
+argSize P   = 1
+argSize V   = 0
+argSize F   = 1
+argSize D   = (SIZEOF_DOUBLE `quot` SIZEOF_VOID_P :: Int)
+argSize L   = (8 `quot` SIZEOF_VOID_P :: Int)
+argSize V16 = (16 `quot` SIZEOF_VOID_P :: Int)
+
+showArg :: ArgRep -> String
+showArg N   = "n"
+showArg P   = "p"
+showArg V   = "v"
+showArg F   = "f"
+showArg D   = "d"
+showArg L   = "l"
+showArg V16 = "v16"
 
 -- is a value a pointer?
 isPtr :: ArgRep -> Bool
@@ -174,7 +177,7 @@ mkBitmap args = foldr f 0 args
 -- when we start passing args to stg_ap_* in regs).
 
 mkApplyName args
-  = text "stg_ap_" <> text (map showArg args)
+  = text "stg_ap_" <> text (concatMap showArg args)
 
 mkApplyRetName args
   = mkApplyName args <> text "_ret"
@@ -496,11 +499,12 @@ formalParam arg n =
     text "arg" <> int n <> text ", "
 formalParamType arg = argRep arg
 
-argRep F = text "F_"
-argRep D = text "D_"
-argRep L = text "L_"
-argRep P = text "gcptr"
-argRep _ = text "W_"
+argRep F   = text "F_"
+argRep D   = text "D_"
+argRep L   = text "L_"
+argRep P   = text "gcptr"
+argRep V16 = text "V16_"
+argRep _   = text "W_"
 
 genApply regstatus args =
    let
@@ -758,7 +762,7 @@ genApplyFast regstatus args =
 -- void arguments.
 
 mkStackApplyEntryLabel:: [ArgRep] -> Doc
-mkStackApplyEntryLabel args = text "stg_ap_stk_" <> text (map showArg args)
+mkStackApplyEntryLabel args = text "stg_ap_stk_" <> text (concatMap showArg args)
 
 genStackApply :: RegStatus -> [ArgRep] -> Doc
 genStackApply regstatus args = 
@@ -783,7 +787,7 @@ genStackApply regstatus args =
 -- in HeapStackCheck.hc for more details.
 
 mkStackSaveEntryLabel :: [ArgRep] -> Doc
-mkStackSaveEntryLabel args = text "stg_stk_save_" <> text (map showArg args)
+mkStackSaveEntryLabel args = text "stg_stk_save_" <> text (concatMap showArg args)
 
 genStackSave :: RegStatus -> [ArgRep] -> Doc
 genStackSave regstatus args =
@@ -849,6 +853,7 @@ applyTypes = [
         [F],
         [D],
         [L],
+        [V16],
         [N],
         [P],
         [P,V],
@@ -865,6 +870,10 @@ applyTypes = [
 -- ToDo: the stack apply and stack save code doesn't make a distinction
 -- between N and P (they both live in the same register), only the bitmap
 -- changes, so we could share the apply/save code between lots of cases.
+--
+--  NOTE: other places to change if you change stackApplyTypes:
+--       - includes/rts/storage/FunTypes.h
+--       - compiler/codeGen/CgCallConv.lhs: stdPattern
 stackApplyTypes = [
         [],
         [N],
@@ -872,6 +881,7 @@ stackApplyTypes = [
         [F],
         [D],
         [L],
+        [V16],
         [N,N],
         [N,P],
         [P,N],