27 files changed, 343 insertions, 985 deletions
diff --git a/compiler/cmm/CmmCallConv.hs b/compiler/cmm/CmmCallConv.hs
index f338415adc..078390638d 100644
--- a/compiler/cmm/CmmCallConv.hs
+++ b/compiler/cmm/CmmCallConv.hs
@@ -81,7 +81,6 @@ assignArgumentsPos dflags off conv arg_ty reps = (stk_off, assignments)
                             | passFloatInXmm          -> k (RegisterParam (DoubleReg s), (vs, fs, ds, ls, ss))
                         (W64, (vs, fs, d:ds, ls, ss))
                             | not passFloatInXmm      -> k (RegisterParam d, (vs, fs, ds, ls, ss))
-                        (W80, _) -> panic "F80 unsupported register type"
                         _ -> (assts, (r:rs))
               int = case (w, regs) of
                       (W128, _) -> panic "W128 unsupported register type"
@@ -100,6 +99,7 @@ assignArgumentsPos dflags off conv arg_ty reps = (stk_off, assignments)
 passFloatArgsInXmm :: DynFlags -> Bool
 passFloatArgsInXmm dflags = case platformArch (targetPlatform dflags) of
                               ArchX86_64 -> True
+                              ArchX86    -> False
                               _          -> False
 
 -- We used to spill vector registers to the stack since the LLVM backend didn't
diff --git a/compiler/cmm/CmmExpr.hs b/compiler/cmm/CmmExpr.hs
index 601b1d9b85..dd4e777436 100644
--- a/compiler/cmm/CmmExpr.hs
+++ b/compiler/cmm/CmmExpr.hs
@@ -474,6 +474,9 @@ instance Eq GlobalReg where
    FloatReg i == FloatReg j = i==j
    DoubleReg i == DoubleReg j = i==j
    LongReg i == LongReg j = i==j
+   -- NOTE: XMM, YMM, ZMM registers actually are the same registers
+   -- at least with respect to store at YMM i and then read from XMM i
+   -- and similarly for ZMM etc.
    XmmReg i == XmmReg j = i==j
    YmmReg i == YmmReg j = i==j
    ZmmReg i == ZmmReg j = i==j
@@ -584,6 +587,9 @@ globalRegType dflags (VanillaReg _ VNonGcPtr) = bWord dflags
 globalRegType _      (FloatReg _)      = cmmFloat W32
 globalRegType _      (DoubleReg _)     = cmmFloat W64
 globalRegType _      (LongReg _)       = cmmBits W64
+-- TODO: improve the internal model of SIMD/vectorized registers
+-- the right design SHOULd improve handling of float and double code too.
+-- see remarks in "NOTE [SIMD Design for the future]"" in StgCmmPrim
 globalRegType _      (XmmReg _)        = cmmVec 4 (cmmBits W32)
 globalRegType _      (YmmReg _)        = cmmVec 8 (cmmBits W32)
 globalRegType _      (ZmmReg _)        = cmmVec 16 (cmmBits W32)
diff --git a/compiler/cmm/CmmType.hs b/compiler/cmm/CmmType.hs
index 0d6e770904..276fbff534 100644
--- a/compiler/cmm/CmmType.hs
+++ b/compiler/cmm/CmmType.hs
@@ -166,9 +166,6 @@ isFloat64 _other                 = False
 -----------------------------------------------------------------------------
 
 data Width   = W8 | W16 | W32 | W64
-             | W80      -- Extended double-precision float,
-                        -- used in x86 native codegen only.
-                        -- (we use Ord, so it'd better be in this order)
              | W128
              | W256
              | W512
@@ -185,7 +182,7 @@ mrStr W64  = sLit("W64")
 mrStr W128 = sLit("W128")
 mrStr W256 = sLit("W256")
 mrStr W512 = sLit("W512")
-mrStr W80  = sLit("W80")
+
 
 
 -------- Common Widths  ------------
@@ -222,7 +219,7 @@ widthInBits W64  = 64
 widthInBits W128 = 128
 widthInBits W256 = 256
 widthInBits W512 = 512
-widthInBits W80  = 80
+
 
 widthInBytes :: Width -> Int
 widthInBytes W8   = 1
@@ -232,7 +229,7 @@ widthInBytes W64  = 8
 widthInBytes W128 = 16
 widthInBytes W256 = 32
 widthInBytes W512 = 64
-widthInBytes W80  = 10
+
 
 widthFromBytes :: Int -> Width
 widthFromBytes 1  = W8
@@ -242,7 +239,7 @@ widthFromBytes 8  = W64
 widthFromBytes 16 = W128
 widthFromBytes 32 = W256
 widthFromBytes 64 = W512
-widthFromBytes 10 = W80
+
 widthFromBytes n  = pprPanic "no width for given number of bytes" (ppr n)
 
 -- log_2 of the width in bytes, useful for generating shifts.
@@ -254,7 +251,7 @@ widthInLog W64  = 3
 widthInLog W128 = 4
 widthInLog W256 = 5
 widthInLog W512 = 6
-widthInLog W80  = panic "widthInLog: F80"
+
 
 -- widening / narrowing
 
diff --git a/compiler/codeGen/StgCmmPrim.hs b/compiler/codeGen/StgCmmPrim.hs
index 63d8f7bc0e..0adb2272ee 100644
--- a/compiler/codeGen/StgCmmPrim.hs
+++ b/compiler/codeGen/StgCmmPrim.hs
@@ -1727,8 +1727,38 @@ vecElemProjectCast dflags WordVec  W32 =  Just (mo_u_32ToWord dflags)
 vecElemProjectCast _      WordVec  W64 =  Nothing
 vecElemProjectCast _      _        _   =  Nothing
 
+
+-- NOTE [SIMD Design for the future]
 -- Check to make sure that we can generate code for the specified vector type
 -- given the current set of dynamic flags.
+-- Currently these checks are specific to x86 and x86_64 architecture.
+-- This should be fixed!
+-- In particular,
+-- 1) Add better support for other architectures! (this may require a redesign)
+-- 2) Decouple design choices from LLVM's pseudo SIMD model!
+--   The high level LLVM naive rep makes per CPU family SIMD generation is own
+--   optimization problem, and hides important differences in eg ARM vs x86_64 simd
+-- 3) Depending on the architecture, the SIMD registers may also support general
+--    computations on Float/Double/Word/Int scalars, but currently on
+--    for example x86_64, we always put Word/Int (or sized) in GPR
+--    (general purpose) registers. Would relaxing that allow for
+--    useful optimization opportunities?
+--      Phrased differently, it is worth experimenting with supporting
+--    different register mapping strategies than we currently have, especially if
+--    someday we want SIMD to be a first class denizen in GHC along with scalar
+--    values!
+--      The current design with respect to register mapping of scalars could
+--    very well be the best,but exploring the  design space and doing careful
+--    measurments is the only only way to validate that.
+--      In some next generation CPU ISAs, notably RISC V, the SIMD extension
+--    includes  support for a sort of run time CPU dependent vectorization parameter,
+--    where a loop may act upon a single scalar each iteration OR some 2,4,8 ...
+--    element chunk! Time will tell if that direction sees wide adoption,
+--    but it is from that context that unifying our handling of simd and scalars
+--    may benefit. It is not likely to benefit current architectures, though
+--    it may very well be a design perspective that helps guide improving the NCG.
+
+
 checkVecCompatibility :: DynFlags -> PrimOpVecCat -> Length -> Width -> FCode ()
 checkVecCompatibility dflags vcat l w = do
     when (hscTarget dflags /= HscLlvm) $ do
diff --git a/compiler/llvmGen/LlvmCodeGen/Base.hs b/compiler/llvmGen/LlvmCodeGen/Base.hs
index e978d6feaf..5ebb7b3830 100644
--- a/compiler/llvmGen/LlvmCodeGen/Base.hs
+++ b/compiler/llvmGen/LlvmCodeGen/Base.hs
@@ -97,7 +97,6 @@ cmmToLlvmType ty | isVecType ty   = LMVector (vecLength ty) (cmmToLlvmType (vecE
 widthToLlvmFloat :: Width -> LlvmType
 widthToLlvmFloat W32  = LMFloat
 widthToLlvmFloat W64  = LMDouble
-widthToLlvmFloat W80  = LMFloat80
 widthToLlvmFloat W128 = LMFloat128
 widthToLlvmFloat w    = panic $ "widthToLlvmFloat: Bad float size: " ++ show w
 
diff --git a/compiler/main/DynFlags.hs b/compiler/main/DynFlags.hs
index 6f2dffff3c..ced0be5506 100644
--- a/compiler/main/DynFlags.hs
+++ b/compiler/main/DynFlags.hs
@@ -5833,20 +5833,24 @@ data SseVersion = SSE1
 isSseEnabled :: DynFlags -> Bool
 isSseEnabled dflags = case platformArch (targetPlatform dflags) of
     ArchX86_64 -> True
-    ArchX86    -> sseVersion dflags >= Just SSE1
+    ArchX86    -> True
     _          -> False
 
 isSse2Enabled :: DynFlags -> Bool
 isSse2Enabled dflags = case platformArch (targetPlatform dflags) of
-    ArchX86_64 -> -- SSE2 is fixed on for x86_64.  It would be
-                  -- possible to make it optional, but we'd need to
-                  -- fix at least the foreign call code where the
-                  -- calling convention specifies the use of xmm regs,
-                  -- and possibly other places.
-                  True
-    ArchX86    -> sseVersion dflags >= Just SSE2
+  -- We Assume  SSE1 and SSE2 operations are available on both
+  -- x86 and x86_64. Historically we didn't default to SSE2 and
+  -- SSE1 on x86, which results in defacto nondeterminism for how
+  -- rounding behaves in the associated x87 floating point instructions
+  -- because variations in the spill/fpu stack placement of arguments for
+  -- operations would change the precision and final result of what
+  -- would otherwise be the same expressions with respect to single or
+  -- double precision IEEE floating point computations.
+    ArchX86_64 -> True
+    ArchX86    -> True
     _          -> False
 
+
 isSse4_2Enabled :: DynFlags -> Bool
 isSse4_2Enabled dflags = sseVersion dflags >= Just SSE42
 
diff --git a/compiler/nativeGen/AsmCodeGen.hs b/compiler/nativeGen/AsmCodeGen.hs
index 84c6a84845..c78ea5fcb1 100644
--- a/compiler/nativeGen/AsmCodeGen.hs
+++ b/compiler/nativeGen/AsmCodeGen.hs
@@ -179,7 +179,7 @@ nativeCodeGen dflags this_mod modLoc h us cmms
 x86NcgImpl :: DynFlags -> NcgImpl (Alignment, CmmStatics)
                                   X86.Instr.Instr X86.Instr.JumpDest
 x86NcgImpl dflags
- = (x86_64NcgImpl dflags) { ncg_x86fp_kludge = map x86fp_kludge }
+ = (x86_64NcgImpl dflags)
 
 x86_64NcgImpl :: DynFlags -> NcgImpl (Alignment, CmmStatics)
                                   X86.Instr.Instr X86.Instr.JumpDest
@@ -194,7 +194,6 @@ x86_64NcgImpl dflags
        ,pprNatCmmDecl             = X86.Ppr.pprNatCmmDecl
        ,maxSpillSlots             = X86.Instr.maxSpillSlots dflags
        ,allocatableRegs           = X86.Regs.allocatableRegs platform
-       ,ncg_x86fp_kludge          = id
        ,ncgAllocMoreStack         = X86.Instr.allocMoreStack platform
        ,ncgExpandTop              = id
        ,ncgMakeFarBranches        = const id
@@ -215,7 +214,6 @@ ppcNcgImpl dflags
        ,pprNatCmmDecl             = PPC.Ppr.pprNatCmmDecl
        ,maxSpillSlots             = PPC.Instr.maxSpillSlots dflags
        ,allocatableRegs           = PPC.Regs.allocatableRegs platform
-       ,ncg_x86fp_kludge          = id
        ,ncgAllocMoreStack         = PPC.Instr.allocMoreStack platform
        ,ncgExpandTop              = id
        ,ncgMakeFarBranches        = PPC.Instr.makeFarBranches
@@ -236,7 +234,6 @@ sparcNcgImpl dflags
        ,pprNatCmmDecl             = SPARC.Ppr.pprNatCmmDecl
        ,maxSpillSlots             = SPARC.Instr.maxSpillSlots dflags
        ,allocatableRegs           = SPARC.Regs.allocatableRegs
-       ,ncg_x86fp_kludge          = id
        ,ncgAllocMoreStack         = noAllocMoreStack
        ,ncgExpandTop              = map SPARC.CodeGen.Expand.expandTop
        ,ncgMakeFarBranches        = const id
@@ -680,19 +677,10 @@ cmmNativeGen dflags this_mod modLoc ncgImpl us fileIds dbgMap cmm count
                 foldl' (\m (from,to) -> addImmediateSuccessor from to m )
                        cfgWithFixupBlks stack_updt_blks
 
-        ---- x86fp_kludge.  This pass inserts ffree instructions to clear
-        ---- the FPU stack on x86.  The x86 ABI requires that the FPU stack
-        ---- is clear, and library functions can return odd results if it
-        ---- isn't.
-        ----
-        ---- NB. must happen before shortcutBranches, because that
-        ---- generates JXX_GBLs which we can't fix up in x86fp_kludge.
-        let kludged = {-# SCC "x86fp_kludge" #-} ncg_x86fp_kludge ncgImpl alloced
-
         ---- generate jump tables
         let tabled      =
                 {-# SCC "generateJumpTables" #-}
-                generateJumpTables ncgImpl kludged
+                generateJumpTables ncgImpl alloced
 
         dumpIfSet_dyn dflags
                 Opt_D_dump_cfg_weights "CFG Update information"
@@ -787,12 +775,6 @@ checkLayout procsUnsequenced procsSequenced =
         getBlockIds (CmmProc _ _ _ (ListGraph blocks)) =
                 setFromList $ map blockId blocks
 
-
-x86fp_kludge :: NatCmmDecl (Alignment, CmmStatics) X86.Instr.Instr -> NatCmmDecl (Alignment, CmmStatics) X86.Instr.Instr
-x86fp_kludge top@(CmmData _ _) = top
-x86fp_kludge (CmmProc info lbl live (ListGraph code)) =
-        CmmProc info lbl live (ListGraph $ X86.Instr.i386_insert_ffrees code)
-
 -- | Compute unwinding tables for the blocks of a procedure
 computeUnwinding :: Instruction instr
                  => DynFlags -> NcgImpl statics instr jumpDest
diff --git a/compiler/nativeGen/Format.hs b/compiler/nativeGen/Format.hs
index 82ecbecc14..31472893e7 100644
--- a/compiler/nativeGen/Format.hs
+++ b/compiler/nativeGen/Format.hs
@@ -47,7 +47,6 @@ data Format
         | II64
         | FF32
         | FF64
-        | FF80
         deriving (Show, Eq)
 
 
@@ -70,7 +69,7 @@ floatFormat width
  = case width of
         W32     -> FF32
         W64     -> FF64
-        W80     -> FF80
+
         other   -> pprPanic "Format.floatFormat" (ppr other)
 
 
@@ -80,7 +79,6 @@ isFloatFormat format
  = case format of
         FF32    -> True
         FF64    -> True
-        FF80    -> True
         _       -> False
 
 
@@ -101,7 +99,7 @@ formatToWidth format
         II64            -> W64
         FF32            -> W32
         FF64            -> W64
-        FF80            -> W80
+
 
 formatInBytes :: Format -> Int
 formatInBytes = widthInBytes . formatToWidth
diff --git a/compiler/nativeGen/NCGMonad.hs b/compiler/nativeGen/NCGMonad.hs
index c22a656d2a..0f53ef6690 100644
--- a/compiler/nativeGen/NCGMonad.hs
+++ b/compiler/nativeGen/NCGMonad.hs
@@ -76,7 +76,6 @@ data NcgImpl statics instr jumpDest = NcgImpl {
     pprNatCmmDecl             :: NatCmmDecl statics instr -> SDoc,
     maxSpillSlots             :: Int,
     allocatableRegs           :: [RealReg],
-    ncg_x86fp_kludge          :: [NatCmmDecl statics instr] -> [NatCmmDecl statics instr],
     ncgExpandTop              :: [NatCmmDecl statics instr] -> [NatCmmDecl statics instr],
     ncgAllocMoreStack         :: Int -> NatCmmDecl statics instr
                               -> UniqSM (NatCmmDecl statics instr, [(BlockId,BlockId)]),
diff --git a/compiler/nativeGen/PPC/CodeGen.hs b/compiler/nativeGen/PPC/CodeGen.hs
index 86525f4736..a9bddfd5f7 100644
--- a/compiler/nativeGen/PPC/CodeGen.hs
+++ b/compiler/nativeGen/PPC/CodeGen.hs
@@ -1593,7 +1593,7 @@ genCCall'
     -> [CmmActual]        -- arguments (of mixed type)
     -> NatM InstrBlock
 
-{- 
+{-
     PowerPC Linux uses the System V Release 4 Calling Convention
     for PowerPC. It is described in the
     "System V Application Binary Interface PowerPC Processor Supplement".
@@ -1906,7 +1906,7 @@ genCCall' dflags gcp target dest_regs args
                           FF32 -> (1, 1, 4, fprs)
                           FF64 -> (2, 1, 8, fprs)
                           II64 -> panic "genCCall' passArguments II64"
-                          FF80 -> panic "genCCall' passArguments FF80"
+
                       GCP32ELF ->
                           case cmmTypeFormat rep of
                           II8  -> (1, 0, 4, gprs)
@@ -1916,7 +1916,6 @@ genCCall' dflags gcp target dest_regs args
                           FF32 -> (0, 1, 4, fprs)
                           FF64 -> (0, 1, 8, fprs)
                           II64 -> panic "genCCall' passArguments II64"
-                          FF80 -> panic "genCCall' passArguments FF80"
                       GCP64ELF _ ->
                           case cmmTypeFormat rep of
                           II8  -> (1, 0, 8, gprs)
@@ -1928,7 +1927,6 @@ genCCall' dflags gcp target dest_regs args
                           -- the FPRs.
                           FF32 -> (1, 1, 8, fprs)
                           FF64 -> (1, 1, 8, fprs)
-                          FF80 -> panic "genCCall' passArguments FF80"
 
         moveResult reduceToFF32 =
             case dest_regs of
diff --git a/compiler/nativeGen/PPC/Ppr.hs b/compiler/nativeGen/PPC/Ppr.hs
index 23525ffb5a..8e9ff95b9f 100644
--- a/compiler/nativeGen/PPC/Ppr.hs
+++ b/compiler/nativeGen/PPC/Ppr.hs
@@ -161,7 +161,7 @@ pprReg r
       RegVirtual (VirtualRegHi u)  -> text "%vHi_"  <> pprUniqueAlways u
       RegVirtual (VirtualRegF  u)  -> text "%vF_"   <> pprUniqueAlways u
       RegVirtual (VirtualRegD  u)  -> text "%vD_"   <> pprUniqueAlways u
-      RegVirtual (VirtualRegSSE u) -> text "%vSSE_" <> pprUniqueAlways u
+
   where
     ppr_reg_no :: Int -> SDoc
     ppr_reg_no i
@@ -179,8 +179,7 @@ pprFormat x
                 II32 -> sLit "w"
                 II64 -> sLit "d"
                 FF32 -> sLit "fs"
-                FF64 -> sLit "fd"
-                _    -> panic "PPC.Ppr.pprFormat: no match")
+                FF64 -> sLit "fd")
 
 
 pprCond :: Cond -> SDoc
@@ -365,7 +364,6 @@ pprInstr (LD fmt reg addr) = hcat [
             II64 -> sLit "d"
             FF32 -> sLit "fs"
             FF64 -> sLit "fd"
-            _         -> panic "PPC.Ppr.pprInstr: no match"
             ),
         case addr of AddrRegImm _ _ -> empty
                      AddrRegReg _ _ -> char 'x',
@@ -405,7 +403,6 @@ pprInstr (LA fmt reg addr) = hcat [
             II64 -> sLit "d"
             FF32 -> sLit "fs"
             FF64 -> sLit "fd"
-            _         -> panic "PPC.Ppr.pprInstr: no match"
             ),
         case addr of AddrRegImm _ _ -> empty
                      AddrRegReg _ _ -> char 'x',
diff --git a/compiler/nativeGen/PPC/Regs.hs b/compiler/nativeGen/PPC/Regs.hs
index 7b16f217d9..8ba2ead926 100644
--- a/compiler/nativeGen/PPC/Regs.hs
+++ b/compiler/nativeGen/PPC/Regs.hs
@@ -131,7 +131,7 @@ regDotColor reg
         RcInteger       -> text "blue"
         RcFloat         -> text "red"
         RcDouble        -> text "green"
-        RcDoubleSSE     -> text "yellow"
+
 
 
 -- immediates ------------------------------------------------------------------
diff --git a/compiler/nativeGen/Reg.hs b/compiler/nativeGen/Reg.hs
index d9d56d47c4..7f69ea01a4 100644
--- a/compiler/nativeGen/Reg.hs
+++ b/compiler/nativeGen/Reg.hs
@@ -56,7 +56,7 @@ data VirtualReg
         | VirtualRegHi {-# UNPACK #-} !Unique  -- High part of 2-word register
         | VirtualRegF  {-# UNPACK #-} !Unique
         | VirtualRegD  {-# UNPACK #-} !Unique
-        | VirtualRegSSE {-# UNPACK #-} !Unique
+
         deriving (Eq, Show)
 
 -- This is laborious, but necessary. We can't derive Ord because
@@ -69,15 +69,14 @@ instance Ord VirtualReg where
   compare (VirtualRegHi a) (VirtualRegHi b) = nonDetCmpUnique a b
   compare (VirtualRegF a) (VirtualRegF b) = nonDetCmpUnique a b
   compare (VirtualRegD a) (VirtualRegD b) = nonDetCmpUnique a b
-  compare (VirtualRegSSE a) (VirtualRegSSE b) = nonDetCmpUnique a b
+
   compare VirtualRegI{} _ = LT
   compare _ VirtualRegI{} = GT
   compare VirtualRegHi{} _ = LT
   compare _ VirtualRegHi{} = GT
   compare VirtualRegF{} _ = LT
   compare _ VirtualRegF{} = GT
-  compare VirtualRegD{} _ = LT
-  compare _ VirtualRegD{} = GT
+
 
 
 instance Uniquable VirtualReg where
@@ -87,16 +86,18 @@ instance Uniquable VirtualReg where
                 VirtualRegHi u  -> u
                 VirtualRegF u   -> u
                 VirtualRegD u   -> u
-                VirtualRegSSE u -> u
 
 instance Outputable VirtualReg where
         ppr reg
          = case reg of
                 VirtualRegI  u  -> text "%vI_"   <> pprUniqueAlways u
                 VirtualRegHi u  -> text "%vHi_"  <> pprUniqueAlways u
-                VirtualRegF  u  -> text "%vF_"   <> pprUniqueAlways u
-                VirtualRegD  u  -> text "%vD_"   <> pprUniqueAlways u
-                VirtualRegSSE u -> text "%vSSE_" <> pprUniqueAlways u
+                -- this code is kinda wrong on x86
+                -- because float and double occupy the same register set
+                -- namely SSE2 register xmm0 .. xmm15
+                VirtualRegF  u  -> text "%vFloat_"   <> pprUniqueAlways u
+                VirtualRegD  u  -> text "%vDouble_"   <> pprUniqueAlways u
+
 
 
 renameVirtualReg :: Unique -> VirtualReg -> VirtualReg
@@ -106,7 +107,6 @@ renameVirtualReg u r
         VirtualRegHi _  -> VirtualRegHi u
         VirtualRegF _   -> VirtualRegF  u
         VirtualRegD _   -> VirtualRegD  u
-        VirtualRegSSE _ -> VirtualRegSSE u
 
 
 classOfVirtualReg :: VirtualReg -> RegClass
@@ -116,7 +116,7 @@ classOfVirtualReg vr
         VirtualRegHi{}  -> RcInteger
         VirtualRegF{}   -> RcFloat
         VirtualRegD{}   -> RcDouble
-        VirtualRegSSE{} -> RcDoubleSSE
+
 
 
 -- Determine the upper-half vreg for a 64-bit quantity on a 32-bit platform
diff --git a/compiler/nativeGen/RegAlloc/Graph/TrivColorable.hs b/compiler/nativeGen/RegAlloc/Graph/TrivColorable.hs
index 7774985dce..8db80ef064 100644
--- a/compiler/nativeGen/RegAlloc/Graph/TrivColorable.hs
+++ b/compiler/nativeGen/RegAlloc/Graph/TrivColorable.hs
@@ -134,6 +134,10 @@ trivColorable platform virtualRegSqueeze realRegSqueeze RcInteger conflicts excl
 trivColorable platform virtualRegSqueeze realRegSqueeze RcFloat conflicts exclusions
         | let cALLOCATABLE_REGS_FLOAT
                   =        (case platformArch platform of
+                    -- On x86_64 and x86, Float and RcDouble
+                    -- use the same registers,
+                    -- so we only use RcDouble to represent the
+                    -- register allocation problem on those types.
                             ArchX86       -> 0
                             ArchX86_64    -> 0
                             ArchPPC       -> 0
@@ -160,8 +164,14 @@ trivColorable platform virtualRegSqueeze realRegSqueeze RcFloat conflicts exclus
 trivColorable platform virtualRegSqueeze realRegSqueeze RcDouble conflicts exclusions
         | let cALLOCATABLE_REGS_DOUBLE
                   =        (case platformArch platform of
-                            ArchX86       -> 6
-                            ArchX86_64    -> 0
+                            ArchX86       -> 8
+                            -- in x86 32bit mode sse2 there are only
+                            -- 8 XMM registers xmm0 ... xmm7
+                            ArchX86_64    -> 10
+                            -- in x86_64 there are 16 XMM registers
+                            -- xmm0 .. xmm15, here 10 is a
+                            -- "dont need to solve conflicts" count that
+                            -- was chosen at some point in the past.
                             ArchPPC       -> 26
                             ArchSPARC     -> 11
                             ArchSPARC64   -> panic "trivColorable ArchSPARC64"
@@ -183,31 +193,7 @@ trivColorable platform virtualRegSqueeze realRegSqueeze RcDouble conflicts exclu
 
         = count3 < cALLOCATABLE_REGS_DOUBLE
 
-trivColorable platform virtualRegSqueeze realRegSqueeze RcDoubleSSE conflicts exclusions
-        | let cALLOCATABLE_REGS_SSE
-                  =        (case platformArch platform of
-                            ArchX86       -> 8
-                            ArchX86_64    -> 10
-                            ArchPPC       -> 0
-                            ArchSPARC     -> 0
-                            ArchSPARC64   -> panic "trivColorable ArchSPARC64"
-                            ArchPPC_64 _  -> 0
-                            ArchARM _ _ _ -> panic "trivColorable ArchARM"
-                            ArchARM64     -> panic "trivColorable ArchARM64"
-                            ArchAlpha     -> panic "trivColorable ArchAlpha"
-                            ArchMipseb    -> panic "trivColorable ArchMipseb"
-                            ArchMipsel    -> panic "trivColorable ArchMipsel"
-                            ArchJavaScript-> panic "trivColorable ArchJavaScript"
-                            ArchUnknown   -> panic "trivColorable ArchUnknown")
-        , count2        <- accSqueeze 0 cALLOCATABLE_REGS_SSE
-                                (virtualRegSqueeze RcDoubleSSE)
-                                conflicts
-
-        , count3        <- accSqueeze  count2    cALLOCATABLE_REGS_SSE
-                                (realRegSqueeze   RcDoubleSSE)
-                                exclusions
 
-        = count3 < cALLOCATABLE_REGS_SSE
 
 
 -- Specification Code ----------------------------------------------------------
diff --git a/compiler/nativeGen/RegClass.hs b/compiler/nativeGen/RegClass.hs
index cd008bbbb1..fbbb786817 100644
--- a/compiler/nativeGen/RegClass.hs
+++ b/compiler/nativeGen/RegClass.hs
@@ -18,7 +18,6 @@ data RegClass
         = RcInteger
         | RcFloat
         | RcDouble
-        | RcDoubleSSE -- x86 only: the SSE regs are a separate class
         deriving Eq
 
 
@@ -26,10 +25,8 @@ instance Uniquable RegClass where
     getUnique RcInteger = mkRegClassUnique 0
     getUnique RcFloat   = mkRegClassUnique 1
     getUnique RcDouble  = mkRegClassUnique 2
-    getUnique RcDoubleSSE = mkRegClassUnique 3
 
 instance Outputable RegClass where
     ppr RcInteger       = Outputable.text "I"
     ppr RcFloat         = Outputable.text "F"
     ppr RcDouble        = Outputable.text "D"
-    ppr RcDoubleSSE     = Outputable.text "S"
diff --git a/compiler/nativeGen/SPARC/Instr.hs b/compiler/nativeGen/SPARC/Instr.hs
index 54fb513478..3f78ac51d7 100644
--- a/compiler/nativeGen/SPARC/Instr.hs
+++ b/compiler/nativeGen/SPARC/Instr.hs
@@ -384,7 +384,6 @@ sparc_mkSpillInstr dflags reg _ slot
                         RcInteger -> II32
                         RcFloat   -> FF32
                         RcDouble  -> FF64
-                        _         -> panic "sparc_mkSpillInstr"
 
     in ST fmt reg (fpRel (negate off_w))
 
@@ -405,7 +404,6 @@ sparc_mkLoadInstr dflags reg _ slot
                         RcInteger -> II32
                         RcFloat   -> FF32
                         RcDouble  -> FF64
-                        _         -> panic "sparc_mkLoadInstr"
 
         in LD fmt (fpRel (- off_w)) reg
 
@@ -454,7 +452,6 @@ sparc_mkRegRegMoveInstr platform src dst
                 RcInteger -> ADD  False False src (RIReg g0) dst
                 RcDouble  -> FMOV FF64 src dst
                 RcFloat   -> FMOV FF32 src dst
-                _         -> panic "sparc_mkRegRegMoveInstr"
 
         | otherwise
         = panic "SPARC.Instr.mkRegRegMoveInstr: classes of src and dest not the same"
diff --git a/compiler/nativeGen/SPARC/Ppr.hs b/compiler/nativeGen/SPARC/Ppr.hs
index d367b0726a..6b441819a6 100644
--- a/compiler/nativeGen/SPARC/Ppr.hs
+++ b/compiler/nativeGen/SPARC/Ppr.hs
@@ -143,7 +143,7 @@ pprReg reg
                 VirtualRegHi  u -> text "%vHi_"  <> pprUniqueAlways u
                 VirtualRegF   u -> text "%vF_"   <> pprUniqueAlways u
                 VirtualRegD   u -> text "%vD_"   <> pprUniqueAlways u
-                VirtualRegSSE u -> text "%vSSE_" <> pprUniqueAlways u
+
 
         RegReal rr
          -> case rr of
@@ -211,8 +211,7 @@ pprFormat x
         II32    -> sLit ""
         II64    -> sLit "d"
         FF32    -> sLit ""
-        FF64    -> sLit "d"
-        _       -> panic "SPARC.Ppr.pprFormat: no match")
+        FF64    -> sLit "d")
 
 
 -- | Pretty print a format for an instruction suffix.
@@ -226,8 +225,8 @@ pprStFormat x
         II32  -> sLit ""
         II64  -> sLit "x"
         FF32  -> sLit ""
-        FF64  -> sLit "d"
-        _       -> panic "SPARC.Ppr.pprFormat: no match")
+        FF64  -> sLit "d")
+
 
 
 -- | Pretty print a condition code.
diff --git a/compiler/nativeGen/SPARC/Regs.hs b/compiler/nativeGen/SPARC/Regs.hs
index d6aadbae94..0d7edc346a 100644
--- a/compiler/nativeGen/SPARC/Regs.hs
+++ b/compiler/nativeGen/SPARC/Regs.hs
@@ -104,7 +104,6 @@ virtualRegSqueeze cls vr
                 VirtualRegD{}           -> 1
                 _other                  -> 0
 
-        _other -> 0
 
 {-# INLINE realRegSqueeze #-}
 realRegSqueeze :: RegClass -> RealReg -> Int
@@ -135,7 +134,6 @@ realRegSqueeze cls rr
 
                 RealRegPair{}           -> 1
 
-        _other -> 0
 
 -- | All the allocatable registers in the machine,
 --      including register pairs.
diff --git a/compiler/nativeGen/X86/CodeGen.hs b/compiler/nativeGen/X86/CodeGen.hs
index 61686186f1..70df468857 100644
--- a/compiler/nativeGen/X86/CodeGen.hs
+++ b/compiler/nativeGen/X86/CodeGen.hs
@@ -98,17 +98,25 @@ is32BitPlatform = do
 sse2Enabled :: NatM Bool
 sse2Enabled = do
   dflags <- getDynFlags
-  return (isSse2Enabled dflags)
+  case platformArch (targetPlatform dflags) of
+  -- We Assume  SSE1 and SSE2 operations are available on both
+  -- x86 and x86_64. Historically we didn't default to SSE2 and
+  -- SSE1 on x86, which results in defacto nondeterminism for how
+  -- rounding behaves in the associated x87 floating point instructions
+  -- because variations in the spill/fpu stack placement of arguments for
+  -- operations would change the precision and final result of what
+  -- would otherwise be the same expressions with respect to single or
+  -- double precision IEEE floating point computations.
+    ArchX86_64 -> return True
+    ArchX86    -> return True
+    _          -> panic "trying to generate x86/x86_64 on the wrong platform"
+
 
 sse4_2Enabled :: NatM Bool
 sse4_2Enabled = do
   dflags <- getDynFlags
   return (isSse4_2Enabled dflags)
 
-if_sse2 :: NatM a -> NatM a -> NatM a
-if_sse2 sse2 x87 = do
-  b <- sse2Enabled
-  if b then sse2 else x87
 
 cmmTopCodeGen
         :: RawCmmDecl
@@ -284,15 +292,14 @@ swizzleRegisterRep (Any _ codefn)     format = Any   format codefn
 
 
 -- | Grab the Reg for a CmmReg
-getRegisterReg :: Platform -> Bool -> CmmReg -> Reg
+getRegisterReg :: Platform  -> CmmReg -> Reg
 
-getRegisterReg _ use_sse2 (CmmLocal (LocalReg u pk))
-  = let fmt = cmmTypeFormat pk in
-    if isFloatFormat fmt && not use_sse2
-       then RegVirtual (mkVirtualReg u FF80)
-       else RegVirtual (mkVirtualReg u fmt)
+getRegisterReg _   (CmmLocal (LocalReg u pk))
+  = -- by Assuming SSE2, Int,Word,Float,Double all can be register allocated
+   let fmt = cmmTypeFormat pk in
+        RegVirtual (mkVirtualReg u fmt)
 
-getRegisterReg platform _ (CmmGlobal mid)
+getRegisterReg platform  (CmmGlobal mid)
   = case globalRegMaybe platform mid of
         Just reg -> RegReal $ reg
         Nothing  -> pprPanic "getRegisterReg-memory" (ppr $ CmmGlobal mid)
@@ -513,15 +520,14 @@ getRegister' dflags is32Bit (CmmReg reg)
             do reg' <- getPicBaseNat (archWordFormat is32Bit)
                return (Fixed (archWordFormat is32Bit) reg' nilOL)
         _ ->
-            do use_sse2 <- sse2Enabled
+            do
                let
                  fmt = cmmTypeFormat (cmmRegType dflags reg)
-                 format | not use_sse2 && isFloatFormat fmt = FF80
-                        | otherwise                         = fmt
+                 format  = fmt
                --
                let platform = targetPlatform dflags
                return (Fixed format
-                             (getRegisterReg platform use_sse2 reg)
+                             (getRegisterReg platform  reg)
                              nilOL)
 
 
@@ -557,8 +563,7 @@ getRegister' _ is32Bit (CmmMachOp (MO_SS_Conv W64 W32) [x])
   return $ Fixed II32 rlo code
 
 getRegister' _ _ (CmmLit lit@(CmmFloat f w)) =
-  if_sse2 float_const_sse2 float_const_x87
- where
+  float_const_sse2  where
   float_const_sse2
     | f == 0.0 = do
       let
@@ -570,21 +575,7 @@ getRegister' _ _ (CmmLit lit@(CmmFloat f w)) =
 
    | otherwise = do
       Amode addr code <- memConstant (mkAlignment $ widthInBytes w) lit
-      loadFloatAmode True w addr code
-
-  float_const_x87 = case w of
-    W64
-      | f == 0.0 ->
-        let code dst = unitOL (GLDZ dst)
-        in  return (Any FF80 code)
-
-      | f == 1.0 ->
-        let code dst = unitOL (GLD1 dst)
-        in  return (Any FF80 code)
-
-    _otherwise -> do
-      Amode addr code <- memConstant (mkAlignment $ widthInBytes w) lit
-      loadFloatAmode False w addr code
+      loadFloatAmode w addr code
 
 -- catch simple cases of zero- or sign-extended load
 getRegister' _ _ (CmmMachOp (MO_UU_Conv W8 W32) [CmmLoad addr _]) = do
@@ -641,11 +632,9 @@ getRegister' _ is32Bit (CmmMachOp (MO_Add W64) [CmmReg (CmmGlobal PicBaseReg),
         LEA II64 (OpAddr (ripRel (litToImm displacement))) (OpReg dst))
 
 getRegister' dflags is32Bit (CmmMachOp mop [x]) = do -- unary MachOps
-    sse2 <- sse2Enabled
     case mop of
-      MO_F_Neg w
-         | sse2      -> sse2NegCode w x
-         | otherwise -> trivialUFCode FF80 (GNEG FF80) x
+      MO_F_Neg w  -> sse2NegCode w x
+
 
       MO_S_Neg w -> triv_ucode NEGI (intFormat w)
       MO_Not w   -> triv_ucode NOT  (intFormat w)
@@ -711,9 +700,8 @@ getRegister' dflags is32Bit (CmmMachOp mop [x]) = do -- unary MachOps
       MO_XX_Conv W16 W64 | not is32Bit -> integerExtend W16 W64 MOV x
       MO_XX_Conv W32 W64 | not is32Bit -> integerExtend W32 W64 MOV x
 
-      MO_FF_Conv W32 W64
-        | sse2      -> coerceFP2FP W64 x
-        | otherwise -> conversionNop FF80 x
+      MO_FF_Conv W32 W64 -> coerceFP2FP W64 x
+
 
       MO_FF_Conv W64 W32 -> coerceFP2FP W32 x
 
@@ -776,7 +764,6 @@ getRegister' dflags is32Bit (CmmMachOp mop [x]) = do -- unary MachOps
 
 
 getRegister' _ is32Bit (CmmMachOp mop [x, y]) = do -- dyadic MachOps
-  sse2 <- sse2Enabled
   case mop of
       MO_F_Eq _ -> condFltReg is32Bit EQQ x y
       MO_F_Ne _ -> condFltReg is32Bit NE  x y
@@ -800,14 +787,14 @@ getRegister' _ is32Bit (CmmMachOp mop [x, y]) = do -- dyadic MachOps
       MO_U_Lt _ -> condIntReg LU  x y
       MO_U_Le _ -> condIntReg LEU x y
 
-      MO_F_Add w  | sse2      -> trivialFCode_sse2 w ADD  x y
-                  | otherwise -> trivialFCode_x87    GADD x y
-      MO_F_Sub w  | sse2      -> trivialFCode_sse2 w SUB  x y
-                  | otherwise -> trivialFCode_x87    GSUB x y
-      MO_F_Quot w | sse2      -> trivialFCode_sse2 w FDIV x y
-                  | otherwise -> trivialFCode_x87    GDIV x y
-      MO_F_Mul w  | sse2      -> trivialFCode_sse2 w MUL x y
-                  | otherwise -> trivialFCode_x87    GMUL x y
+      MO_F_Add w   -> trivialFCode_sse2 w ADD  x y
+
+      MO_F_Sub w   -> trivialFCode_sse2 w SUB  x y
+
+      MO_F_Quot w  -> trivialFCode_sse2 w FDIV x y
+
+      MO_F_Mul w   -> trivialFCode_sse2 w MUL x y
+
 
       MO_Add rep -> add_code rep x y
       MO_Sub rep -> sub_code rep x y
@@ -1001,8 +988,7 @@ getRegister' _ _ (CmmLoad mem pk)
   | isFloatType pk
   = do
     Amode addr mem_code <- getAmode mem
-    use_sse2 <- sse2Enabled
-    loadFloatAmode use_sse2 (typeWidth pk) addr mem_code
+    loadFloatAmode  (typeWidth pk) addr mem_code
 
 getRegister' _ is32Bit (CmmLoad mem pk)
   | is32Bit && not (isWord64 pk)
@@ -1132,9 +1118,7 @@ getNonClobberedReg expr = do
                 return (reg, code)
 
 reg2reg :: Format -> Reg -> Reg -> Instr
-reg2reg format src dst
-  | format == FF80 = GMOV src dst
-  | otherwise    = MOV format (OpReg src) (OpReg dst)
+reg2reg format src dst = MOV format (OpReg src) (OpReg dst)
 
 
 --------------------------------------------------------------------------------
@@ -1243,8 +1227,7 @@ x86_complex_amode base index shift offset
 
 getNonClobberedOperand :: CmmExpr -> NatM (Operand, InstrBlock)
 getNonClobberedOperand (CmmLit lit) = do
-  use_sse2 <- sse2Enabled
-  if use_sse2 && isSuitableFloatingPointLit lit
+  if  isSuitableFloatingPointLit lit
     then do
       let CmmFloat _ w = lit
       Amode addr code <- memConstant (mkAlignment $ widthInBytes w) lit
@@ -1259,9 +1242,12 @@ getNonClobberedOperand (CmmLit lit) = do
 
 getNonClobberedOperand (CmmLoad mem pk) = do
   is32Bit <- is32BitPlatform
-  use_sse2 <- sse2Enabled
-  if (not (isFloatType pk) || use_sse2)
-      && (if is32Bit then not (isWord64 pk) else True)
+  -- this logic could be simplified
+  -- TODO FIXME
+  if   (if is32Bit then not (isWord64 pk) else True)
+      -- if 32bit and pk is at float/double/simd value
+      -- or if 64bit
+      --  this could use some eyeballs or i'll need to stare at it more later
     then do
       dflags <- getDynFlags
       let platform = targetPlatform dflags
@@ -1278,6 +1264,7 @@ getNonClobberedOperand (CmmLoad mem pk) = do
                    return (src, nilOL)
       return (OpAddr src', mem_code `appOL` save_code)
     else do
+      -- if its a word or gcptr on 32bit?
       getNonClobberedOperand_generic (CmmLoad mem pk)
 
 getNonClobberedOperand e = getNonClobberedOperand_generic e
@@ -1370,14 +1357,13 @@ memConstant align lit = do
   return (Amode addr code)
 
 
-loadFloatAmode :: Bool -> Width -> AddrMode -> InstrBlock -> NatM Register
-loadFloatAmode use_sse2 w addr addr_code = do
+loadFloatAmode :: Width -> AddrMode -> InstrBlock -> NatM Register
+loadFloatAmode w addr addr_code = do
   let format = floatFormat w
       code dst = addr_code `snocOL`
-                 if use_sse2
-                    then MOV format (OpAddr addr) (OpReg dst)
-                    else GLD format addr dst
-  return (Any (if use_sse2 then format else FF80) code)
+                    MOV format (OpAddr addr) (OpReg dst)
+
+  return (Any format code)
 
 
 -- if we want a floating-point literal as an operand, we can
@@ -1538,19 +1524,9 @@ condIntCode' _ cond x y = do
 condFltCode :: Cond -> CmmExpr -> CmmExpr -> NatM CondCode
 
 condFltCode cond x y
-  = if_sse2 condFltCode_sse2 condFltCode_x87
+  =  condFltCode_sse2
   where
 
-  condFltCode_x87
-    = ASSERT(cond `elem` ([EQQ, NE, LE, LTT, GE, GTT])) do
-    (x_reg, x_code) <- getNonClobberedReg x
-    (y_reg, y_code) <- getSomeReg y
-    let
-        code = x_code `appOL` y_code `snocOL`
-                GCMP cond x_reg y_reg
-    -- The GCMP insn does the test and sets the zero flag if comparable
-    -- and true.  Hence we always supply EQQ as the condition to test.
-    return (CondCode True EQQ code)
 
   -- in the SSE2 comparison ops (ucomiss, ucomisd) the left arg may be
   -- an operand, but the right must be a reg.  We can probably do better
@@ -1634,35 +1610,33 @@ assignReg_IntCode pk reg (CmmLoad src _) = do
   load_code <- intLoadCode (MOV pk) src
   dflags <- getDynFlags
   let platform = targetPlatform dflags
-  return (load_code (getRegisterReg platform False{-no sse2-} reg))
+  return (load_code (getRegisterReg platform reg))
 
 -- dst is a reg, but src could be anything
 assignReg_IntCode _ reg src = do
   dflags <- getDynFlags
   let platform = targetPlatform dflags
   code <- getAnyReg src
-  return (code (getRegisterReg platform False{-no sse2-} reg))
+  return (code (getRegisterReg platform reg))
 
 
 -- Floating point assignment to memory
 assignMem_FltCode pk addr src = do
   (src_reg, src_code) <- getNonClobberedReg src
   Amode addr addr_code <- getAmode addr
-  use_sse2 <- sse2Enabled
   let
         code = src_code `appOL`
                addr_code `snocOL`
-                if use_sse2 then MOV pk (OpReg src_reg) (OpAddr addr)
-                            else GST pk src_reg addr
+               MOV pk (OpReg src_reg) (OpAddr addr)
+
   return code
 
 -- Floating point assignment to a register/temporary
 assignReg_FltCode _ reg src = do
-  use_sse2 <- sse2Enabled
   src_code <- getAnyReg src
   dflags <- getDynFlags
   let platform = targetPlatform dflags
-  return (src_code (getRegisterReg platform use_sse2 reg))
+  return (src_code (getRegisterReg platform  reg))
 
 
 genJump :: CmmExpr{-the branch target-} -> [Reg] -> NatM InstrBlock
@@ -1945,7 +1919,7 @@ genCCall _ is32bit (PrimTarget (MO_Prefetch_Data n )) _  [src] _ =
 
 genCCall dflags is32Bit (PrimTarget (MO_BSwap width)) [dst] [src] _ = do
     let platform = targetPlatform dflags
-    let dst_r = getRegisterReg platform False (CmmLocal dst)
+    let dst_r = getRegisterReg platform (CmmLocal dst)
     case width of
         W64 | is32Bit -> do
                ChildCode64 vcode rlo <- iselExpr64 src
@@ -1972,7 +1946,7 @@ genCCall dflags is32Bit (PrimTarget (MO_PopCnt width)) dest_regs@[dst]
     if sse4_2
         then do code_src <- getAnyReg src
                 src_r <- getNewRegNat format
-                let dst_r = getRegisterReg platform False (CmmLocal dst)
+                let dst_r = getRegisterReg platform  (CmmLocal dst)
                 return $ code_src src_r `appOL`
                     (if width == W8 then
                          -- The POPCNT instruction doesn't take a r/m8
@@ -2004,7 +1978,7 @@ genCCall dflags is32Bit (PrimTarget (MO_Pdep width)) dest_regs@[dst]
                 code_mask <- getAnyReg mask
                 src_r     <- getNewRegNat format
                 mask_r    <- getNewRegNat format
-                let dst_r = getRegisterReg platform False (CmmLocal dst)
+                let dst_r = getRegisterReg platform  (CmmLocal dst)
                 return $ code_src src_r `appOL` code_mask mask_r `appOL`
                     (if width == W8 then
                          -- The PDEP instruction doesn't take a r/m8
@@ -2037,7 +2011,7 @@ genCCall dflags is32Bit (PrimTarget (MO_Pext width)) dest_regs@[dst]
                 code_mask <- getAnyReg mask
                 src_r     <- getNewRegNat format
                 mask_r    <- getNewRegNat format
-                let dst_r = getRegisterReg platform False (CmmLocal dst)
+                let dst_r = getRegisterReg platform  (CmmLocal dst)
                 return $ code_src src_r `appOL` code_mask mask_r `appOL`
                     (if width == W8 then
                          -- The PEXT instruction doesn't take a r/m8
@@ -2073,7 +2047,7 @@ genCCall dflags is32Bit (PrimTarget (MO_Clz width)) dest_regs@[dst] args@[src] b
 
   | otherwise = do
     code_src <- getAnyReg src
-    let dst_r = getRegisterReg platform False (CmmLocal dst)
+    let dst_r = getRegisterReg platform (CmmLocal dst)
     if isBmi2Enabled dflags
         then do
             src_r <- getNewRegNat (intFormat width)
@@ -2110,7 +2084,7 @@ genCCall dflags is32Bit (PrimTarget (MO_Ctz width)) [dst] [src] bid
   | is32Bit, width == W64 = do
       ChildCode64 vcode rlo <- iselExpr64 src
       let rhi     = getHiVRegFromLo rlo
-          dst_r   = getRegisterReg platform False (CmmLocal dst)
+          dst_r   = getRegisterReg platform  (CmmLocal dst)
       lbl1 <- getBlockIdNat
       lbl2 <- getBlockIdNat
       let format = if width == W8 then II16 else intFormat width
@@ -2150,7 +2124,7 @@ genCCall dflags is32Bit (PrimTarget (MO_Ctz width)) [dst] [src] bid
 
   | otherwise = do
     code_src <- getAnyReg src
-    let dst_r = getRegisterReg platform False (CmmLocal dst)
+    let dst_r = getRegisterReg platform (CmmLocal dst)
 
     if isBmi2Enabled dflags
     then do
@@ -2201,9 +2175,8 @@ genCCall dflags is32Bit (PrimTarget (MO_AtomicRMW width amop))
         else getSimpleAmode dflags is32Bit addr  -- See genCCall for MO_Cmpxchg
     arg <- getNewRegNat format
     arg_code <- getAnyReg n
-    use_sse2 <- sse2Enabled
     let platform = targetPlatform dflags
-        dst_r    = getRegisterReg platform use_sse2 (CmmLocal dst)
+        dst_r    = getRegisterReg platform  (CmmLocal dst)
     code <- op_code dst_r arg amode
     return $ addr_code `appOL` arg_code arg `appOL` code
   where
@@ -2260,8 +2233,8 @@ genCCall dflags is32Bit (PrimTarget (MO_AtomicRMW width amop))
 genCCall dflags _ (PrimTarget (MO_AtomicRead width)) [dst] [addr] _ = do
   load_code <- intLoadCode (MOV (intFormat width)) addr
   let platform = targetPlatform dflags
-  use_sse2 <- sse2Enabled
-  return (load_code (getRegisterReg platform use_sse2 (CmmLocal dst)))
+
+  return (load_code (getRegisterReg platform  (CmmLocal dst)))
 
 genCCall _ _ (PrimTarget (MO_AtomicWrite width)) [] [addr, val] _ = do
     code <- assignMem_IntCode (intFormat width) addr val
@@ -2276,9 +2249,8 @@ genCCall dflags is32Bit (PrimTarget (MO_Cmpxchg width)) [dst] [addr, old, new] _
     newval_code <- getAnyReg new
     oldval <- getNewRegNat format
     oldval_code <- getAnyReg old
-    use_sse2 <- sse2Enabled
     let platform = targetPlatform dflags
-        dst_r    = getRegisterReg platform use_sse2 (CmmLocal dst)
+        dst_r    = getRegisterReg platform  (CmmLocal dst)
         code     = toOL
                    [ MOV format (OpReg oldval) (OpReg eax)
                    , LOCK (CMPXCHG format (OpReg newval) (OpAddr amode))
@@ -2292,14 +2264,12 @@ genCCall dflags is32Bit (PrimTarget (MO_Cmpxchg width)) [dst] [addr, old, new] _
 genCCall _ is32Bit target dest_regs args bid = do
   dflags <- getDynFlags
   let platform = targetPlatform dflags
-      sse2     = isSse2Enabled dflags
   case (target, dest_regs) of
     -- void return type prim op
     (PrimTarget op, []) ->
         outOfLineCmmOp bid op Nothing args
     -- we only cope with a single result for foreign calls
-    (PrimTarget op, [r])
-      | sse2 -> case op of
+    (PrimTarget op, [r])  -> case op of
           MO_F32_Fabs -> case args of
             [x] -> sse2FabsCode W32 x
             _ -> panic "genCCall: Wrong number of arguments for fabs"
@@ -2310,36 +2280,16 @@ genCCall _ is32Bit target dest_regs args bid = do
           MO_F32_Sqrt -> actuallyInlineSSE2Op (\fmt r -> SQRT fmt (OpReg r)) FF32 args
           MO_F64_Sqrt -> actuallyInlineSSE2Op (\fmt r -> SQRT fmt (OpReg r)) FF64 args
           _other_op -> outOfLineCmmOp bid op (Just r) args
-      | otherwise -> do
-        l1 <- getNewLabelNat
-        l2 <- getNewLabelNat
-        if sse2
-          then outOfLineCmmOp bid op (Just r) args
-          else case op of
-              MO_F32_Sqrt -> actuallyInlineFloatOp GSQRT FF32 args
-              MO_F64_Sqrt -> actuallyInlineFloatOp GSQRT FF64 args
-
-              MO_F32_Sin  -> actuallyInlineFloatOp (\s -> GSIN s l1 l2) FF32 args
-              MO_F64_Sin  -> actuallyInlineFloatOp (\s -> GSIN s l1 l2) FF64 args
-
-              MO_F32_Cos  -> actuallyInlineFloatOp (\s -> GCOS s l1 l2) FF32 args
-              MO_F64_Cos  -> actuallyInlineFloatOp (\s -> GCOS s l1 l2) FF64 args
-
-              MO_F32_Tan  -> actuallyInlineFloatOp (\s -> GTAN s l1 l2) FF32 args
-              MO_F64_Tan  -> actuallyInlineFloatOp (\s -> GTAN s l1 l2) FF64 args
-
-              _other_op   -> outOfLineCmmOp bid op (Just r) args
 
        where
-        actuallyInlineFloatOp = actuallyInlineFloatOp' False
-        actuallyInlineSSE2Op = actuallyInlineFloatOp' True
+        actuallyInlineSSE2Op = actuallyInlineFloatOp'
 
-        actuallyInlineFloatOp' usesSSE instr format [x]
+        actuallyInlineFloatOp'  instr format [x]
               = do res <- trivialUFCode format (instr format) x
                    any <- anyReg res
-                   return (any (getRegisterReg platform usesSSE (CmmLocal r)))
+                   return (any (getRegisterReg platform  (CmmLocal r)))
 
-        actuallyInlineFloatOp' _ _ _ args
+        actuallyInlineFloatOp' _ _ args
               = panic $ "genCCall.actuallyInlineFloatOp': bad number of arguments! ("
                       ++ show (length args) ++ ")"
 
@@ -2358,7 +2308,7 @@ genCCall _ is32Bit target dest_regs args bid = do
                 AND fmt (OpReg tmp) (OpReg dst)
                 ]
 
-          return $ code (getRegisterReg platform True (CmmLocal r))
+          return $ code (getRegisterReg platform (CmmLocal r))
 
     (PrimTarget (MO_S_QuotRem  width), _) -> divOp1 platform True  width dest_regs args
     (PrimTarget (MO_U_QuotRem  width), _) -> divOp1 platform False width dest_regs args
@@ -2370,8 +2320,8 @@ genCCall _ is32Bit target dest_regs args bid = do
                let format = intFormat width
                lCode <- anyReg =<< trivialCode width (ADD_CC format)
                                      (Just (ADD_CC format)) arg_x arg_y
-               let reg_l = getRegisterReg platform True (CmmLocal res_l)
-                   reg_h = getRegisterReg platform True (CmmLocal res_h)
+               let reg_l = getRegisterReg platform (CmmLocal res_l)
+                   reg_h = getRegisterReg platform (CmmLocal res_h)
                    code = hCode reg_h `appOL`
                           lCode reg_l `snocOL`
                           ADC format (OpImm (ImmInteger 0)) (OpReg reg_h)
@@ -2391,8 +2341,8 @@ genCCall _ is32Bit target dest_regs args bid = do
             do (y_reg, y_code) <- getRegOrMem arg_y
                x_code <- getAnyReg arg_x
                let format = intFormat width
-                   reg_h = getRegisterReg platform True (CmmLocal res_h)
-                   reg_l = getRegisterReg platform True (CmmLocal res_l)
+                   reg_h = getRegisterReg platform (CmmLocal res_h)
+                   reg_l = getRegisterReg platform (CmmLocal res_l)
                    code = y_code `appOL`
                           x_code rax `appOL`
                           toOL [MUL2 format y_reg,
@@ -2428,8 +2378,8 @@ genCCall _ is32Bit target dest_regs args bid = do
         divOp platform signed width [res_q, res_r]
               m_arg_x_high arg_x_low arg_y
             = do let format = intFormat width
-                     reg_q = getRegisterReg platform True (CmmLocal res_q)
-                     reg_r = getRegisterReg platform True (CmmLocal res_r)
+                     reg_q = getRegisterReg platform (CmmLocal res_q)
+                     reg_r = getRegisterReg platform (CmmLocal res_r)
                      widen | signed    = CLTD format
                            | otherwise = XOR format (OpReg rdx) (OpReg rdx)
                      instr | signed    = IDIV
@@ -2456,8 +2406,8 @@ genCCall _ is32Bit target dest_regs args bid = do
                  rCode <- anyReg =<< trivialCode width (instr format)
                                        (mrevinstr format) arg_x arg_y
                  reg_tmp <- getNewRegNat II8
-                 let reg_c = getRegisterReg platform True (CmmLocal res_c)
-                     reg_r = getRegisterReg platform True (CmmLocal res_r)
+                 let reg_c = getRegisterReg platform  (CmmLocal res_c)
+                     reg_r = getRegisterReg platform  (CmmLocal res_r)
                      code = rCode reg_r `snocOL`
                             SETCC cond (OpReg reg_tmp) `snocOL`
                             MOVZxL II8 (OpReg reg_tmp) (OpReg reg_c)
@@ -2501,8 +2451,7 @@ genCCall32' dflags target dest_regs args = do
         delta0 <- getDeltaNat
         setDeltaNat (delta0 - arg_pad_size)
 
-        use_sse2 <- sse2Enabled
-        push_codes <- mapM (push_arg use_sse2) (reverse prom_args)
+        push_codes <- mapM push_arg (reverse prom_args)
         delta <- getDeltaNat
         MASSERT(delta == delta0 - tot_arg_size)
 
@@ -2555,18 +2504,21 @@ genCCall32' dflags target dest_regs args = do
             assign_code []     = nilOL
             assign_code [dest]
               | isFloatType ty =
-                 if use_sse2
-                    then let tmp_amode = AddrBaseIndex (EABaseReg esp)
+                  -- we assume SSE2
+                  let tmp_amode = AddrBaseIndex (EABaseReg esp)
                                                        EAIndexNone
                                                        (ImmInt 0)
-                             fmt = floatFormat w
+                      fmt = floatFormat w
                          in toOL [ SUB II32 (OpImm (ImmInt b)) (OpReg esp),
                                    DELTA (delta0 - b),
-                                   GST fmt fake0 tmp_amode,
+                                   X87Store fmt  tmp_amode,
+                                   -- X87Store only supported for the CDECL ABI
+                                   -- NB: This code will need to be
+                                   -- revisted once GHC does more work around
+                                   -- SIGFPE f
                                    MOV fmt (OpAddr tmp_amode) (OpReg r_dest),
                                    ADD II32 (OpImm (ImmInt b)) (OpReg esp),
                                    DELTA delta0]
-                    else unitOL (GMOV fake0 r_dest)
               | isWord64 ty    = toOL [MOV II32 (OpReg eax) (OpReg r_dest),
                                         MOV II32 (OpReg edx) (OpReg r_dest_hi)]
               | otherwise      = unitOL (MOV (intFormat w)
@@ -2577,7 +2529,7 @@ genCCall32' dflags target dest_regs args = do
                     w  = typeWidth ty
                     b  = widthInBytes w
                     r_dest_hi = getHiVRegFromLo r_dest
-                    r_dest    = getRegisterReg platform use_sse2 (CmmLocal dest)
+                    r_dest    = getRegisterReg platform  (CmmLocal dest)
             assign_code many = pprPanic "genCCall.assign_code - too many return values:" (ppr many)
 
         return (push_code `appOL`
@@ -2592,10 +2544,10 @@ genCCall32' dflags target dest_regs args = do
         roundTo a x | x `mod` a == 0 = x
                     | otherwise = x + a - (x `mod` a)
 
-        push_arg :: Bool -> CmmActual {-current argument-}
+        push_arg :: CmmActual {-current argument-}
                         -> NatM InstrBlock  -- code
 
-        push_arg use_sse2 arg -- we don't need the hints on x86
+        push_arg  arg -- we don't need the hints on x86
           | isWord64 arg_ty = do
             ChildCode64 code r_lo <- iselExpr64 arg
             delta <- getDeltaNat
@@ -2619,9 +2571,10 @@ genCCall32' dflags target dest_regs args = do
                                                             (ImmInt 0)
                                       format = floatFormat (typeWidth arg_ty)
                                   in
-                                  if use_sse2
-                                     then MOV format (OpReg reg) (OpAddr addr)
-                                     else GST format reg addr
+
+                                  -- assume SSE2
+                                   MOV format (OpReg reg) (OpAddr addr)
+
                                  ]
                            )
 
@@ -2749,7 +2702,7 @@ genCCall64' dflags target dest_regs args = do
                 _ -> unitOL (MOV (cmmTypeFormat rep) (OpReg rax) (OpReg r_dest))
           where
                 rep = localRegType dest
-                r_dest = getRegisterReg platform True (CmmLocal dest)
+                r_dest = getRegisterReg platform  (CmmLocal dest)
         assign_code _many = panic "genCCall.assign_code many"
 
     return (adjust_rsp          `appOL`
@@ -3162,17 +3115,9 @@ condIntReg cond x y = do
 -- and plays better with the uOP cache.
 
 condFltReg :: Bool -> Cond -> CmmExpr -> CmmExpr -> NatM Register
-condFltReg is32Bit cond x y = if_sse2 condFltReg_sse2 condFltReg_x87
+condFltReg is32Bit cond x y = condFltReg_sse2
  where
-  condFltReg_x87 = do
-    CondCode _ cond cond_code <- condFltCode cond x y
-    tmp <- getNewRegNat II8
-    let
-        code dst = cond_code `appOL` toOL [
-                    SETCC cond (OpReg tmp),
-                    MOVZxL II8 (OpReg tmp) (OpReg dst)
-                  ]
-    return (Any II32 code)
+
 
   condFltReg_sse2 = do
     CondCode _ cond cond_code <- condFltCode cond x y
@@ -3336,18 +3281,6 @@ trivialUCode rep instr x = do
 
 -----------
 
-trivialFCode_x87 :: (Format -> Reg -> Reg -> Reg -> Instr)
-                 -> CmmExpr -> CmmExpr -> NatM Register
-trivialFCode_x87 instr x y = do
-  (x_reg, x_code) <- getNonClobberedReg x -- these work for float regs too
-  (y_reg, y_code) <- getSomeReg y
-  let
-     format = FF80 -- always, on x87
-     code dst =
-        x_code `appOL`
-        y_code `snocOL`
-        instr format x_reg y_reg dst
-  return (Any format code)
 
 trivialFCode_sse2 :: Width -> (Format -> Operand -> Operand -> Instr)
                   -> CmmExpr -> CmmExpr -> NatM Register
@@ -3368,17 +3301,8 @@ trivialUFCode format instr x = do
 
 --------------------------------------------------------------------------------
 coerceInt2FP :: Width -> Width -> CmmExpr -> NatM Register
-coerceInt2FP from to x = if_sse2 coerce_sse2 coerce_x87
+coerceInt2FP from to x =  coerce_sse2
  where
-   coerce_x87 = do
-     (x_reg, x_code) <- getSomeReg x
-     let
-           opc  = case to of W32 -> GITOF; W64 -> GITOD;
-                             n -> panic $ "coerceInt2FP.x87: unhandled width ("
-                                         ++ show n ++ ")"
-           code dst = x_code `snocOL` opc x_reg dst
-        -- ToDo: works for non-II32 reps?
-     return (Any FF80 code)
 
    coerce_sse2 = do
      (x_op, x_code) <- getOperand x  -- ToDo: could be a safe operand
@@ -3392,18 +3316,8 @@ coerceInt2FP from to x = if_sse2 coerce_sse2 coerce_x87
 
 --------------------------------------------------------------------------------
 coerceFP2Int :: Width -> Width -> CmmExpr -> NatM Register
-coerceFP2Int from to x = if_sse2 coerceFP2Int_sse2 coerceFP2Int_x87
+coerceFP2Int from to x =  coerceFP2Int_sse2
  where
-   coerceFP2Int_x87 = do
-     (x_reg, x_code) <- getSomeReg x
-     let
-           opc  = case from of W32 -> GFTOI; W64 -> GDTOI
-                               n -> panic $ "coerceFP2Int.x87: unhandled width ("
-                                           ++ show n ++ ")"
-           code dst = x_code `snocOL` opc x_reg dst
-        -- ToDo: works for non-II32 reps?
-     return (Any (intFormat to) code)
-
    coerceFP2Int_sse2 = do
      (x_op, x_code) <- getOperand x  -- ToDo: could be a safe operand
      let
@@ -3418,15 +3332,13 @@ coerceFP2Int from to x = if_sse2 coerceFP2Int_sse2 coerceFP2Int_x87
 --------------------------------------------------------------------------------
 coerceFP2FP :: Width -> CmmExpr -> NatM Register
 coerceFP2FP to x = do
-  use_sse2 <- sse2Enabled
   (x_reg, x_code) <- getSomeReg x
   let
-        opc | use_sse2  = case to of W32 -> CVTSD2SS; W64 -> CVTSS2SD;
+        opc  = case to of W32 -> CVTSD2SS; W64 -> CVTSS2SD;
                                      n -> panic $ "coerceFP2FP: unhandled width ("
                                                  ++ show n ++ ")"
-            | otherwise = GDTOF
         code dst = x_code `snocOL` opc x_reg dst
-  return (Any (if use_sse2 then floatFormat to else FF80) code)
+  return (Any ( floatFormat to) code)
 
 --------------------------------------------------------------------------------
 
@@ -3443,7 +3355,7 @@ sse2NegCode w x = do
       x@II16 -> wrongFmt x
       x@II32 -> wrongFmt x
       x@II64 -> wrongFmt x
-      x@FF80 -> wrongFmt x
+
       where
         wrongFmt x = panic $ "sse2NegCode: " ++ show x
   Amode amode amode_code <- memConstant (mkAlignment $ widthInBytes w) const
diff --git a/compiler/nativeGen/X86/Instr.hs b/compiler/nativeGen/X86/Instr.hs
index beb9a97097..0e69d421a3 100644
--- a/compiler/nativeGen/X86/Instr.hs
+++ b/compiler/nativeGen/X86/Instr.hs
@@ -10,7 +10,7 @@
 
 module X86.Instr (Instr(..), Operand(..), PrefetchVariant(..), JumpDest(..),
                   getJumpDestBlockId, canShortcut, shortcutStatics,
-                  shortcutJump, i386_insert_ffrees, allocMoreStack,
+                  shortcutJump, allocMoreStack,
                   maxSpillSlots, archWordFormat )
 where
 
@@ -240,46 +240,14 @@ data Instr
         | BT          Format Imm Operand
         | NOP
 
-        -- x86 Float Arithmetic.
-        -- Note that we cheat by treating G{ABS,MOV,NEG} of doubles
-        -- as single instructions right up until we spit them out.
-        -- all the 3-operand fake fp insns are src1 src2 dst
-        -- and furthermore are constrained to be fp regs only.
-        -- IMPORTANT: keep is_G_insn up to date with any changes here
-        | GMOV        Reg Reg -- src(fpreg), dst(fpreg)
-        | GLD         Format AddrMode Reg -- src, dst(fpreg)
-        | GST         Format Reg AddrMode -- src(fpreg), dst
 
-        | GLDZ        Reg -- dst(fpreg)
-        | GLD1        Reg -- dst(fpreg)
-
-        | GFTOI       Reg Reg -- src(fpreg), dst(intreg)
-        | GDTOI       Reg Reg -- src(fpreg), dst(intreg)
-
-        | GITOF       Reg Reg -- src(intreg), dst(fpreg)
-        | GITOD       Reg Reg -- src(intreg), dst(fpreg)
-
-        | GDTOF       Reg Reg -- src(fpreg), dst(fpreg)
-
-        | GADD        Format Reg Reg Reg -- src1, src2, dst
-        | GDIV        Format Reg Reg Reg -- src1, src2, dst
-        | GSUB        Format Reg Reg Reg -- src1, src2, dst
-        | GMUL        Format Reg Reg Reg -- src1, src2, dst
-
-                -- FP compare.  Cond must be `elem` [EQQ, NE, LE, LTT, GE, GTT]
-                -- Compare src1 with src2; set the Zero flag iff the numbers are
-                -- comparable and the comparison is True.  Subsequent code must
-                -- test the %eflags zero flag regardless of the supplied Cond.
-        | GCMP        Cond Reg Reg -- src1, src2
-
-        | GABS        Format Reg Reg -- src, dst
-        | GNEG        Format Reg Reg -- src, dst
-        | GSQRT       Format Reg Reg -- src, dst
-        | GSIN        Format CLabel CLabel Reg Reg -- src, dst
-        | GCOS        Format CLabel CLabel Reg Reg -- src, dst
-        | GTAN        Format CLabel CLabel Reg Reg -- src, dst
-
-        | GFREE         -- do ffree on all x86 regs; an ugly hack
+        -- We need to support the FSTP (x87 store and pop) instruction
+        -- so that we can correctly read off the return value of an
+        -- x86 CDECL C function call when its floating point.
+        -- so we dont include a register argument, and just use st(0)
+        -- this instruction is used ONLY for return values of C ffi calls
+        -- in x86_32 abi
+        | X87Store         Format  AddrMode -- st(0), dst
 
 
         -- SSE2 floating point: we use a restricted set of the available SSE2
@@ -427,33 +395,7 @@ x86_regUsageOfInstr platform instr
     CLTD   _            -> mkRU [eax] [edx]
     NOP                 -> mkRU [] []
 
-    GMOV   src dst      -> mkRU [src] [dst]
-    GLD    _ src dst    -> mkRU (use_EA src []) [dst]
-    GST    _ src dst    -> mkRUR (src : use_EA dst [])
-
-    GLDZ   dst          -> mkRU [] [dst]
-    GLD1   dst          -> mkRU [] [dst]
-
-    GFTOI  src dst      -> mkRU [src] [dst]
-    GDTOI  src dst      -> mkRU [src] [dst]
-
-    GITOF  src dst      -> mkRU [src] [dst]
-    GITOD  src dst      -> mkRU [src] [dst]
-
-    GDTOF  src dst      -> mkRU [src] [dst]
-
-    GADD   _ s1 s2 dst  -> mkRU [s1,s2] [dst]
-    GSUB   _ s1 s2 dst  -> mkRU [s1,s2] [dst]
-    GMUL   _ s1 s2 dst  -> mkRU [s1,s2] [dst]
-    GDIV   _ s1 s2 dst  -> mkRU [s1,s2] [dst]
-
-    GCMP   _ src1 src2   -> mkRUR [src1,src2]
-    GABS   _ src dst     -> mkRU [src] [dst]
-    GNEG   _ src dst     -> mkRU [src] [dst]
-    GSQRT  _ src dst     -> mkRU [src] [dst]
-    GSIN   _ _ _ src dst -> mkRU [src] [dst]
-    GCOS   _ _ _ src dst -> mkRU [src] [dst]
-    GTAN   _ _ _ src dst -> mkRU [src] [dst]
+    X87Store    _  dst    -> mkRUR ( use_EA dst [])
 
     CVTSS2SD   src dst  -> mkRU [src] [dst]
     CVTSD2SS   src dst  -> mkRU [src] [dst]
@@ -603,33 +545,8 @@ x86_patchRegsOfInstr instr env
     JMP op regs          -> JMP (patchOp op) regs
     JMP_TBL op ids s lbl -> JMP_TBL (patchOp op) ids s lbl
 
-    GMOV src dst         -> GMOV (env src) (env dst)
-    GLD  fmt src dst     -> GLD fmt (lookupAddr src) (env dst)
-    GST  fmt src dst     -> GST fmt (env src) (lookupAddr dst)
-
-    GLDZ dst            -> GLDZ (env dst)
-    GLD1 dst            -> GLD1 (env dst)
-
-    GFTOI src dst       -> GFTOI (env src) (env dst)
-    GDTOI src dst       -> GDTOI (env src) (env dst)
-
-    GITOF src dst       -> GITOF (env src) (env dst)
-    GITOD src dst       -> GITOD (env src) (env dst)
-
-    GDTOF src dst       -> GDTOF (env src) (env dst)
-
-    GADD fmt s1 s2 dst   -> GADD fmt (env s1) (env s2) (env dst)
-    GSUB fmt s1 s2 dst   -> GSUB fmt (env s1) (env s2) (env dst)
-    GMUL fmt s1 s2 dst   -> GMUL fmt (env s1) (env s2) (env dst)
-    GDIV fmt s1 s2 dst   -> GDIV fmt (env s1) (env s2) (env dst)
-
-    GCMP fmt src1 src2   -> GCMP fmt (env src1) (env src2)
-    GABS fmt src dst     -> GABS fmt (env src) (env dst)
-    GNEG fmt src dst     -> GNEG fmt (env src) (env dst)
-    GSQRT fmt src dst    -> GSQRT fmt (env src) (env dst)
-    GSIN fmt l1 l2 src dst       -> GSIN fmt l1 l2 (env src) (env dst)
-    GCOS fmt l1 l2 src dst       -> GCOS fmt l1 l2 (env src) (env dst)
-    GTAN fmt l1 l2 src dst       -> GTAN fmt l1 l2 (env src) (env dst)
+    -- literally only support storing the top x87 stack value st(0)
+    X87Store  fmt  dst     -> X87Store fmt  (lookupAddr dst)
 
     CVTSS2SD src dst    -> CVTSS2SD (env src) (env dst)
     CVTSD2SS src dst    -> CVTSD2SS (env src) (env dst)
@@ -752,8 +669,7 @@ x86_mkSpillInstr dflags reg delta slot
     case targetClassOfReg platform reg of
            RcInteger   -> MOV (archWordFormat is32Bit)
                               (OpReg reg) (OpAddr (spRel dflags off))
-           RcDouble    -> GST FF80 reg (spRel dflags off) {- RcFloat/RcDouble -}
-           RcDoubleSSE -> MOV FF64 (OpReg reg) (OpAddr (spRel dflags off))
+           RcDouble    -> MOV FF64 (OpReg reg) (OpAddr (spRel dflags off))
            _         -> panic "X86.mkSpillInstr: no match"
     where platform = targetPlatform dflags
           is32Bit = target32Bit platform
@@ -772,8 +688,7 @@ x86_mkLoadInstr dflags reg delta slot
         case targetClassOfReg platform reg of
               RcInteger -> MOV (archWordFormat is32Bit)
                                (OpAddr (spRel dflags off)) (OpReg reg)
-              RcDouble  -> GLD FF80 (spRel dflags off) reg {- RcFloat/RcDouble -}
-              RcDoubleSSE -> MOV FF64 (OpAddr (spRel dflags off)) (OpReg reg)
+              RcDouble  -> MOV FF64 (OpAddr (spRel dflags off)) (OpReg reg)
               _           -> panic "X86.x86_mkLoadInstr"
     where platform = targetPlatform dflags
           is32Bit = target32Bit platform
@@ -827,6 +742,7 @@ x86_isMetaInstr instr
 
 
 
+---  TODO: why is there
 -- | Make a reg-reg move instruction.
 --      On SPARC v8 there are no instructions to move directly between
 --      floating point and integer regs. If we need to do that then we
@@ -844,8 +760,10 @@ x86_mkRegRegMoveInstr platform src dst
                      ArchX86    -> MOV II32 (OpReg src) (OpReg dst)
                      ArchX86_64 -> MOV II64 (OpReg src) (OpReg dst)
                      _          -> panic "x86_mkRegRegMoveInstr: Bad arch"
-        RcDouble    -> GMOV src dst
-        RcDoubleSSE -> MOV FF64 (OpReg src) (OpReg dst)
+        RcDouble    ->  MOV FF64 (OpReg src) (OpReg dst)
+        -- this code is the lie we tell ourselves because both float and double
+        -- use the same register class.on x86_64 and x86 32bit with SSE2,
+        -- more plainly, both use the XMM registers
         _     -> panic "X86.RegInfo.mkRegRegMoveInstr: no match"
 
 -- | Check whether an instruction represents a reg-reg move.
@@ -970,58 +888,6 @@ x86_mkStackDeallocInstr platform amount
       ArchX86_64 -> [ADD II64 (OpImm (ImmInt amount)) (OpReg rsp)]
       _ -> panic "x86_mkStackDeallocInstr"
 
-i386_insert_ffrees
-        :: [GenBasicBlock Instr]
-        -> [GenBasicBlock Instr]
-
-i386_insert_ffrees blocks
-   | any (any is_G_instr) [ instrs | BasicBlock _ instrs <- blocks ]
-   = map insertGFREEs blocks
-   | otherwise
-   = blocks
- where
-   insertGFREEs (BasicBlock id insns)
-     = BasicBlock id (insertBeforeNonlocalTransfers GFREE insns)
-
-insertBeforeNonlocalTransfers :: Instr -> [Instr] -> [Instr]
-insertBeforeNonlocalTransfers insert insns
-     = foldr p [] insns
-     where p insn r = case insn of
-                        CALL _ _    -> insert : insn : r
-                        JMP _ _     -> insert : insn : r
-                        JXX_GBL _ _ -> panic "insertBeforeNonlocalTransfers: cannot handle JXX_GBL"
-                        _           -> insn : r
-
-
--- if you ever add a new FP insn to the fake x86 FP insn set,
--- you must update this too
-is_G_instr :: Instr -> Bool
-is_G_instr instr
-   = case instr of
-        GMOV{}          -> True
-        GLD{}           -> True
-        GST{}           -> True
-        GLDZ{}          -> True
-        GLD1{}          -> True
-        GFTOI{}         -> True
-        GDTOI{}         -> True
-        GITOF{}         -> True
-        GITOD{}         -> True
-        GDTOF{}         -> True
-        GADD{}          -> True
-        GDIV{}          -> True
-        GSUB{}          -> True
-        GMUL{}          -> True
-        GCMP{}          -> True
-        GABS{}          -> True
-        GNEG{}          -> True
-        GSQRT{}         -> True
-        GSIN{}          -> True
-        GCOS{}          -> True
-        GTAN{}          -> True
-        GFREE           -> panic "is_G_instr: GFREE (!)"
-        _               -> False
-
 
 --
 -- Note [extra spill slots]
diff --git a/compiler/nativeGen/X86/Ppr.hs b/compiler/nativeGen/X86/Ppr.hs
index 83d53be553..ccfaff4753 100644
--- a/compiler/nativeGen/X86/Ppr.hs
+++ b/compiler/nativeGen/X86/Ppr.hs
@@ -272,7 +272,7 @@ pprReg f r
       RegVirtual (VirtualRegHi u)  -> text "%vHi_"  <> pprUniqueAlways u
       RegVirtual (VirtualRegF  u)  -> text "%vF_"   <> pprUniqueAlways u
       RegVirtual (VirtualRegD  u)  -> text "%vD_"   <> pprUniqueAlways u
-      RegVirtual (VirtualRegSSE u) -> text "%vSSE_" <> pprUniqueAlways u
+
   where
     ppr32_reg_no :: Format -> Int -> SDoc
     ppr32_reg_no II8   = ppr32_reg_byte
@@ -364,17 +364,14 @@ pprReg f r
 
 ppr_reg_float :: Int -> PtrString
 ppr_reg_float i = case i of
-        16 -> sLit "%fake0";  17 -> sLit "%fake1"
-        18 -> sLit "%fake2";  19 -> sLit "%fake3"
-        20 -> sLit "%fake4";  21 -> sLit "%fake5"
-        24 -> sLit "%xmm0";   25 -> sLit "%xmm1"
-        26 -> sLit "%xmm2";   27 -> sLit "%xmm3"
-        28 -> sLit "%xmm4";   29 -> sLit "%xmm5"
-        30 -> sLit "%xmm6";   31 -> sLit "%xmm7"
-        32 -> sLit "%xmm8";   33 -> sLit "%xmm9"
-        34 -> sLit "%xmm10";  35 -> sLit "%xmm11"
-        36 -> sLit "%xmm12";  37 -> sLit "%xmm13"
-        38 -> sLit "%xmm14";  39 -> sLit "%xmm15"
+        16 -> sLit "%xmm0" ;   17 -> sLit "%xmm1"
+        18 -> sLit "%xmm2" ;   19 -> sLit "%xmm3"
+        20 -> sLit "%xmm4" ;   21 -> sLit "%xmm5"
+        22 -> sLit "%xmm6" ;   23 -> sLit "%xmm7"
+        24 -> sLit "%xmm8" ;   25 -> sLit "%xmm9"
+        26 -> sLit "%xmm10";   27 -> sLit "%xmm11"
+        28 -> sLit "%xmm12";   29 -> sLit "%xmm13"
+        30 -> sLit "%xmm14";   31 -> sLit "%xmm15"
         _  -> sLit "very naughty x86 register"
 
 pprFormat :: Format -> SDoc
@@ -386,7 +383,6 @@ pprFormat x
                 II64  -> sLit "q"
                 FF32  -> sLit "ss"      -- "scalar single-precision float" (SSE2)
                 FF64  -> sLit "sd"      -- "scalar double-precision float" (SSE2)
-                FF80  -> sLit "t"
                 )
 
 pprFormat_x87 :: Format -> SDoc
@@ -394,9 +390,9 @@ pprFormat_x87 x
   = ptext $ case x of
                 FF32  -> sLit "s"
                 FF64  -> sLit "l"
-                FF80  -> sLit "t"
                 _     -> panic "X86.Ppr.pprFormat_x87"
 
+
 pprCond :: Cond -> SDoc
 pprCond c
  = ptext (case c of {
@@ -807,224 +803,12 @@ pprInstr (FETCHPC reg)
           ]
 
 
--- -----------------------------------------------------------------------------
--- i386 floating-point
-
--- Simulating a flat register set on the x86 FP stack is tricky.
--- you have to free %st(7) before pushing anything on the FP reg stack
--- so as to preclude the possibility of a FP stack overflow exception.
-pprInstr g@(GMOV src dst)
-   | src == dst
-   = empty
-   | otherwise
-   = pprG g (hcat [gtab, gpush src 0, gsemi, gpop dst 1])
-
--- GLD fmt addr dst ==> FLDsz addr ; FSTP (dst+1)
-pprInstr g@(GLD fmt addr dst)
- = pprG g (hcat [gtab, text "fld", pprFormat_x87 fmt, gsp,
-                 pprAddr addr, gsemi, gpop dst 1])
-
+-- the
 -- GST fmt src addr ==> FLD dst ; FSTPsz addr
-pprInstr g@(GST fmt src addr)
- | src == fake0 && fmt /= FF80 -- fstt instruction doesn't exist
- = pprG g (hcat [gtab,
-                 text "fst", pprFormat_x87 fmt, gsp, pprAddr addr])
- | otherwise
- = pprG g (hcat [gtab, gpush src 0, gsemi,
+pprInstr g@(X87Store fmt  addr)
+ = pprX87 g (hcat [gtab,
                  text "fstp", pprFormat_x87 fmt, gsp, pprAddr addr])
 
-pprInstr g@(GLDZ dst)
- = pprG g (hcat [gtab, text "fldz ; ", gpop dst 1])
-pprInstr g@(GLD1 dst)
- = pprG g (hcat [gtab, text "fld1 ; ", gpop dst 1])
-
-pprInstr (GFTOI src dst)
-   = pprInstr (GDTOI src dst)
-
-pprInstr g@(GDTOI src dst)
-   = pprG g (vcat [
-         hcat [gtab, text "subl $8, %esp ; fnstcw 4(%esp)"],
-         hcat [gtab, gpush src 0],
-         hcat [gtab, text "movzwl 4(%esp), ", reg,
-                     text " ; orl $0xC00, ", reg],
-         hcat [gtab, text "movl ", reg, text ", 0(%esp) ; fldcw 0(%esp)"],
-         hcat [gtab, text "fistpl 0(%esp)"],
-         hcat [gtab, text "fldcw 4(%esp) ; movl 0(%esp), ", reg],
-         hcat [gtab, text "addl $8, %esp"]
-     ])
-   where
-     reg = pprReg II32 dst
-
-pprInstr (GITOF src dst)
-   = pprInstr (GITOD src dst)
-
-pprInstr g@(GITOD src dst)
-   = pprG g (hcat [gtab, text "pushl ", pprReg II32 src,
-                   text " ; fildl (%esp) ; ",
-                   gpop dst 1, text " ; addl $4,%esp"])
-
-pprInstr g@(GDTOF src dst)
-  = pprG g (vcat [gtab <> gpush src 0,
-                  gtab <> text "subl $4,%esp ; fstps (%esp) ; flds (%esp) ; addl $4,%esp ;",
-                  gtab <> gpop dst 1])
-
-{- Gruesome swamp follows.  If you're unfortunate enough to have ventured
-   this far into the jungle AND you give a Rat's Ass (tm) what's going
-   on, here's the deal.  Generate code to do a floating point comparison
-   of src1 and src2, of kind cond, and set the Zero flag if true.
-
-   The complications are to do with handling NaNs correctly.  We want the
-   property that if either argument is NaN, then the result of the
-   comparison is False ... except if we're comparing for inequality,
-   in which case the answer is True.
-
-   Here's how the general (non-inequality) case works.  As an
-   example, consider generating the an equality test:
-
-     pushl %eax         -- we need to mess with this
-     <get src1 to top of FPU stack>
-     fcomp <src2 location in FPU stack> and pop pushed src1
-                -- Result of comparison is in FPU Status Register bits
-                -- C3 C2 and C0
-     fstsw %ax  -- Move FPU Status Reg to %ax
-     sahf       -- move C3 C2 C0 from %ax to integer flag reg
-     -- now the serious magic begins
-     setpo %ah     -- %ah = if comparable(neither arg was NaN) then 1 else 0
-     sete  %al     -- %al = if arg1 == arg2 then 1 else 0
-     andb %ah,%al  -- %al &= %ah
-                   -- so %al == 1 iff (comparable && same); else it holds 0
-     decb %al      -- %al == 0, ZeroFlag=1  iff (comparable && same);
-                      else %al == 0xFF, ZeroFlag=0
-     -- the zero flag is now set as we desire.
-     popl %eax
-
-   The special case of inequality differs thusly:
-
-     setpe %ah     -- %ah = if incomparable(either arg was NaN) then 1 else 0
-     setne %al     -- %al = if arg1 /= arg2 then 1 else 0
-     orb %ah,%al   -- %al = if (incomparable || different) then 1 else 0
-     decb %al      -- if (incomparable || different) then (%al == 0, ZF=1)
-                                                     else (%al == 0xFF, ZF=0)
--}
-pprInstr g@(GCMP cond src1 src2)
-   | case cond of { NE -> True; _ -> False }
-   = pprG g (vcat [
-        hcat [gtab, text "pushl %eax ; ",gpush src1 0],
-        hcat [gtab, text "fcomp ", greg src2 1,
-                    text "; fstsw %ax ; sahf ;  setpe %ah"],
-        hcat [gtab, text "setne %al ;  ",
-              text "orb %ah,%al ;  decb %al ;  popl %eax"]
-    ])
-   | otherwise
-   = pprG g (vcat [
-        hcat [gtab, text "pushl %eax ; ",gpush src1 0],
-        hcat [gtab, text "fcomp ", greg src2 1,
-                    text "; fstsw %ax ; sahf ;  setpo %ah"],
-        hcat [gtab, text "set", pprCond (fix_FP_cond cond), text " %al ;  ",
-              text "andb %ah,%al ;  decb %al ;  popl %eax"]
-    ])
-    where
-        {- On the 486, the flags set by FP compare are the unsigned ones!
-           (This looks like a HACK to me.  WDP 96/03)
-        -}
-        fix_FP_cond :: Cond -> Cond
-        fix_FP_cond GE   = GEU
-        fix_FP_cond GTT  = GU
-        fix_FP_cond LTT  = LU
-        fix_FP_cond LE   = LEU
-        fix_FP_cond EQQ  = EQQ
-        fix_FP_cond NE   = NE
-        fix_FP_cond _    = panic "X86.Ppr.fix_FP_cond: no match"
-        -- there should be no others
-
-
-pprInstr g@(GABS _ src dst)
-   = pprG g (hcat [gtab, gpush src 0, text " ; fabs ; ", gpop dst 1])
-
-pprInstr g@(GNEG _ src dst)
-   = pprG g (hcat [gtab, gpush src 0, text " ; fchs ; ", gpop dst 1])
-
-pprInstr g@(GSQRT fmt src dst)
-   = pprG g (hcat [gtab, gpush src 0, text " ; fsqrt"] $$
-             hcat [gtab, gcoerceto fmt, gpop dst 1])
-
-pprInstr g@(GSIN fmt l1 l2 src dst)
-   = pprG g (pprTrigOp "fsin" False l1 l2 src dst fmt)
-
-pprInstr g@(GCOS fmt l1 l2 src dst)
-   = pprG g (pprTrigOp "fcos" False l1 l2 src dst fmt)
-
-pprInstr g@(GTAN fmt l1 l2 src dst)
-   = pprG g (pprTrigOp "fptan" True l1 l2 src dst fmt)
-
--- In the translations for GADD, GMUL, GSUB and GDIV,
--- the first two cases are mere optimisations.  The otherwise clause
--- generates correct code under all circumstances.
-
-pprInstr g@(GADD _ src1 src2 dst)
-   | src1 == dst
-   = pprG g (text "\t#GADD-xxxcase1" $$
-             hcat [gtab, gpush src2 0,
-                   text " ; faddp %st(0),", greg src1 1])
-   | src2 == dst
-   = pprG g (text "\t#GADD-xxxcase2" $$
-             hcat [gtab, gpush src1 0,
-                   text " ; faddp %st(0),", greg src2 1])
-   | otherwise
-   = pprG g (hcat [gtab, gpush src1 0,
-                   text " ; fadd ", greg src2 1, text ",%st(0)",
-                   gsemi, gpop dst 1])
-
-
-pprInstr g@(GMUL _ src1 src2 dst)
-   | src1 == dst
-   = pprG g (text "\t#GMUL-xxxcase1" $$
-             hcat [gtab, gpush src2 0,
-                   text " ; fmulp %st(0),", greg src1 1])
-   | src2 == dst
-   = pprG g (text "\t#GMUL-xxxcase2" $$
-             hcat [gtab, gpush src1 0,
-                   text " ; fmulp %st(0),", greg src2 1])
-   | otherwise
-   = pprG g (hcat [gtab, gpush src1 0,
-             text " ; fmul ", greg src2 1, text ",%st(0)",
-             gsemi, gpop dst 1])
-
-
-pprInstr g@(GSUB _ src1 src2 dst)
-   | src1 == dst
-   = pprG g (text "\t#GSUB-xxxcase1" $$
-             hcat [gtab, gpush src2 0,
-                   text " ; fsubrp %st(0),", greg src1 1])
-   | src2 == dst
-   = pprG g (text "\t#GSUB-xxxcase2" $$
-             hcat [gtab, gpush src1 0,
-                   text " ; fsubp %st(0),", greg src2 1])
-   | otherwise
-   = pprG g (hcat [gtab, gpush src1 0,
-                   text " ; fsub ", greg src2 1, text ",%st(0)",
-                   gsemi, gpop dst 1])
-
-
-pprInstr g@(GDIV _ src1 src2 dst)
-   | src1 == dst
-   = pprG g (text "\t#GDIV-xxxcase1" $$
-             hcat [gtab, gpush src2 0,
-                   text " ; fdivrp %st(0),", greg src1 1])
-   | src2 == dst
-   = pprG g (text "\t#GDIV-xxxcase2" $$
-             hcat [gtab, gpush src1 0,
-                   text " ; fdivp %st(0),", greg src2 1])
-   | otherwise
-   = pprG g (hcat [gtab, gpush src1 0,
-                   text " ; fdiv ", greg src2 1, text ",%st(0)",
-                   gsemi, gpop dst 1])
-
-
-pprInstr GFREE
-   = vcat [ text "\tffree %st(0) ;ffree %st(1) ;ffree %st(2) ;ffree %st(3)",
-            text "\tffree %st(4) ;ffree %st(5)"
-          ]
 
 -- Atomics
 
@@ -1038,70 +822,11 @@ pprInstr (CMPXCHG format src dst)
    = pprFormatOpOp (sLit "cmpxchg") format src dst
 
 
-pprTrigOp :: String -> Bool -> CLabel -> CLabel
-          -> Reg -> Reg -> Format -> SDoc
-pprTrigOp op -- fsin, fcos or fptan
-          isTan -- we need a couple of extra steps if we're doing tan
-          l1 l2 -- internal labels for us to use
-          src dst fmt
-    = -- We'll be needing %eax later on
-      hcat [gtab, text "pushl %eax;"] $$
-      -- tan is going to use an extra space on the FP stack
-      (if isTan then hcat [gtab, text "ffree %st(6)"] else empty) $$
-      -- First put the value in %st(0) and try to apply the op to it
-      hcat [gpush src 0, text ("; " ++ op)] $$
-      -- Now look to see if C2 was set (overflow, |value| >= 2^63)
-      hcat [gtab, text "fnstsw %ax"] $$
-      hcat [gtab, text "test   $0x400,%eax"] $$
-      -- If we were in bounds then jump to the end
-      hcat [gtab, text "je     " <> ppr l1] $$
-      -- Otherwise we need to shrink the value. Start by
-      -- loading pi, doubleing it (by adding it to itself),
-      -- and then swapping pi with the value, so the value we
-      -- want to apply op to is in %st(0) again
-      hcat [gtab, text "ffree %st(7); fldpi"] $$
-      hcat [gtab, text "fadd   %st(0),%st"] $$
-      hcat [gtab, text "fxch   %st(1)"] $$
-      -- Now we have a loop in which we make the value smaller,
-      -- see if it's small enough, and loop if not
-      (ppr l2 <> char ':') $$
-      hcat [gtab, text "fprem1"] $$
-      -- My Debian libc uses fstsw here for the tan code, but I can't
-      -- see any reason why it should need to be different for tan.
-      hcat [gtab, text "fnstsw %ax"] $$
-      hcat [gtab, text "test   $0x400,%eax"] $$
-      hcat [gtab, text "jne    " <> ppr l2] $$
-      hcat [gtab, text "fstp   %st(1)"] $$
-      hcat [gtab, text op] $$
-      (ppr l1 <> char ':') $$
-      -- Pop the 1.0 tan gave us
-      (if isTan then hcat [gtab, text "fstp %st(0)"] else empty) $$
-      -- Restore %eax
-      hcat [gtab, text "popl %eax;"] $$
-      -- And finally make the result the right size
-      hcat [gtab, gcoerceto fmt, gpop dst 1]
 
 --------------------------
+-- some left over
 
--- coerce %st(0) to the specified size
-gcoerceto :: Format -> SDoc
-gcoerceto FF64 = empty
-gcoerceto FF32 = empty --text "subl $4,%esp ; fstps (%esp) ; flds (%esp) ; addl $4,%esp ; "
-gcoerceto _    = panic "X86.Ppr.gcoerceto: no match"
 
-gpush :: Reg -> RegNo -> SDoc
-gpush reg offset
-   = hcat [text "fld ", greg reg offset]
-
-gpop :: Reg -> RegNo -> SDoc
-gpop reg offset
-   = hcat [text "fstp ", greg reg offset]
-
-greg :: Reg -> RegNo -> SDoc
-greg reg offset = text "%st(" <> int (gregno reg - firstfake+offset) <> char ')'
-
-gsemi :: SDoc
-gsemi = text " ; "
 
 gtab :: SDoc
 gtab  = char '\t'
@@ -1109,45 +834,15 @@ gtab  = char '\t'
 gsp :: SDoc
 gsp   = char ' '
 
-gregno :: Reg -> RegNo
-gregno (RegReal (RealRegSingle i)) = i
-gregno _           = --pprPanic "gregno" (ppr other)
-                     999   -- bogus; only needed for debug printing
-
-pprG :: Instr -> SDoc -> SDoc
-pprG fake actual
-   = (char '#' <> pprGInstr fake) $$ actual
-
-
-pprGInstr :: Instr -> SDoc
-pprGInstr (GMOV src dst)   = pprFormatRegReg (sLit "gmov") FF64 src dst
-pprGInstr (GLD fmt src dst) = pprFormatAddrReg (sLit "gld") fmt src dst
-pprGInstr (GST fmt src dst) = pprFormatRegAddr (sLit "gst") fmt src dst
-
-pprGInstr (GLDZ dst) = pprFormatReg (sLit "gldz") FF64 dst
-pprGInstr (GLD1 dst) = pprFormatReg (sLit "gld1") FF64 dst
 
-pprGInstr (GFTOI src dst) = pprFormatFormatRegReg (sLit "gftoi") FF32 II32 src dst
-pprGInstr (GDTOI src dst) = pprFormatFormatRegReg (sLit "gdtoi") FF64 II32 src dst
 
-pprGInstr (GITOF src dst) = pprFormatFormatRegReg (sLit "gitof") II32 FF32 src dst
-pprGInstr (GITOD src dst) = pprFormatFormatRegReg (sLit "gitod") II32 FF64 src dst
-pprGInstr (GDTOF src dst) = pprFormatFormatRegReg (sLit "gdtof") FF64 FF32 src dst
+pprX87 :: Instr -> SDoc -> SDoc
+pprX87 fake actual
+   = (char '#' <> pprX87Instr fake) $$ actual
 
-pprGInstr (GCMP co src dst) = pprCondRegReg (sLit "gcmp_") FF64 co src dst
-pprGInstr (GABS fmt src dst) = pprFormatRegReg (sLit "gabs") fmt src dst
-pprGInstr (GNEG fmt src dst) = pprFormatRegReg (sLit "gneg") fmt src dst
-pprGInstr (GSQRT fmt src dst) = pprFormatRegReg (sLit "gsqrt") fmt src dst
-pprGInstr (GSIN fmt _ _ src dst) = pprFormatRegReg (sLit "gsin") fmt src dst
-pprGInstr (GCOS fmt _ _ src dst) = pprFormatRegReg (sLit "gcos") fmt src dst
-pprGInstr (GTAN fmt _ _ src dst) = pprFormatRegReg (sLit "gtan") fmt src dst
-
-pprGInstr (GADD fmt src1 src2 dst) = pprFormatRegRegReg (sLit "gadd") fmt src1 src2 dst
-pprGInstr (GSUB fmt src1 src2 dst) = pprFormatRegRegReg (sLit "gsub") fmt src1 src2 dst
-pprGInstr (GMUL fmt src1 src2 dst) = pprFormatRegRegReg (sLit "gmul") fmt src1 src2 dst
-pprGInstr (GDIV fmt src1 src2 dst) = pprFormatRegRegReg (sLit "gdiv") fmt src1 src2 dst
-
-pprGInstr _ = panic "X86.Ppr.pprGInstr: no match"
+pprX87Instr :: Instr -> SDoc
+pprX87Instr (X87Store fmt  dst) = pprFormatAddr (sLit "gst") fmt  dst
+pprX87Instr _ = panic "X86.Ppr.pprX87Instr: no match"
 
 pprDollImm :: Imm -> SDoc
 pprDollImm i = text "$" <> pprImm i
@@ -1215,23 +910,6 @@ pprOpOp name format op1 op2
     ]
 
 
-pprFormatReg :: PtrString -> Format -> Reg -> SDoc
-pprFormatReg name format reg1
-  = hcat [
-        pprMnemonic name format,
-        pprReg format reg1
-    ]
-
-
-pprFormatRegReg :: PtrString -> Format -> Reg -> Reg -> SDoc
-pprFormatRegReg name format reg1 reg2
-  = hcat [
-        pprMnemonic name format,
-        pprReg format reg1,
-        comma,
-        pprReg format reg2
-    ]
-
 
 pprRegReg :: PtrString -> Reg -> Reg -> SDoc
 pprRegReg name reg1 reg2
@@ -1266,31 +944,6 @@ pprCondOpReg name format cond op1 reg2
         pprReg format reg2
     ]
 
-pprCondRegReg :: PtrString -> Format -> Cond -> Reg -> Reg -> SDoc
-pprCondRegReg name format cond reg1 reg2
-  = hcat [
-        char '\t',
-        ptext name,
-        pprCond cond,
-        space,
-        pprReg format reg1,
-        comma,
-        pprReg format reg2
-    ]
-
-pprFormatFormatRegReg :: PtrString -> Format -> Format -> Reg -> Reg -> SDoc
-pprFormatFormatRegReg name format1 format2 reg1 reg2
-  = hcat [
-        char '\t',
-        ptext name,
-        pprFormat format1,
-        pprFormat format2,
-        space,
-        pprReg format1 reg1,
-        comma,
-        pprReg format2 reg2
-    ]
-
 pprFormatFormatOpReg :: PtrString -> Format -> Format -> Operand -> Reg -> SDoc
 pprFormatFormatOpReg name format1 format2 op1 reg2
   = hcat [
@@ -1300,17 +953,6 @@ pprFormatFormatOpReg name format1 format2 op1 reg2
         pprReg format2 reg2
     ]
 
-pprFormatRegRegReg :: PtrString -> Format -> Reg -> Reg -> Reg -> SDoc
-pprFormatRegRegReg name format reg1 reg2 reg3
-  = hcat [
-        pprMnemonic name format,
-        pprReg format reg1,
-        comma,
-        pprReg format reg2,
-        comma,
-        pprReg format reg3
-    ]
-
 pprFormatOpOpReg :: PtrString -> Format -> Operand -> Operand -> Reg -> SDoc
 pprFormatOpOpReg name format op1 op2 reg3
   = hcat [
@@ -1322,26 +964,16 @@ pprFormatOpOpReg name format op1 op2 reg3
         pprReg format reg3
     ]
 
-pprFormatAddrReg :: PtrString -> Format -> AddrMode -> Reg -> SDoc
-pprFormatAddrReg name format op dst
-  = hcat [
-        pprMnemonic name format,
-        pprAddr op,
-        comma,
-        pprReg format dst
-    ]
 
 
-pprFormatRegAddr :: PtrString -> Format -> Reg -> AddrMode -> SDoc
-pprFormatRegAddr name format src op
+pprFormatAddr :: PtrString -> Format -> AddrMode -> SDoc
+pprFormatAddr name format  op
   = hcat [
         pprMnemonic name format,
-        pprReg format src,
         comma,
         pprAddr op
     ]
 
-
 pprShift :: PtrString -> Format -> Operand -> Operand -> SDoc
 pprShift name format src dest
   = hcat [
diff --git a/compiler/nativeGen/X86/RegInfo.hs b/compiler/nativeGen/X86/RegInfo.hs
index 339fdd7a8c..24823e3985 100644
--- a/compiler/nativeGen/X86/RegInfo.hs
+++ b/compiler/nativeGen/X86/RegInfo.hs
@@ -25,10 +25,13 @@ import X86.Regs
 mkVirtualReg :: Unique -> Format -> VirtualReg
 mkVirtualReg u format
    = case format of
-        FF32    -> VirtualRegSSE u
-        FF64    -> VirtualRegSSE u
-        FF80    -> VirtualRegD   u
-        _other  -> VirtualRegI   u
+        FF32    -> VirtualRegD u
+        -- for scalar F32, we use the same xmm as F64!
+        -- this is a hack that needs some improvement.
+        -- For now we map both to being allocated as "Double" Registers
+        -- on X86/X86_64
+        FF64    -> VirtualRegD u
+        _other  -> VirtualRegI u
 
 regDotColor :: Platform -> RealReg -> SDoc
 regDotColor platform reg
@@ -37,11 +40,12 @@ regDotColor platform reg
         _        -> panic "Register not assigned a color"
 
 regColors :: Platform -> UniqFM [Char]
-regColors platform = listToUFM (normalRegColors platform ++ fpRegColors platform)
+regColors platform = listToUFM (normalRegColors platform)
 
 normalRegColors :: Platform -> [(Reg,String)]
 normalRegColors platform =
     zip (map regSingle [0..lastint platform]) colors
+        ++ zip (map regSingle [firstxmm..lastxmm platform]) greys
   where
     -- 16 colors - enough for amd64 gp regs
     colors = ["#800000","#ff0000","#808000","#ffff00","#008000"
@@ -49,17 +53,6 @@ normalRegColors platform =
              ,"#800080","#ff00ff","#87005f","#875f00","#87af00"
              ,"#ff00af"]
 
-fpRegColors :: Platform -> [(Reg,String)]
-fpRegColors platform =
-        [ (fake0, "red")
-        , (fake1, "red")
-        , (fake2, "red")
-        , (fake3, "red")
-        , (fake4, "red")
-        , (fake5, "red") ]
-
-        ++ zip (map regSingle [firstxmm..lastxmm platform]) greys
-  where
     -- 16 shades of grey, enough for the currently supported
     -- SSE extensions.
     greys = ["#0e0e0e","#1c1c1c","#2a2a2a","#383838","#464646"
diff --git a/compiler/nativeGen/X86/Regs.hs b/compiler/nativeGen/X86/Regs.hs
index b7181189a4..37e99c5a71 100644
--- a/compiler/nativeGen/X86/Regs.hs
+++ b/compiler/nativeGen/X86/Regs.hs
@@ -29,7 +29,7 @@ module X86.Regs (
         EABase(..), EAIndex(..), addrModeRegs,
 
         eax, ebx, ecx, edx, esi, edi, ebp, esp,
-        fake0, fake1, fake2, fake3, fake4, fake5, firstfake,
+
 
         rax, rbx, rcx, rdx, rsi, rdi, rbp, rsp,
         r8,  r9,  r10, r11, r12, r13, r14, r15,
@@ -86,10 +86,6 @@ virtualRegSqueeze cls vr
                 VirtualRegF{}           -> 0
                 _other                  -> 0
 
-        RcDoubleSSE
-         -> case vr of
-                VirtualRegSSE{}         -> 1
-                _other                  -> 0
 
         _other -> 0
 
@@ -100,7 +96,7 @@ realRegSqueeze cls rr
         RcInteger
          -> case rr of
                 RealRegSingle regNo
-                        | regNo < firstfake -> 1
+                        | regNo < firstxmm -> 1
                         | otherwise     -> 0
 
                 RealRegPair{}           -> 0
@@ -108,15 +104,11 @@ realRegSqueeze cls rr
         RcDouble
          -> case rr of
                 RealRegSingle regNo
-                        | regNo >= firstfake && regNo <= lastfake -> 1
+                        | regNo >= firstxmm  -> 1
                         | otherwise     -> 0
 
                 RealRegPair{}           -> 0
 
-        RcDoubleSSE
-         -> case rr of
-                RealRegSingle regNo | regNo >= firstxmm -> 1
-                _otherwise                        -> 0
 
         _other -> 0
 
@@ -210,17 +202,16 @@ spRel dflags n
 -- use a Word32 to represent the set of free registers in the register
 -- allocator.
 
-firstfake, lastfake :: RegNo
-firstfake = 16
-lastfake  = 21
+
 
 firstxmm :: RegNo
-firstxmm  = 24
+firstxmm  = 16
 
+--  on 32bit platformOSs, only the first 8 XMM/YMM/ZMM registers are available
 lastxmm :: Platform -> RegNo
 lastxmm platform
- | target32Bit platform = 31
- | otherwise            = 39
+ | target32Bit platform = firstxmm + 7  -- xmm0 - xmmm7
+ | otherwise            = firstxmm + 15 -- xmm0 -xmm15
 
 lastint :: Platform -> RegNo
 lastint platform
@@ -230,14 +221,13 @@ lastint platform
 intregnos :: Platform -> [RegNo]
 intregnos platform = [0 .. lastint platform]
 
-fakeregnos :: [RegNo]
-fakeregnos  = [firstfake .. lastfake]
+
 
 xmmregnos :: Platform -> [RegNo]
 xmmregnos platform = [firstxmm  .. lastxmm platform]
 
 floatregnos :: Platform -> [RegNo]
-floatregnos platform = fakeregnos ++ xmmregnos platform
+floatregnos platform = xmmregnos platform
 
 -- argRegs is the set of regs which are read for an n-argument call to C.
 -- For archs which pass all args on the stack (x86), is empty.
@@ -257,20 +247,19 @@ classOfRealReg :: Platform -> RealReg -> RegClass
 -- However, we can get away without this at the moment because the
 -- only allocatable integer regs are also 8-bit compatible (1, 3, 4).
 classOfRealReg platform reg
- = case reg of
+    = case reg of
         RealRegSingle i
-          | i <= lastint platform -> RcInteger
-          | i <= lastfake         -> RcDouble
-          | otherwise             -> RcDoubleSSE
-
-        RealRegPair{}   -> panic "X86.Regs.classOfRealReg: RegPairs on this arch"
+            | i <= lastint platform -> RcInteger
+            | i <= lastxmm platform -> RcDouble
+            | otherwise             -> panic "X86.Reg.classOfRealReg registerSingle too high"
+        _   -> panic "X86.Regs.classOfRealReg: RegPairs on this arch"
 
 -- | Get the name of the register with this number.
+-- NOTE: fixme, we dont track which "way" the XMM registers are used
 showReg :: Platform -> RegNo -> String
 showReg platform n
-        | n >= firstxmm  = "%xmm" ++ show (n-firstxmm)
-        | n >= firstfake = "%fake" ++ show (n-firstfake)
-        | n >= 8         = "%r" ++ show n
+        | n >= firstxmm && n <= lastxmm  platform = "%xmm" ++ show (n-firstxmm)
+        | n >= 8   && n < firstxmm      = "%r" ++ show n
         | otherwise      = regNames platform A.! n
 
 regNames :: Platform -> A.Array Int String
@@ -290,17 +279,16 @@ Intel x86 architecture:
 - Only ebx, esi, edi and esp are available across a C call (they are callee-saves).
 - Registers 0-7 have 16-bit counterparts (ax, bx etc.)
 - Registers 0-3 have 8 bit counterparts (ah, bh etc.)
-- Registers fake0..fake5 are fakes; we pretend x86 has 6 conventionally-addressable
-  fp registers, and 3-operand insns for them, and we translate this into
-  real stack-based x86 fp code after register allocation.
 
 The fp registers are all Double registers; we don't have any RcFloat class
 regs.  @regClass@ barfs if you give it a VirtualRegF, and mkVReg above should
 never generate them.
+
+TODO: cleanup modelling float vs double registers and how they are the same class.
 -}
 
-fake0, fake1, fake2, fake3, fake4, fake5,
-       eax, ebx, ecx, edx, esp, ebp, esi, edi :: Reg
+
+eax, ebx, ecx, edx, esp, ebp, esi, edi :: Reg
 
 eax   = regSingle 0
 ebx   = regSingle 1
@@ -310,12 +298,7 @@ esi   = regSingle 4
 edi   = regSingle 5
 ebp   = regSingle 6
 esp   = regSingle 7
-fake0 = regSingle 16
-fake1 = regSingle 17
-fake2 = regSingle 18
-fake3 = regSingle 19
-fake4 = regSingle 20
-fake5 = regSingle 21
+
 
 
 
@@ -362,22 +345,22 @@ r12   = regSingle 12
 r13   = regSingle 13
 r14   = regSingle 14
 r15   = regSingle 15
-xmm0  = regSingle 24
-xmm1  = regSingle 25
-xmm2  = regSingle 26
-xmm3  = regSingle 27
-xmm4  = regSingle 28
-xmm5  = regSingle 29
-xmm6  = regSingle 30
-xmm7  = regSingle 31
-xmm8  = regSingle 32
-xmm9  = regSingle 33
-xmm10 = regSingle 34
-xmm11 = regSingle 35
-xmm12 = regSingle 36
-xmm13 = regSingle 37
-xmm14 = regSingle 38
-xmm15 = regSingle 39
+xmm0  = regSingle 16
+xmm1  = regSingle 17
+xmm2  = regSingle 18
+xmm3  = regSingle 19
+xmm4  = regSingle 20
+xmm5  = regSingle 21
+xmm6  = regSingle 22
+xmm7  = regSingle 23
+xmm8  = regSingle 24
+xmm9  = regSingle 25
+xmm10 = regSingle 26
+xmm11 = regSingle 27
+xmm12 = regSingle 28
+xmm13 = regSingle 29
+xmm14 = regSingle 30
+xmm15 = regSingle 31
 
 ripRel :: Displacement -> AddrMode
 ripRel imm      = AddrBaseIndex EABaseRip EAIndexNone imm
@@ -411,7 +394,7 @@ callClobberedRegs platform
    -- Only xmm0-5 are caller-saves registers on 64bit windows.
    -- ( https://docs.microsoft.com/en-us/cpp/build/register-usage )
    -- For details check the Win64 ABI.
-   ++ map regSingle fakeregnos ++ map xmm [0  .. 5]
+   ++ map xmm [0  .. 5]
  | otherwise
     -- all xmm regs are caller-saves
     -- caller-saves registers
@@ -430,11 +413,15 @@ allIntArgRegs platform
     = panic "X86.Regs.allIntArgRegs: not defined for this platform"
  | otherwise = [rdi,rsi,rdx,rcx,r8,r9]
 
+
+-- | on 64bit platforms we pass the first 8 float/double arguments
+-- in the xmm registers.
 allFPArgRegs :: Platform -> [Reg]
 allFPArgRegs platform
  | platformOS platform == OSMinGW32
     = panic "X86.Regs.allFPArgRegs: not defined for this platform"
- | otherwise = map regSingle [firstxmm .. firstxmm+7]
+ | otherwise = map regSingle [firstxmm .. firstxmm + 7 ]
+
 
 -- Machine registers which might be clobbered by instructions that
 -- generate results into fixed registers, or need arguments in a fixed
diff --git a/compiler/types/TyCon.hs b/compiler/types/TyCon.hs
index a943284612..e55cf8b2a3 100644
--- a/compiler/types/TyCon.hs
+++ b/compiler/types/TyCon.hs
@@ -1328,12 +1328,12 @@ Roughly in order of "includes more information":
    number of bits.  It may represent a signed or unsigned integer, a
    floating-point value, or an address.
 
-    data Width = W8 | W16 | W32 | W64 | W80 | W128
+    data Width = W8 | W16 | W32 | W64  | W128
 
  - Size, which is used in the native code generator, is Width +
    floating point information.
 
-   data Size = II8 | II16 | II32 | II64 | FF32 | FF64 | FF80
+   data Size = II8 | II16 | II32 | II64 | FF32 | FF64
 
    it is necessary because e.g. the instruction to move a 64-bit float
    on x86 (movsd) is different from the instruction to move a 64-bit
diff --git a/includes/CodeGen.Platform.hs b/includes/CodeGen.Platform.hs
index 2a373283d4..867dd14fb0 100644
--- a/includes/CodeGen.Platform.hs
+++ b/includes/CodeGen.Platform.hs
@@ -41,65 +41,59 @@ import Reg
 #  define r15   15
 # endif
 
-# define fake0 16
-# define fake1 17
-# define fake2 18
-# define fake3 19
-# define fake4 20
-# define fake5 21
 
 -- N.B. XMM, YMM, and ZMM are all aliased to the same hardware registers hence
 -- being assigned the same RegNos.
-# define xmm0  24
-# define xmm1  25
-# define xmm2  26
-# define xmm3  27
-# define xmm4  28
-# define xmm5  29
-# define xmm6  30
-# define xmm7  31
-# define xmm8  32
-# define xmm9  33
-# define xmm10 34
-# define xmm11 35
-# define xmm12 36
-# define xmm13 37
-# define xmm14 38
-# define xmm15 39
+# define xmm0  16
+# define xmm1  17
+# define xmm2  18
+# define xmm3  19
+# define xmm4  20
+# define xmm5  21
+# define xmm6  22
+# define xmm7  23
+# define xmm8  24
+# define xmm9  25
+# define xmm10 26
+# define xmm11 27
+# define xmm12 28
+# define xmm13 29
+# define xmm14 30
+# define xmm15 31
 
-# define ymm0  24
-# define ymm1  25
-# define ymm2  26
-# define ymm3  27
-# define ymm4  28
-# define ymm5  29
-# define ymm6  30
-# define ymm7  31
-# define ymm8  32
-# define ymm9  33
-# define ymm10 34
-# define ymm11 35
-# define ymm12 36
-# define ymm13 37
-# define ymm14 38
-# define ymm15 39
+# define ymm0  16
+# define ymm1  17
+# define ymm2  18
+# define ymm3  19
+# define ymm4  20
+# define ymm5  21
+# define ymm6  22
+# define ymm7  23
+# define ymm8  24
+# define ymm9  25
+# define ymm10 26
+# define ymm11 27
+# define ymm12 28
+# define ymm13 29
+# define ymm14 30
+# define ymm15 31
 
-# define zmm0  24
-# define zmm1  25
-# define zmm2  26
-# define zmm3  27
-# define zmm4  28
-# define zmm5  29
-# define zmm6  30
-# define zmm7  31
-# define zmm8  32
-# define zmm9  33
-# define zmm10 34
-# define zmm11 35
-# define zmm12 36
-# define zmm13 37
-# define zmm14 38
-# define zmm15 39
+# define zmm0  16
+# define zmm1  17
+# define zmm2  18
+# define zmm3  19
+# define zmm4  20
+# define zmm5  21
+# define zmm6  22
+# define zmm7  23
+# define zmm8  24
+# define zmm9  25
+# define zmm10 26
+# define zmm11 27
+# define zmm12 28
+# define zmm13 29
+# define zmm14 30
+# define zmm15 31
 
 -- Note: these are only needed for ARM/ARM64 because globalRegMaybe is now used in CmmSink.hs.
 -- Since it's only used to check 'isJust', the actual values don't matter, thus
diff --git a/libraries/base/tests/Numeric/all.T b/libraries/base/tests/Numeric/all.T
index c4c9bb4646..9969a5552a 100644
--- a/libraries/base/tests/Numeric/all.T
+++ b/libraries/base/tests/Numeric/all.T
@@ -7,21 +7,10 @@ test('num006', normal, compile_and_run, [''])
 test('num007', normal, compile_and_run, [''])
 test('num008', normal, compile_and_run, [''])
 
-# On i386, we need -msse2 to get reliable floating point results
-if config.arch == 'i386':
-    opts = '-msse2'
-else:
-    opts = ''
+
 test('num009', [ when(fast(), skip)
-               , when(wordsize(32), expect_broken(15062))
-               , when(platform('i386-apple-darwin'), expect_broken(2370))
-               , when(platform('powerpc64le-unknown-linux'), expect_broken(13634))
-               , when(opsys('mingw32'), omit_ways(['ghci'])) ],
-               # We get different results at 1e20 on x86/Windows, so there is
-               # a special output file for that.  I (SDM) don't think these are
-               # serious, since the results for lower numbers are all fine.
-               # We also get another set of results for 1e02 with GHCi, so
-               # I'm skipping that way altogether.
+               # , when(wordsize(32), expect_broken(15062))
+               , when(platform('powerpc64le-unknown-linux'), expect_broken(13634))],
              compile_and_run, [opts])
 test('num010',
      when(platform('i386-apple-darwin'), expect_broken_for(7043, 'ghci')),
diff --git a/libraries/base/tests/Numeric/num009.hs b/libraries/base/tests/Numeric/num009.hs
index e405ddf050..5133d43920 100644
--- a/libraries/base/tests/Numeric/num009.hs
+++ b/libraries/base/tests/Numeric/num009.hs
@@ -1,8 +1,6 @@
 -- trac #2059
 --
--- Note that this test fails miserably when compiled to use X87 floating point.
--- For instance, in the case of (sin 1e20) the X86 FSIN instruction doesn't even
--- get the sign right on my machine.
+
 
 module Main(main) where