diff options
| author | Herbert Valerio Riedel <hvr@gnu.org> | 2014-10-18 14:32:33 +0200 | 
|---|---|---|
| committer | Herbert Valerio Riedel <hvr@gnu.org> | 2014-10-18 14:32:56 +0200 | 
| commit | 612f3d120c65a461a4ad7f212d67bdae005f4975 (patch) | |
| tree | 5045f1a4ae209ad3201c479bc8a86d47a6cc4685 /compiler | |
| parent | 1c35f9f1cb7a293da85d649904ce731a65824cfe (diff) | |
| download | haskell-612f3d120c65a461a4ad7f212d67bdae005f4975.tar.gz | |
Implement optimized NCG `MO_Ctz W64` op for i386 (#9340)
Summary:
This is an optimization to the CTZ primops introduced for #9340
Previously we called out to `hs_ctz64`, but we can actually generate
better hand-tuned code while avoiding the FFI ccall.
With this patch, the code
  {-# LANGUAGE MagicHash #-}
  module TestClz0 where
  import GHC.Prim
  ctz64 :: Word64# -> Word#
  ctz64 x = ctz64# x
results in the following assembler generated by NCG on i386:
  TestClz.ctz64_info:
      movl (%ebp),%eax
      movl 4(%ebp),%ecx
      movl %ecx,%edx
      orl %eax,%edx
      movl $64,%edx
      je _nAO
      bsf %ecx,%ecx
      addl $32,%ecx
      bsf %eax,%eax
      cmovne %eax,%ecx
      movl %ecx,%edx
  _nAO:
      movl %edx,%esi
      addl $8,%ebp
      jmp *(%ebp)
For comparision, here's what LLVM 3.4 currently generates:
  000000fc <TestClzz_ctzz64_info>:
    fc:   0f bc 45 04             bsf    0x4(%ebp),%eax
   100:   b9 20 00 00 00          mov    $0x20,%ecx
   105:   0f 45 c8                cmovne %eax,%ecx
   108:   83 c1 20                add    $0x20,%ecx
   10b:   8b 45 00                mov    0x0(%ebp),%eax
   10e:   8b 55 08                mov    0x8(%ebp),%edx
   111:   0f bc f0                bsf    %eax,%esi
   114:   85 c0                   test   %eax,%eax
   116:   0f 44 f1                cmove  %ecx,%esi
   119:   83 c5 08                add    $0x8,%ebp
   11c:   ff e2                   jmp    *%edx
Reviewed By: austin
Auditors: simonmar
Differential Revision: https://phabricator.haskell.org/D163
Diffstat (limited to 'compiler')
| -rw-r--r-- | compiler/nativeGen/X86/CodeGen.hs | 41 | 
1 files changed, 32 insertions, 9 deletions
| diff --git a/compiler/nativeGen/X86/CodeGen.hs b/compiler/nativeGen/X86/CodeGen.hs index 9d7cb78a6c..abd87ed087 100644 --- a/compiler/nativeGen/X86/CodeGen.hs +++ b/compiler/nativeGen/X86/CodeGen.hs @@ -1799,14 +1799,38 @@ genCCall dflags is32Bit (PrimTarget (MO_Clz width)) dest_regs@[dst] args@[src]      size = if width == W8 then II16 else intSize width      lbl = mkCmmCodeLabel primPackageKey (fsLit (clzLabel width)) -genCCall dflags is32Bit (PrimTarget (MO_Ctz width)) dest_regs@[dst] args@[src] +genCCall dflags is32Bit (PrimTarget (MO_Ctz width)) [dst] [src]    | is32Bit, width == W64 = do -    -- Fallback to `hs_ctz64` on i386 -    targetExpr <- cmmMakeDynamicReference dflags CallReference lbl -    let target = ForeignTarget targetExpr (ForeignConvention CCallConv -                                           [NoHint] [NoHint] -                                           CmmMayReturn) -    genCCall dflags is32Bit target dest_regs args +      ChildCode64 vcode rlo <- iselExpr64 src +      let rhi     = getHiVRegFromLo rlo +          dst_r   = getRegisterReg platform False (CmmLocal dst) +      lbl1 <- getBlockIdNat +      lbl2 <- getBlockIdNat +      tmp_r <- getNewRegNat size + +      -- The following instruction sequence corresponds to the pseudo-code +      -- +      --  if (src) { +      --    dst = src.lo32 ? BSF(src.lo32) : (BSF(src.hi32) + 32); +      --  } else { +      --    dst = 64; +      --  } +      return $ vcode `appOL` toOL +               ([ MOV      II32 (OpReg rhi)         (OpReg tmp_r) +                , OR       II32 (OpReg rlo)         (OpReg tmp_r) +                , MOV      II32 (OpImm (ImmInt 64)) (OpReg dst_r) +                , JXX EQQ    lbl2 +                , JXX ALWAYS lbl1 + +                , NEWBLOCK   lbl1 +                , BSF     II32 (OpReg rhi)         dst_r +                , ADD     II32 (OpImm (ImmInt 32)) (OpReg dst_r) +                , BSF     II32 (OpReg rlo)         tmp_r +                , CMOV NE II32 (OpReg tmp_r)       dst_r +                , JXX ALWAYS lbl2 + +                , NEWBLOCK   lbl2 +                ])    | otherwise = do      code_src <- getAnyReg src @@ -1828,7 +1852,6 @@ genCCall dflags is32Bit (PrimTarget (MO_Ctz width)) dest_regs@[dst] args@[src]      bw = widthInBits width      platform = targetPlatform dflags      size = if width == W8 then II16 else intSize width -    lbl = mkCmmCodeLabel primPackageKey (fsLit (ctzLabel width))  genCCall dflags is32Bit (PrimTarget (MO_UF_Conv width)) dest_regs args = do      targetExpr <- cmmMakeDynamicReference dflags @@ -2485,7 +2508,7 @@ outOfLineCmmOp mop res args                MO_PopCnt _  -> fsLit "popcnt"                MO_BSwap _   -> fsLit "bswap"                MO_Clz w     -> fsLit $ clzLabel w -              MO_Ctz w     -> fsLit $ ctzLabel w +              MO_Ctz _     -> unsupported                MO_AtomicRMW _ _ -> fsLit "atomicrmw"                MO_AtomicRead _  -> fsLit "atomicread" | 
