diff options
author | Torbjorn Granlund <tg@gmplib.org> | 2019-09-13 14:32:41 +0200 |
---|---|---|
committer | Torbjorn Granlund <tg@gmplib.org> | 2019-09-13 14:32:41 +0200 |
commit | 5e6abbb09e67c726827a9d86ba0749a9d07211e8 (patch) | |
tree | 1d5aeafd197d7aa286d3faaa03a09ac64856c48c | |
parent | a2d8ee6a2971e33f66fdc3e100d86dfc39e6f472 (diff) | |
download | gmp-5e6abbb09e67c726827a9d86ba0749a9d07211e8.tar.gz |
(umul_ppmm): Fix criterion for when to use mulx.
(count_leading_zeros): Use lzcnt for appropriate CPUs.
(count_trailing_zeros): Use tzcnt for appropriate CPUs.
-rw-r--r-- | longlong.h | 40 |
1 files changed, 32 insertions, 8 deletions
diff --git a/longlong.h b/longlong.h index dc8d2dd56..7624d6709 100644 --- a/longlong.h +++ b/longlong.h @@ -1058,14 +1058,15 @@ extern UWtype __MPN(udiv_qrnnd) (UWtype *, UWtype, UWtype, UWtype); : "=r" (sh), "=&r" (sl) \ : "0" ((UDItype)(ah)), "rme" ((UDItype)(bh)), \ "1" ((UDItype)(al)), "rme" ((UDItype)(bl))) -#if defined (HAVE_MULX) +#if HAVE_HOST_CPU_haswell || HAVE_HOST_CPU_broadwell || HAVE_HOST_CPU_skylake \ + || HAVE_HOST_CPU_bd4 || HAVE_HOST_CPU_zen #define umul_ppmm(w1, w0, u, v) \ - __asm__ ("mulx %3, %0, %1" \ + __asm__ ("mulx\t%3, %0, %1" \ : "=r" (w0), "=r" (w1) \ : "%d" ((UDItype)(u)), "rm" ((UDItype)(v))) #else #define umul_ppmm(w1, w0, u, v) \ - __asm__ ("mulq %3" \ + __asm__ ("mulq\t%3" \ : "=a" (w0), "=d" (w1) \ : "%0" ((UDItype)(u)), "rm" ((UDItype)(v))) #endif @@ -1073,21 +1074,44 @@ extern UWtype __MPN(udiv_qrnnd) (UWtype *, UWtype, UWtype, UWtype); __asm__ ("divq %4" /* stringification in K&R C */ \ : "=a" (q), "=d" (r) \ : "0" ((UDItype)(n0)), "1" ((UDItype)(n1)), "rm" ((UDItype)(dx))) -/* bsrq destination must be a 64-bit register, hence UDItype for __cbtmp. */ + +#if HAVE_HOST_CPU_haswell || HAVE_HOST_CPU_broadwell || HAVE_HOST_CPU_skylake \ + || HAVE_HOST_CPU_k10 || HAVE_HOST_CPU_bd1 || HAVE_HOST_CPU_bd2 \ + || HAVE_HOST_CPU_bd3 || HAVE_HOST_CPU_bd4 || HAVE_HOST_CPU_zen \ + || HAVE_HOST_CPU_bobcat || HAVE_HOST_CPU_jaguar +#define count_leading_zeros(count, x) \ + do { \ + /* This is lzcnt, spelled for older assemblers. Destination and */ \ + /* source must be a 64-bit registers, hence cast and %q. */ \ + __asm__ ("rep;bsr\t%1, %q0" : "=r" (count) : "rm" ((UDItype)(x))); \ + } while (0) +#define COUNT_LEADING_ZEROS_0 64 +#else #define count_leading_zeros(count, x) \ do { \ UDItype __cbtmp; \ ASSERT ((x) != 0); \ - __asm__ ("bsrq %1,%0" : "=r" (__cbtmp) : "rm" ((UDItype)(x))); \ + __asm__ ("bsr\t%1,%0" : "=r" (__cbtmp) : "rm" ((UDItype)(x))); \ (count) = __cbtmp ^ 63; \ } while (0) -/* bsfq destination must be a 64-bit register, "%q0" forces this in case - count is only an int. */ +#endif + +#if HAVE_HOST_CPU_bd2 || HAVE_HOST_CPU_bd3 || HAVE_HOST_CPU_bd4 \ + || HAVE_HOST_CPU_zen || HAVE_HOST_CPU_jaguar +#define count_trailing_zeros(count, x) \ + do { \ + /* This is tzcnt, spelled for older assemblers. Destination and */ \ + /* source must be a 64-bit registers, hence cast and %q. */ \ + __asm__ ("rep;bsf\t%1, %q0" : "=r" (count) : "rm" ((UDItype)(x))); \ + } while (0) +#define COUNT_TRAILING_ZEROS_0 64 +#else #define count_trailing_zeros(count, x) \ do { \ ASSERT ((x) != 0); \ - __asm__ ("bsfq %1,%q0" : "=r" (count) : "rm" ((UDItype)(x))); \ + __asm__ ("bsf\t%1, %q0" : "=r" (count) : "rm" ((UDItype)(x))); \ } while (0) +#endif #endif /* __amd64__ */ #if defined (__i860__) && W_TYPE_SIZE == 32 |