summaryrefslogtreecommitdiff
diff options
context:
space:
mode:
authorTorbjorn Granlund <tg@gmplib.org>2019-09-13 14:32:41 +0200
committerTorbjorn Granlund <tg@gmplib.org>2019-09-13 14:32:41 +0200
commit5e6abbb09e67c726827a9d86ba0749a9d07211e8 (patch)
tree1d5aeafd197d7aa286d3faaa03a09ac64856c48c
parenta2d8ee6a2971e33f66fdc3e100d86dfc39e6f472 (diff)
downloadgmp-5e6abbb09e67c726827a9d86ba0749a9d07211e8.tar.gz
(umul_ppmm): Fix criterion for when to use mulx.
(count_leading_zeros): Use lzcnt for appropriate CPUs. (count_trailing_zeros): Use tzcnt for appropriate CPUs.
-rw-r--r--longlong.h40
1 files changed, 32 insertions, 8 deletions
diff --git a/longlong.h b/longlong.h
index dc8d2dd56..7624d6709 100644
--- a/longlong.h
+++ b/longlong.h
@@ -1058,14 +1058,15 @@ extern UWtype __MPN(udiv_qrnnd) (UWtype *, UWtype, UWtype, UWtype);
: "=r" (sh), "=&r" (sl) \
: "0" ((UDItype)(ah)), "rme" ((UDItype)(bh)), \
"1" ((UDItype)(al)), "rme" ((UDItype)(bl)))
-#if defined (HAVE_MULX)
+#if HAVE_HOST_CPU_haswell || HAVE_HOST_CPU_broadwell || HAVE_HOST_CPU_skylake \
+ || HAVE_HOST_CPU_bd4 || HAVE_HOST_CPU_zen
#define umul_ppmm(w1, w0, u, v) \
- __asm__ ("mulx %3, %0, %1" \
+ __asm__ ("mulx\t%3, %0, %1" \
: "=r" (w0), "=r" (w1) \
: "%d" ((UDItype)(u)), "rm" ((UDItype)(v)))
#else
#define umul_ppmm(w1, w0, u, v) \
- __asm__ ("mulq %3" \
+ __asm__ ("mulq\t%3" \
: "=a" (w0), "=d" (w1) \
: "%0" ((UDItype)(u)), "rm" ((UDItype)(v)))
#endif
@@ -1073,21 +1074,44 @@ extern UWtype __MPN(udiv_qrnnd) (UWtype *, UWtype, UWtype, UWtype);
__asm__ ("divq %4" /* stringification in K&R C */ \
: "=a" (q), "=d" (r) \
: "0" ((UDItype)(n0)), "1" ((UDItype)(n1)), "rm" ((UDItype)(dx)))
-/* bsrq destination must be a 64-bit register, hence UDItype for __cbtmp. */
+
+#if HAVE_HOST_CPU_haswell || HAVE_HOST_CPU_broadwell || HAVE_HOST_CPU_skylake \
+ || HAVE_HOST_CPU_k10 || HAVE_HOST_CPU_bd1 || HAVE_HOST_CPU_bd2 \
+ || HAVE_HOST_CPU_bd3 || HAVE_HOST_CPU_bd4 || HAVE_HOST_CPU_zen \
+ || HAVE_HOST_CPU_bobcat || HAVE_HOST_CPU_jaguar
+#define count_leading_zeros(count, x) \
+ do { \
+ /* This is lzcnt, spelled for older assemblers. Destination and */ \
+ /* source must be a 64-bit registers, hence cast and %q. */ \
+ __asm__ ("rep;bsr\t%1, %q0" : "=r" (count) : "rm" ((UDItype)(x))); \
+ } while (0)
+#define COUNT_LEADING_ZEROS_0 64
+#else
#define count_leading_zeros(count, x) \
do { \
UDItype __cbtmp; \
ASSERT ((x) != 0); \
- __asm__ ("bsrq %1,%0" : "=r" (__cbtmp) : "rm" ((UDItype)(x))); \
+ __asm__ ("bsr\t%1,%0" : "=r" (__cbtmp) : "rm" ((UDItype)(x))); \
(count) = __cbtmp ^ 63; \
} while (0)
-/* bsfq destination must be a 64-bit register, "%q0" forces this in case
- count is only an int. */
+#endif
+
+#if HAVE_HOST_CPU_bd2 || HAVE_HOST_CPU_bd3 || HAVE_HOST_CPU_bd4 \
+ || HAVE_HOST_CPU_zen || HAVE_HOST_CPU_jaguar
+#define count_trailing_zeros(count, x) \
+ do { \
+ /* This is tzcnt, spelled for older assemblers. Destination and */ \
+ /* source must be a 64-bit registers, hence cast and %q. */ \
+ __asm__ ("rep;bsf\t%1, %q0" : "=r" (count) : "rm" ((UDItype)(x))); \
+ } while (0)
+#define COUNT_TRAILING_ZEROS_0 64
+#else
#define count_trailing_zeros(count, x) \
do { \
ASSERT ((x) != 0); \
- __asm__ ("bsfq %1,%q0" : "=r" (count) : "rm" ((UDItype)(x))); \
+ __asm__ ("bsf\t%1, %q0" : "=r" (count) : "rm" ((UDItype)(x))); \
} while (0)
+#endif
#endif /* __amd64__ */
#if defined (__i860__) && W_TYPE_SIZE == 32