diff options
author | Kevin Ryde <user42@zip.com.au> | 2001-06-15 00:12:13 +0200 |
---|---|---|
committer | Kevin Ryde <user42@zip.com.au> | 2001-06-15 00:12:13 +0200 |
commit | fbaf85f32a0a267efaa8a5dea865c9fe60f3191d (patch) | |
tree | fd6c11fa277c48499a6998e514432ee8a750c573 /longlong.h | |
parent | 2401b639a6ca57ce39a438c5602fa2fb0f4b4a76 (diff) | |
download | gmp-fbaf85f32a0a267efaa8a5dea865c9fe60f3191d.tar.gz |
* longlong.h (count_leading_zeros) [pentium]: Decide to go with float
method for p54.
(count_leading_zeros) [alpha]: Add COUNT_LEADING_ZEROS_NEED_CLZ_TAB.
(__clz_tab): Provide a prototype only if it's needed.
Diffstat (limited to 'longlong.h')
-rw-r--r-- | longlong.h | 81 |
1 files changed, 48 insertions, 33 deletions
diff --git a/longlong.h b/longlong.h index fb49f7014..b2dfa8464 100644 --- a/longlong.h +++ b/longlong.h @@ -142,6 +142,7 @@ long __MPN(count_leading_zeros) _PROTO ((UDItype)); #endif #define count_leading_zeros(count, x) \ ((count) = __MPN(count_leading_zeros) (x)) +#define COUNT_LEADING_ZEROS_NEED_CLZ_TAB #endif /* LONGLONG_STANDALONE */ #endif /* __alpha */ @@ -564,14 +565,43 @@ extern UWtype __MPN(udiv_qrnnd) _PROTO ((UWtype *, UWtype, UWtype, UWtype)); : "=a" (q), "=d" (r) \ : "0" ((USItype)(n0)), "1" ((USItype)(n1)), "rm" ((USItype)(dx))) -#if HAVE_HOST_CPU_i586 || HAVE_HOST_CPU_pentium || HAVE_HOST_CPU_pentiummmx -/* This code should be a fixed 14 or 15 cycles, but possibly plus an L1 - cache miss reading from __clz_tab. P5 "bsrl" on the other hand takes - between 10 and 72 cycles depending where the most significant 1 bit is. +/* P5 bsrl takes between 10 and 72 cycles depending where the most + significant 1 bit is, hence the use of the alternatives below. bsfl is + slow too, between 18 and 42 depending where the least significant 1 bit + is. The faster count_leading_zeros are pressed into service via the + generic count_trailing_zeros at the end of the file. */ + +#if HAVE_HOST_CPU_i586 || HAVE_HOST_CPU_pentium + +/* The following should be a fixed 14 cycles or so. Some scheduling + opportunities should be available between the float load/store too. This + is used (with "n&-n" to get trailing zeros) in gcc 3 for __builtin_ffs + and is apparently suggested by the Intel optimizing manual (don't know + exactly where). gcc 2.95 or up will be best for this, so the "double" is + correctly aligned on the stack. */ + +#define count_leading_zeros(c,n) \ + do { \ + union { \ + double d; \ + unsigned a[2]; \ + } __u; \ + ASSERT ((n) != 0); \ + __u.d = (UWtype) (n); \ + (c) = 0x3FF + 31 - (__u.a[1] >> 20); \ + } while (0) + +#else /* ! pentium */ +#if HAVE_HOST_CPU_pentiummmx + +/* The following should be a fixed 14 or 15 cycles, but possibly plus an L1 + cache miss reading from __clz_tab. It's favoured over the float above so + as to avoid mixing MMX and x87, since the penalty for switching between + the two is about 100 cycles. The asm block sets __shift to -3 if the high 24 bits are clear, -2 for 16, -1 for 8, or 0 otherwise. This could be written equivalently as - follows, but as of gcc 2.95.2 this results in conditional jumps. + follows, but as of gcc 2.95.2 it results in conditional jumps. __shift = -(__n < 0x1000000); __shift -= (__n < 0x10000); @@ -580,16 +610,7 @@ extern UWtype __MPN(udiv_qrnnd) _PROTO ((UWtype *, UWtype, UWtype, UWtype)); The middle two sbbl and cmpl's pair, and with luck something gcc generates might pair with the first cmpl and the last sbbl. The "32+1" constant could be folded into __clz_tab[], but it doesn't seem worth - making a different table just for that. - - The Intel suggested method, used in __builtin_ffs of gcc 3, - - double d = (n); - (c) = (((unsigned*)&d)[1] >> 20) & 0x3FF; - - seems to measure about the same, or a touch slower. We wouldn't want it - for P55, since we don't want to mix MMX and x87 any more than can be - helped. */ + making a different table just for that. */ #define count_leading_zeros(c,n) \ do { \ @@ -609,17 +630,12 @@ extern UWtype __MPN(udiv_qrnnd) _PROTO ((UWtype *, UWtype, UWtype, UWtype)); #define COUNT_LEADING_ZEROS_NEED_CLZ_TAB #define COUNT_LEADING_ZEROS_0 31 /* n==0 indistinguishable from n==1 */ -/* count_trailing_zeros for pentium is done by the generic code below using - count_leading_zeros. This should be a fixed 15 or 16 cycles, possibly - plus an L1 miss. P5 "bsfl" on the other hand takes between 18 and 42 - depending where the least significant 1 bit is. */ - -#else -/* gcc on p6 prior to 3.0 generates a partial register stall for __cbtmp^31, - due to using "xorb $31" instead of "xorl $31", the former being 1 code - byte smaller. "31-__cbtmp" is a workaround, probably at the cost of one - extra instruction. Do this for "i386" too, since that means generic - x86. */ +#else /* !pentiummmx */ +/* On P6, gcc prior to 3.0 generates a partial register stall for + __cbtmp^31, due to using "xorb $31" instead of "xorl $31", the former + being 1 code byte smaller. "31-__cbtmp" is a workaround, probably at the + cost of one extra instruction. Do this for "i386" too, since that means + generic x86. */ #if __GNUC__ < 3 \ && (HAVE_HOST_CPU_i386 || HAVE_HOST_CPU_pentiumpro \ || HAVE_HOST_CPU_pentium2 || HAVE_HOST_CPU_pentium3) @@ -645,7 +661,8 @@ extern UWtype __MPN(udiv_qrnnd) _PROTO ((UWtype *, UWtype, UWtype, UWtype)); ASSERT ((x) != 0); \ __asm__ ("bsfl %1,%0" : "=r" (count) : "rm" ((USItype)(x))); \ } while (0) -#endif +#endif /* ! pentiummmx */ +#endif /* ! pentium */ #ifndef UMUL_TIME #define UMUL_TIME 10 @@ -1533,12 +1550,6 @@ extern mp_limb_t mpn_udiv_qrnnd _PROTO ((mp_limb_t *, #define udiv_qrnnd __udiv_qrnnd_c #endif -extern -#if __STDC__ -const -#endif -unsigned char __clz_tab[128]; - #if !defined (count_leading_zeros) #define count_leading_zeros(count, x) \ do { \ @@ -1567,6 +1578,10 @@ unsigned char __clz_tab[128]; #define COUNT_LEADING_ZEROS_NEED_CLZ_TAB #endif +#ifdef COUNT_LEADING_ZEROS_NEED_CLZ_TAB +extern const unsigned char __clz_tab[128]; +#endif + #if !defined (count_trailing_zeros) /* Define count_trailing_zeros using count_leading_zeros. The latter might be defined in asm, but if it is not, the C version above is good enough. */ |