summaryrefslogtreecommitdiff
path: root/tune
diff options
context:
space:
mode:
authorKevin Ryde <user42@zip.com.au>2000-06-30 00:42:38 +0200
committerKevin Ryde <user42@zip.com.au>2000-06-30 00:42:38 +0200
commita557fe9d07c37497c572416aebe132536bbfbe52 (patch)
tree94818365113516d33e825367aef3e3c2ecdd670c /tune
parentef0f7391964d47fd032ce46c6cd52441a6b2b39e (diff)
downloadgmp-a557fe9d07c37497c572416aebe132536bbfbe52.tar.gz
* tune/*: Locate data to help direct-mapped caches, add measuring
of mpz_init/clear, mpz_add and mpz_bin_uiui, various cleanups.
Diffstat (limited to 'tune')
-rw-r--r--tune/common.c271
-rw-r--r--tune/speed.c138
-rw-r--r--tune/speed.h540
3 files changed, 641 insertions, 308 deletions
diff --git a/tune/common.c b/tune/common.c
index 60910a83b..93c35849d 100644
--- a/tune/common.c
+++ b/tune/common.c
@@ -43,6 +43,10 @@ MA 02111-1307, USA.
#define numberof(x) (sizeof (x) / sizeof ((x)[0]))
typedef int (*qsort_function_t) _PROTO ((const void *, const void *));
+
+int speed_option_addrs = 0;
+
+
void
pentium_wbinvd(void)
{
@@ -128,7 +132,7 @@ speed_measure (double (*fun) _PROTO ((struct speed_params *s)),
memset (&s_dummy, '\0', sizeof (s_dummy));
s = &s_dummy;
}
-
+
s->reps = 1;
s->time_divisor = 1.0;
for (i = 0; i < numberof (t); i++)
@@ -229,16 +233,74 @@ mpn_cache_fill_write (mp_ptr ptr, mp_size_t size)
void
+speed_operand_src (struct speed_params *s, mp_ptr ptr, mp_size_t size)
+{
+ if (s->src_num >= numberof (s->src))
+ {
+ fprintf (stderr, "speed_operand_src: no room left in s->src[]\n");
+ abort ();
+ }
+ s->src[s->src_num].ptr = ptr;
+ s->src[s->src_num].size = size;
+ s->src_num++;
+}
+
+
+void
+speed_operand_dst (struct speed_params *s, mp_ptr ptr, mp_size_t size)
+{
+ if (s->dst_num >= numberof (s->dst))
+ {
+ fprintf (stderr, "speed_operand_dst: no room left in s->dst[]\n");
+ abort ();
+ }
+ s->dst[s->dst_num].ptr = ptr;
+ s->dst[s->dst_num].size = size;
+ s->dst_num++;
+}
+
+
+void
speed_cache_fill (struct speed_params *s)
{
+ static struct speed_params prev;
int i;
-#if 0
- for (i = 0; i < s->dst_num; i++)
- printf ("dst %p %ld\n", s->dst[i].ptr, s->dst[i].size);
- for (i = 0; i < s->src_num; i++)
- printf ("src %p %ld\n", s->src[i].ptr, s->src[i].size);
-#endif
+ /* FIXME: need a better way to get the format string for a pointer */
+
+ if (speed_option_addrs)
+ {
+ int different;
+
+ different = (s->dst_num != prev.dst_num || s->src_num != prev.src_num);
+ for (i = 0; i < s->dst_num; i++)
+ different |= (s->dst[i].ptr != prev.dst[i].ptr);
+ for (i = 0; i < s->src_num; i++)
+ different |= (s->src[i].ptr != prev.src[i].ptr);
+
+ if (different)
+ {
+ if (s->dst_num != 0)
+ {
+ printf ("dst");
+ for (i = 0; i < s->dst_num; i++)
+ printf (" %08lX", (unsigned long) s->dst[i].ptr);
+ printf (" ");
+ }
+
+ if (s->src_num != 0)
+ {
+ printf ("src");
+ for (i = 0; i < s->src_num; i++)
+ printf (" %08lX", (unsigned long) s->src[i].ptr);
+ printf (" ");
+ }
+ printf (" (cf sp approx %08lX)\n", (unsigned long) &different);
+
+ }
+
+ memcpy (&prev, s, sizeof(prev));
+ }
switch (s->cache) {
case 0:
@@ -285,8 +347,8 @@ _mp_allocate_or_reallocate (void *ptr, size_t oldsize, size_t newsize)
}
-/* Adjust ptr to align to CACHE_LINE_SIZE plus "align". ptr needs to have
- room for up to CACHE_LINE_SIZE-4 extra bytes. */
+/* Adjust ptr to align to CACHE_LINE_SIZE bytes plus "align" limbs. ptr
+ needs to have room for up to CACHE_LINE_SIZE-4 extra bytes. */
mp_ptr
speed_tmp_alloc_adjust (void *ptr, mp_size_t align)
@@ -316,6 +378,20 @@ mpz_set_n (mpz_ptr z, mp_srcptr p, mp_size_t size)
}
+/* Miscellanous options accepted by tune and speed programs under -o. */
+
+void
+speed_option_set (const char *s)
+{
+ if (strcmp (s, "addrs") == 0) speed_option_addrs = 1;
+ else
+ {
+ printf ("Unrecognised -o option: %s\n", s);
+ exit (1);
+ }
+}
+
+
/* The following are basic speed running routines for various gmp functions.
Many are very similar and use speed.h macros.
@@ -338,7 +414,7 @@ mpz_set_n (mpz_ptr z, mp_srcptr p, mp_size_t size)
using the routines will ensure s->xp and s->yp are aligned. Aligning
onto a CACHE_LINE_SIZE boundary is suggested. s->align_wp and
s->align_wp2 should be respected where it makes sense to do so.
- SPEED_TMP_ALLOC is a good way to do this.
+ SPEED_TMP_ALLOC_LIMBS is a good way to do this.
A loop of the following form can be expected to turn into good assembler
code on most CPUs, thereby minimizing overhead in the measurement. It
@@ -585,7 +661,7 @@ speed_mpn_sqr_n (struct speed_params *s)
double
speed_mpn_mul_n_sqr (struct speed_params *s)
{
- SPEED_ROUTINE_MPN_SQR_CALL (mpn_mul_n (wp, s->xp, s->xp, s->size), 1);
+ SPEED_ROUTINE_MPN_SQR_CALL (mpn_mul_n (wp, s->xp, s->xp, s->size));
}
double
@@ -600,32 +676,26 @@ speed_mpn_sqr_basecase (struct speed_params *s)
SPEED_ROUTINE_MPN_SQR (mpn_sqr_basecase);
}
-/* FIXME: size restrictions on kara */
double
speed_mpn_kara_mul_n (struct speed_params *s)
{
- SPEED_ROUTINE_MPN_MUL_N_CALL
- (mpn_kara_mul_n (wp, s->xp, s->xp, s->size, tspace),
- MPN_KARA_MUL_N_TSIZE (s->size));
+ SPEED_ROUTINE_MPN_KARA_MUL_N (mpn_kara_mul_n);
}
double
speed_mpn_kara_sqr_n (struct speed_params *s)
{
- SPEED_ROUTINE_MPN_SQR_CALL
- (mpn_kara_sqr_n (wp, s->xp, s->size, tspace),
- MPN_KARA_SQR_N_TSIZE (s->size));
+ SPEED_ROUTINE_MPN_KARA_SQR_N (mpn_kara_sqr_n);
}
-/* FIXME: size restrictions on toom3 */
double
speed_mpn_toom3_mul_n (struct speed_params *s)
{
- SPEED_ROUTINE_GMPN_TOOM3_MUL_N (mpn_toom3_mul_n);
+ SPEED_ROUTINE_MPN_TOOM3_MUL_N (mpn_toom3_mul_n);
}
double
speed_mpn_toom3_sqr_n (struct speed_params *s)
{
- SPEED_ROUTINE_GMPN_TOOM3_SQR_N (mpn_toom3_sqr_n);
+ SPEED_ROUTINE_MPN_TOOM3_SQR_N (mpn_toom3_sqr_n);
}
@@ -736,3 +806,162 @@ speed_noop_wxys (struct speed_params *s)
return t;
}
+
+#define SPEED_ROUTINE_ALLOC_FREE(variables, calls) \
+ { \
+ unsigned i; \
+ variables; \
+ \
+ speed_starttime (); \
+ i = s->reps; \
+ do \
+ { \
+ calls; \
+ } \
+ while (--i != 0); \
+ return speed_endtime (); \
+ }
+
+
+/* Compare these to see how much malloc/free costs and then how much
+ _mp_default_allocate/free and mpz_init/clear add. mpz_init/clear or
+ mpq_init/clear will be doing a 1 limb allocate, so use that as the size
+ when including them in comparisons. */
+
+double
+speed_malloc_free (struct speed_params *s)
+{
+ size_t bytes = s->size * BYTES_PER_MP_LIMB;
+ SPEED_ROUTINE_ALLOC_FREE (void *p,
+ p = malloc (bytes);
+ free (p));
+}
+
+double
+speed_malloc_realloc_free (struct speed_params *s)
+{
+ size_t bytes = s->size * BYTES_PER_MP_LIMB;
+ SPEED_ROUTINE_ALLOC_FREE (void *p,
+ p = malloc (BYTES_PER_MP_LIMB);
+ p = realloc (p, bytes);
+ free (p));
+}
+
+double
+speed_mp_allocate_free (struct speed_params *s)
+{
+ size_t bytes = s->size * BYTES_PER_MP_LIMB;
+ SPEED_ROUTINE_ALLOC_FREE (void *p,
+ p = (*_mp_allocate_func) (bytes);
+ (*_mp_free_func) (p, bytes));
+}
+
+double
+speed_mp_allocate_reallocate_free (struct speed_params *s)
+{
+ size_t bytes = s->size * BYTES_PER_MP_LIMB;
+ SPEED_ROUTINE_ALLOC_FREE
+ (void *p,
+ p = (*_mp_allocate_func) (BYTES_PER_MP_LIMB);
+ p = (*_mp_reallocate_func) (p, bytes, BYTES_PER_MP_LIMB);
+ (*_mp_free_func) (p, bytes));
+}
+
+double
+speed_mpz_init_clear (struct speed_params *s)
+{
+ SPEED_ROUTINE_ALLOC_FREE (mpz_t z,
+ mpz_init (z);
+ mpz_clear (z));
+}
+
+double
+speed_mpz_init_realloc_clear (struct speed_params *s)
+{
+ SPEED_ROUTINE_ALLOC_FREE (mpz_t z,
+ mpz_init (z);
+ _mpz_realloc (z, s->size);
+ mpz_clear (z));
+}
+
+double
+speed_mpq_init_clear (struct speed_params *s)
+{
+ SPEED_ROUTINE_ALLOC_FREE (mpq_t q,
+ mpq_init (q);
+ mpq_clear (q));
+}
+
+double
+speed_mpf_init_clear (struct speed_params *s)
+{
+ SPEED_ROUTINE_ALLOC_FREE (mpf_t f,
+ mpf_init (f);
+ mpf_clear (f));
+}
+
+
+/* Compare this to mpn_add_n to see how much overhead mpz_add adds. Note
+ that repeatedly calling mpz_add with the same data gives branch predition
+ in it an advantage. */
+
+double
+speed_mpz_add (struct speed_params *s)
+{
+ mpz_t w, x, y;
+ unsigned i;
+ double t;
+
+ mpz_init (w);
+ mpz_init (x);
+ mpz_init (y);
+
+ mpz_set_n (x, s->xp, s->size);
+ mpz_set_n (y, s->yp, s->size);
+ mpz_add (w, x, y);
+
+ speed_starttime ();
+ i = s->reps;
+ do
+ {
+ mpz_add (w, x, y);
+ }
+ while (--i != 0);
+ t = speed_endtime ();
+
+ mpz_clear (w);
+ mpz_clear (x);
+ mpz_clear (y);
+ return t;
+}
+
+
+/* If r==0, calculate (size,size/2),
+ otherwise calculate (size,r). */
+
+double
+speed_mpz_bin_uiui (struct speed_params *s)
+{
+ mpz_t w;
+ unsigned long k;
+ unsigned i;
+ double t;
+
+ mpz_init (w);
+ if (s->r != 0)
+ k = s->r;
+ else
+ k = s->size/2;
+
+ speed_starttime ();
+ i = s->reps;
+ do
+ {
+ mpz_bin_uiui (w, s->size, k);
+ }
+ while (--i != 0);
+ t = speed_endtime ();
+
+ mpz_clear (w);
+ return t;
+}
diff --git a/tune/speed.c b/tune/speed.c
index b4a081e0d..3779b7bc1 100644
--- a/tune/speed.c
+++ b/tune/speed.c
@@ -24,12 +24,12 @@ MA 02111-1307, USA.
/* Usage message is in the code below, run with no arguments to print it.
See README for interesting applications.
- To add a new routine foo() to measure, create a speed_foo() function in
- the style of the existing ones and add an entry in the routine[] array.
- Put FLAG_R if the routine needs an "r" parameter.
+ To add a new routine foo(), create a speed_foo() function in the style of
+ the existing ones and add an entry in the routine[] array. Put FLAG_R if
+ speed_foo() wants an "r" parameter.
The routines don't have help messages or descriptions, but most have
- pretty suggestive names. See the source code for full details.
+ suggestive names. See the source code for full details.
*/
@@ -67,10 +67,6 @@ SPEED_EXTRA_PROTOS
#endif
-#ifndef LONG_BIT
-#define LONG_BIT (8 * sizeof(long))
-#endif
-
#define numberof(x) (sizeof (x) / sizeof ((x)[0]))
#define MPN_FILL(ptr, size, n) \
@@ -109,6 +105,7 @@ struct size_array_t {
mp_size_t size_num = 0;
mp_size_t size_allocnum = 0;
int option_resource_usage = 0;
+long option_seed = 123456789;
struct speed_params sp;
@@ -172,10 +169,6 @@ const struct routine_t {
{ "mpn_popcount", speed_mpn_popcount },
{ "mpn_hamdist", speed_mpn_hamdist },
- { "mpz_fac_ui", speed_mpz_fac_ui },
- { "mpz_fib_ui", speed_mpz_fib_ui },
- { "mpz_powm", speed_mpz_powm },
-
{ "mpn_gcdext", speed_mpn_gcdext },
{ "mpn_gcd", speed_mpn_gcd },
{ "mpn_gcd_1", speed_mpn_gcd_1, FLAG_R_OPTIONAL },
@@ -193,6 +186,12 @@ const struct routine_t {
{ "mpn_toom3_mul_n", speed_mpn_toom3_mul_n },
{ "mpn_toom3_sqr_n", speed_mpn_toom3_sqr_n },
+ { "mpz_add", speed_mpz_add },
+ { "mpz_bin_uiui", speed_mpz_bin_uiui, FLAG_R_OPTIONAL },
+ { "mpz_fac_ui", speed_mpz_fac_ui },
+ { "mpz_fib_ui", speed_mpz_fib_ui },
+ { "mpz_powm", speed_mpz_powm },
+
{ "MPN_COPY", speed_MPN_COPY },
{ "MPN_COPY_INCR", speed_MPN_COPY_INCR },
{ "MPN_COPY_DECR", speed_MPN_COPY_DECR },
@@ -200,6 +199,15 @@ const struct routine_t {
{ "modlimb_invert", speed_modlimb_invert },
+ { "malloc_free", speed_malloc_free },
+ { "malloc_realloc_free", speed_malloc_realloc_free },
+ { "mp_allocate_free", speed_mp_allocate_free },
+ { "mp_allocate_reallocate_free", speed_mp_allocate_reallocate_free },
+ { "mpz_init_clear", speed_mpz_init_clear },
+ { "mpq_init_clear", speed_mpq_init_clear },
+ { "mpf_init_clear", speed_mpf_init_clear },
+ { "mpz_init_realloc_clear", speed_mpz_init_realloc_clear },
+
#ifdef SPEED_EXTRA_ROUTINES
SPEED_EXTRA_ROUTINES
#endif
@@ -218,6 +226,32 @@ struct choice_t *choice;
int num_choices = 0;
+void
+data_fill (mp_ptr ptr, mp_size_t size)
+{
+ switch (option_data) {
+ case DATA_RANDOM:
+ mpn_random (ptr, size);
+ break;
+ case DATA_RANDOM2:
+ mpn_random2 (ptr, size);
+ break;
+ case DATA_ZEROS:
+ MPN_ZERO (ptr, size);
+ break;
+ case DATA_FFS:
+ MPN_FILL (ptr, size, MP_LIMB_T_MAX);
+ break;
+ case DATA_2FD:
+ MPN_FILL (ptr, size, MP_LIMB_T_MAX);
+ ptr[0] -= 2;
+ break;
+ default:
+ abort();
+ /*NOTREACHED*/
+ }
+}
+
/* The code here handling the various combinations of output options isn't
too attractive, but it works and is fairly clean. */
@@ -230,8 +264,16 @@ void
run_one (FILE *fp, struct speed_params *s, mp_size_t prev_size)
{
const char *first_open_fastest, *first_open_notfastest, *first_close;
- int i, fastest;
- double fastest_time;
+ int i, fastest;
+ double fastest_time;
+ TMP_DECL (marker);
+
+ TMP_MARK (marker);
+ sp.xp = SPEED_TMP_ALLOC_LIMBS (s->size, s->align_xp);
+ sp.yp = SPEED_TMP_ALLOC_LIMBS (s->size, s->align_yp);
+
+ data_fill (s->xp, s->size);
+ data_fill (s->yp, s->size);
if (prev_size == -1 && option_cmp == CMP_DIFFPREV)
{
@@ -370,55 +412,23 @@ run_one (FILE *fp, struct speed_params *s, mp_size_t prev_size)
}
fprintf (fp, "\n");
}
-}
-void
-data_fill (mp_ptr ptr, mp_size_t size)
-{
- switch (option_data) {
- case DATA_RANDOM:
- mpn_random (ptr, size);
- break;
- case DATA_RANDOM2:
- mpn_random2 (ptr, size);
- break;
- case DATA_ZEROS:
- MPN_ZERO (ptr, size);
- break;
- case DATA_FFS:
- MPN_FILL (ptr, size, MP_LIMB_T_MAX);
- break;
- case DATA_2FD:
- MPN_FILL (ptr, size, MP_LIMB_T_MAX);
- ptr[0] -= 2;
- break;
- default:
- abort();
- /*NOTREACHED*/
- }
+ TMP_FREE (marker);
}
void
run_all (FILE *fp)
{
- mp_size_t prev_size, max_size;
+ mp_size_t prev_size;
int i;
TMP_DECL (marker);
- max_size = SPEED_DATA_SIZE;
- for (i = 0; i < size_num; i++)
- max_size = MAX (max_size, size_array[i].end);
-
- for (i = 0; i < num_choices; i++)
- if (choice[i].p->flag & FLAG_RSIZE)
- max_size = MAX (max_size, choice[i].r);
-
TMP_MARK (marker);
- sp.xp = SPEED_TMP_ALLOC (max_size, sp.align_xp);
- sp.yp = SPEED_TMP_ALLOC (max_size, sp.align_yp);
+ sp.xp_block = SPEED_TMP_ALLOC_LIMBS (SPEED_BLOCK_SIZE, sp.align_xp);
+ sp.yp_block = SPEED_TMP_ALLOC_LIMBS (SPEED_BLOCK_SIZE, sp.align_yp);
- data_fill (sp.xp, max_size);
- data_fill (sp.yp, max_size);
+ data_fill (sp.xp_block, SPEED_BLOCK_SIZE);
+ data_fill (sp.yp_block, SPEED_BLOCK_SIZE);
for (i = 0; i < size_num; i++)
{
@@ -538,7 +548,7 @@ run_gnuplot (void)
/* Return a long with n many one bits (starting from the least significant) */
#define LONG_ONES(n) \
- ((n) == LONG_BIT ? -1L : (n) == 0 ? 0L : (1L << (n)) - 1)
+ ((n) == BITS_PER_LONGINT ? -1L : (n) == 0 ? 0L : (1L << (n)) - 1)
long
r_string (const char *s)
@@ -554,10 +564,10 @@ r_string (const char *s)
if (strcmp (s, "bits") == 0)
{
mp_limb_t l;
- if (n > LONG_BIT)
+ if (n > BITS_PER_LONGINT)
{
fprintf (stderr, "%ld bit parameter invalid (max %d bits)\n",
- n, LONG_BIT);
+ n, BITS_PER_LONGINT);
exit (1);
}
mpn_random (&l, 1);
@@ -565,10 +575,10 @@ r_string (const char *s)
}
else if (strcmp (s, "ones") == 0)
{
- if (n > LONG_BIT)
+ if (n > BITS_PER_LONGINT)
{
fprintf (stderr, "%ld bit parameter invalid (max %d bits)\n",
- n, LONG_BIT);
+ n, BITS_PER_LONGINT);
exit (1);
}
return LONG_ONES (n);
@@ -657,6 +667,7 @@ Times are in seconds, accuracy is shown.\n\
-P name output plot files \"name.gnuplot\" and \"name.data\"\n\
-a <type> use given data: random(default), random2, zeros, ffs\n\
-x, -y, -w, -W <align> specify data alignments, sources and dests\n\
+ -o addrs print addresses of data blocks
\n\
If both -t and -f are used, it means step by the factor or the step, whichever\n\
is greater.\n\
@@ -735,7 +746,7 @@ main (int argc, char *argv[])
and isn't lost if you kill the program half way. */
setbuf (stdout, NULL);
-#define OPTSTRING "a:CcDdEFf:p:P:rRs:t:ux:y:w:W:z"
+#define OPTSTRING "a:CcDdEFf:o:p:P:rRs:t:ux:y:w:W:z"
#if HAVE_GETOPT_LONG
while ((opt = getopt_long(argc, argv, OPTSTRING, longopts, NULL))
!= EOF)
@@ -796,6 +807,9 @@ main (int argc, char *argv[])
exit (1);
}
break;
+ case 'o':
+ speed_option_set (optarg);
+ break;
case 'P':
option_gnuplot = 1;
option_gnuplot_basename = optarg;
@@ -804,9 +818,7 @@ main (int argc, char *argv[])
speed_precision = atoi (optarg);
break;
case 'R':
- srand (time (NULL));
- srandom (time (NULL));
- srand48 (time (NULL));
+ option_seed = time (NULL);
break;
case 'r':
if (option_cmp != CMP_ABSOLUTE)
@@ -892,6 +904,10 @@ main (int argc, char *argv[])
exit (1);
}
+ srand (option_seed);
+ srandom (option_seed);
+ srand48 (option_seed);
+
choice = (struct choice_t *) (*_mp_allocate_func)
((argc - optind) * sizeof(choice[0]));
for ( ; optind < argc; optind++)
diff --git a/tune/speed.h b/tune/speed.h
index e32e1e979..49078f77d 100644
--- a/tune/speed.h
+++ b/tune/speed.h
@@ -43,7 +43,7 @@ MA 02111-1307, USA.
/* A mask of the least significant n bits. Note 1<<32 doesn't give zero on
x86 family CPUs, hence the separate case for BITS_PER_MP_LIMB. */
#define MP_LIMB_T_LOWBITMASK(n) \
- ((n) == BITS_PER_MP_LIMB ? ~0 : ((mp_limb_t) 1 << (n)) - 1)
+ ((n) == BITS_PER_MP_LIMB ? MP_LIMB_T_MAX : ((mp_limb_t) 1 << (n)) - 1)
/* align must be a power of 2 here, usually CACHE_LINE_SIZE is a good choice */
@@ -53,19 +53,29 @@ MA 02111-1307, USA.
#define TMP_ALLOC_LIMBS_ALIGNED(limbs, align) \
((mp_ptr) TMP_ALLOC_ALIGNED ((limbs)*sizeof(mp_limb_t), align))
-/* 32 is right for pentium family, need to configure this for other CPUs */
+/* 32 for pentium, 64 for athlon, might want to configure this for other
+ CPUs. In truth though nothing has yet shown up that cares about cache
+ line boundaries. The only practical effect of this is to restrict the
+ range that s->align_xp can take. Perhaps this could be a variable
+ instead. */
#define CACHE_LINE_SIZE 64 /* bytes */
#define SPEED_TMP_ALLOC_ADJUST_MASK (CACHE_LINE_SIZE/BYTES_PER_MP_LIMB - 1)
-#define SPEED_TMP_ALLOC(limbs, align) \
- (speed_tmp_alloc_adjust \
+#define SPEED_TMP_ALLOC_LIMBS(limbs, align) \
+ (speed_tmp_alloc_adjust \
(TMP_ALLOC_LIMBS((limbs) + SPEED_TMP_ALLOC_ADJUST_MASK), (align)))
-/* Minimum source data limbs available in s.xp and y.sp from speed program.
- 512 means 2kbytes of data for xp and yp, making 4k total, which should
- fit easily in any L1 data cache. */
-#define SPEED_DATA_SIZE 512
+
+/* This is the size for s->xp_block and s->yp_block, used in certain
+ routines that want to run across many different data values and use
+ s->size for a different purpose, eg. SPEED_ROUTINE_MPN_GCD_1.
+
+ 512 means 2kbytes of data for each of xp_block and yp_block, making 4k
+ total, which should fit easily in any L1 data cache. */
+
+#define SPEED_BLOCK_SIZE 512 /* limbs */
+
extern double speed_unittime;
extern double speed_cycletime;
@@ -76,32 +86,44 @@ void speed_starttime _PROTO ((void));
double speed_endtime _PROTO ((void));
struct speed_params {
- unsigned reps; /* how many times to run the routine */
- mp_ptr xp; /* first argument */
- mp_ptr yp; /* second argument */
- mp_size_t size; /* size of both arguments */
- long r; /* user supplied parameter */
+ unsigned reps; /* how many times to run the routine */
+ mp_ptr xp; /* first argument */
+ mp_ptr yp; /* second argument */
+ mp_size_t size; /* size of both arguments */
+ long r; /* user supplied parameter */
mp_size_t align_xp; /* alignment of xp */
mp_size_t align_yp; /* alignment of yp */
mp_size_t align_wp; /* intended alignment of wp */
mp_size_t align_wp2; /* intended alignment of wp2 */
+ mp_ptr xp_block; /* first special SPEED_BLOCK_SIZE block */
+ mp_ptr yp_block; /* second special SPEED_BLOCK_SIZE block */
double time_divisor; /* optionally set by the speed routine */
-
+
+ /* used by the cache priming things */
int cache;
unsigned src_num, dst_num;
struct {
mp_ptr ptr;
mp_size_t size;
- } src[2], dst[2];
+ } src[2], dst[3];
};
typedef double (*speed_function_t) _PROTO ((struct speed_params *s));
-double speed_measure _PROTO ((double (*fun)_PROTO ((struct speed_params *s)),
- struct speed_params *s));
+double speed_measure _PROTO ((speed_function_t fun, struct speed_params *s));
+
+/* Prototypes for speed measuring routines */
+
+double speed_malloc_free _PROTO ((struct speed_params *s));
+double speed_malloc_realloc_free _PROTO ((struct speed_params *s));
double speed_memcpy _PROTO ((struct speed_params *s));
double speed_modlimb_invert _PROTO ((struct speed_params *s));
+double speed_mp_allocate_free _PROTO ((struct speed_params *s));
+double speed_mp_allocate_reallocate_free _PROTO ((struct speed_params *s));
+
+double speed_mpf_init_clear _PROTO ((struct speed_params *s));
+
double speed_mpn_add_n _PROTO ((struct speed_params *s));
double speed_mpn_add_n_self _PROTO ((struct speed_params *s));
double speed_mpn_add_n_inplace _PROTO ((struct speed_params *s));
@@ -156,16 +178,23 @@ double speed_mpn_toom3_sqr_n _PROTO ((struct speed_params *s));
double speed_mpn_xnor_n _PROTO ((struct speed_params *s));
double speed_mpn_xor_n _PROTO ((struct speed_params *s));
+double speed_mpq_init_clear _PROTO ((struct speed_params *s));
+
+double speed_mpz_add _PROTO ((struct speed_params *s));
+double speed_mpz_bin_uiui _PROTO ((struct speed_params *s));
double speed_mpz_fac_ui _PROTO ((struct speed_params *s));
double speed_mpz_fib_ui _PROTO ((struct speed_params *s));
+double speed_mpz_init_clear _PROTO ((struct speed_params *s));
+double speed_mpz_init_realloc_clear _PROTO ((struct speed_params *s));
double speed_mpz_powm _PROTO ((struct speed_params *s));
-double speed_mpn_jacobi_base _PROTO ((struct speed_params *s));
-double speed_mpn_jacobi_base_division _PROTO ((struct speed_params *s));
double speed_noop _PROTO ((struct speed_params *s));
double speed_noop_wxs _PROTO ((struct speed_params *s));
double speed_noop_wxys _PROTO ((struct speed_params *s));
+
+/* Prototypes for other routines */
+
/* low 32-bits in p[0], high 32-bits in p[1] */
void speed_cyclecounter _PROTO ((unsigned p[2]));
@@ -184,23 +213,14 @@ void *_mp_allocate_or_reallocate _PROTO ((void *ptr,
void *align_pointer _PROTO ((void *p, size_t align));
void *_mp_allocate_func_aligned _PROTO ((size_t bytes, size_t align));
void speed_cache_fill _PROTO ((struct speed_params *s));
+void speed_operand_src _PROTO ((struct speed_params *s,
+ mp_ptr ptr, mp_size_t size));
+void speed_operand_dst _PROTO ((struct speed_params *s,
+ mp_ptr ptr, mp_size_t size));
void mpz_set_n _PROTO ((mpz_ptr z, mp_srcptr p, mp_size_t size));
-
-
-#define SPEED_OPERAND_SRC(s,p,sz) \
- do { \
- (s)->src[(s)->src_num].ptr = (p); \
- (s)->src[(s)->src_num].size = (sz); \
- (s)->src_num++; \
- } while (0)
-
-#define SPEED_OPERAND_DST(s,p,sz) \
- do { \
- (s)->dst[(s)->dst_num].ptr = (p); \
- (s)->dst[(s)->dst_num].size = (sz); \
- (s)->dst_num++; \
- } while (0)
+extern int speed_option_addrs;
+void speed_option_set _PROTO((const char *s));
#define SPEED_RESTRICT_COND(cond) if (!(cond)) return -1.0;
@@ -216,10 +236,10 @@ void mpz_set_n _PROTO ((mpz_ptr z, mp_srcptr p, mp_size_t size));
SPEED_RESTRICT_COND (s->size >= 0); \
\
TMP_MARK (marker); \
- wp = SPEED_TMP_ALLOC (s->size, s->align_wp); \
+ wp = SPEED_TMP_ALLOC_LIMBS (s->size, s->align_wp); \
\
- SPEED_OPERAND_SRC (s, s->xp, s->size); \
- SPEED_OPERAND_DST (s, wp, s->size); \
+ speed_operand_src (s, s->xp, s->size); \
+ speed_operand_dst (s, wp, s->size); \
speed_cache_fill (s); \
\
speed_starttime (); \
@@ -250,11 +270,11 @@ void mpz_set_n _PROTO ((mpz_ptr z, mp_srcptr p, mp_size_t size));
SPEED_RESTRICT_COND (s->size >= 1); \
\
TMP_MARK (marker); \
- wp = SPEED_TMP_ALLOC (s->size, s->align_wp); \
+ wp = SPEED_TMP_ALLOC_LIMBS (s->size, s->align_wp); \
\
- SPEED_OPERAND_SRC (s, s->xp, s->size); \
- SPEED_OPERAND_SRC (s, s->yp, s->size); \
- SPEED_OPERAND_DST (s, wp, s->size); \
+ speed_operand_src (s, s->xp, s->size); \
+ speed_operand_src (s, s->yp, s->size); \
+ speed_operand_dst (s, wp, s->size); \
speed_cache_fill (s); \
\
speed_starttime (); \
@@ -292,10 +312,10 @@ void mpz_set_n _PROTO ((mpz_ptr z, mp_srcptr p, mp_size_t size));
SPEED_RESTRICT_COND (s->size >= 1); \
\
TMP_MARK (marker); \
- wp = SPEED_TMP_ALLOC (s->size, s->align_wp); \
+ wp = SPEED_TMP_ALLOC_LIMBS (s->size, s->align_wp); \
\
- SPEED_OPERAND_SRC (s, s->xp, s->size); \
- SPEED_OPERAND_DST (s, wp, s->size); \
+ speed_operand_src (s, s->xp, s->size); \
+ speed_operand_dst (s, wp, s->size); \
speed_cache_fill (s); \
\
speed_starttime (); \
@@ -343,11 +363,11 @@ void mpz_set_n _PROTO ((mpz_ptr z, mp_srcptr p, mp_size_t size));
SPEED_RESTRICT_COND (size1 >= s->size); \
\
TMP_MARK (marker); \
- wp = SPEED_TMP_ALLOC (size1 + s->size, s->align_wp); \
+ wp = SPEED_TMP_ALLOC_LIMBS (size1 + s->size, s->align_wp); \
\
- SPEED_OPERAND_SRC (s, s->xp, size1); \
- SPEED_OPERAND_SRC (s, s->yp, s->size); \
- SPEED_OPERAND_DST (s, wp, size1 + s->size); \
+ speed_operand_src (s, s->xp, size1); \
+ speed_operand_src (s, s->yp, s->size); \
+ speed_operand_dst (s, wp, size1 + s->size); \
speed_cache_fill (s); \
\
speed_starttime (); \
@@ -362,79 +382,147 @@ void mpz_set_n _PROTO ((mpz_ptr z, mp_srcptr p, mp_size_t size));
}
-#define SPEED_ROUTINE_MPN_MUL_N_CALL(call, tsize) \
- { \
- mp_ptr wp, tspace; \
- unsigned i; \
- double t; \
- TMP_DECL (marker); \
- \
- SPEED_RESTRICT_COND (s->size >= 1); \
- \
- TMP_MARK (marker); \
- wp = SPEED_TMP_ALLOC (2*s->size, s->align_wp); \
- tspace = SPEED_TMP_ALLOC (tsize, s->align_wp2); \
- \
- SPEED_OPERAND_SRC (s, s->xp, s->size); \
- SPEED_OPERAND_SRC (s, s->yp, s->size); \
- SPEED_OPERAND_DST (s, wp, 2*s->size); \
- SPEED_OPERAND_DST (s, tspace, tsize); \
- speed_cache_fill (s); \
- \
- speed_starttime (); \
- i = s->reps; \
- do \
- call; \
- while (--i != 0); \
- t = speed_endtime (); \
- \
- TMP_FREE (marker); \
- return t; \
+#define SPEED_ROUTINE_MPN_MUL_N(function) \
+ { \
+ mp_ptr wp; \
+ unsigned i; \
+ double t; \
+ TMP_DECL (marker); \
+ \
+ SPEED_RESTRICT_COND (s->size >= 1); \
+ \
+ TMP_MARK (marker); \
+ wp = SPEED_TMP_ALLOC_LIMBS (2*s->size, s->align_wp); \
+ \
+ speed_operand_src (s, s->xp, s->size); \
+ speed_operand_src (s, s->yp, s->size); \
+ speed_operand_dst (s, wp, 2*s->size); \
+ speed_cache_fill (s); \
+ \
+ speed_starttime (); \
+ i = s->reps; \
+ do \
+ function (wp, s->xp, s->yp, s->size); \
+ while (--i != 0); \
+ t = speed_endtime (); \
+ \
+ TMP_FREE (marker); \
+ return t; \
+ }
+
+
+#define SPEED_ROUTINE_MPN_MUL_N_TSPACE(call, tsize) \
+ { \
+ mp_ptr wp, tspace; \
+ unsigned i; \
+ double t; \
+ TMP_DECL (marker); \
+ \
+ SPEED_RESTRICT_COND (s->size >= 1); \
+ \
+ TMP_MARK (marker); \
+ wp = SPEED_TMP_ALLOC_LIMBS (2*s->size, s->align_wp); \
+ tspace = SPEED_TMP_ALLOC_LIMBS (tsize, s->align_wp2); \
+ \
+ speed_operand_src (s, s->xp, s->size); \
+ speed_operand_src (s, s->yp, s->size); \
+ speed_operand_dst (s, wp, 2*s->size); \
+ speed_operand_dst (s, tspace, tsize); \
+ speed_cache_fill (s); \
+ \
+ speed_starttime (); \
+ i = s->reps; \
+ do \
+ call; \
+ while (--i != 0); \
+ t = speed_endtime (); \
+ \
+ TMP_FREE (marker); \
+ return t; \
}
-#define SPEED_ROUTINE_MPN_MUL_N(function) \
- SPEED_ROUTINE_MPN_MUL_N_CALL (function (wp, s->xp, s->yp, s->size), 1)
+/* FIXME: size restrictions */
+#define SPEED_ROUTINE_MPN_KARA_MUL_N(function) \
+ SPEED_ROUTINE_MPN_MUL_N_TSPACE \
+ (function (wp, s->xp, s->xp, s->size, tspace), \
+ MPN_KARA_MUL_N_TSIZE (s->size))
-#define SPEED_ROUTINE_GMPN_TOOM3_MUL_N(function) \
- SPEED_ROUTINE_MPN_MUL_N_CALL (function (wp, s->xp, s->yp, s->size, tspace), \
- MPN_TOOM3_MUL_N_TSIZE (s->size))
+/* FIXME: size restrictions */
+#define SPEED_ROUTINE_MPN_TOOM3_MUL_N(function) \
+ SPEED_ROUTINE_MPN_MUL_N_TSPACE \
+ (function (wp, s->xp, s->yp, s->size, tspace), \
+ MPN_TOOM3_MUL_N_TSIZE (s->size))
-#define SPEED_ROUTINE_MPN_SQR_CALL(call, tsize) \
- { \
- mp_ptr wp, tspace; \
- unsigned i; \
- double t; \
- TMP_DECL (marker); \
- \
- SPEED_RESTRICT_COND (s->size >= 1); \
- \
- TMP_MARK (marker); \
- wp = SPEED_TMP_ALLOC (2*s->size, s->align_wp); \
- tspace = SPEED_TMP_ALLOC (tsize, s->align_wp2); \
- \
- SPEED_OPERAND_SRC (s, s->xp, s->size); \
- SPEED_OPERAND_DST (s, wp, 2*s->size); \
- SPEED_OPERAND_DST (s, tspace, tsize); \
- speed_cache_fill (s); \
- \
- speed_starttime (); \
- i = s->reps; \
- do \
- call; \
- while (--i != 0); \
- t = speed_endtime (); \
- \
- TMP_FREE (marker); \
- return t; \
+#define SPEED_ROUTINE_MPN_SQR_CALL(call) \
+ { \
+ mp_ptr wp; \
+ unsigned i; \
+ double t; \
+ TMP_DECL (marker); \
+ \
+ SPEED_RESTRICT_COND (s->size >= 1); \
+ \
+ TMP_MARK (marker); \
+ wp = SPEED_TMP_ALLOC_LIMBS (2*s->size, s->align_wp); \
+ \
+ speed_operand_src (s, s->xp, s->size); \
+ speed_operand_dst (s, wp, 2*s->size); \
+ speed_cache_fill (s); \
+ \
+ speed_starttime (); \
+ i = s->reps; \
+ do \
+ call; \
+ while (--i != 0); \
+ t = speed_endtime (); \
+ \
+ TMP_FREE (marker); \
+ return t; \
+ }
+
+#define SPEED_ROUTINE_MPN_SQR(function) \
+ SPEED_ROUTINE_MPN_SQR_CALL (function (wp, s->xp, s->size))
+
+
+#define SPEED_ROUTINE_MPN_SQR_TSPACE(call, tsize) \
+ { \
+ mp_ptr wp, tspace; \
+ unsigned i; \
+ double t; \
+ TMP_DECL (marker); \
+ \
+ SPEED_RESTRICT_COND (s->size >= 1); \
+ \
+ TMP_MARK (marker); \
+ wp = SPEED_TMP_ALLOC_LIMBS (2*s->size, s->align_wp); \
+ tspace = SPEED_TMP_ALLOC_LIMBS (tsize, s->align_wp2); \
+ \
+ speed_operand_src (s, s->xp, s->size); \
+ speed_operand_dst (s, wp, 2*s->size); \
+ speed_operand_dst (s, tspace, tsize); \
+ speed_cache_fill (s); \
+ \
+ speed_starttime (); \
+ i = s->reps; \
+ do \
+ call; \
+ while (--i != 0); \
+ t = speed_endtime (); \
+ \
+ TMP_FREE (marker); \
+ return t; \
}
-#define SPEED_ROUTINE_MPN_SQR(function) \
- SPEED_ROUTINE_MPN_SQR_CALL (function (wp, s->xp, s->size), 1)
+/* FIXME: size restrictions */
+#define SPEED_ROUTINE_MPN_KARA_SQR_N(function) \
+ SPEED_ROUTINE_MPN_SQR_TSPACE (function (wp, s->xp, s->size, tspace), \
+ MPN_KARA_SQR_N_TSIZE (s->size))
-#define SPEED_ROUTINE_GMPN_TOOM3_SQR_N(function) \
- SPEED_ROUTINE_MPN_SQR_CALL (function (wp, s->xp, s->size, tspace), \
- MPN_TOOM3_SQR_N_TSIZE (s->size))
+/* FIXME: size restrictions */
+#define SPEED_ROUTINE_MPN_TOOM3_SQR_N(function) \
+ SPEED_ROUTINE_MPN_SQR_TSPACE (function (wp, s->xp, s->size, tspace), \
+ MPN_TOOM3_SQR_N_TSIZE (s->size))
#define SPEED_ROUTINE_MPN_MOD_CALL(call) \
@@ -443,7 +531,7 @@ void mpz_set_n _PROTO ((mpz_ptr z, mp_srcptr p, mp_size_t size));
\
SPEED_RESTRICT_COND (s->size >= 0); \
\
- SPEED_OPERAND_SRC (s, s->xp, s->size); \
+ speed_operand_src (s, s->xp, s->size); \
speed_cache_fill (s); \
\
speed_starttime (); \
@@ -463,40 +551,40 @@ void mpz_set_n _PROTO ((mpz_ptr z, mp_srcptr p, mp_size_t size));
/* A division of 2*s->size by s->size limbs */
-#define SPEED_ROUTINE_MPN_BZ_DIVREM_CALL(call) \
- { \
- unsigned i; \
- mp_ptr a, d, q, r; \
- \
- SPEED_RESTRICT_COND (s->size >= 1); \
- \
- TMP_MARK (marker); \
- a = SPEED_TMP_ALLOC (2*s->size, s->align_xp); \
- d = SPEED_TMP_ALLOC (s->size, s->align_yp); \
- q = SPEED_TMP_ALLOC (s->size+1, s->align_wp); \
- r = SPEED_TMP_ALLOC (s->size, s->align_wp2); \
- \
- MPN_COPY (a, s->xp, s->size); \
- MPN_COPY (a+s->size, s->xp, s->size); \
- \
- MPN_COPY (d, s->yp, s->size); \
- \
- /* normalize the data */ \
- d[s->size-1] |= MP_LIMB_T_HIGHBIT; \
- a[2*s->size-1] = d[s->size-1] - 1; \
- \
- SPEED_OPERAND_SRC (s, a, 2*s->size); \
- SPEED_OPERAND_SRC (s, d, s->size); \
- SPEED_OPERAND_DST (s, q, s->size+1); \
- SPEED_OPERAND_DST (s, r, s->size); \
- speed_cache_fill (s); \
- \
- speed_starttime (); \
- i = s->reps; \
- do \
- call; \
- while (--i != 0); \
- return speed_endtime (); \
+#define SPEED_ROUTINE_MPN_BZ_DIVREM_CALL(call) \
+ { \
+ unsigned i; \
+ mp_ptr a, d, q, r; \
+ \
+ SPEED_RESTRICT_COND (s->size >= 1); \
+ \
+ TMP_MARK (marker); \
+ a = SPEED_TMP_ALLOC_LIMBS (2*s->size, s->align_xp); \
+ d = SPEED_TMP_ALLOC_LIMBS (s->size, s->align_yp); \
+ q = SPEED_TMP_ALLOC_LIMBS (s->size+1, s->align_wp); \
+ r = SPEED_TMP_ALLOC_LIMBS (s->size, s->align_wp2); \
+ \
+ MPN_COPY (a, s->xp, s->size); \
+ MPN_COPY (a+s->size, s->xp, s->size); \
+ \
+ MPN_COPY (d, s->yp, s->size); \
+ \
+ /* normalize the data */ \
+ d[s->size-1] |= MP_LIMB_T_HIGHBIT; \
+ a[2*s->size-1] = d[s->size-1] - 1; \
+ \
+ speed_operand_src (s, a, 2*s->size); \
+ speed_operand_src (s, d, s->size); \
+ speed_operand_dst (s, q, s->size+1); \
+ speed_operand_dst (s, r, s->size); \
+ speed_cache_fill (s); \
+ \
+ speed_starttime (); \
+ i = s->reps; \
+ do \
+ call; \
+ while (--i != 0); \
+ return speed_endtime (); \
}
#define SPEED_ROUTINE_MPN_BZ_DIVREM_N(function) \
@@ -506,8 +594,8 @@ void mpz_set_n _PROTO ((mpz_ptr z, mp_srcptr p, mp_size_t size));
SPEED_ROUTINE_MPN_BZ_DIVREM_CALL \
((*function) (q, a, 2*s->size, d, s->size))
-#define SPEED_ROUTINE_MPN_BZ_TDIV_QR(function) \
- SPEED_ROUTINE_MPN_BZ_DIVREM_CALL \
+#define SPEED_ROUTINE_MPN_BZ_TDIV_QR(function) \
+ SPEED_ROUTINE_MPN_BZ_DIVREM_CALL \
((*function) (q, r, 0, a, 2*s->size, d, s->size))
@@ -517,7 +605,7 @@ void mpz_set_n _PROTO ((mpz_ptr z, mp_srcptr p, mp_size_t size));
\
SPEED_RESTRICT_COND (s->size >= 1); \
\
- SPEED_OPERAND_SRC (s, s->xp, s->size); \
+ speed_operand_src (s, s->xp, s->size); \
speed_cache_fill (s); \
\
speed_starttime (); \
@@ -534,8 +622,8 @@ void mpz_set_n _PROTO ((mpz_ptr z, mp_srcptr p, mp_size_t size));
\
SPEED_RESTRICT_COND (s->size >= 1); \
\
- SPEED_OPERAND_SRC (s, s->xp, s->size); \
- SPEED_OPERAND_SRC (s, s->yp, s->size); \
+ speed_operand_src (s, s->xp, s->size); \
+ speed_operand_src (s, s->yp, s->size); \
speed_cache_fill (s); \
\
speed_starttime (); \
@@ -619,8 +707,8 @@ void mpz_set_n _PROTO ((mpz_ptr z, mp_srcptr p, mp_size_t size));
SPEED_RESTRICT_COND (s->size >= 0); \
\
TMP_MARK (marker); \
- wp = SPEED_TMP_ALLOC (s->size, s->align_wp); \
- wp2 = SPEED_TMP_ALLOC (s->size, s->align_wp2); \
+ wp = SPEED_TMP_ALLOC_LIMBS (s->size, s->align_wp); \
+ wp2 = SPEED_TMP_ALLOC_LIMBS (s->size, s->align_wp2); \
xp = s->xp; \
yp = s->yp; \
\
@@ -637,10 +725,10 @@ void mpz_set_n _PROTO ((mpz_ptr z, mp_srcptr p, mp_size_t size));
if (xp != s->xp) MPN_COPY (xp, s->xp, s->size); \
if (yp != s->yp) MPN_COPY (yp, s->yp, s->size); \
\
- SPEED_OPERAND_SRC (s, xp, s->size); \
- SPEED_OPERAND_SRC (s, yp, s->size); \
- SPEED_OPERAND_DST (s, wp, s->size); \
- SPEED_OPERAND_DST (s, wp2, s->size); \
+ speed_operand_src (s, xp, s->size); \
+ speed_operand_src (s, yp, s->size); \
+ speed_operand_dst (s, wp, s->size); \
+ speed_operand_dst (s, wp2, s->size); \
speed_cache_fill (s); \
\
speed_starttime (); \
@@ -654,17 +742,15 @@ void mpz_set_n _PROTO ((mpz_ptr z, mp_srcptr p, mp_size_t size));
return t; \
}
-#define SPEED_ROUTINE_MPN_ADDSUB_N(function) \
- SPEED_ROUTINE_MPN_ADDSUB_CALL \
+#define SPEED_ROUTINE_MPN_ADDSUB_N(function) \
+ SPEED_ROUTINE_MPN_ADDSUB_CALL \
(function (wp, wp2, xp, yp, s->size));
-#define SPEED_ROUTINE_MPN_ADDSUB_NC(function) \
- SPEED_ROUTINE_MPN_ADDSUB_CALL \
+#define SPEED_ROUTINE_MPN_ADDSUB_NC(function) \
+ SPEED_ROUTINE_MPN_ADDSUB_CALL \
(function (wp, wp2, xp, yp, s->size, 0));
-/* function (wp1, wp2, wp1, wp2, s->size); */ /*full*/
-
#define SPEED_ROUTINE_MPN_GCD_1xN(function) \
{ \
unsigned i; \
@@ -676,7 +762,7 @@ void mpz_set_n _PROTO ((mpz_ptr z, mp_srcptr p, mp_size_t size));
\
TMP_MARK (marker); \
\
- SPEED_OPERAND_SRC (s, s->xp, s->size); \
+ speed_operand_src (s, s->xp, s->size); \
speed_cache_fill (s); \
\
speed_starttime (); \
@@ -691,7 +777,7 @@ void mpz_set_n _PROTO ((mpz_ptr z, mp_srcptr p, mp_size_t size));
}
-/* SPEED_DATA_SIZE many one GCDs of s->size bits each. */
+/* SPEED_BLOCK_SIZE many one GCDs of s->size bits each. */
#define SPEED_ROUTINE_MPN_GCD_1_CALL(setup, call) \
{ \
@@ -705,29 +791,29 @@ void mpz_set_n _PROTO ((mpz_ptr z, mp_srcptr p, mp_size_t size));
SPEED_RESTRICT_COND (s->size <= mp_bits_per_limb); \
\
TMP_MARK (marker); \
- px = SPEED_TMP_ALLOC (SPEED_DATA_SIZE, s->align_xp); \
- py = SPEED_TMP_ALLOC (SPEED_DATA_SIZE, s->align_yp); \
- MPN_COPY (px, s->xp, SPEED_DATA_SIZE); \
- MPN_COPY (py, s->yp, SPEED_DATA_SIZE); \
+ px = SPEED_TMP_ALLOC_LIMBS (SPEED_BLOCK_SIZE, s->align_xp); \
+ py = SPEED_TMP_ALLOC_LIMBS (SPEED_BLOCK_SIZE, s->align_yp); \
+ MPN_COPY (px, s->xp_block, SPEED_BLOCK_SIZE); \
+ MPN_COPY (py, s->yp_block, SPEED_BLOCK_SIZE); \
\
x_mask = MP_LIMB_T_LOWBITMASK (s->size); \
y_mask = MP_LIMB_T_LOWBITMASK (s->r != 0 ? s->r : s->size); \
- for (i = 0; i < SPEED_DATA_SIZE; i++) \
+ for (i = 0; i < SPEED_BLOCK_SIZE; i++) \
{ \
px[i] &= x_mask; px[i] += (px[i] == 0); \
py[i] &= y_mask; py[i] += (py[i] == 0); \
setup; \
} \
\
- SPEED_OPERAND_SRC (s, px, SPEED_DATA_SIZE); \
- SPEED_OPERAND_SRC (s, py, SPEED_DATA_SIZE); \
+ speed_operand_src (s, px, SPEED_BLOCK_SIZE); \
+ speed_operand_src (s, py, SPEED_BLOCK_SIZE); \
speed_cache_fill (s); \
\
speed_starttime (); \
i = s->reps; \
do \
{ \
- j = SPEED_DATA_SIZE; \
+ j = SPEED_BLOCK_SIZE; \
do \
{ \
call; \
@@ -739,7 +825,7 @@ void mpz_set_n _PROTO ((mpz_ptr z, mp_srcptr p, mp_size_t size));
\
TMP_FREE (marker); \
\
- s->time_divisor = SPEED_DATA_SIZE; \
+ s->time_divisor = SPEED_BLOCK_SIZE; \
return t; \
}
@@ -757,11 +843,10 @@ void mpz_set_n _PROTO ((mpz_ptr z, mp_srcptr p, mp_size_t size));
function (px[j-1], py[j-1], 0))
-/* SPEED_DATA_SIZE/s->size many GCDs of s->size limbs each.
+/* SPEED_BLOCK_SIZE/s->size many GCDs of s->size limbs each.
FIXME: It might be worth reducing the number of GCDs as s->size increases,
- after all GCD is an O(n^2) algorithm, even if the accelerated algorithm
- flattens this out a bit at smallish sizes. */
+ after all GCD is an O(n^2) algorithm. */
#define SPEED_ROUTINE_MPN_GCD_CALL(datadivisor, call) \
{ \
@@ -774,20 +859,20 @@ void mpz_set_n _PROTO ((mpz_ptr z, mp_srcptr p, mp_size_t size));
SPEED_RESTRICT_COND (s->size >= 1); \
\
TMP_MARK (marker); \
- xtmp = SPEED_TMP_ALLOC (s->size+1, s->align_xp); \
- ytmp = SPEED_TMP_ALLOC (s->size+1, s->align_yp); \
- wp = SPEED_TMP_ALLOC (s->size, s->align_wp); \
- wp2 = SPEED_TMP_ALLOC (s->size, s->align_wp2); \
+ xtmp = SPEED_TMP_ALLOC_LIMBS (s->size+1, s->align_xp); \
+ ytmp = SPEED_TMP_ALLOC_LIMBS (s->size+1, s->align_yp); \
+ wp = SPEED_TMP_ALLOC_LIMBS (s->size, s->align_wp); \
+ wp2 = SPEED_TMP_ALLOC_LIMBS (s->size, s->align_wp2); \
\
- pieces = SPEED_DATA_SIZE / s->size / datadivisor; \
+ pieces = SPEED_BLOCK_SIZE / s->size / datadivisor; \
if (pieces == 0) \
pieces = 1; \
\
psize = pieces * s->size; \
px = TMP_ALLOC_LIMBS (psize); \
py = TMP_ALLOC_LIMBS (psize); \
- MPN_COPY (px, s->xp, psize); \
- MPN_COPY (py, s->yp, psize); \
+ MPN_COPY (px, pieces==1 ? s->xp : s->xp_block, psize); \
+ MPN_COPY (py, pieces==1 ? s->yp : s->yp_block, psize); \
\
/* y must be odd, x must have at least as many bits as y */ \
for (j = 0; j < pieces; j++) \
@@ -800,11 +885,11 @@ void mpz_set_n _PROTO ((mpz_ptr z, mp_srcptr p, mp_size_t size));
x[s->size-1] = MAX (x[s->size-1], y[s->size-1]); \
} \
\
- SPEED_OPERAND_SRC (s, px, psize); \
- SPEED_OPERAND_SRC (s, py, psize); \
- SPEED_OPERAND_DST (s, xtmp, s->size); \
- SPEED_OPERAND_DST (s, ytmp, s->size); \
- SPEED_OPERAND_DST (s, wp, s->size); \
+ speed_operand_src (s, px, psize); \
+ speed_operand_src (s, py, psize); \
+ speed_operand_dst (s, xtmp, s->size); \
+ speed_operand_dst (s, ytmp, s->size); \
+ speed_operand_dst (s, wp, s->size); \
speed_cache_fill (s); \
\
speed_starttime (); \
@@ -850,14 +935,14 @@ void mpz_set_n _PROTO ((mpz_ptr z, mp_srcptr p, mp_size_t size));
SPEED_RESTRICT_COND (s->size >= 1); \
\
TMP_MARK (marker); \
- xp = SPEED_TMP_ALLOC (s->size, s->align_xp); \
- wp = SPEED_TMP_ALLOC (s->size, s->align_wp); \
+ xp = SPEED_TMP_ALLOC_LIMBS (s->size, s->align_xp); \
+ wp = SPEED_TMP_ALLOC_LIMBS (s->size, s->align_wp); \
\
/* source is overwritten */ \
MPN_COPY (xp, s->xp, s->size); \
\
- SPEED_OPERAND_SRC (s, xp, s->size); \
- SPEED_OPERAND_DST (s, wp, s->size); \
+ speed_operand_src (s, xp, s->size); \
+ speed_operand_dst (s, wp, s->size); \
speed_cache_fill (s); \
\
speed_starttime (); \
@@ -872,37 +957,40 @@ void mpz_set_n _PROTO ((mpz_ptr z, mp_srcptr p, mp_size_t size));
}
-#define SPEED_ROUTINE_MODLIMB_INVERT(function) \
- { \
- unsigned i, j; \
- mp_ptr xp; \
- mp_limb_t n = 1; \
- double t; \
- \
- xp = s->xp - 1; \
- \
- speed_starttime (); \
- i = s->reps; \
- do \
- { \
- j = SPEED_DATA_SIZE; \
- do \
- { \
- /* randomized but successively dependent */ \
- n += (xp[j] << 1); \
- \
- function (n, n); \
- } \
- while (--j != 0); \
- } \
- while (--i != 0); \
- t = speed_endtime (); \
- \
- /* make sure the compiler won't optimize away n */ \
- noop_1 (n); \
- \
- s->time_divisor = SPEED_DATA_SIZE; \
- return t; \
+#define SPEED_ROUTINE_MODLIMB_INVERT(function) \
+ { \
+ unsigned i, j; \
+ mp_ptr xp; \
+ mp_limb_t n = 1; \
+ double t; \
+ \
+ xp = s->xp_block-1; \
+ \
+ speed_operand_src (s, s->xp_block, SPEED_BLOCK_SIZE); \
+ speed_cache_fill (s); \
+ \
+ speed_starttime (); \
+ i = s->reps; \
+ do \
+ { \
+ j = SPEED_BLOCK_SIZE; \
+ do \
+ { \
+ /* randomized but successively dependent */ \
+ n += (xp[j] << 1); \
+ \
+ function (n, n); \
+ } \
+ while (--j != 0); \
+ } \
+ while (--i != 0); \
+ t = speed_endtime (); \
+ \
+ /* make sure the compiler won't optimize away n */ \
+ noop_1 (n); \
+ \
+ s->time_divisor = SPEED_BLOCK_SIZE; \
+ return t; \
}
#endif