diff options
author | Kevin Ryde <user42@zip.com.au> | 2000-06-30 00:42:38 +0200 |
---|---|---|
committer | Kevin Ryde <user42@zip.com.au> | 2000-06-30 00:42:38 +0200 |
commit | a557fe9d07c37497c572416aebe132536bbfbe52 (patch) | |
tree | 94818365113516d33e825367aef3e3c2ecdd670c /tune | |
parent | ef0f7391964d47fd032ce46c6cd52441a6b2b39e (diff) | |
download | gmp-a557fe9d07c37497c572416aebe132536bbfbe52.tar.gz |
* tune/*: Locate data to help direct-mapped caches, add measuring
of mpz_init/clear, mpz_add and mpz_bin_uiui, various cleanups.
Diffstat (limited to 'tune')
-rw-r--r-- | tune/common.c | 271 | ||||
-rw-r--r-- | tune/speed.c | 138 | ||||
-rw-r--r-- | tune/speed.h | 540 |
3 files changed, 641 insertions, 308 deletions
diff --git a/tune/common.c b/tune/common.c index 60910a83b..93c35849d 100644 --- a/tune/common.c +++ b/tune/common.c @@ -43,6 +43,10 @@ MA 02111-1307, USA. #define numberof(x) (sizeof (x) / sizeof ((x)[0])) typedef int (*qsort_function_t) _PROTO ((const void *, const void *)); + +int speed_option_addrs = 0; + + void pentium_wbinvd(void) { @@ -128,7 +132,7 @@ speed_measure (double (*fun) _PROTO ((struct speed_params *s)), memset (&s_dummy, '\0', sizeof (s_dummy)); s = &s_dummy; } - + s->reps = 1; s->time_divisor = 1.0; for (i = 0; i < numberof (t); i++) @@ -229,16 +233,74 @@ mpn_cache_fill_write (mp_ptr ptr, mp_size_t size) void +speed_operand_src (struct speed_params *s, mp_ptr ptr, mp_size_t size) +{ + if (s->src_num >= numberof (s->src)) + { + fprintf (stderr, "speed_operand_src: no room left in s->src[]\n"); + abort (); + } + s->src[s->src_num].ptr = ptr; + s->src[s->src_num].size = size; + s->src_num++; +} + + +void +speed_operand_dst (struct speed_params *s, mp_ptr ptr, mp_size_t size) +{ + if (s->dst_num >= numberof (s->dst)) + { + fprintf (stderr, "speed_operand_dst: no room left in s->dst[]\n"); + abort (); + } + s->dst[s->dst_num].ptr = ptr; + s->dst[s->dst_num].size = size; + s->dst_num++; +} + + +void speed_cache_fill (struct speed_params *s) { + static struct speed_params prev; int i; -#if 0 - for (i = 0; i < s->dst_num; i++) - printf ("dst %p %ld\n", s->dst[i].ptr, s->dst[i].size); - for (i = 0; i < s->src_num; i++) - printf ("src %p %ld\n", s->src[i].ptr, s->src[i].size); -#endif + /* FIXME: need a better way to get the format string for a pointer */ + + if (speed_option_addrs) + { + int different; + + different = (s->dst_num != prev.dst_num || s->src_num != prev.src_num); + for (i = 0; i < s->dst_num; i++) + different |= (s->dst[i].ptr != prev.dst[i].ptr); + for (i = 0; i < s->src_num; i++) + different |= (s->src[i].ptr != prev.src[i].ptr); + + if (different) + { + if (s->dst_num != 0) + { + printf ("dst"); + for (i = 0; i < s->dst_num; i++) + printf (" %08lX", (unsigned long) s->dst[i].ptr); + printf (" "); + } + + if (s->src_num != 0) + { + printf ("src"); + for (i = 0; i < s->src_num; i++) + printf (" %08lX", (unsigned long) s->src[i].ptr); + printf (" "); + } + printf (" (cf sp approx %08lX)\n", (unsigned long) &different); + + } + + memcpy (&prev, s, sizeof(prev)); + } switch (s->cache) { case 0: @@ -285,8 +347,8 @@ _mp_allocate_or_reallocate (void *ptr, size_t oldsize, size_t newsize) } -/* Adjust ptr to align to CACHE_LINE_SIZE plus "align". ptr needs to have - room for up to CACHE_LINE_SIZE-4 extra bytes. */ +/* Adjust ptr to align to CACHE_LINE_SIZE bytes plus "align" limbs. ptr + needs to have room for up to CACHE_LINE_SIZE-4 extra bytes. */ mp_ptr speed_tmp_alloc_adjust (void *ptr, mp_size_t align) @@ -316,6 +378,20 @@ mpz_set_n (mpz_ptr z, mp_srcptr p, mp_size_t size) } +/* Miscellanous options accepted by tune and speed programs under -o. */ + +void +speed_option_set (const char *s) +{ + if (strcmp (s, "addrs") == 0) speed_option_addrs = 1; + else + { + printf ("Unrecognised -o option: %s\n", s); + exit (1); + } +} + + /* The following are basic speed running routines for various gmp functions. Many are very similar and use speed.h macros. @@ -338,7 +414,7 @@ mpz_set_n (mpz_ptr z, mp_srcptr p, mp_size_t size) using the routines will ensure s->xp and s->yp are aligned. Aligning onto a CACHE_LINE_SIZE boundary is suggested. s->align_wp and s->align_wp2 should be respected where it makes sense to do so. - SPEED_TMP_ALLOC is a good way to do this. + SPEED_TMP_ALLOC_LIMBS is a good way to do this. A loop of the following form can be expected to turn into good assembler code on most CPUs, thereby minimizing overhead in the measurement. It @@ -585,7 +661,7 @@ speed_mpn_sqr_n (struct speed_params *s) double speed_mpn_mul_n_sqr (struct speed_params *s) { - SPEED_ROUTINE_MPN_SQR_CALL (mpn_mul_n (wp, s->xp, s->xp, s->size), 1); + SPEED_ROUTINE_MPN_SQR_CALL (mpn_mul_n (wp, s->xp, s->xp, s->size)); } double @@ -600,32 +676,26 @@ speed_mpn_sqr_basecase (struct speed_params *s) SPEED_ROUTINE_MPN_SQR (mpn_sqr_basecase); } -/* FIXME: size restrictions on kara */ double speed_mpn_kara_mul_n (struct speed_params *s) { - SPEED_ROUTINE_MPN_MUL_N_CALL - (mpn_kara_mul_n (wp, s->xp, s->xp, s->size, tspace), - MPN_KARA_MUL_N_TSIZE (s->size)); + SPEED_ROUTINE_MPN_KARA_MUL_N (mpn_kara_mul_n); } double speed_mpn_kara_sqr_n (struct speed_params *s) { - SPEED_ROUTINE_MPN_SQR_CALL - (mpn_kara_sqr_n (wp, s->xp, s->size, tspace), - MPN_KARA_SQR_N_TSIZE (s->size)); + SPEED_ROUTINE_MPN_KARA_SQR_N (mpn_kara_sqr_n); } -/* FIXME: size restrictions on toom3 */ double speed_mpn_toom3_mul_n (struct speed_params *s) { - SPEED_ROUTINE_GMPN_TOOM3_MUL_N (mpn_toom3_mul_n); + SPEED_ROUTINE_MPN_TOOM3_MUL_N (mpn_toom3_mul_n); } double speed_mpn_toom3_sqr_n (struct speed_params *s) { - SPEED_ROUTINE_GMPN_TOOM3_SQR_N (mpn_toom3_sqr_n); + SPEED_ROUTINE_MPN_TOOM3_SQR_N (mpn_toom3_sqr_n); } @@ -736,3 +806,162 @@ speed_noop_wxys (struct speed_params *s) return t; } + +#define SPEED_ROUTINE_ALLOC_FREE(variables, calls) \ + { \ + unsigned i; \ + variables; \ + \ + speed_starttime (); \ + i = s->reps; \ + do \ + { \ + calls; \ + } \ + while (--i != 0); \ + return speed_endtime (); \ + } + + +/* Compare these to see how much malloc/free costs and then how much + _mp_default_allocate/free and mpz_init/clear add. mpz_init/clear or + mpq_init/clear will be doing a 1 limb allocate, so use that as the size + when including them in comparisons. */ + +double +speed_malloc_free (struct speed_params *s) +{ + size_t bytes = s->size * BYTES_PER_MP_LIMB; + SPEED_ROUTINE_ALLOC_FREE (void *p, + p = malloc (bytes); + free (p)); +} + +double +speed_malloc_realloc_free (struct speed_params *s) +{ + size_t bytes = s->size * BYTES_PER_MP_LIMB; + SPEED_ROUTINE_ALLOC_FREE (void *p, + p = malloc (BYTES_PER_MP_LIMB); + p = realloc (p, bytes); + free (p)); +} + +double +speed_mp_allocate_free (struct speed_params *s) +{ + size_t bytes = s->size * BYTES_PER_MP_LIMB; + SPEED_ROUTINE_ALLOC_FREE (void *p, + p = (*_mp_allocate_func) (bytes); + (*_mp_free_func) (p, bytes)); +} + +double +speed_mp_allocate_reallocate_free (struct speed_params *s) +{ + size_t bytes = s->size * BYTES_PER_MP_LIMB; + SPEED_ROUTINE_ALLOC_FREE + (void *p, + p = (*_mp_allocate_func) (BYTES_PER_MP_LIMB); + p = (*_mp_reallocate_func) (p, bytes, BYTES_PER_MP_LIMB); + (*_mp_free_func) (p, bytes)); +} + +double +speed_mpz_init_clear (struct speed_params *s) +{ + SPEED_ROUTINE_ALLOC_FREE (mpz_t z, + mpz_init (z); + mpz_clear (z)); +} + +double +speed_mpz_init_realloc_clear (struct speed_params *s) +{ + SPEED_ROUTINE_ALLOC_FREE (mpz_t z, + mpz_init (z); + _mpz_realloc (z, s->size); + mpz_clear (z)); +} + +double +speed_mpq_init_clear (struct speed_params *s) +{ + SPEED_ROUTINE_ALLOC_FREE (mpq_t q, + mpq_init (q); + mpq_clear (q)); +} + +double +speed_mpf_init_clear (struct speed_params *s) +{ + SPEED_ROUTINE_ALLOC_FREE (mpf_t f, + mpf_init (f); + mpf_clear (f)); +} + + +/* Compare this to mpn_add_n to see how much overhead mpz_add adds. Note + that repeatedly calling mpz_add with the same data gives branch predition + in it an advantage. */ + +double +speed_mpz_add (struct speed_params *s) +{ + mpz_t w, x, y; + unsigned i; + double t; + + mpz_init (w); + mpz_init (x); + mpz_init (y); + + mpz_set_n (x, s->xp, s->size); + mpz_set_n (y, s->yp, s->size); + mpz_add (w, x, y); + + speed_starttime (); + i = s->reps; + do + { + mpz_add (w, x, y); + } + while (--i != 0); + t = speed_endtime (); + + mpz_clear (w); + mpz_clear (x); + mpz_clear (y); + return t; +} + + +/* If r==0, calculate (size,size/2), + otherwise calculate (size,r). */ + +double +speed_mpz_bin_uiui (struct speed_params *s) +{ + mpz_t w; + unsigned long k; + unsigned i; + double t; + + mpz_init (w); + if (s->r != 0) + k = s->r; + else + k = s->size/2; + + speed_starttime (); + i = s->reps; + do + { + mpz_bin_uiui (w, s->size, k); + } + while (--i != 0); + t = speed_endtime (); + + mpz_clear (w); + return t; +} diff --git a/tune/speed.c b/tune/speed.c index b4a081e0d..3779b7bc1 100644 --- a/tune/speed.c +++ b/tune/speed.c @@ -24,12 +24,12 @@ MA 02111-1307, USA. /* Usage message is in the code below, run with no arguments to print it. See README for interesting applications. - To add a new routine foo() to measure, create a speed_foo() function in - the style of the existing ones and add an entry in the routine[] array. - Put FLAG_R if the routine needs an "r" parameter. + To add a new routine foo(), create a speed_foo() function in the style of + the existing ones and add an entry in the routine[] array. Put FLAG_R if + speed_foo() wants an "r" parameter. The routines don't have help messages or descriptions, but most have - pretty suggestive names. See the source code for full details. + suggestive names. See the source code for full details. */ @@ -67,10 +67,6 @@ SPEED_EXTRA_PROTOS #endif -#ifndef LONG_BIT -#define LONG_BIT (8 * sizeof(long)) -#endif - #define numberof(x) (sizeof (x) / sizeof ((x)[0])) #define MPN_FILL(ptr, size, n) \ @@ -109,6 +105,7 @@ struct size_array_t { mp_size_t size_num = 0; mp_size_t size_allocnum = 0; int option_resource_usage = 0; +long option_seed = 123456789; struct speed_params sp; @@ -172,10 +169,6 @@ const struct routine_t { { "mpn_popcount", speed_mpn_popcount }, { "mpn_hamdist", speed_mpn_hamdist }, - { "mpz_fac_ui", speed_mpz_fac_ui }, - { "mpz_fib_ui", speed_mpz_fib_ui }, - { "mpz_powm", speed_mpz_powm }, - { "mpn_gcdext", speed_mpn_gcdext }, { "mpn_gcd", speed_mpn_gcd }, { "mpn_gcd_1", speed_mpn_gcd_1, FLAG_R_OPTIONAL }, @@ -193,6 +186,12 @@ const struct routine_t { { "mpn_toom3_mul_n", speed_mpn_toom3_mul_n }, { "mpn_toom3_sqr_n", speed_mpn_toom3_sqr_n }, + { "mpz_add", speed_mpz_add }, + { "mpz_bin_uiui", speed_mpz_bin_uiui, FLAG_R_OPTIONAL }, + { "mpz_fac_ui", speed_mpz_fac_ui }, + { "mpz_fib_ui", speed_mpz_fib_ui }, + { "mpz_powm", speed_mpz_powm }, + { "MPN_COPY", speed_MPN_COPY }, { "MPN_COPY_INCR", speed_MPN_COPY_INCR }, { "MPN_COPY_DECR", speed_MPN_COPY_DECR }, @@ -200,6 +199,15 @@ const struct routine_t { { "modlimb_invert", speed_modlimb_invert }, + { "malloc_free", speed_malloc_free }, + { "malloc_realloc_free", speed_malloc_realloc_free }, + { "mp_allocate_free", speed_mp_allocate_free }, + { "mp_allocate_reallocate_free", speed_mp_allocate_reallocate_free }, + { "mpz_init_clear", speed_mpz_init_clear }, + { "mpq_init_clear", speed_mpq_init_clear }, + { "mpf_init_clear", speed_mpf_init_clear }, + { "mpz_init_realloc_clear", speed_mpz_init_realloc_clear }, + #ifdef SPEED_EXTRA_ROUTINES SPEED_EXTRA_ROUTINES #endif @@ -218,6 +226,32 @@ struct choice_t *choice; int num_choices = 0; +void +data_fill (mp_ptr ptr, mp_size_t size) +{ + switch (option_data) { + case DATA_RANDOM: + mpn_random (ptr, size); + break; + case DATA_RANDOM2: + mpn_random2 (ptr, size); + break; + case DATA_ZEROS: + MPN_ZERO (ptr, size); + break; + case DATA_FFS: + MPN_FILL (ptr, size, MP_LIMB_T_MAX); + break; + case DATA_2FD: + MPN_FILL (ptr, size, MP_LIMB_T_MAX); + ptr[0] -= 2; + break; + default: + abort(); + /*NOTREACHED*/ + } +} + /* The code here handling the various combinations of output options isn't too attractive, but it works and is fairly clean. */ @@ -230,8 +264,16 @@ void run_one (FILE *fp, struct speed_params *s, mp_size_t prev_size) { const char *first_open_fastest, *first_open_notfastest, *first_close; - int i, fastest; - double fastest_time; + int i, fastest; + double fastest_time; + TMP_DECL (marker); + + TMP_MARK (marker); + sp.xp = SPEED_TMP_ALLOC_LIMBS (s->size, s->align_xp); + sp.yp = SPEED_TMP_ALLOC_LIMBS (s->size, s->align_yp); + + data_fill (s->xp, s->size); + data_fill (s->yp, s->size); if (prev_size == -1 && option_cmp == CMP_DIFFPREV) { @@ -370,55 +412,23 @@ run_one (FILE *fp, struct speed_params *s, mp_size_t prev_size) } fprintf (fp, "\n"); } -} -void -data_fill (mp_ptr ptr, mp_size_t size) -{ - switch (option_data) { - case DATA_RANDOM: - mpn_random (ptr, size); - break; - case DATA_RANDOM2: - mpn_random2 (ptr, size); - break; - case DATA_ZEROS: - MPN_ZERO (ptr, size); - break; - case DATA_FFS: - MPN_FILL (ptr, size, MP_LIMB_T_MAX); - break; - case DATA_2FD: - MPN_FILL (ptr, size, MP_LIMB_T_MAX); - ptr[0] -= 2; - break; - default: - abort(); - /*NOTREACHED*/ - } + TMP_FREE (marker); } void run_all (FILE *fp) { - mp_size_t prev_size, max_size; + mp_size_t prev_size; int i; TMP_DECL (marker); - max_size = SPEED_DATA_SIZE; - for (i = 0; i < size_num; i++) - max_size = MAX (max_size, size_array[i].end); - - for (i = 0; i < num_choices; i++) - if (choice[i].p->flag & FLAG_RSIZE) - max_size = MAX (max_size, choice[i].r); - TMP_MARK (marker); - sp.xp = SPEED_TMP_ALLOC (max_size, sp.align_xp); - sp.yp = SPEED_TMP_ALLOC (max_size, sp.align_yp); + sp.xp_block = SPEED_TMP_ALLOC_LIMBS (SPEED_BLOCK_SIZE, sp.align_xp); + sp.yp_block = SPEED_TMP_ALLOC_LIMBS (SPEED_BLOCK_SIZE, sp.align_yp); - data_fill (sp.xp, max_size); - data_fill (sp.yp, max_size); + data_fill (sp.xp_block, SPEED_BLOCK_SIZE); + data_fill (sp.yp_block, SPEED_BLOCK_SIZE); for (i = 0; i < size_num; i++) { @@ -538,7 +548,7 @@ run_gnuplot (void) /* Return a long with n many one bits (starting from the least significant) */ #define LONG_ONES(n) \ - ((n) == LONG_BIT ? -1L : (n) == 0 ? 0L : (1L << (n)) - 1) + ((n) == BITS_PER_LONGINT ? -1L : (n) == 0 ? 0L : (1L << (n)) - 1) long r_string (const char *s) @@ -554,10 +564,10 @@ r_string (const char *s) if (strcmp (s, "bits") == 0) { mp_limb_t l; - if (n > LONG_BIT) + if (n > BITS_PER_LONGINT) { fprintf (stderr, "%ld bit parameter invalid (max %d bits)\n", - n, LONG_BIT); + n, BITS_PER_LONGINT); exit (1); } mpn_random (&l, 1); @@ -565,10 +575,10 @@ r_string (const char *s) } else if (strcmp (s, "ones") == 0) { - if (n > LONG_BIT) + if (n > BITS_PER_LONGINT) { fprintf (stderr, "%ld bit parameter invalid (max %d bits)\n", - n, LONG_BIT); + n, BITS_PER_LONGINT); exit (1); } return LONG_ONES (n); @@ -657,6 +667,7 @@ Times are in seconds, accuracy is shown.\n\ -P name output plot files \"name.gnuplot\" and \"name.data\"\n\ -a <type> use given data: random(default), random2, zeros, ffs\n\ -x, -y, -w, -W <align> specify data alignments, sources and dests\n\ + -o addrs print addresses of data blocks \n\ If both -t and -f are used, it means step by the factor or the step, whichever\n\ is greater.\n\ @@ -735,7 +746,7 @@ main (int argc, char *argv[]) and isn't lost if you kill the program half way. */ setbuf (stdout, NULL); -#define OPTSTRING "a:CcDdEFf:p:P:rRs:t:ux:y:w:W:z" +#define OPTSTRING "a:CcDdEFf:o:p:P:rRs:t:ux:y:w:W:z" #if HAVE_GETOPT_LONG while ((opt = getopt_long(argc, argv, OPTSTRING, longopts, NULL)) != EOF) @@ -796,6 +807,9 @@ main (int argc, char *argv[]) exit (1); } break; + case 'o': + speed_option_set (optarg); + break; case 'P': option_gnuplot = 1; option_gnuplot_basename = optarg; @@ -804,9 +818,7 @@ main (int argc, char *argv[]) speed_precision = atoi (optarg); break; case 'R': - srand (time (NULL)); - srandom (time (NULL)); - srand48 (time (NULL)); + option_seed = time (NULL); break; case 'r': if (option_cmp != CMP_ABSOLUTE) @@ -892,6 +904,10 @@ main (int argc, char *argv[]) exit (1); } + srand (option_seed); + srandom (option_seed); + srand48 (option_seed); + choice = (struct choice_t *) (*_mp_allocate_func) ((argc - optind) * sizeof(choice[0])); for ( ; optind < argc; optind++) diff --git a/tune/speed.h b/tune/speed.h index e32e1e979..49078f77d 100644 --- a/tune/speed.h +++ b/tune/speed.h @@ -43,7 +43,7 @@ MA 02111-1307, USA. /* A mask of the least significant n bits. Note 1<<32 doesn't give zero on x86 family CPUs, hence the separate case for BITS_PER_MP_LIMB. */ #define MP_LIMB_T_LOWBITMASK(n) \ - ((n) == BITS_PER_MP_LIMB ? ~0 : ((mp_limb_t) 1 << (n)) - 1) + ((n) == BITS_PER_MP_LIMB ? MP_LIMB_T_MAX : ((mp_limb_t) 1 << (n)) - 1) /* align must be a power of 2 here, usually CACHE_LINE_SIZE is a good choice */ @@ -53,19 +53,29 @@ MA 02111-1307, USA. #define TMP_ALLOC_LIMBS_ALIGNED(limbs, align) \ ((mp_ptr) TMP_ALLOC_ALIGNED ((limbs)*sizeof(mp_limb_t), align)) -/* 32 is right for pentium family, need to configure this for other CPUs */ +/* 32 for pentium, 64 for athlon, might want to configure this for other + CPUs. In truth though nothing has yet shown up that cares about cache + line boundaries. The only practical effect of this is to restrict the + range that s->align_xp can take. Perhaps this could be a variable + instead. */ #define CACHE_LINE_SIZE 64 /* bytes */ #define SPEED_TMP_ALLOC_ADJUST_MASK (CACHE_LINE_SIZE/BYTES_PER_MP_LIMB - 1) -#define SPEED_TMP_ALLOC(limbs, align) \ - (speed_tmp_alloc_adjust \ +#define SPEED_TMP_ALLOC_LIMBS(limbs, align) \ + (speed_tmp_alloc_adjust \ (TMP_ALLOC_LIMBS((limbs) + SPEED_TMP_ALLOC_ADJUST_MASK), (align))) -/* Minimum source data limbs available in s.xp and y.sp from speed program. - 512 means 2kbytes of data for xp and yp, making 4k total, which should - fit easily in any L1 data cache. */ -#define SPEED_DATA_SIZE 512 + +/* This is the size for s->xp_block and s->yp_block, used in certain + routines that want to run across many different data values and use + s->size for a different purpose, eg. SPEED_ROUTINE_MPN_GCD_1. + + 512 means 2kbytes of data for each of xp_block and yp_block, making 4k + total, which should fit easily in any L1 data cache. */ + +#define SPEED_BLOCK_SIZE 512 /* limbs */ + extern double speed_unittime; extern double speed_cycletime; @@ -76,32 +86,44 @@ void speed_starttime _PROTO ((void)); double speed_endtime _PROTO ((void)); struct speed_params { - unsigned reps; /* how many times to run the routine */ - mp_ptr xp; /* first argument */ - mp_ptr yp; /* second argument */ - mp_size_t size; /* size of both arguments */ - long r; /* user supplied parameter */ + unsigned reps; /* how many times to run the routine */ + mp_ptr xp; /* first argument */ + mp_ptr yp; /* second argument */ + mp_size_t size; /* size of both arguments */ + long r; /* user supplied parameter */ mp_size_t align_xp; /* alignment of xp */ mp_size_t align_yp; /* alignment of yp */ mp_size_t align_wp; /* intended alignment of wp */ mp_size_t align_wp2; /* intended alignment of wp2 */ + mp_ptr xp_block; /* first special SPEED_BLOCK_SIZE block */ + mp_ptr yp_block; /* second special SPEED_BLOCK_SIZE block */ double time_divisor; /* optionally set by the speed routine */ - + + /* used by the cache priming things */ int cache; unsigned src_num, dst_num; struct { mp_ptr ptr; mp_size_t size; - } src[2], dst[2]; + } src[2], dst[3]; }; typedef double (*speed_function_t) _PROTO ((struct speed_params *s)); -double speed_measure _PROTO ((double (*fun)_PROTO ((struct speed_params *s)), - struct speed_params *s)); +double speed_measure _PROTO ((speed_function_t fun, struct speed_params *s)); + +/* Prototypes for speed measuring routines */ + +double speed_malloc_free _PROTO ((struct speed_params *s)); +double speed_malloc_realloc_free _PROTO ((struct speed_params *s)); double speed_memcpy _PROTO ((struct speed_params *s)); double speed_modlimb_invert _PROTO ((struct speed_params *s)); +double speed_mp_allocate_free _PROTO ((struct speed_params *s)); +double speed_mp_allocate_reallocate_free _PROTO ((struct speed_params *s)); + +double speed_mpf_init_clear _PROTO ((struct speed_params *s)); + double speed_mpn_add_n _PROTO ((struct speed_params *s)); double speed_mpn_add_n_self _PROTO ((struct speed_params *s)); double speed_mpn_add_n_inplace _PROTO ((struct speed_params *s)); @@ -156,16 +178,23 @@ double speed_mpn_toom3_sqr_n _PROTO ((struct speed_params *s)); double speed_mpn_xnor_n _PROTO ((struct speed_params *s)); double speed_mpn_xor_n _PROTO ((struct speed_params *s)); +double speed_mpq_init_clear _PROTO ((struct speed_params *s)); + +double speed_mpz_add _PROTO ((struct speed_params *s)); +double speed_mpz_bin_uiui _PROTO ((struct speed_params *s)); double speed_mpz_fac_ui _PROTO ((struct speed_params *s)); double speed_mpz_fib_ui _PROTO ((struct speed_params *s)); +double speed_mpz_init_clear _PROTO ((struct speed_params *s)); +double speed_mpz_init_realloc_clear _PROTO ((struct speed_params *s)); double speed_mpz_powm _PROTO ((struct speed_params *s)); -double speed_mpn_jacobi_base _PROTO ((struct speed_params *s)); -double speed_mpn_jacobi_base_division _PROTO ((struct speed_params *s)); double speed_noop _PROTO ((struct speed_params *s)); double speed_noop_wxs _PROTO ((struct speed_params *s)); double speed_noop_wxys _PROTO ((struct speed_params *s)); + +/* Prototypes for other routines */ + /* low 32-bits in p[0], high 32-bits in p[1] */ void speed_cyclecounter _PROTO ((unsigned p[2])); @@ -184,23 +213,14 @@ void *_mp_allocate_or_reallocate _PROTO ((void *ptr, void *align_pointer _PROTO ((void *p, size_t align)); void *_mp_allocate_func_aligned _PROTO ((size_t bytes, size_t align)); void speed_cache_fill _PROTO ((struct speed_params *s)); +void speed_operand_src _PROTO ((struct speed_params *s, + mp_ptr ptr, mp_size_t size)); +void speed_operand_dst _PROTO ((struct speed_params *s, + mp_ptr ptr, mp_size_t size)); void mpz_set_n _PROTO ((mpz_ptr z, mp_srcptr p, mp_size_t size)); - - -#define SPEED_OPERAND_SRC(s,p,sz) \ - do { \ - (s)->src[(s)->src_num].ptr = (p); \ - (s)->src[(s)->src_num].size = (sz); \ - (s)->src_num++; \ - } while (0) - -#define SPEED_OPERAND_DST(s,p,sz) \ - do { \ - (s)->dst[(s)->dst_num].ptr = (p); \ - (s)->dst[(s)->dst_num].size = (sz); \ - (s)->dst_num++; \ - } while (0) +extern int speed_option_addrs; +void speed_option_set _PROTO((const char *s)); #define SPEED_RESTRICT_COND(cond) if (!(cond)) return -1.0; @@ -216,10 +236,10 @@ void mpz_set_n _PROTO ((mpz_ptr z, mp_srcptr p, mp_size_t size)); SPEED_RESTRICT_COND (s->size >= 0); \ \ TMP_MARK (marker); \ - wp = SPEED_TMP_ALLOC (s->size, s->align_wp); \ + wp = SPEED_TMP_ALLOC_LIMBS (s->size, s->align_wp); \ \ - SPEED_OPERAND_SRC (s, s->xp, s->size); \ - SPEED_OPERAND_DST (s, wp, s->size); \ + speed_operand_src (s, s->xp, s->size); \ + speed_operand_dst (s, wp, s->size); \ speed_cache_fill (s); \ \ speed_starttime (); \ @@ -250,11 +270,11 @@ void mpz_set_n _PROTO ((mpz_ptr z, mp_srcptr p, mp_size_t size)); SPEED_RESTRICT_COND (s->size >= 1); \ \ TMP_MARK (marker); \ - wp = SPEED_TMP_ALLOC (s->size, s->align_wp); \ + wp = SPEED_TMP_ALLOC_LIMBS (s->size, s->align_wp); \ \ - SPEED_OPERAND_SRC (s, s->xp, s->size); \ - SPEED_OPERAND_SRC (s, s->yp, s->size); \ - SPEED_OPERAND_DST (s, wp, s->size); \ + speed_operand_src (s, s->xp, s->size); \ + speed_operand_src (s, s->yp, s->size); \ + speed_operand_dst (s, wp, s->size); \ speed_cache_fill (s); \ \ speed_starttime (); \ @@ -292,10 +312,10 @@ void mpz_set_n _PROTO ((mpz_ptr z, mp_srcptr p, mp_size_t size)); SPEED_RESTRICT_COND (s->size >= 1); \ \ TMP_MARK (marker); \ - wp = SPEED_TMP_ALLOC (s->size, s->align_wp); \ + wp = SPEED_TMP_ALLOC_LIMBS (s->size, s->align_wp); \ \ - SPEED_OPERAND_SRC (s, s->xp, s->size); \ - SPEED_OPERAND_DST (s, wp, s->size); \ + speed_operand_src (s, s->xp, s->size); \ + speed_operand_dst (s, wp, s->size); \ speed_cache_fill (s); \ \ speed_starttime (); \ @@ -343,11 +363,11 @@ void mpz_set_n _PROTO ((mpz_ptr z, mp_srcptr p, mp_size_t size)); SPEED_RESTRICT_COND (size1 >= s->size); \ \ TMP_MARK (marker); \ - wp = SPEED_TMP_ALLOC (size1 + s->size, s->align_wp); \ + wp = SPEED_TMP_ALLOC_LIMBS (size1 + s->size, s->align_wp); \ \ - SPEED_OPERAND_SRC (s, s->xp, size1); \ - SPEED_OPERAND_SRC (s, s->yp, s->size); \ - SPEED_OPERAND_DST (s, wp, size1 + s->size); \ + speed_operand_src (s, s->xp, size1); \ + speed_operand_src (s, s->yp, s->size); \ + speed_operand_dst (s, wp, size1 + s->size); \ speed_cache_fill (s); \ \ speed_starttime (); \ @@ -362,79 +382,147 @@ void mpz_set_n _PROTO ((mpz_ptr z, mp_srcptr p, mp_size_t size)); } -#define SPEED_ROUTINE_MPN_MUL_N_CALL(call, tsize) \ - { \ - mp_ptr wp, tspace; \ - unsigned i; \ - double t; \ - TMP_DECL (marker); \ - \ - SPEED_RESTRICT_COND (s->size >= 1); \ - \ - TMP_MARK (marker); \ - wp = SPEED_TMP_ALLOC (2*s->size, s->align_wp); \ - tspace = SPEED_TMP_ALLOC (tsize, s->align_wp2); \ - \ - SPEED_OPERAND_SRC (s, s->xp, s->size); \ - SPEED_OPERAND_SRC (s, s->yp, s->size); \ - SPEED_OPERAND_DST (s, wp, 2*s->size); \ - SPEED_OPERAND_DST (s, tspace, tsize); \ - speed_cache_fill (s); \ - \ - speed_starttime (); \ - i = s->reps; \ - do \ - call; \ - while (--i != 0); \ - t = speed_endtime (); \ - \ - TMP_FREE (marker); \ - return t; \ +#define SPEED_ROUTINE_MPN_MUL_N(function) \ + { \ + mp_ptr wp; \ + unsigned i; \ + double t; \ + TMP_DECL (marker); \ + \ + SPEED_RESTRICT_COND (s->size >= 1); \ + \ + TMP_MARK (marker); \ + wp = SPEED_TMP_ALLOC_LIMBS (2*s->size, s->align_wp); \ + \ + speed_operand_src (s, s->xp, s->size); \ + speed_operand_src (s, s->yp, s->size); \ + speed_operand_dst (s, wp, 2*s->size); \ + speed_cache_fill (s); \ + \ + speed_starttime (); \ + i = s->reps; \ + do \ + function (wp, s->xp, s->yp, s->size); \ + while (--i != 0); \ + t = speed_endtime (); \ + \ + TMP_FREE (marker); \ + return t; \ + } + + +#define SPEED_ROUTINE_MPN_MUL_N_TSPACE(call, tsize) \ + { \ + mp_ptr wp, tspace; \ + unsigned i; \ + double t; \ + TMP_DECL (marker); \ + \ + SPEED_RESTRICT_COND (s->size >= 1); \ + \ + TMP_MARK (marker); \ + wp = SPEED_TMP_ALLOC_LIMBS (2*s->size, s->align_wp); \ + tspace = SPEED_TMP_ALLOC_LIMBS (tsize, s->align_wp2); \ + \ + speed_operand_src (s, s->xp, s->size); \ + speed_operand_src (s, s->yp, s->size); \ + speed_operand_dst (s, wp, 2*s->size); \ + speed_operand_dst (s, tspace, tsize); \ + speed_cache_fill (s); \ + \ + speed_starttime (); \ + i = s->reps; \ + do \ + call; \ + while (--i != 0); \ + t = speed_endtime (); \ + \ + TMP_FREE (marker); \ + return t; \ } -#define SPEED_ROUTINE_MPN_MUL_N(function) \ - SPEED_ROUTINE_MPN_MUL_N_CALL (function (wp, s->xp, s->yp, s->size), 1) +/* FIXME: size restrictions */ +#define SPEED_ROUTINE_MPN_KARA_MUL_N(function) \ + SPEED_ROUTINE_MPN_MUL_N_TSPACE \ + (function (wp, s->xp, s->xp, s->size, tspace), \ + MPN_KARA_MUL_N_TSIZE (s->size)) -#define SPEED_ROUTINE_GMPN_TOOM3_MUL_N(function) \ - SPEED_ROUTINE_MPN_MUL_N_CALL (function (wp, s->xp, s->yp, s->size, tspace), \ - MPN_TOOM3_MUL_N_TSIZE (s->size)) +/* FIXME: size restrictions */ +#define SPEED_ROUTINE_MPN_TOOM3_MUL_N(function) \ + SPEED_ROUTINE_MPN_MUL_N_TSPACE \ + (function (wp, s->xp, s->yp, s->size, tspace), \ + MPN_TOOM3_MUL_N_TSIZE (s->size)) -#define SPEED_ROUTINE_MPN_SQR_CALL(call, tsize) \ - { \ - mp_ptr wp, tspace; \ - unsigned i; \ - double t; \ - TMP_DECL (marker); \ - \ - SPEED_RESTRICT_COND (s->size >= 1); \ - \ - TMP_MARK (marker); \ - wp = SPEED_TMP_ALLOC (2*s->size, s->align_wp); \ - tspace = SPEED_TMP_ALLOC (tsize, s->align_wp2); \ - \ - SPEED_OPERAND_SRC (s, s->xp, s->size); \ - SPEED_OPERAND_DST (s, wp, 2*s->size); \ - SPEED_OPERAND_DST (s, tspace, tsize); \ - speed_cache_fill (s); \ - \ - speed_starttime (); \ - i = s->reps; \ - do \ - call; \ - while (--i != 0); \ - t = speed_endtime (); \ - \ - TMP_FREE (marker); \ - return t; \ +#define SPEED_ROUTINE_MPN_SQR_CALL(call) \ + { \ + mp_ptr wp; \ + unsigned i; \ + double t; \ + TMP_DECL (marker); \ + \ + SPEED_RESTRICT_COND (s->size >= 1); \ + \ + TMP_MARK (marker); \ + wp = SPEED_TMP_ALLOC_LIMBS (2*s->size, s->align_wp); \ + \ + speed_operand_src (s, s->xp, s->size); \ + speed_operand_dst (s, wp, 2*s->size); \ + speed_cache_fill (s); \ + \ + speed_starttime (); \ + i = s->reps; \ + do \ + call; \ + while (--i != 0); \ + t = speed_endtime (); \ + \ + TMP_FREE (marker); \ + return t; \ + } + +#define SPEED_ROUTINE_MPN_SQR(function) \ + SPEED_ROUTINE_MPN_SQR_CALL (function (wp, s->xp, s->size)) + + +#define SPEED_ROUTINE_MPN_SQR_TSPACE(call, tsize) \ + { \ + mp_ptr wp, tspace; \ + unsigned i; \ + double t; \ + TMP_DECL (marker); \ + \ + SPEED_RESTRICT_COND (s->size >= 1); \ + \ + TMP_MARK (marker); \ + wp = SPEED_TMP_ALLOC_LIMBS (2*s->size, s->align_wp); \ + tspace = SPEED_TMP_ALLOC_LIMBS (tsize, s->align_wp2); \ + \ + speed_operand_src (s, s->xp, s->size); \ + speed_operand_dst (s, wp, 2*s->size); \ + speed_operand_dst (s, tspace, tsize); \ + speed_cache_fill (s); \ + \ + speed_starttime (); \ + i = s->reps; \ + do \ + call; \ + while (--i != 0); \ + t = speed_endtime (); \ + \ + TMP_FREE (marker); \ + return t; \ } -#define SPEED_ROUTINE_MPN_SQR(function) \ - SPEED_ROUTINE_MPN_SQR_CALL (function (wp, s->xp, s->size), 1) +/* FIXME: size restrictions */ +#define SPEED_ROUTINE_MPN_KARA_SQR_N(function) \ + SPEED_ROUTINE_MPN_SQR_TSPACE (function (wp, s->xp, s->size, tspace), \ + MPN_KARA_SQR_N_TSIZE (s->size)) -#define SPEED_ROUTINE_GMPN_TOOM3_SQR_N(function) \ - SPEED_ROUTINE_MPN_SQR_CALL (function (wp, s->xp, s->size, tspace), \ - MPN_TOOM3_SQR_N_TSIZE (s->size)) +/* FIXME: size restrictions */ +#define SPEED_ROUTINE_MPN_TOOM3_SQR_N(function) \ + SPEED_ROUTINE_MPN_SQR_TSPACE (function (wp, s->xp, s->size, tspace), \ + MPN_TOOM3_SQR_N_TSIZE (s->size)) #define SPEED_ROUTINE_MPN_MOD_CALL(call) \ @@ -443,7 +531,7 @@ void mpz_set_n _PROTO ((mpz_ptr z, mp_srcptr p, mp_size_t size)); \ SPEED_RESTRICT_COND (s->size >= 0); \ \ - SPEED_OPERAND_SRC (s, s->xp, s->size); \ + speed_operand_src (s, s->xp, s->size); \ speed_cache_fill (s); \ \ speed_starttime (); \ @@ -463,40 +551,40 @@ void mpz_set_n _PROTO ((mpz_ptr z, mp_srcptr p, mp_size_t size)); /* A division of 2*s->size by s->size limbs */ -#define SPEED_ROUTINE_MPN_BZ_DIVREM_CALL(call) \ - { \ - unsigned i; \ - mp_ptr a, d, q, r; \ - \ - SPEED_RESTRICT_COND (s->size >= 1); \ - \ - TMP_MARK (marker); \ - a = SPEED_TMP_ALLOC (2*s->size, s->align_xp); \ - d = SPEED_TMP_ALLOC (s->size, s->align_yp); \ - q = SPEED_TMP_ALLOC (s->size+1, s->align_wp); \ - r = SPEED_TMP_ALLOC (s->size, s->align_wp2); \ - \ - MPN_COPY (a, s->xp, s->size); \ - MPN_COPY (a+s->size, s->xp, s->size); \ - \ - MPN_COPY (d, s->yp, s->size); \ - \ - /* normalize the data */ \ - d[s->size-1] |= MP_LIMB_T_HIGHBIT; \ - a[2*s->size-1] = d[s->size-1] - 1; \ - \ - SPEED_OPERAND_SRC (s, a, 2*s->size); \ - SPEED_OPERAND_SRC (s, d, s->size); \ - SPEED_OPERAND_DST (s, q, s->size+1); \ - SPEED_OPERAND_DST (s, r, s->size); \ - speed_cache_fill (s); \ - \ - speed_starttime (); \ - i = s->reps; \ - do \ - call; \ - while (--i != 0); \ - return speed_endtime (); \ +#define SPEED_ROUTINE_MPN_BZ_DIVREM_CALL(call) \ + { \ + unsigned i; \ + mp_ptr a, d, q, r; \ + \ + SPEED_RESTRICT_COND (s->size >= 1); \ + \ + TMP_MARK (marker); \ + a = SPEED_TMP_ALLOC_LIMBS (2*s->size, s->align_xp); \ + d = SPEED_TMP_ALLOC_LIMBS (s->size, s->align_yp); \ + q = SPEED_TMP_ALLOC_LIMBS (s->size+1, s->align_wp); \ + r = SPEED_TMP_ALLOC_LIMBS (s->size, s->align_wp2); \ + \ + MPN_COPY (a, s->xp, s->size); \ + MPN_COPY (a+s->size, s->xp, s->size); \ + \ + MPN_COPY (d, s->yp, s->size); \ + \ + /* normalize the data */ \ + d[s->size-1] |= MP_LIMB_T_HIGHBIT; \ + a[2*s->size-1] = d[s->size-1] - 1; \ + \ + speed_operand_src (s, a, 2*s->size); \ + speed_operand_src (s, d, s->size); \ + speed_operand_dst (s, q, s->size+1); \ + speed_operand_dst (s, r, s->size); \ + speed_cache_fill (s); \ + \ + speed_starttime (); \ + i = s->reps; \ + do \ + call; \ + while (--i != 0); \ + return speed_endtime (); \ } #define SPEED_ROUTINE_MPN_BZ_DIVREM_N(function) \ @@ -506,8 +594,8 @@ void mpz_set_n _PROTO ((mpz_ptr z, mp_srcptr p, mp_size_t size)); SPEED_ROUTINE_MPN_BZ_DIVREM_CALL \ ((*function) (q, a, 2*s->size, d, s->size)) -#define SPEED_ROUTINE_MPN_BZ_TDIV_QR(function) \ - SPEED_ROUTINE_MPN_BZ_DIVREM_CALL \ +#define SPEED_ROUTINE_MPN_BZ_TDIV_QR(function) \ + SPEED_ROUTINE_MPN_BZ_DIVREM_CALL \ ((*function) (q, r, 0, a, 2*s->size, d, s->size)) @@ -517,7 +605,7 @@ void mpz_set_n _PROTO ((mpz_ptr z, mp_srcptr p, mp_size_t size)); \ SPEED_RESTRICT_COND (s->size >= 1); \ \ - SPEED_OPERAND_SRC (s, s->xp, s->size); \ + speed_operand_src (s, s->xp, s->size); \ speed_cache_fill (s); \ \ speed_starttime (); \ @@ -534,8 +622,8 @@ void mpz_set_n _PROTO ((mpz_ptr z, mp_srcptr p, mp_size_t size)); \ SPEED_RESTRICT_COND (s->size >= 1); \ \ - SPEED_OPERAND_SRC (s, s->xp, s->size); \ - SPEED_OPERAND_SRC (s, s->yp, s->size); \ + speed_operand_src (s, s->xp, s->size); \ + speed_operand_src (s, s->yp, s->size); \ speed_cache_fill (s); \ \ speed_starttime (); \ @@ -619,8 +707,8 @@ void mpz_set_n _PROTO ((mpz_ptr z, mp_srcptr p, mp_size_t size)); SPEED_RESTRICT_COND (s->size >= 0); \ \ TMP_MARK (marker); \ - wp = SPEED_TMP_ALLOC (s->size, s->align_wp); \ - wp2 = SPEED_TMP_ALLOC (s->size, s->align_wp2); \ + wp = SPEED_TMP_ALLOC_LIMBS (s->size, s->align_wp); \ + wp2 = SPEED_TMP_ALLOC_LIMBS (s->size, s->align_wp2); \ xp = s->xp; \ yp = s->yp; \ \ @@ -637,10 +725,10 @@ void mpz_set_n _PROTO ((mpz_ptr z, mp_srcptr p, mp_size_t size)); if (xp != s->xp) MPN_COPY (xp, s->xp, s->size); \ if (yp != s->yp) MPN_COPY (yp, s->yp, s->size); \ \ - SPEED_OPERAND_SRC (s, xp, s->size); \ - SPEED_OPERAND_SRC (s, yp, s->size); \ - SPEED_OPERAND_DST (s, wp, s->size); \ - SPEED_OPERAND_DST (s, wp2, s->size); \ + speed_operand_src (s, xp, s->size); \ + speed_operand_src (s, yp, s->size); \ + speed_operand_dst (s, wp, s->size); \ + speed_operand_dst (s, wp2, s->size); \ speed_cache_fill (s); \ \ speed_starttime (); \ @@ -654,17 +742,15 @@ void mpz_set_n _PROTO ((mpz_ptr z, mp_srcptr p, mp_size_t size)); return t; \ } -#define SPEED_ROUTINE_MPN_ADDSUB_N(function) \ - SPEED_ROUTINE_MPN_ADDSUB_CALL \ +#define SPEED_ROUTINE_MPN_ADDSUB_N(function) \ + SPEED_ROUTINE_MPN_ADDSUB_CALL \ (function (wp, wp2, xp, yp, s->size)); -#define SPEED_ROUTINE_MPN_ADDSUB_NC(function) \ - SPEED_ROUTINE_MPN_ADDSUB_CALL \ +#define SPEED_ROUTINE_MPN_ADDSUB_NC(function) \ + SPEED_ROUTINE_MPN_ADDSUB_CALL \ (function (wp, wp2, xp, yp, s->size, 0)); -/* function (wp1, wp2, wp1, wp2, s->size); */ /*full*/ - #define SPEED_ROUTINE_MPN_GCD_1xN(function) \ { \ unsigned i; \ @@ -676,7 +762,7 @@ void mpz_set_n _PROTO ((mpz_ptr z, mp_srcptr p, mp_size_t size)); \ TMP_MARK (marker); \ \ - SPEED_OPERAND_SRC (s, s->xp, s->size); \ + speed_operand_src (s, s->xp, s->size); \ speed_cache_fill (s); \ \ speed_starttime (); \ @@ -691,7 +777,7 @@ void mpz_set_n _PROTO ((mpz_ptr z, mp_srcptr p, mp_size_t size)); } -/* SPEED_DATA_SIZE many one GCDs of s->size bits each. */ +/* SPEED_BLOCK_SIZE many one GCDs of s->size bits each. */ #define SPEED_ROUTINE_MPN_GCD_1_CALL(setup, call) \ { \ @@ -705,29 +791,29 @@ void mpz_set_n _PROTO ((mpz_ptr z, mp_srcptr p, mp_size_t size)); SPEED_RESTRICT_COND (s->size <= mp_bits_per_limb); \ \ TMP_MARK (marker); \ - px = SPEED_TMP_ALLOC (SPEED_DATA_SIZE, s->align_xp); \ - py = SPEED_TMP_ALLOC (SPEED_DATA_SIZE, s->align_yp); \ - MPN_COPY (px, s->xp, SPEED_DATA_SIZE); \ - MPN_COPY (py, s->yp, SPEED_DATA_SIZE); \ + px = SPEED_TMP_ALLOC_LIMBS (SPEED_BLOCK_SIZE, s->align_xp); \ + py = SPEED_TMP_ALLOC_LIMBS (SPEED_BLOCK_SIZE, s->align_yp); \ + MPN_COPY (px, s->xp_block, SPEED_BLOCK_SIZE); \ + MPN_COPY (py, s->yp_block, SPEED_BLOCK_SIZE); \ \ x_mask = MP_LIMB_T_LOWBITMASK (s->size); \ y_mask = MP_LIMB_T_LOWBITMASK (s->r != 0 ? s->r : s->size); \ - for (i = 0; i < SPEED_DATA_SIZE; i++) \ + for (i = 0; i < SPEED_BLOCK_SIZE; i++) \ { \ px[i] &= x_mask; px[i] += (px[i] == 0); \ py[i] &= y_mask; py[i] += (py[i] == 0); \ setup; \ } \ \ - SPEED_OPERAND_SRC (s, px, SPEED_DATA_SIZE); \ - SPEED_OPERAND_SRC (s, py, SPEED_DATA_SIZE); \ + speed_operand_src (s, px, SPEED_BLOCK_SIZE); \ + speed_operand_src (s, py, SPEED_BLOCK_SIZE); \ speed_cache_fill (s); \ \ speed_starttime (); \ i = s->reps; \ do \ { \ - j = SPEED_DATA_SIZE; \ + j = SPEED_BLOCK_SIZE; \ do \ { \ call; \ @@ -739,7 +825,7 @@ void mpz_set_n _PROTO ((mpz_ptr z, mp_srcptr p, mp_size_t size)); \ TMP_FREE (marker); \ \ - s->time_divisor = SPEED_DATA_SIZE; \ + s->time_divisor = SPEED_BLOCK_SIZE; \ return t; \ } @@ -757,11 +843,10 @@ void mpz_set_n _PROTO ((mpz_ptr z, mp_srcptr p, mp_size_t size)); function (px[j-1], py[j-1], 0)) -/* SPEED_DATA_SIZE/s->size many GCDs of s->size limbs each. +/* SPEED_BLOCK_SIZE/s->size many GCDs of s->size limbs each. FIXME: It might be worth reducing the number of GCDs as s->size increases, - after all GCD is an O(n^2) algorithm, even if the accelerated algorithm - flattens this out a bit at smallish sizes. */ + after all GCD is an O(n^2) algorithm. */ #define SPEED_ROUTINE_MPN_GCD_CALL(datadivisor, call) \ { \ @@ -774,20 +859,20 @@ void mpz_set_n _PROTO ((mpz_ptr z, mp_srcptr p, mp_size_t size)); SPEED_RESTRICT_COND (s->size >= 1); \ \ TMP_MARK (marker); \ - xtmp = SPEED_TMP_ALLOC (s->size+1, s->align_xp); \ - ytmp = SPEED_TMP_ALLOC (s->size+1, s->align_yp); \ - wp = SPEED_TMP_ALLOC (s->size, s->align_wp); \ - wp2 = SPEED_TMP_ALLOC (s->size, s->align_wp2); \ + xtmp = SPEED_TMP_ALLOC_LIMBS (s->size+1, s->align_xp); \ + ytmp = SPEED_TMP_ALLOC_LIMBS (s->size+1, s->align_yp); \ + wp = SPEED_TMP_ALLOC_LIMBS (s->size, s->align_wp); \ + wp2 = SPEED_TMP_ALLOC_LIMBS (s->size, s->align_wp2); \ \ - pieces = SPEED_DATA_SIZE / s->size / datadivisor; \ + pieces = SPEED_BLOCK_SIZE / s->size / datadivisor; \ if (pieces == 0) \ pieces = 1; \ \ psize = pieces * s->size; \ px = TMP_ALLOC_LIMBS (psize); \ py = TMP_ALLOC_LIMBS (psize); \ - MPN_COPY (px, s->xp, psize); \ - MPN_COPY (py, s->yp, psize); \ + MPN_COPY (px, pieces==1 ? s->xp : s->xp_block, psize); \ + MPN_COPY (py, pieces==1 ? s->yp : s->yp_block, psize); \ \ /* y must be odd, x must have at least as many bits as y */ \ for (j = 0; j < pieces; j++) \ @@ -800,11 +885,11 @@ void mpz_set_n _PROTO ((mpz_ptr z, mp_srcptr p, mp_size_t size)); x[s->size-1] = MAX (x[s->size-1], y[s->size-1]); \ } \ \ - SPEED_OPERAND_SRC (s, px, psize); \ - SPEED_OPERAND_SRC (s, py, psize); \ - SPEED_OPERAND_DST (s, xtmp, s->size); \ - SPEED_OPERAND_DST (s, ytmp, s->size); \ - SPEED_OPERAND_DST (s, wp, s->size); \ + speed_operand_src (s, px, psize); \ + speed_operand_src (s, py, psize); \ + speed_operand_dst (s, xtmp, s->size); \ + speed_operand_dst (s, ytmp, s->size); \ + speed_operand_dst (s, wp, s->size); \ speed_cache_fill (s); \ \ speed_starttime (); \ @@ -850,14 +935,14 @@ void mpz_set_n _PROTO ((mpz_ptr z, mp_srcptr p, mp_size_t size)); SPEED_RESTRICT_COND (s->size >= 1); \ \ TMP_MARK (marker); \ - xp = SPEED_TMP_ALLOC (s->size, s->align_xp); \ - wp = SPEED_TMP_ALLOC (s->size, s->align_wp); \ + xp = SPEED_TMP_ALLOC_LIMBS (s->size, s->align_xp); \ + wp = SPEED_TMP_ALLOC_LIMBS (s->size, s->align_wp); \ \ /* source is overwritten */ \ MPN_COPY (xp, s->xp, s->size); \ \ - SPEED_OPERAND_SRC (s, xp, s->size); \ - SPEED_OPERAND_DST (s, wp, s->size); \ + speed_operand_src (s, xp, s->size); \ + speed_operand_dst (s, wp, s->size); \ speed_cache_fill (s); \ \ speed_starttime (); \ @@ -872,37 +957,40 @@ void mpz_set_n _PROTO ((mpz_ptr z, mp_srcptr p, mp_size_t size)); } -#define SPEED_ROUTINE_MODLIMB_INVERT(function) \ - { \ - unsigned i, j; \ - mp_ptr xp; \ - mp_limb_t n = 1; \ - double t; \ - \ - xp = s->xp - 1; \ - \ - speed_starttime (); \ - i = s->reps; \ - do \ - { \ - j = SPEED_DATA_SIZE; \ - do \ - { \ - /* randomized but successively dependent */ \ - n += (xp[j] << 1); \ - \ - function (n, n); \ - } \ - while (--j != 0); \ - } \ - while (--i != 0); \ - t = speed_endtime (); \ - \ - /* make sure the compiler won't optimize away n */ \ - noop_1 (n); \ - \ - s->time_divisor = SPEED_DATA_SIZE; \ - return t; \ +#define SPEED_ROUTINE_MODLIMB_INVERT(function) \ + { \ + unsigned i, j; \ + mp_ptr xp; \ + mp_limb_t n = 1; \ + double t; \ + \ + xp = s->xp_block-1; \ + \ + speed_operand_src (s, s->xp_block, SPEED_BLOCK_SIZE); \ + speed_cache_fill (s); \ + \ + speed_starttime (); \ + i = s->reps; \ + do \ + { \ + j = SPEED_BLOCK_SIZE; \ + do \ + { \ + /* randomized but successively dependent */ \ + n += (xp[j] << 1); \ + \ + function (n, n); \ + } \ + while (--j != 0); \ + } \ + while (--i != 0); \ + t = speed_endtime (); \ + \ + /* make sure the compiler won't optimize away n */ \ + noop_1 (n); \ + \ + s->time_divisor = SPEED_BLOCK_SIZE; \ + return t; \ } #endif |