#include #include "gmp.h" #include "gmp-impl.h" #include #include const double BETA = 0.7; const mp_size_t SHORT_MUL_THRESHOLD = 1; // 2*KARATSUBA_MUL_THRESHOLD; #include int cputime () { struct rusage rus; getrusage (0, &rus); return rus.ru_utime.tv_sec * 1000 + rus.ru_utime.tv_usec / 1000; } /* Low short multiplication using the classical multiplication scheme. Adds the h least significant limbs of the product of (s1p,size1) and (s2p,size2) to (rp,h). An eventual carry is ignored. size1>=size2 h<=size1+size2 */ mpn_addlowshortmul_clas(rp, s1p, size1, s2p, size2, h) mp_ptr rp, s1p, s2p; mp_size_t size1, size2, h; { mp_ptr p2=s2p, p3=rp, s2end = s2p+size2, msl, mslp; mp_size_t s; if (size2>=h) { for ( ; h; p2++, p3++, h--) mpn_addmul_1(p3, s1p, h, *p2); return; } if (size1>=h) { for ( ; p2=size2 h<=size1+size2 */ mpn_addlowshortmul_fast(rp, s1p, size1, s2p, size2, h) mp_ptr rp, s1p, s2p; mp_size_t size1, size2, h; { mp_size_t ds; mp_ptr prod; if (h<=size2) { mpn_addlowshortmul_fast_1(rp, s1p, s2p, h); return; } if (h<=size1) { ds = h-size2; TMP_DECL (marker); TMP_MARK (marker); prod = (mp_ptr) TMP_ALLOC(h*BYTES_PER_MP_LIMB); if (ds>=size2) mpn_mul(prod, s1p, ds, s2p, size2); else mpn_mul(prod, s2p, size2, s1p, ds); mpn_addlowshortmul_fast_1(prod+ds, s1p+ds, s2p, size2); mpn_add_n(rp, rp, prod, h); TMP_FREE (marker); return; } ds = h-size2; TMP_DECL (marker); TMP_MARK (marker); prod = (mp_ptr) TMP_ALLOC(h*BYTES_PER_MP_LIMB); if (ds>=size2) mpn_mul(prod, s1p, ds, s2p, size2); else mpn_mul(prod, s2p, size2, s1p, ds); mpn_addlowshortmul_fast(prod+ds, s2p, size2, s1p+ds, size1-ds, size2); mpn_add_n(rp, rp, prod, h); TMP_FREE (marker); } /* Low short multiplication. Returns the h least significant limbs of the product of (s1p,size1) and (s2p,size2). sp has size at least h. */ void mpn_lowshortmul(sp, s1p, size1, s2p, size2, h) mp_ptr sp; mp_srcptr s1p, s2p; mp_size_t size1, size2, h; { if (h>=size1+size2) if (size1>=size2) mpn_mul(sp, s1p, size1, s2p, size2); else mpn_mul(sp, s2p, size2, s1p, size1); else { MPN_ZERO(sp, h); if (size1>=size2) mpn_addlowshortmul_fast(sp, s1p, size1, s2p, size2, h); else mpn_addlowshortmul_fast(sp, s2p, size2, s1p, size1, h); } } /* High short multiplication using the classical multiplication scheme. Let hsp be the high short product of (s1p,size1) and (s2p,size2) with bound h, i.e. hsp is the sum of all s1p[i]*s2p[j]*beta^(i+j) with i+j>=h. Adds hsp[h]+hsp[h+1]*beta+hsp[h+2]*beta^2+... to (rp,size1+size2-h). Returns carry, i.e. 0 or 1. size1>=size2 h=size1-1) { TMP_DECL (marker); TMP_MARK (marker); msl = (mp_ptr) TMP_ALLOC((size2-(h-(size1-1)))*BYTES_PER_MP_LIMB); mslp = msl; for (p1=s1p+size1-1, p2=s2p+(h-(size1-1)), s=1; p2=size2-1) { TMP_DECL (marker); TMP_MARK (marker); msl = (mp_ptr) TMP_ALLOC(size2*BYTES_PER_MP_LIMB); for (mslp=msl, p1=s1p+h, p2=s2p, s=size1-h; p2=s1p; p1--, p2++, mslp++, s++) *mslp = mpn_addmul_1(rp, p1, s, *p2); for (pr=rp+1; p2=size. Adds N[size]+N[size+1]*beta+N[size+2]*beta^2+... to (rp,size), where hsp <= N <= p. Returns carry, i.e. 0 or 1. Here, the formulation with N is needed because you have no control over the carries in the recursive calls. */ mp_limb_t mpn_addhighshortmul_fast_1(rp, s1p, s2p, size) mp_ptr rp, s1p, s2p; mp_size_t size; { mp_ptr fp; if (size=h. Adds N[h]+N[h+1]*beta+N[h+2]*beta^2+... to (rp,size1+size2-h), where hsp <= N <= p. Returns carry, i.e. 0 or 1. size1>=size2 h=size1) { s = size1+size2-h; return mpn_addhighshortmul_fast_1(rp, s1p+size1-s, s2p+size2-s, s); } if (h>=size2) { ds = size1-h; TMP_DECL (marker); TMP_MARK (marker); prod = (mp_ptr) TMP_ALLOC((ds+size2)*BYTES_PER_MP_LIMB); if (ds>=size2) mpn_mul(prod, s1p+h, ds, s2p, size2); else mpn_mul(prod, s2p, size2, s1p+h, ds); msl = mpn_addhighshortmul_fast_1(prod, s1p+(h-size2), s2p, size2); mpn_add_1(prod+size2, prod+size2, ds, msl); retval = mpn_add_n(rp, rp, prod, size1+size2-h); TMP_FREE (marker); return retval; } ds = size1-h; TMP_DECL (marker); TMP_MARK (marker); prod = (mp_ptr) TMP_ALLOC((ds+size2)*BYTES_PER_MP_LIMB); if (ds>=size2) mpn_mul(prod, s1p+h, ds, s2p, size2); else mpn_mul(prod, s2p, size2, s1p+h, ds); msl = mpn_addhighshortmul_fast(prod, s2p, size2, s1p, h, h); mpn_add_1(prod+size2, prod+size2, ds, msl); retval = mpn_add_n(rp, rp, prod, size1+size2-h); TMP_FREE (marker); return retval; } /* High short multiplication. Let p be the product of (s1p,size1) and (s2p,size2). Let hsp be the high short product of (s1p,size1) and (s2p,size2) with bound h, i.e. hsp is the sum of all s1p[i]*s2p[j]*beta^(i+j) with i+j>=h. Returns N[h]+N[h+1]*beta+N[h+2]*beta^2+..., where hsp <= N <= p. The result contains size1+size2-h limbs. If h>=size1+size2, the result contains one zero limb. Here, the formulation with N is needed because you have no control over the carries in the recursive calls. */ mp_ptr mpn_highshortmul_approx(s1p, size1, s2p, size2, h) mp_ptr s1p, s2p; mp_size_t size1, size2, h; { mp_ptr sp; if (h>=size1+size2) { sp = alloca(BYTES_PER_MP_LIMB); MPN_ZERO(sp,1); } sp = alloca((size1+size2-h)*BYTES_PER_MP_LIMB); MPN_ZERO(sp,size1+size2-h); if (size1>=size2) mpn_addhighshortmul_fast(sp, s1p, size1, s2p, size2, h); else mpn_addhighshortmul_fast(sp, s2p, size2, s1p, size1, h); return sp; } /* High short multiplication. Let p be the product of (s1p,size1) and (s2p,size2). Returns p[h]+p[h+1]*beta+p[h+2]*beta^2+.... The result contains size1+size2-h limbs. If h>=size1+size2, the result contains one zero limb. */ mp_ptr mpn_highshortmul(s1p, size1, s2p, size2, h) mp_ptr s1p, s2p; mp_size_t size1, size2, h; { mp_ptr sp, prod; mp_size_t s; if (h>=size1+size2) { sp = (mp_ptr) calloc(1, sizeof(mp_limb_t)); return sp; } if (h<2) { sp = (mp_ptr) calloc(size1+size2, sizeof(mp_limb_t)); if (size1>=size2) mpn_mul(sp, s1p, size1, s2p, size2); else mpn_mul(sp, s2p, size2, s1p, size1); return sp+h; } sp = (mp_ptr) calloc(size1+size2-h+2, sizeof(mp_limb_t)); s = MAX(size1, size2); if (size1>=size2) mpn_addhighshortmul_fast(sp, s1p, size1, s2p, size2, h-2); else mpn_addhighshortmul_fast(sp, s2p, size2, s1p, size1, h-2); if (sp[1]+s=size2) mpn_mul(sp, s1p, size1, s2p, size2); else mpn_mul(sp, s2p, size2, s1p, size1); return sp+h; } return sp+2; } // TESTING test_mpn_addlowshortmul_clas(size1, size2, h) mp_size_t size1, size2, h; { mp_ptr s1p, s2p, rp1, rp2, prod; mp_size_t i; printf("test_mpn_addlowshortmul_clas: "); fflush(stdout); s1p = alloca(size1*BYTES_PER_MP_LIMB); mpn_random(s1p, size1); s2p = alloca(size2*BYTES_PER_MP_LIMB); mpn_random(s2p, size2); rp1 = alloca(h*BYTES_PER_MP_LIMB); mpn_random(rp1, h); rp2 = alloca(h*BYTES_PER_MP_LIMB); for (i=0; i=0) { elapse.tv_sec = now.tv_sec-timer.tv_sec; elapse.tv_usec = now.tv_usec-timer.tv_usec; } else { elapse.tv_sec = now.tv_sec-timer.tv_sec-1; elapse.tv_usec= 1000000+now.tv_usec-timer.tv_usec; } return elapse; } time_mpn_addlowshortmul_clas(size1, size2, h, c) mp_size_t size1, size2, h; int c; { mp_ptr s1p, s2p, rp, prod; struct timeval timer, elapse; int i; printf("time_mpn_addlowshortmul_clas:\n"); s1p = alloca(size1*BYTES_PER_MP_LIMB); mpn_random(s1p, size1); s2p = alloca(size2*BYTES_PER_MP_LIMB); mpn_random(s2p, size2); rp = alloca(h*BYTES_PER_MP_LIMB); mpn_random(rp, h); start_timer(&timer); for (i=c ; i; i--) mpn_addlowshortmul_clas(rp, s1p, size1, s2p, size2, h); elapse = elapsed(timer); printf(" addlowshortmul_clas: %d.%.6d\n",elapse.tv_sec,elapse.tv_usec); start_timer(&timer); for (i=c ; i; i--) { TMP_DECL (marker); TMP_MARK (marker); prod = (mp_ptr) TMP_ALLOC((size1+size2)*BYTES_PER_MP_LIMB); mpn_mul(prod, s1p, size1, s2p, size2); TMP_FREE (marker); } elapse = elapsed(timer); printf(" addfullmul: %d.%.6d\n",elapse.tv_sec,elapse.tv_usec); } time_mpn_addlowshortmul_fast(size1, size2, h, c) mp_size_t size1, size2, h; int c; { mp_ptr s1p, s2p, rp, prod; struct timeval timer, elapse; int i; printf("time_mpn_addlowshortmul_fast:\n"); s1p = alloca(size1*BYTES_PER_MP_LIMB); mpn_random(s1p, size1); s2p = alloca(size2*BYTES_PER_MP_LIMB); mpn_random(s2p, size2); rp = alloca(h*BYTES_PER_MP_LIMB); mpn_random(rp, h); start_timer(&timer); for (i=c ; i; i--) mpn_addlowshortmul_fast(rp, s1p, size1, s2p, size2, h); elapse = elapsed(timer); printf(" addlowshortmul_fast: %d.%.6d\n",elapse.tv_sec,elapse.tv_usec); start_timer(&timer); for (i=c ; i; i--) { TMP_DECL (marker); TMP_MARK (marker); prod = (mp_ptr) TMP_ALLOC((size1+size2)*BYTES_PER_MP_LIMB); mpn_mul(prod, s1p, size1, s2p, size2); TMP_FREE (marker); } elapse = elapsed(timer); printf(" addfullmul: %d.%.6d\n",elapse.tv_sec,elapse.tv_usec); } time_mpn_addhighshortmul_clas(size1, size2, h, c) mp_size_t h; int c; { mp_ptr s1p, s2p, rp, prod; struct timeval timer, elapse; int i; printf("time_mpn_addhighshortmul_clas:\n"); s1p = alloca(size1*BYTES_PER_MP_LIMB); mpn_random(s1p, size1); s2p = alloca(size2*BYTES_PER_MP_LIMB); mpn_random(s2p, size2); rp = alloca((size1+size2-h)*BYTES_PER_MP_LIMB); MPN_ZERO(rp,size1+size2-h); start_timer(&timer); for (i=c ; i; i--) mpn_addhighshortmul_clas(rp, s1p, size1, s2p, size2, h); elapse = elapsed(timer); printf(" addhighshortmul_clas: %d.%.6d\n",elapse.tv_sec,elapse.tv_usec); start_timer(&timer); for (i=c ; i; i--) { TMP_DECL (marker); TMP_MARK (marker); prod = (mp_ptr) TMP_ALLOC((size1+size2)*BYTES_PER_MP_LIMB); mpn_mul(prod, s1p, size1, s2p, size2); TMP_FREE (marker); } elapse = elapsed(timer); printf(" addfullmul: %d.%.6d\n",elapse.tv_sec,elapse.tv_usec); } time_mpn_addhighshortmul_fast(size1, size2, h, c) mp_size_t h; int c; { mp_ptr s1p, s2p, rp, prod; struct timeval timer, elapse; int i; printf("time_mpn_addhighshortmul_fast:\n"); s1p = alloca(size1*BYTES_PER_MP_LIMB); mpn_random(s1p, size1); s2p = alloca(size2*BYTES_PER_MP_LIMB); mpn_random(s2p, size2); rp = alloca((size1+size2-h)*BYTES_PER_MP_LIMB); MPN_ZERO(rp,size1+size2-h); start_timer(&timer); for (i=c ; i; i--) mpn_addhighshortmul_fast(rp, s1p, size1, s2p, size2, h); elapse = elapsed(timer); printf(" addhighshortmul_fast: %d.%.6d\n",elapse.tv_sec,elapse.tv_usec); start_timer(&timer); for (i=c ; i; i--) { TMP_DECL (marker); TMP_MARK (marker); prod = (mp_ptr) TMP_ALLOC((size1+size2)*BYTES_PER_MP_LIMB); mpn_mul(prod, s1p, size1, s2p, size2); TMP_FREE (marker); } elapse = elapsed(timer); printf(" addfullmul: %d.%.6d\n",elapse.tv_sec,elapse.tv_usec); } time_mpn_lowshortmul(size1, size2, h, c) mp_size_t size1, size2, h; int c; { mp_ptr s1p, s2p, sp, prod; int i, st1, st2; printf("time_mpn_lowshortmul:\n"); s1p = alloca(size1*BYTES_PER_MP_LIMB); mpn_random(s1p, size1); s2p = alloca(size2*BYTES_PER_MP_LIMB); mpn_random(s2p, size2); TMP_DECL (marker); TMP_MARK (marker); prod = (mp_ptr) TMP_ALLOC((size1+size2)*BYTES_PER_MP_LIMB); st1 = cputime(); for (i=c ; i; i--) mpn_lowshortmul(prod, s1p, size1, s2p, size2, h); st1 = cputime()-st1; printf(" lowshortmul: %dms\n", st1); st2 = cputime(); for (i=c ; i; i--) mpn_mul(prod, s1p, size1, s2p, size2); st2 = cputime()-st2; TMP_FREE (marker); printf(" fullmul: %dms\n", st2); printf("lowshortmul/fullmul=%1.2f\n", (double) st1 / (double) st2); } time_mpn_lowshortmul_all(size1, size2, h, c) mp_size_t size1, size2, h; int c; { mp_ptr s1p, s2p, sp, prod; int i, st1, st2, st3, newsize, opt_newsize; if (size1