summaryrefslogtreecommitdiff
diff options
context:
space:
mode:
authorJussi Kivilinna <jussi.kivilinna@iki.fi>2023-04-10 22:24:43 +0300
committerJussi Kivilinna <jussi.kivilinna@iki.fi>2023-04-23 21:19:09 +0300
commitfdf2e8ba654a4dcfee25586dd7e0749f2b7a92c0 (patch)
tree2f7e68db261f9fc9bde89efc3e5fe190a6316028
parentad4ee8d52f7199ba8bdee767044337060529069f (diff)
downloadlibgcrypt-fdf2e8ba654a4dcfee25586dd7e0749f2b7a92c0.tar.gz
mpi: optimize mpi_rshift and mpi_lshift to avoid extra MPI copying
* mpi/mpi-bit.c (_gcry_mpi_rshift): Refactor so that _gcry_mpih_rshift is used to do the copying along with shifting when copying is needed and refactor so that same code-path is used for both in-place and copying operation. (_gcry_mpi_lshift): Refactor so that _gcry_mpih_lshift is used to do the copying along with shifting when copying is needed and refactor so that same code-path is used for both in-place and copying operation. -- Benchmark on AMD Ryzen 9 7900X: Before: | nanosecs/byte mebibytes/sec cycles/byte auto Mhz rshift3 | 0.039 ns/B 24662 MiB/s 0.182 c/B 4700 lshift3 | 0.108 ns/B 8832 MiB/s 0.508 c/B 4700 rshift65 | 0.137 ns/B 6968 MiB/s 0.643 c/B 4700 lshift65 | 0.109 ns/B 8776 MiB/s 0.511 c/B 4700 After: | nanosecs/byte mebibytes/sec cycles/byte auto Mhz rshift3 | 0.038 ns/B 25049 MiB/s 0.179 c/B 4700 lshift3 | 0.039 ns/B 24709 MiB/s 0.181 c/B 4700 rshift65 | 0.038 ns/B 24942 MiB/s 0.180 c/B 4700 lshift65 | 0.040 ns/B 23671 MiB/s 0.189 c/B 4700 Signed-off-by: Jussi Kivilinna <jussi.kivilinna@iki.fi>
-rw-r--r--mpi/mpi-bit.c138
1 files changed, 51 insertions, 87 deletions
diff --git a/mpi/mpi-bit.c b/mpi/mpi-bit.c
index e2170401..7313a9d4 100644
--- a/mpi/mpi-bit.c
+++ b/mpi/mpi-bit.c
@@ -251,10 +251,11 @@ _gcry_mpi_rshift_limbs( gcry_mpi_t a, unsigned int count )
void
_gcry_mpi_rshift ( gcry_mpi_t x, gcry_mpi_t a, unsigned int n )
{
- mpi_size_t xsize;
- unsigned int i;
unsigned int nlimbs = (n/BITS_PER_MPI_LIMB);
unsigned int nbits = (n%BITS_PER_MPI_LIMB);
+ unsigned int i;
+ mpi_size_t alimbs;
+ mpi_ptr_t xp, ap;
if (mpi_is_immutable (x))
{
@@ -262,75 +263,42 @@ _gcry_mpi_rshift ( gcry_mpi_t x, gcry_mpi_t a, unsigned int n )
return;
}
- if ( x == a )
- {
- /* In-place operation. */
- if ( nlimbs >= x->nlimbs )
- {
- x->nlimbs = 0;
- return;
- }
+ alimbs = a->nlimbs;
- if (nlimbs)
- {
- for (i=0; i < x->nlimbs - nlimbs; i++ )
- x->d[i] = x->d[i+nlimbs];
- x->d[i] = 0;
- x->nlimbs -= nlimbs;
-
- }
- if ( x->nlimbs && nbits )
- _gcry_mpih_rshift ( x->d, x->d, x->nlimbs, nbits );
- }
- else if ( nlimbs )
+ if (x != a)
{
- /* Copy and shift by more or equal bits than in a limb. */
- xsize = a->nlimbs;
+ RESIZE_IF_NEEDED (x, alimbs);
+ x->nlimbs = alimbs;
+ x->flags = a->flags;
x->sign = a->sign;
- RESIZE_IF_NEEDED (x, xsize);
- x->nlimbs = xsize;
- for (i=0; i < a->nlimbs; i++ )
- x->d[i] = a->d[i];
- x->nlimbs = i;
-
- if ( nlimbs >= x->nlimbs )
- {
- x->nlimbs = 0;
- return;
- }
+ }
+
+ /* In-place operation. */
+ if (nlimbs >= alimbs)
+ {
+ x->nlimbs = 0;
+ return;
+ }
+
+ xp = x->d;
+ ap = a->d;
+ if (alimbs && nbits)
+ {
+ _gcry_mpih_rshift (xp, ap + nlimbs, alimbs - nlimbs, nbits);
if (nlimbs)
- {
- for (i=0; i < x->nlimbs - nlimbs; i++ )
- x->d[i] = x->d[i+nlimbs];
- x->d[i] = 0;
- x->nlimbs -= nlimbs;
- }
-
- if ( x->nlimbs && nbits )
- _gcry_mpih_rshift ( x->d, x->d, x->nlimbs, nbits );
+ xp[alimbs - nlimbs] = 0;
+ x->nlimbs -= nlimbs;
}
- else
+ else if (nlimbs || (x != a))
{
- /* Copy and shift by less than bits in a limb. */
- xsize = a->nlimbs;
- x->sign = a->sign;
- RESIZE_IF_NEEDED (x, xsize);
- x->nlimbs = xsize;
-
- if ( xsize )
- {
- if (nbits )
- _gcry_mpih_rshift (x->d, a->d, x->nlimbs, nbits );
- else
- {
- /* The rshift helper function is not specified for
- NBITS==0, thus we do a plain copy here. */
- for (i=0; i < x->nlimbs; i++ )
- x->d[i] = a->d[i];
- }
- }
+ for (i = 0; i < alimbs - nlimbs; i++ )
+ xp[i] = ap[i + nlimbs];
+ if (nlimbs)
+ xp[i] = 0;
+ x->nlimbs -= nlimbs;
}
+
MPN_NORMALIZE (x->d, x->nlimbs);
}
@@ -368,6 +336,9 @@ _gcry_mpi_lshift ( gcry_mpi_t x, gcry_mpi_t a, unsigned int n )
{
unsigned int nlimbs = (n/BITS_PER_MPI_LIMB);
unsigned int nbits = (n%BITS_PER_MPI_LIMB);
+ mpi_size_t alimbs;
+ mpi_ptr_t xp, ap;
+ int i;
if (mpi_is_immutable (x))
{
@@ -378,34 +349,27 @@ _gcry_mpi_lshift ( gcry_mpi_t x, gcry_mpi_t a, unsigned int n )
if (x == a && !n)
return; /* In-place shift with an amount of zero. */
- if ( x != a )
- {
- /* Copy A to X. */
- unsigned int alimbs = a->nlimbs;
- int asign = a->sign;
- mpi_ptr_t xp, ap;
-
- RESIZE_IF_NEEDED (x, alimbs+nlimbs+1);
- xp = x->d;
- ap = a->d;
- MPN_COPY (xp, ap, alimbs);
- x->nlimbs = alimbs;
- x->flags = a->flags;
- x->sign = asign;
- }
+ /* Note: might be in-place operation, so a==x or a!=x. */
+
+ alimbs = a->nlimbs;
- if (nlimbs && !nbits)
+ RESIZE_IF_NEEDED (x, alimbs + nlimbs + 1);
+ xp = x->d;
+ ap = a->d;
+ if (nbits && alimbs)
{
- /* Shift a full number of limbs. */
- _gcry_mpi_lshift_limbs (x, nlimbs);
+ x->nlimbs = alimbs + nlimbs + 1;
+ xp[alimbs + nlimbs] = _gcry_mpih_lshift (xp + nlimbs, ap, alimbs, nbits);
}
- else if (n)
+ else
{
- /* We use a very dump approach: Shift left by the number of
- limbs plus one and than fix it up by an rshift. */
- _gcry_mpi_lshift_limbs (x, nlimbs+1);
- mpi_rshift (x, x, BITS_PER_MPI_LIMB - nbits);
+ x->nlimbs = alimbs + nlimbs;
+ for (i = alimbs - 1; i >= 0; i--)
+ xp[i + nlimbs] = ap[i];
}
-
+ for (i = 0; i < nlimbs; i++)
+ xp[i] = 0;
+ x->flags = a->flags;
+ x->sign = a->sign;
MPN_NORMALIZE (x->d, x->nlimbs);
}