summaryrefslogtreecommitdiff
diff options
context:
space:
mode:
authorNicolas Boichat <drinkcat@chromium.org>2018-05-24 14:33:06 +0800
committerChromeOS Commit Bot <chromeos-commit-bot@chromium.org>2018-06-01 06:22:07 +0000
commitc85c8ae8a2a2158b9b9bd41e792f0ee574454ca7 (patch)
tree1c64676a1bf374679b120f669f8baa5d6fe2a4cd
parent6cd014b2658f2a709f22b16d06982ff5444cd6db (diff)
downloadchrome-ec-c85c8ae8a2a2158b9b9bd41e792f0ee574454ca7.tar.gz
rsa: Further optimization of multiplications for Cortex-M0
In RSA, we often need to actually compute (a*b)+c+d: provide some assembly optimized functions for that. With -O3, 3072-bit exponent, lower verification time from 104 ms to 88 ms on STM32F072 @48Mhz. BRANCH=poppy BUG=b:35647963 BUG=b:77608104 TEST=On staff, flash, verification successful TEST=make test-rsa, make test-rsa3 TEST=make BOARD=hammer test-utils test-rsa3, test on board Change-Id: I80e8a7258d091e4f6adea11797729ac657dfd85d Signed-off-by: Nicolas Boichat <drinkcat@chromium.org> Reviewed-on: https://chromium-review.googlesource.com/1071411 Reviewed-by: Vincent Palatin <vpalatin@chromium.org> Reviewed-on: https://chromium-review.googlesource.com/1080584
-rw-r--r--common/rsa.c6
-rw-r--r--core/cortex-m0/mula.S37
-rw-r--r--include/util.h16
-rw-r--r--test/utils.c10
4 files changed, 60 insertions, 9 deletions
diff --git a/common/rsa.c b/common/rsa.c
index 3d8518e9ef..8b166e9dfb 100644
--- a/common/rsa.c
+++ b/common/rsa.c
@@ -56,8 +56,8 @@ static void mont_mul_add(const struct rsa_public_key *key,
uint32_t i;
for (i = 1; i < RSANUMWORDS; ++i) {
- A = (A >> 32) + mula32(a, b[i], c[i]);
- B = (B >> 32) + mula32(d0, key->n[i], A);
+ A = mulaa32(a, b[i], c[i], A >> 32);
+ B = mulaa32(d0, key->n[i], A, B >> 32);
c[i - 1] = (uint32_t)B;
}
@@ -82,7 +82,7 @@ static void mont_mul_add_0(const struct rsa_public_key *key,
uint32_t i;
for (i = 1; i < RSANUMWORDS; ++i) {
- B = (B >> 32) + mula32(d0, key->n[i], c[i]);
+ B = mulaa32(d0, key->n[i], c[i], B >> 32);
c[i - 1] = (uint32_t)B;
}
diff --git a/core/cortex-m0/mula.S b/core/cortex-m0/mula.S
index fc0a6f3ee0..02e617c328 100644
--- a/core/cortex-m0/mula.S
+++ b/core/cortex-m0/mula.S
@@ -2,7 +2,7 @@
* Use of this source code is governed by a BSD-style license that can be
* found in the LICENSE file.
*
- * Cortex-M0 multiply-accumulate functions
+ * Cortex-M0 multiply-accumulate[-accumulate] functions
*/
.syntax unified
@@ -44,3 +44,38 @@ mula32:
pop {r4, r5}
bx lr
+@ uint64_t mulaa32(uint32_t a, uint32_t b, uint32_t c, uint32_t d)
+@
+@ Multiply a (r0) and b (r1), add c (r2), add d (r3) and return the product in
+@ r1:r0
+ .thumb_func
+ .section .text.mulaa32
+ .global mulaa32
+mulaa32:
+
+ push {r4, r5, r6}
+ uxth r5, r0 /* r5 = a.lo16 */
+ uxth r6, r1 /* r6 = b.lo16 */
+ uxth r4, r2 /* r4 = c.lo16 */
+ muls r5, r6 /* r5 = a.lo16 * b.lo16 */
+ adds r5, r4 /* r5 = a.lo16 * b.lo16 + c.lo16 == r.lo32 */
+ lsrs r4, r0, 16 /* r4 = a.hi16 */
+ lsrs r2, r2, 16 /* r2 = c.hi16 */
+ muls r6, r4 /* r6 = a.hi16 * b.lo16 */
+ adds r6, r2 /* r6 = a.hi16 * b.lo16 + c.hi16 == r.mid32.1 */
+ uxth r2, r0 /* r2 = a.lo16 */
+ lsrs r1, r1, 16 /* r1 = b.hi16 */
+ muls r2, r1 /* r2 = a.lo16 * b.hi16 == r.mid32.2 */
+ muls r1, r4 /* r1 = b.hi16 * a.hi16 == r.hi32 */
+ movs r4, 0 /* r4 = 0 */
+ adds r6, r2 /* r6 = (r.mid32.1 + r.mid32.2).lo32 == r.mid.lo32 */
+ adcs r4, r4 /* r4 = (r.mid32.1 + r.mid32.2).hi32 == r.mid.hi32 */
+ lsls r0, r6, 16 /* r0 = r.mid.lo32.lo16 << 16 == r.mid.inpos.lo32 */
+ lsrs r6, r6, 16 /* r6 = r.mid.lo32.hi16 >> 16 */
+ lsls r4, r4, 16 /* r4 = r.mid.hi.lo16 << 16 */
+ adds r0, r3 /* r0 = r.mid.inposition.lo32 + d */
+ adcs r4, r6 /* r4 = r6 + r4 + carry = r.mid.inpos.hi32 */
+ adds r0, r5 /* r0 = r.lo32 */
+ adcs r1, r4 /* r1 = r.hi32 */
+ pop {r4, r5, r6}
+ bx lr
diff --git a/include/util.h b/include/util.h
index 2670d4e5ad..c8ae09db93 100644
--- a/include/util.h
+++ b/include/util.h
@@ -184,10 +184,11 @@ int parse_offset_size(int argc, char **argv, int shift,
#ifdef CONFIG_ASSEMBLY_MULA32
/*
- * Compute (a*b)+c, where a, b, c are 32-bit integers, and the result is
- * 64-bit long.
+ * Compute (a*b)+c[+d], where a, b, c[, d] are 32-bit integers, and the result
+ * is 64-bit long.
*/
uint64_t mula32(uint32_t a, uint32_t b, uint32_t c);
+uint64_t mulaa32(uint32_t a, uint32_t b, uint32_t c, uint32_t d);
#else
static inline uint64_t mula32(uint32_t a, uint32_t b, uint32_t c)
{
@@ -198,6 +199,17 @@ static inline uint64_t mula32(uint32_t a, uint32_t b, uint32_t c)
return ret;
}
+
+static inline uint64_t mulaa32(uint32_t a, uint32_t b, uint32_t c, uint32_t d)
+{
+ uint64_t ret = a;
+
+ ret *= b;
+ ret += c;
+ ret += d;
+
+ return ret;
+}
#endif
#endif /* __CROS_EC_UTIL_H */
diff --git a/test/utils.c b/test/utils.c
index a5a53a5eb9..f63dbeec58 100644
--- a/test/utils.c
+++ b/test/utils.c
@@ -360,6 +360,7 @@ static int test_cond_t(void)
static int test_mula32(void)
{
uint64_t r = 0x0;
+ uint64_t r2 = 0x0;
uint32_t b = 0x1;
uint32_t c = 0x1;
uint32_t i;
@@ -368,15 +369,18 @@ static int test_mula32(void)
t0 = get_time();
for (i = 0; i < 5000000; i++) {
r = mula32(b, c, r + (r >> 32));
+ r2 = mulaa32(b, c, r2 >> 32, r2);
b = (b << 13) ^ (b >> 2) ^ i;
c = (c << 16) ^ (c >> 7) ^ i;
watchdog_reload();
}
t1 = get_time();
- ccprintf("After %d iterations, r=%08x%08x (time: %d)\n", i,
- (uint32_t)(r >> 32), (uint32_t)r, t1.le.lo-t0.le.lo);
- TEST_ASSERT(r == 0x9df59b9fb0ab9d96L);
+ ccprintf("After %d iterations, r=%08x%08x, r2=%08x%08x (time: %d)\n",
+ i, (uint32_t)(r >> 32), (uint32_t)r,
+ (uint32_t)(r2 >> 32), (uint32_t)r2, t1.le.lo-t0.le.lo);
+ TEST_ASSERT(r == 0x9df59b9fb0ab9d96L);
+ TEST_ASSERT(r2 == 0x9df59b9fb0beabd6L);
/* well okay then */
return EC_SUCCESS;