diff options
author | Jussi Kivilinna <jussi.kivilinna@iki.fi> | 2021-08-13 16:50:34 +0300 |
---|---|---|
committer | Jussi Kivilinna <jussi.kivilinna@iki.fi> | 2021-08-26 20:30:31 +0300 |
commit | 47e425e07995454573e28c13c08229d2f8a75642 (patch) | |
tree | fe18a88110106a2eb6a69d8cffc54ee0942b0a1b /cipher | |
parent | 33aebb30d210768d510a2843d9cc0c0ecd4237d1 (diff) | |
download | libgcrypt-47e425e07995454573e28c13c08229d2f8a75642.tar.gz |
Add ARMv8-CE HW acceleration for GCM-SIV counter mode
* cipher/rijndael-armv8-aarch32-ce.S
(_gcry_aes_ctr32le_enc_armv8_ce): New.
* cipher/rijndael-armv8-aarch64-ce.S
(_gcry_aes_ctr32le_enc_armv8_ce): New.
* cipher/rijndael-armv8-ce.c
(_gcry_aes_ctr32le_enc_armv8_ce)
(_gcry_aes_armv8_ce_ctr32le_enc): New.
* cipher/rijndael.c
(_gcry_aes_armv8_ce_ctr32le_enc): New prototype.
(do_setkey): Add setup of 'bulk_ops->ctr32le_enc' for ARMv8-CE.
--
Benchmark on Cortex-A53 (aarch64):
Before:
AES | nanosecs/byte mebibytes/sec cycles/byte auto Mhz
GCM-SIV enc | 11.77 ns/B 81.03 MiB/s 7.63 c/B 647.9
GCM-SIV dec | 11.92 ns/B 79.98 MiB/s 7.73 c/B 647.9
GCM-SIV auth | 2.99 ns/B 318.9 MiB/s 1.94 c/B 648.0
After (~2.4x faster):
AES | nanosecs/byte mebibytes/sec cycles/byte auto Mhz
GCM-SIV enc | 4.66 ns/B 204.5 MiB/s 3.02 c/B 647.9
GCM-SIV dec | 4.82 ns/B 198.0 MiB/s 3.12 c/B 647.9
GCM-SIV auth | 3.00 ns/B 318.4 MiB/s 1.94 c/B 648.0
GnuPG-bug-id: T4485
Signed-off-by: Jussi Kivilinna <jussi.kivilinna@iki.fi>
Diffstat (limited to 'cipher')
-rw-r--r-- | cipher/rijndael-armv8-aarch32-ce.S | 121 | ||||
-rw-r--r-- | cipher/rijndael-armv8-aarch64-ce.S | 109 | ||||
-rw-r--r-- | cipher/rijndael-armv8-ce.c | 17 | ||||
-rw-r--r-- | cipher/rijndael.c | 5 |
4 files changed, 252 insertions, 0 deletions
diff --git a/cipher/rijndael-armv8-aarch32-ce.S b/cipher/rijndael-armv8-aarch32-ce.S index 66440bd4..6d78af0a 100644 --- a/cipher/rijndael-armv8-aarch32-ce.S +++ b/cipher/rijndael-armv8-aarch32-ce.S @@ -1017,6 +1017,127 @@ _gcry_aes_ctr_enc_armv8_ce: /* + * void _gcry_aes_ctr32le_enc_armv8_ce (const void *keysched, + * unsigned char *outbuf, + * const unsigned char *inbuf, + * unsigned char *iv, + * unsigned int nrounds); + */ + +.align 3 +.globl _gcry_aes_ctr32le_enc_armv8_ce +.type _gcry_aes_ctr32le_enc_armv8_ce,%function; +_gcry_aes_ctr32le_enc_armv8_ce: + /* input: + * r0: keysched + * r1: outbuf + * r2: inbuf + * r3: iv + * %st+0: nblocks => r4 + * %st+4: nrounds => r5 + */ + + vpush {q4-q7} + push {r4-r12,lr} /* 4*16 + 4*10 = 104b */ + ldr r4, [sp, #(104+0)] + ldr r5, [sp, #(104+4)] + cmp r4, #0 + beq .Lctr32le_enc_skip + + cmp r5, #12 + vld1.8 {q0}, [r3] /* load IV */ + + aes_preload_keys(r0, r6); + + beq .Lctr32le_enc_entry_192 + bhi .Lctr32le_enc_entry_256 + +#define CTR_ENC(bits, ...) \ + .Lctr32le_enc_entry_##bits: \ + cmp r4, #4; \ + blo .Lctr32le_enc_loop_##bits; \ + \ + .Lctr32le_enc_loop4_##bits: \ + veor q2, q2; \ + sub r4, r4, #4; \ + vmov.i64 d4, #0xffffffff; /* q2 <= -1:0:0:0 */ \ + vmov q1, q0; \ + vadd.u32 q3, q2, q2; /* q3 <= -2:0:0:0 */ \ + vadd.u32 q0, q3, q3; /* q0 <= -4:0:0:0 */ \ + vadd.u32 q4, q3, q2; /* q4 <= -3:0:0:0 */ \ + vsub.u32 q0, q1, q0; \ + vsub.u32 q2, q1, q2; \ + vst1.8 {q0}, [r3]; \ + vsub.u32 q3, q1, q3; \ + vsub.u32 q4, q1, q4; \ + \ + cmp r4, #4; \ + vld1.8 {q0}, [r2]!; /* load ciphertext */ \ + \ + do_aes_4_##bits(e, mc, q1, q2, q3, q4, ##__VA_ARGS__); \ + \ + veor q1, q1, q0; \ + vld1.8 {q0}, [r2]!; /* load ciphertext */ \ + vst1.8 {q1}, [r1]!; /* store plaintext */ \ + vld1.8 {q1}, [r2]!; /* load ciphertext */ \ + veor q2, q2, q0; \ + veor q3, q3, q1; \ + vld1.8 {q0}, [r2]!; /* load ciphertext */ \ + vst1.8 {q2}, [r1]!; /* store plaintext */ \ + veor q4, q4, q0; \ + vld1.8 {q0}, [r3]; /* reload IV */ \ + vst1.8 {q3-q4}, [r1]!; /* store plaintext */ \ + \ + bhs .Lctr32le_enc_loop4_##bits; \ + cmp r4, #0; \ + beq .Lctr32le_enc_done; \ + \ + .Lctr32le_enc_loop_##bits: \ + \ + veor q2, q2; \ + vmov q1, q0; \ + vmov.i64 d4, #0xffffffff; /* q2 <= -1:0:0:0 */ \ + subs r4, r4, #1; \ + vsub.u32 q0, q0, q2; \ + vld1.8 {q2}, [r2]!; /* load ciphertext */ \ + \ + do_aes_one##bits(e, mc, q1, q1, ##__VA_ARGS__); \ + \ + veor q1, q2, q1; \ + vst1.8 {q1}, [r1]!; /* store plaintext */ \ + \ + bne .Lctr32le_enc_loop_##bits; \ + b .Lctr32le_enc_done; + + CTR_ENC(128) + CTR_ENC(192, r0, r6) + CTR_ENC(256, r0, r6) + +#undef CTR_ENC + +.Lctr32le_enc_done: + vst1.8 {q0}, [r3] /* store IV */ + + CLEAR_REG(q0) + CLEAR_REG(q1) + CLEAR_REG(q2) + CLEAR_REG(q3) + CLEAR_REG(q8) + CLEAR_REG(q9) + CLEAR_REG(q10) + CLEAR_REG(q11) + CLEAR_REG(q12) + CLEAR_REG(q13) + CLEAR_REG(q14) + +.Lctr32le_enc_skip: + pop {r4-r12,lr} + vpop {q4-q7} + bx lr +.size _gcry_aes_ctr32le_enc_armv8_ce,.-_gcry_aes_ctr32le_enc_armv8_ce; + + +/* * void _gcry_aes_ocb_enc_armv8_ce (const void *keysched, * unsigned char *outbuf, * const unsigned char *inbuf, diff --git a/cipher/rijndael-armv8-aarch64-ce.S b/cipher/rijndael-armv8-aarch64-ce.S index 3af29e0d..a87d2ca5 100644 --- a/cipher/rijndael-armv8-aarch64-ce.S +++ b/cipher/rijndael-armv8-aarch64-ce.S @@ -677,6 +677,115 @@ ELF(.size _gcry_aes_ctr_enc_armv8_ce,.-_gcry_aes_ctr_enc_armv8_ce;) /* + * void _gcry_aes_ctr32le_enc_armv8_ce (const void *keysched, + * unsigned char *outbuf, + * const unsigned char *inbuf, + * unsigned char *iv, + * unsigned int nrounds); + */ + +.align 3 +.globl _gcry_aes_ctr32le_enc_armv8_ce +ELF(.type _gcry_aes_ctr32le_enc_armv8_ce,%function;) +_gcry_aes_ctr32le_enc_armv8_ce: + /* input: + * r0: keysched + * r1: outbuf + * r2: inbuf + * r3: iv + * x4: nblocks + * w5: nrounds + */ + CFI_STARTPROC(); + + cbz x4, .Lctr32le_enc_skip + + mov w6, #1 + movi v16.16b, #0 + mov v16.S[0], w6 + + /* load IV */ + ld1 {v0.16b}, [x3] + + aes_preload_keys(x0, w5); + + b.eq .Lctr32le_enc_entry_192 + b.hi .Lctr32le_enc_entry_256 + +#define CTR_ENC(bits) \ + .Lctr32le_enc_entry_##bits: \ + cmp x4, #4; \ + b.lo .Lctr32le_enc_loop_##bits; \ + \ + .Lctr32le_enc_loop4_##bits: \ + sub x4, x4, #4; \ + \ + add v3.4s, v16.4s, v16.4s; /* 2 */ \ + mov v1.16b, v0.16b; \ + add v2.4s, v0.4s, v16.4s; \ + add v4.4s, v3.4s, v16.4s; /* 3 */ \ + add v6.4s, v3.4s, v3.4s; /* 4 */ \ + add v3.4s, v0.4s, v3.4s; \ + add v4.4s, v0.4s, v4.4s; \ + add v0.4s, v0.4s, v6.4s; \ + \ + cmp x4, #4; \ + ld1 {v5.16b-v7.16b}, [x2], #48; /* preload ciphertext */ \ + \ + do_aes_4_##bits(e, mc, v1, v2, v3, v4); \ + \ + eor v1.16b, v1.16b, v5.16b; \ + ld1 {v5.16b}, [x2], #16; /* load ciphertext */ \ + eor v2.16b, v2.16b, v6.16b; \ + eor v3.16b, v3.16b, v7.16b; \ + eor v4.16b, v4.16b, v5.16b; \ + st1 {v1.16b-v4.16b}, [x1], #64; /* store plaintext */ \ + \ + b.hs .Lctr32le_enc_loop4_##bits; \ + CLEAR_REG(v3); \ + CLEAR_REG(v4); \ + CLEAR_REG(v5); \ + CLEAR_REG(v6); \ + CLEAR_REG(v7); \ + cbz x4, .Lctr32le_enc_done; \ + \ + .Lctr32le_enc_loop_##bits: \ + \ + mov v1.16b, v0.16b; \ + ld1 {v2.16b}, [x2], #16; /* load ciphertext */ \ + sub x4, x4, #1; \ + add v0.4s, v0.4s, v16.4s; \ + \ + do_aes_one##bits(e, mc, v1, v1); \ + \ + eor v1.16b, v2.16b, v1.16b; \ + st1 {v1.16b}, [x1], #16; /* store plaintext */ \ + \ + cbnz x4, .Lctr32le_enc_loop_##bits; \ + b .Lctr32le_enc_done; + + CTR_ENC(128) + CTR_ENC(192) + CTR_ENC(256) + +#undef CTR_ENC + +.Lctr32le_enc_done: + aes_clear_keys(w5) + + st1 {v0.16b}, [x3] /* store IV */ + + CLEAR_REG(v0) + CLEAR_REG(v1) + CLEAR_REG(v2) + +.Lctr32le_enc_skip: + ret + CFI_ENDPROC(); +ELF(.size _gcry_aes_ctr32le_enc_armv8_ce,.-_gcry_aes_ctr32le_enc_armv8_ce;) + + +/* * void _gcry_aes_cfb_enc_armv8_ce (const void *keysched, * unsigned char *outbuf, * const unsigned char *inbuf, diff --git a/cipher/rijndael-armv8-ce.c b/cipher/rijndael-armv8-ce.c index 6e46830e..b24ae3e9 100644 --- a/cipher/rijndael-armv8-ce.c +++ b/cipher/rijndael-armv8-ce.c @@ -75,6 +75,12 @@ extern void _gcry_aes_ctr_enc_armv8_ce (const void *keysched, unsigned char *iv, size_t nblocks, unsigned int nrounds); +extern void _gcry_aes_ctr32le_enc_armv8_ce (const void *keysched, + unsigned char *outbuf, + const unsigned char *inbuf, + unsigned char *iv, size_t nblocks, + unsigned int nrounds); + extern void _gcry_aes_ocb_enc_armv8_ce (const void *keysched, unsigned char *outbuf, const unsigned char *inbuf, @@ -345,6 +351,17 @@ _gcry_aes_armv8_ce_ctr_enc (RIJNDAEL_context *ctx, unsigned char *iv, _gcry_aes_ctr_enc_armv8_ce(keysched, outbuf, inbuf, iv, nblocks, nrounds); } +void +_gcry_aes_armv8_ce_ctr32le_enc (RIJNDAEL_context *ctx, unsigned char *iv, + unsigned char *outbuf, + const unsigned char *inbuf, size_t nblocks) +{ + const void *keysched = ctx->keyschenc32; + unsigned int nrounds = ctx->rounds; + + _gcry_aes_ctr32le_enc_armv8_ce(keysched, outbuf, inbuf, iv, nblocks, nrounds); +} + size_t _gcry_aes_armv8_ce_ocb_crypt (gcry_cipher_hd_t c, void *outbuf_arg, const void *inbuf_arg, size_t nblocks, diff --git a/cipher/rijndael.c b/cipher/rijndael.c index c096321f..df41b911 100644 --- a/cipher/rijndael.c +++ b/cipher/rijndael.c @@ -209,6 +209,10 @@ extern void _gcry_aes_armv8_ce_cbc_enc (void *context, unsigned char *iv, extern void _gcry_aes_armv8_ce_ctr_enc (void *context, unsigned char *ctr, void *outbuf_arg, const void *inbuf_arg, size_t nblocks); +extern void _gcry_aes_armv8_ce_ctr32le_enc (void *context, unsigned char *ctr, + void *outbuf_arg, + const void *inbuf_arg, + size_t nblocks); extern void _gcry_aes_armv8_ce_cfb_dec (void *context, unsigned char *iv, void *outbuf_arg, const void *inbuf_arg, size_t nblocks); @@ -570,6 +574,7 @@ do_setkey (RIJNDAEL_context *ctx, const byte *key, const unsigned keylen, bulk_ops->cbc_enc = _gcry_aes_armv8_ce_cbc_enc; bulk_ops->cbc_dec = _gcry_aes_armv8_ce_cbc_dec; bulk_ops->ctr_enc = _gcry_aes_armv8_ce_ctr_enc; + bulk_ops->ctr32le_enc = _gcry_aes_armv8_ce_ctr32le_enc; bulk_ops->ocb_crypt = _gcry_aes_armv8_ce_ocb_crypt; bulk_ops->ocb_auth = _gcry_aes_armv8_ce_ocb_auth; bulk_ops->xts_crypt = _gcry_aes_armv8_ce_xts_crypt; |