diff options
author | Jussi Kivilinna <jussi.kivilinna@iki.fi> | 2019-04-26 19:28:11 +0300 |
---|---|---|
committer | Jussi Kivilinna <jussi.kivilinna@iki.fi> | 2019-04-26 19:28:11 +0300 |
commit | 14c8a593ede42f51f567ed7ba77b53124151aa38 (patch) | |
tree | 03a649e8ee9be5efde4b5a313e521814c059b9af | |
parent | b878a986f3ab2c35aff89c7f66f137a91542ed5b (diff) | |
download | libgcrypt-14c8a593ede42f51f567ed7ba77b53124151aa38.tar.gz |
Add 64-bit ARMv8/CE PMULL implementation of CRC
* cipher/Makefile.am: Add 'crc-armv8-ce.c' and
'crc-armv8-aarch64-ce.S'.
* cipher/asm-common-aarch64.h [HAVE_GCC_ASM_CFI_DIRECTIVES]: Add CFI
helper macros.
* cipher/crc-armv8-aarch64-ce.S: New.
* cipher/crc-armv8-ce.c: New.
* cipher/crc.c (USE_ARM_PMULL): New.
(CRC_CONTEXT) [USE_ARM_PMULL]: Add 'use_pmull'.
[USE_ARM_PMULL] (_gcry_crc32_armv8_ce_pmull)
(_gcry_crc24rfc2440_armv8_ce_pmull): New prototypes.
(crc32_init, crc32rfc1510_init, crc24rfc2440_init): Enable ARM PMULL
implementations if supported by HW features.
(crc32_write, crc24rfc2440_write) [USE_ARM_PMULL]: Use ARM PMULL
implementations if enabled.
* configure.ac: Add 'crc-armv8-ce.lo' and 'crc-armv8-aarch64-ce.lo'.
--
Benchmark on Cortex-A53 (at 1104 Mhz):
Before:
| nanosecs/byte mebibytes/sec cycles/byte
CRC32 | 2.89 ns/B 330.2 MiB/s 3.19 c/B
CRC32RFC1510 | 2.89 ns/B 330.2 MiB/s 3.19 c/B
CRC24RFC2440 | 2.72 ns/B 350.8 MiB/s 3.00 c/B
After (crc32 ~8.4x faster, crc24 ~6.8x faster):
| nanosecs/byte mebibytes/sec cycles/byte
CRC32 | 0.341 ns/B 2796 MiB/s 0.377 c/B
CRC32RFC1510 | 0.342 ns/B 2792 MiB/s 0.377 c/B
CRC24RFC2440 | 0.398 ns/B 2396 MiB/s 0.439 c/B
Signed-off-by: Jussi Kivilinna <jussi.kivilinna@iki.fi>
-rw-r--r-- | cipher/Makefile.am | 3 | ||||
-rw-r--r-- | cipher/crc-armv8-aarch64-ce.S | 492 | ||||
-rw-r--r-- | cipher/crc-armv8-ce.c | 229 | ||||
-rw-r--r-- | cipher/crc.c | 51 | ||||
-rw-r--r-- | configure.ac | 5 |
5 files changed, 776 insertions, 4 deletions
diff --git a/cipher/Makefile.am b/cipher/Makefile.am index 3f00ed4a..2acd7cb3 100644 --- a/cipher/Makefile.am +++ b/cipher/Makefile.am @@ -78,7 +78,8 @@ EXTRA_libcipher_la_SOURCES = \ cast5.c cast5-amd64.S cast5-arm.S \ chacha20.c chacha20-amd64-ssse3.S chacha20-amd64-avx2.S \ chacha20-armv7-neon.S chacha20-aarch64.S \ - crc.c crc-intel-pclmul.c \ + crc.c crc-intel-pclmul.c crc-armv8-ce.c \ + crc-armv8-aarch64-ce.S \ des.c des-amd64.S \ dsa.c \ elgamal.c \ diff --git a/cipher/crc-armv8-aarch64-ce.S b/cipher/crc-armv8-aarch64-ce.S new file mode 100644 index 00000000..497d0055 --- /dev/null +++ b/cipher/crc-armv8-aarch64-ce.S @@ -0,0 +1,492 @@ +/* crc-armv8-aarch64-ce.S - ARMv8/CE PMULL accelerated CRC implementation + * Copyright (C) 2019 Jussi Kivilinna <jussi.kivilinna@iki.fi> + * + * This file is part of Libgcrypt. + * + * Libgcrypt is free software; you can redistribute it and/or modify + * it under the terms of the GNU Lesser General Public License as + * published by the Free Software Foundation; either version 2.1 of + * the License, or (at your option) any later version. + * + * Libgcrypt is distributed in the hope that it will be useful, + * but WITHOUT ANY WARRANTY; without even the implied warranty of + * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the + * GNU Lesser General Public License for more details. + * + * You should have received a copy of the GNU Lesser General Public + * License along with this program; if not, see <http://www.gnu.org/licenses/>. + */ + +#include "asm-common-aarch64.h" + +#if defined(__AARCH64EL__) && \ + defined(HAVE_COMPATIBLE_GCC_AARCH64_PLATFORM_AS) && \ + defined(HAVE_GCC_INLINE_ASM_AARCH64_CRYPTO) + +.cpu generic+simd+crypto + +.text + +#define GET_DATA_POINTER(reg, name) \ + adrp reg, :got:name ; \ + ldr reg, [reg, #:got_lo12:name] ; + +/* Structure of crc32_consts_s */ + +#define consts_k(idx) ((idx) * 8) +#define consts_my_p(idx) (consts_k(6) + (idx) * 8) + +/* Constants */ + +.align 6 +.Lcrc32_constants: +.Lcrc32_partial_fold_input_mask: + .byte 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00 + .byte 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00 +.Lcrc32_refl_shuf_shift: + .byte 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff + .byte 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff + .byte 0x00, 0x01, 0x02, 0x03, 0x04, 0x05, 0x06, 0x07 + .byte 0x08, 0x09, 0x0a, 0x0b, 0x0c, 0x0d, 0x0e, 0x0f +.Lcrc32_shuf_shift: + .byte 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff + .byte 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff +.Lcrc32_bswap_shuf: + .byte 0x0f, 0x0e, 0x0d, 0x0c, 0x0b, 0x0a, 0x09, 0x08 + .byte 0x07, 0x06, 0x05, 0x04, 0x03, 0x02, 0x01, 0x00 + .byte 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff + .byte 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff + + +/* + * void _gcry_crc32r_armv8_ce_bulk (u32 *pcrc, const byte *inbuf, size_t inlen, + * const struct crc32_consts_s *consts); + */ +.align 3 +.globl _gcry_crc32r_armv8_ce_bulk +ELF(.type _gcry_crc32r_armv8_ce_bulk,%function;) +_gcry_crc32r_armv8_ce_bulk: + /* input: + * x0: pcrc + * x1: inbuf + * x2: inlen + * x3: consts + */ + + GET_DATA_POINTER(x7, .Lcrc32_constants) + add x9, x3, #consts_k(5 - 1) + cmp x2, #128 + + b.lo .Lcrc32r_fold_by_one_setup + + eor v4.16b, v4.16b, v4.16b + add x4, x3, #consts_k(1 - 1) + ld1 {v4.s}[0], [x0] /* load pcrc */ + ld1 {v0.16b-v3.16b}, [x1], #64 /* load 64 bytes of input */ + sub x2, x2, #64 + ld1 {v6.16b}, [x4] + eor v0.16b, v0.16b, v4.16b + + add x4, x3, #consts_k(3 - 1) + add x5, x3, #consts_my_p(0) + +.Lcrc32r_fold_by_four: + + /* Fold by 4. */ + ld1 {v16.16b-v19.16b}, [x1], #64 /* load 64 bytes of input */ + sub x2, x2, #64 + pmull v20.1q, v0.1d, v6.1d + pmull v21.1q, v1.1d, v6.1d + pmull v22.1q, v2.1d, v6.1d + pmull v23.1q, v3.1d, v6.1d + cmp x2, #64 + pmull2 v24.1q, v0.2d, v6.2d + pmull2 v25.1q, v1.2d, v6.2d + pmull2 v26.1q, v2.2d, v6.2d + pmull2 v27.1q, v3.2d, v6.2d + eor v0.16b, v20.16b, v16.16b + eor v1.16b, v21.16b, v17.16b + eor v2.16b, v22.16b, v18.16b + eor v3.16b, v23.16b, v19.16b + eor v0.16b, v0.16b, v24.16b + eor v1.16b, v1.16b, v25.16b + eor v2.16b, v2.16b, v26.16b + eor v3.16b, v3.16b, v27.16b + b.hs .Lcrc32r_fold_by_four + + ld1 {v6.16b}, [x4] + ld1 {v5.16b}, [x5] + + cmp x2, #16 + + /* Fold 4 to 1. */ + + pmull v16.1q, v0.1d, v6.1d + pmull2 v4.1q, v0.2d, v6.2d + eor v0.16b, v16.16b, v1.16b + eor v0.16b, v0.16b, v4.16b + + pmull v16.1q, v0.1d, v6.1d + pmull2 v4.1q, v0.2d, v6.2d + eor v0.16b, v16.16b, v2.16b + eor v0.16b, v0.16b, v4.16b + + pmull v16.1q, v0.1d, v6.1d + pmull2 v4.1q, v0.2d, v6.2d + eor v0.16b, v16.16b, v3.16b + eor v0.16b, v0.16b, v4.16b + + b.lo .Lcrc32r_fold_by_one_done + b .Lcrc32r_fold_by_one + +.Lcrc32r_fold_by_one_setup: + + eor v1.16b, v1.16b, v1.16b + add x4, x3, #consts_k(3 - 1) + add x5, x3, #consts_my_p(0) + sub x2, x2, #16 + ld1 {v1.s}[0], [x0] /* load pcrc */ + ld1 {v0.16b}, [x1], #16 /* load 16 bytes of input */ + cmp x2, #16 + ld1 {v6.16b}, [x4] /* load k3k4 */ + ld1 {v5.16b}, [x5] /* load my_p */ + eor v0.16b, v0.16b, v1.16b + b.lo .Lcrc32r_fold_by_one_done + +.Lcrc32r_fold_by_one: + sub x2, x2, #16 + ld1 {v2.16b}, [x1], #16 /* load 16 bytes of input */ + pmull v3.1q, v0.1d, v6.1d + pmull2 v1.1q, v0.2d, v6.2d + cmp x2, #16 + eor v0.16b, v3.16b, v2.16b + eor v0.16b, v0.16b, v1.16b + + b.hs .Lcrc32r_fold_by_one + +.Lcrc32r_fold_by_one_done: + + cmp x2, #0 + b.eq .Lcrc32r_final_fold + + /* Partial fold. */ + + add x4, x7, #.Lcrc32_refl_shuf_shift - .Lcrc32_constants + add x5, x7, #.Lcrc32_refl_shuf_shift - .Lcrc32_constants + 16 + add x6, x7, #.Lcrc32_partial_fold_input_mask - .Lcrc32_constants + sub x8, x2, #16 + add x4, x4, x2 + add x5, x5, x2 + add x6, x6, x2 + add x8, x1, x8 + + /* Load last input and add padding zeros. */ + ld1 {v4.16b}, [x4] + eor x2, x2, x2 + ld1 {v3.16b}, [x5] + ld1 {v2.16b}, [x6] + tbl v30.16b, {v0.16b}, v4.16b + ld1 {v4.16b}, [x8] + tbl v1.16b, {v0.16b}, v3.16b + + pmull v0.1q, v30.1d, v6.1d + and v2.16b, v2.16b, v4.16b + pmull2 v31.1q, v30.2d, v6.2d + orr v2.16b, v2.16b, v1.16b + eor v0.16b, v0.16b, v31.16b + eor v0.16b, v0.16b, v2.16b + +.Lcrc32r_final_fold: + + /* Final fold. */ + + eor v2.16b, v2.16b, v2.16b /* zero reg */ + ld1 {v7.16b}, [x9] + + /* reduce 128-bits to 96-bits */ + ext v6.16b, v6.16b, v6.16b, #8 /* swap high and low parts */ + mov v1.16b, v0.16b + pmull v0.1q, v0.1d, v6.1d + ext v6.16b, v5.16b, v5.16b, #8 /* swap high and low parts */ + ext v1.16b, v1.16b, v2.16b, #8 /* high to low, high zeroed */ + eor v3.16b, v0.16b, v1.16b + + /* reduce 96-bits to 64-bits */ + eor v1.16b, v1.16b, v1.16b + ext v0.16b, v3.16b, v2.16b, #4 /* [00][00][x2][x1] */ + mov v1.s[0], v3.s[0] /* [00][00][00][x0] */ + eor v3.16b, v3.16b, v3.16b + pmull v1.1q, v1.1d, v7.1d /* [00][00][xx][xx] */ + eor v0.16b, v0.16b, v1.16b /* top 64-bit are zero */ + + /* barrett reduction */ + mov v3.s[1], v0.s[0] /* [00][00][x1][00] */ + ext v0.16b, v2.16b, v0.16b, #12 /* [??][x1][??][00] */ + pmull v1.1q, v3.1d, v5.1d /* [00][xx][xx][00] */ + pmull v1.1q, v1.1d, v6.1d /* [00][xx][xx][00] */ + eor v0.16b, v0.16b, v1.16b + + /* store CRC */ + st1 {v0.s}[2], [x0] + + ret +ELF(.size _gcry_crc32r_armv8_ce_bulk,.-_gcry_crc32r_armv8_ce_bulk;) + +/* + * void _gcry_crc32r_armv8_ce_reduction_4 (u32 *pcrc, u32 data, u32 crc, + * const struct crc32_consts_s *consts); + */ +.align 3 +.globl _gcry_crc32r_armv8_ce_reduction_4 +ELF(.type _gcry_crc32r_armv8_ce_reduction_4,%function;) +_gcry_crc32r_armv8_ce_reduction_4: + /* input: + * w0: data + * w1: crc + * x2: crc32 constants + */ + + eor v0.16b, v0.16b, v0.16b + add x2, x2, #consts_my_p(0) + eor v1.16b, v1.16b, v1.16b + ld1 {v5.16b}, [x2] + + mov v0.s[0], w0 + pmull v0.1q, v0.1d, v5.1d /* [00][00][xx][xx] */ + mov v1.s[1], w1 + mov v0.s[2], v0.s[0] /* [00][x0][x1][x0] */ + pmull2 v0.1q, v0.2d, v5.2d /* [00][00][xx][xx] */ + eor v0.16b, v0.16b, v1.16b + + mov w0, v0.s[1] + + ret +ELF(.size _gcry_crc32r_armv8_ce_reduction_4,.-_gcry_crc32r_armv8_ce_reduction_4;) + +/* + * void _gcry_crc32_armv8_ce_bulk (u32 *pcrc, const byte *inbuf, size_t inlen, + * const struct crc32_consts_s *consts); + */ +.align 3 +.globl _gcry_crc32_armv8_ce_bulk +ELF(.type _gcry_crc32_armv8_ce_bulk,%function;) +_gcry_crc32_armv8_ce_bulk: + /* input: + * x0: pcrc + * x1: inbuf + * x2: inlen + * x3: consts + */ + + GET_DATA_POINTER(x7, .Lcrc32_constants) + add x4, x7, #.Lcrc32_bswap_shuf - .Lcrc32_constants + cmp x2, #128 + ld1 {v7.16b}, [x4] + + b.lo .Lcrc32_fold_by_one_setup + + eor v4.16b, v4.16b, v4.16b + add x4, x3, #consts_k(1 - 1) + ld1 {v4.s}[0], [x0] /* load pcrc */ + ld1 {v0.16b-v3.16b}, [x1], #64 /* load 64 bytes of input */ + sub x2, x2, #64 + ld1 {v6.16b}, [x4] + eor v0.16b, v0.16b, v4.16b + ext v4.16b, v6.16b, v6.16b, #8 + tbl v0.16b, { v0.16b }, v7.16b /* byte swap */ + tbl v1.16b, { v1.16b }, v7.16b /* byte swap */ + tbl v2.16b, { v2.16b }, v7.16b /* byte swap */ + tbl v3.16b, { v3.16b }, v7.16b /* byte swap */ + + add x4, x3, #consts_k(3 - 1) + add x5, x3, #consts_my_p(0) + +.Lcrc32_fold_by_four: + + /* Fold by 4. */ + ld1 {v16.16b-v19.16b}, [x1], #64 /* load 64 bytes of input */ + sub x2, x2, #64 + tbl v16.16b, { v16.16b }, v7.16b /* byte swap */ + tbl v17.16b, { v17.16b }, v7.16b /* byte swap */ + tbl v18.16b, { v18.16b }, v7.16b /* byte swap */ + tbl v19.16b, { v19.16b }, v7.16b /* byte swap */ + cmp x2, #64 + pmull2 v20.1q, v0.2d, v4.2d + pmull2 v21.1q, v1.2d, v4.2d + pmull2 v22.1q, v2.2d, v4.2d + pmull2 v23.1q, v3.2d, v4.2d + pmull v24.1q, v0.1d, v4.1d + pmull v25.1q, v1.1d, v4.1d + pmull v26.1q, v2.1d, v4.1d + pmull v27.1q, v3.1d, v4.1d + eor v0.16b, v20.16b, v16.16b + eor v1.16b, v21.16b, v17.16b + eor v2.16b, v22.16b, v18.16b + eor v3.16b, v23.16b, v19.16b + eor v0.16b, v0.16b, v24.16b + eor v1.16b, v1.16b, v25.16b + eor v2.16b, v2.16b, v26.16b + eor v3.16b, v3.16b, v27.16b + b.hs .Lcrc32_fold_by_four + + ld1 {v6.16b}, [x4] + ld1 {v5.16b}, [x5] + ext v6.16b, v6.16b, v6.16b, #8 + ext v5.16b, v5.16b, v5.16b, #8 + + cmp x2, #16 + + /* Fold 4 to 1. */ + + pmull2 v16.1q, v0.2d, v6.2d + pmull v4.1q, v0.1d, v6.1d + eor v0.16b, v16.16b, v1.16b + eor v0.16b, v0.16b, v4.16b + + pmull2 v16.1q, v0.2d, v6.2d + pmull v4.1q, v0.1d, v6.1d + eor v0.16b, v16.16b, v2.16b + eor v0.16b, v0.16b, v4.16b + + pmull2 v16.1q, v0.2d, v6.2d + pmull v4.1q, v0.1d, v6.1d + eor v0.16b, v16.16b, v3.16b + eor v0.16b, v0.16b, v4.16b + + b.lo .Lcrc32_fold_by_one_done + b .Lcrc32_fold_by_one + +.Lcrc32_fold_by_one_setup: + + eor v1.16b, v1.16b, v1.16b + add x4, x3, #consts_k(3 - 1) + add x5, x3, #consts_my_p(0) + ld1 {v1.s}[0], [x0] /* load pcrc */ + sub x2, x2, #16 + ld1 {v0.16b}, [x1], #16 /* load 16 bytes of input */ + ld1 {v6.16b}, [x4] /* load k3k4 */ + ld1 {v5.16b}, [x5] /* load my_p */ + eor v0.16b, v0.16b, v1.16b + cmp x2, #16 + ext v6.16b, v6.16b, v6.16b, #8 /* swap high and low parts */ + ext v5.16b, v5.16b, v5.16b, #8 /* swap high and low parts */ + tbl v0.16b, { v0.16b }, v7.16b /* byte swap */ + b.lo .Lcrc32_fold_by_one_done + +.Lcrc32_fold_by_one: + sub x2, x2, #16 + ld1 {v2.16b}, [x1], #16 /* load 16 bytes of input */ + pmull2 v3.1q, v0.2d, v6.2d + tbl v2.16b, { v2.16b }, v7.16b /* byte swap */ + pmull v1.1q, v0.1d, v6.1d + cmp x2, #16 + eor v0.16b, v3.16b, v2.16b + eor v0.16b, v0.16b, v1.16b + + b.hs .Lcrc32_fold_by_one + +.Lcrc32_fold_by_one_done: + + cmp x2, #0 + b.eq .Lcrc32_final_fold + + /* Partial fold. */ + + add x4, x7, #.Lcrc32_refl_shuf_shift - .Lcrc32_constants + 32 + add x5, x7, #.Lcrc32_shuf_shift - .Lcrc32_constants + 16 + add x6, x7, #.Lcrc32_partial_fold_input_mask - .Lcrc32_constants + sub x8, x2, #16 + sub x4, x4, x2 + add x5, x5, x2 + add x6, x6, x2 + add x8, x1, x8 + + /* Load last input and add padding zeros. */ + ld1 {v4.16b}, [x4] + eor x2, x2, x2 + ld1 {v3.16b}, [x5] + ld1 {v2.16b}, [x6] + tbl v30.16b, {v0.16b}, v4.16b + ld1 {v4.16b}, [x8] + tbl v1.16b, {v0.16b}, v3.16b + and v2.16b, v2.16b, v4.16b + + pmull2 v0.1q, v30.2d, v6.2d + orr v2.16b, v2.16b, v1.16b + pmull v1.1q, v30.1d, v6.1d + tbl v2.16b, {v2.16b}, v7.16b /* byte swap */ + eor v0.16b, v0.16b, v1.16b + eor v0.16b, v0.16b, v2.16b + +.Lcrc32_final_fold: + + /* Final fold. */ + + eor v2.16b, v2.16b, v2.16b /* zero reg */ + + /* reduce 128-bits to 96-bits */ + add x4, x3, #consts_k(4) + ext v3.16b, v6.16b, v6.16b, #8 /* swap high and low parts */ + eor v6.16b, v6.16b, v6.16b + mov v1.16b, v0.16b + pmull2 v0.1q, v0.2d, v3.2d + ld1 {v6.d}[1], [x4] /* load k4 */ + ext v1.16b, v2.16b, v1.16b, #8 /* low to high, low zeroed */ + eor v3.16b, v0.16b, v1.16b /* bottom 32-bit are zero */ + + /* reduce 96-bits to 64-bits */ + eor v0.16b, v0.16b, v0.16b + eor v1.16b, v1.16b, v1.16b + mov v0.s[1], v3.s[1] /* [00][00][x1][00] */ + mov v1.s[2], v3.s[3] /* [00][x3][00][00] */ + mov v0.s[2], v3.s[2] /* [00][x2][x1][00] */ + eor v3.16b, v3.16b, v3.16b + pmull2 v1.1q, v1.2d, v6.2d /* [00][xx][xx][00] */ + eor v0.16b, v0.16b, v1.16b /* top and bottom 32-bit are zero */ + + /* barrett reduction */ + mov v3.s[0], v0.s[1] /* [00][00][00][x1] */ + pmull2 v0.1q, v0.2d, v5.2d /* [00][xx][xx][xx] */ + ext v0.16b, v0.16b, v2.16b, #4 /* [00][00][xx][xx] */ + pmull v0.1q, v0.1d, v5.1d + eor v0.16b, v0.16b, v3.16b + + /* store CRC in input endian */ + rev32 v0.8b, v0.8b /* byte swap */ + st1 {v0.s}[0], [x0] + + ret +ELF(.size _gcry_crc32_armv8_ce_bulk,.-_gcry_crc32_armv8_ce_bulk;) + +/* + * void _gcry_crc32_armv8_ce_reduction_4 (u32 *pcrc, u32 data, u32 crc, + * const struct crc32_consts_s *consts); + */ +.align 3 +.globl _gcry_crc32_armv8_ce_reduction_4 +ELF(.type _gcry_crc32_armv8_ce_reduction_4,%function;) +_gcry_crc32_armv8_ce_reduction_4: + /* input: + * w0: data + * w1: crc + * x2: crc32 constants + */ + + eor v0.16b, v0.16b, v0.16b + add x2, x2, #consts_my_p(0) + eor v1.16b, v1.16b, v1.16b + ld1 {v5.16b}, [x2] + + mov v0.s[1], w0 + pmull v0.1q, v0.1d, v5.1d /* [00][xx][xx][00] */ + mov v1.s[0], w1 + pmull2 v0.1q, v0.2d, v5.2d /* [00][00][xx][xx] */ + eor v0.16b, v0.16b, v1.16b + + rev32 v0.8b, v0.8b /* Return in input endian */ + mov w0, v0.s[0] + + ret +ELF(.size _gcry_crc32_armv8_ce_reduction_4,.-_gcry_crc32_armv8_ce_reduction_4;) + +#endif diff --git a/cipher/crc-armv8-ce.c b/cipher/crc-armv8-ce.c new file mode 100644 index 00000000..8dd07cce --- /dev/null +++ b/cipher/crc-armv8-ce.c @@ -0,0 +1,229 @@ +/* crc-armv8-ce.c - ARMv8-CE PMULL accelerated CRC implementation + * Copyright (C) 2019 Jussi Kivilinna <jussi.kivilinna@iki.fi> + * + * This file is part of Libgcrypt. + * + * Libgcrypt is free software; you can redistribute it and/or modify + * it under the terms of the GNU Lesser General Public License as + * published by the Free Software Foundation; either version 2.1 of + * the License, or (at your option) any later version. + * + * Libgcrypt is distributed in the hope that it will be useful, + * but WITHOUT ANY WARRANTY; without even the implied warranty of + * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the + * GNU Lesser General Public License for more details. + * + * You should have received a copy of the GNU Lesser General Public + * License along with this program; if not, write to the Free Software + * Foundation, Inc., 59 Temple Place - Suite 330, Boston, MA 02111-1307, USA + * + */ + +#include <config.h> +#include <stdio.h> +#include <stdlib.h> +#include <string.h> + +#include "g10lib.h" + +#include "bithelp.h" +#include "bufhelp.h" + + +#if defined(ENABLE_ARM_CRYPTO_SUPPORT) && defined(__AARCH64EL__) && \ + defined(HAVE_COMPATIBLE_GCC_AARCH64_PLATFORM_AS) && \ + defined(HAVE_GCC_INLINE_ASM_AARCH64_CRYPTO) + + +#define ALIGNED_16 __attribute__ ((aligned (16))) + + +struct u16_unaligned_s +{ + u16 a; +} __attribute__((packed, aligned (1), may_alias)); + +struct u32_unaligned_s +{ + u32 a; +} __attribute__((packed, aligned (1), may_alias)); + + +/* Constants structure for generic reflected/non-reflected CRC32 PMULL + * functions. */ +struct crc32_consts_s +{ + /* k: { x^(32*17), x^(32*15), x^(32*5), x^(32*3), x^(32*2), 0 } mod P(x) */ + u64 k[6]; + /* my_p: { floor(x^64 / P(x)), P(x) } */ + u64 my_p[2]; +}; + +/* PMULL constants for CRC32 and CRC32RFC1510. */ +static const struct crc32_consts_s crc32_consts ALIGNED_16 = +{ + { /* k[6] = reverse_33bits( x^(32*y) mod P(x) ) */ + U64_C(0x154442bd4), U64_C(0x1c6e41596), /* y = { 17, 15 } */ + U64_C(0x1751997d0), U64_C(0x0ccaa009e), /* y = { 5, 3 } */ + U64_C(0x163cd6124), 0 /* y = 2 */ + }, + { /* my_p[2] = reverse_33bits ( { floor(x^64 / P(x)), P(x) } ) */ + U64_C(0x1f7011641), U64_C(0x1db710641) + } +}; + +/* PMULL constants for CRC24RFC2440 (polynomial multiplied with x⁸). */ +static const struct crc32_consts_s crc24rfc2440_consts ALIGNED_16 = +{ + { /* k[6] = x^(32*y) mod P(x) << 32*/ + U64_C(0x08289a00) << 32, U64_C(0x74b44a00) << 32, /* y = { 17, 15 } */ + U64_C(0xc4b14d00) << 32, U64_C(0xfd7e0c00) << 32, /* y = { 5, 3 } */ + U64_C(0xd9fe8c00) << 32, 0 /* y = 2 */ + }, + { /* my_p[2] = { floor(x^64 / P(x)), P(x) } */ + U64_C(0x1f845fe24), U64_C(0x1864cfb00) + } +}; + + +u32 _gcry_crc32r_armv8_ce_reduction_4 (u32 data, u32 crc, + const struct crc32_consts_s *consts); +void _gcry_crc32r_armv8_ce_bulk (u32 *pcrc, const byte *inbuf, size_t inlen, + const struct crc32_consts_s *consts); + +u32 _gcry_crc32_armv8_ce_reduction_4 (u32 data, u32 crc, + const struct crc32_consts_s *consts); +void _gcry_crc32_armv8_ce_bulk (u32 *pcrc, const byte *inbuf, size_t inlen, + const struct crc32_consts_s *consts); + + +static inline void +crc32r_less_than_16 (u32 *pcrc, const byte *inbuf, size_t inlen, + const struct crc32_consts_s *consts) +{ + u32 crc = *pcrc; + u32 data; + + while (inlen >= 4) + { + data = ((const struct u32_unaligned_s *)inbuf)->a; + data ^= crc; + + inlen -= 4; + inbuf += 4; + + crc = _gcry_crc32r_armv8_ce_reduction_4 (data, 0, consts); + } + + switch (inlen) + { + case 0: + break; + case 1: + data = inbuf[0]; + data ^= crc; + data <<= 24; + crc >>= 8; + crc = _gcry_crc32r_armv8_ce_reduction_4 (data, crc, consts); + break; + case 2: + data = ((const struct u16_unaligned_s *)inbuf)->a; + data ^= crc; + data <<= 16; + crc >>= 16; + crc = _gcry_crc32r_armv8_ce_reduction_4 (data, crc, consts); + break; + case 3: + data = ((const struct u16_unaligned_s *)inbuf)->a; + data |= inbuf[2] << 16; + data ^= crc; + data <<= 8; + crc >>= 24; + crc = _gcry_crc32r_armv8_ce_reduction_4 (data, crc, consts); + break; + } + + *pcrc = crc; +} + +static inline void +crc32_less_than_16 (u32 *pcrc, const byte *inbuf, size_t inlen, + const struct crc32_consts_s *consts) +{ + u32 crc = *pcrc; + u32 data; + + while (inlen >= 4) + { + data = ((const struct u32_unaligned_s *)inbuf)->a; + data ^= crc; + data = _gcry_bswap32(data); + + inlen -= 4; + inbuf += 4; + + crc = _gcry_crc32_armv8_ce_reduction_4 (data, 0, consts); + } + + switch (inlen) + { + case 0: + break; + case 1: + data = inbuf[0]; + data ^= crc; + data = data & 0xffU; + crc = _gcry_bswap32(crc >> 8); + crc = _gcry_crc32_armv8_ce_reduction_4 (data, crc, consts); + break; + case 2: + data = ((const struct u16_unaligned_s *)inbuf)->a; + data ^= crc; + data = _gcry_bswap32(data << 16); + crc = _gcry_bswap32(crc >> 16); + crc = _gcry_crc32_armv8_ce_reduction_4 (data, crc, consts); + break; + case 3: + data = ((const struct u16_unaligned_s *)inbuf)->a; + data |= inbuf[2] << 16; + data ^= crc; + data = _gcry_bswap32(data << 8); + crc = crc & 0xff000000U; + crc = _gcry_crc32_armv8_ce_reduction_4 (data, crc, consts); + break; + } + + *pcrc = crc; +} + +void +_gcry_crc32_armv8_ce_pmull (u32 *pcrc, const byte *inbuf, size_t inlen) +{ + const struct crc32_consts_s *consts = &crc32_consts; + + if (!inlen) + return; + + if (inlen >= 16) + _gcry_crc32r_armv8_ce_bulk (pcrc, inbuf, inlen, consts); + else + crc32r_less_than_16 (pcrc, inbuf, inlen, consts); +} + +void +_gcry_crc24rfc2440_armv8_ce_pmull (u32 *pcrc, const byte *inbuf, size_t inlen) +{ + const struct crc32_consts_s *consts = &crc24rfc2440_consts; + + if (!inlen) + return; + + /* Note: *pcrc in input endian. */ + + if (inlen >= 16) + _gcry_crc32_armv8_ce_bulk (pcrc, inbuf, inlen, consts); + else + crc32_less_than_16 (pcrc, inbuf, inlen, consts); +} + +#endif /* USE_INTEL_PCLMUL */ diff --git a/cipher/crc.c b/cipher/crc.c index 4457ff62..2abbab28 100644 --- a/cipher/crc.c +++ b/cipher/crc.c @@ -42,6 +42,15 @@ # endif #endif /* USE_INTEL_PCLMUL */ +/* USE_ARM_PMULL indicates whether to compile GCM with ARMv8 PMULL code. */ +#undef USE_ARM_PMULL +#if defined(ENABLE_ARM_CRYPTO_SUPPORT) +# if defined(__AARCH64EL__) && \ + defined(HAVE_COMPATIBLE_GCC_AARCH64_PLATFORM_AS) && \ + defined(HAVE_GCC_INLINE_ASM_AARCH64_CRYPTO) +# define USE_ARM_PMULL 1 +# endif +#endif /* USE_ARM_PMULL */ typedef struct { @@ -49,6 +58,9 @@ typedef struct #ifdef USE_INTEL_PCLMUL unsigned int use_pclmul:1; /* Intel PCLMUL shall be used. */ #endif +#ifdef USE_ARM_PMULL + unsigned int use_pmull:1; /* ARMv8 PMULL shall be used. */ +#endif byte buf[4]; } CRC_CONTEXT; @@ -61,6 +73,13 @@ void _gcry_crc24rfc2440_intel_pclmul (u32 *pcrc, const byte *inbuf, size_t inlen); #endif +#ifdef USE_ARM_PMULL +/*-- crc-armv8-ce.c --*/ +void _gcry_crc32_armv8_ce_pmull (u32 *pcrc, const byte *inbuf, size_t inlen); +void _gcry_crc24rfc2440_armv8_ce_pmull (u32 *pcrc, const byte *inbuf, + size_t inlen); +#endif + /* * Code generated by universal_crc by Danjel McGougan @@ -361,13 +380,17 @@ static void crc32_init (void *context, unsigned int flags) { CRC_CONTEXT *ctx = (CRC_CONTEXT *) context; -#ifdef USE_INTEL_PCLMUL u32 hwf = _gcry_get_hw_features (); +#ifdef USE_INTEL_PCLMUL ctx->use_pclmul = (hwf & HWF_INTEL_SSE4_1) && (hwf & HWF_INTEL_PCLMUL); #endif +#ifdef USE_ARM_PMULL + ctx->use_pmull = (hwf & HWF_ARM_NEON) && (hwf & HWF_ARM_PMULL); +#endif (void)flags; + (void)hwf; ctx->CRC = 0 ^ 0xffffffffL; } @@ -386,6 +409,13 @@ crc32_write (void *context, const void *inbuf_arg, size_t inlen) return; } #endif +#ifdef USE_ARM_PMULL + if (ctx->use_pmull) + { + _gcry_crc32_armv8_ce_pmull(&ctx->CRC, inbuf, inlen); + return; + } +#endif if (!inbuf || !inlen) return; @@ -439,13 +469,17 @@ static void crc32rfc1510_init (void *context, unsigned int flags) { CRC_CONTEXT *ctx = (CRC_CONTEXT *) context; -#ifdef USE_INTEL_PCLMUL u32 hwf = _gcry_get_hw_features (); +#ifdef USE_INTEL_PCLMUL ctx->use_pclmul = (hwf & HWF_INTEL_SSE4_1) && (hwf & HWF_INTEL_PCLMUL); #endif +#ifdef USE_ARM_PMULL + ctx->use_pmull = (hwf & HWF_ARM_NEON) && (hwf & HWF_ARM_PMULL); +#endif (void)flags; + (void)hwf; ctx->CRC = 0; } @@ -769,12 +803,16 @@ static void crc24rfc2440_init (void *context, unsigned int flags) { CRC_CONTEXT *ctx = (CRC_CONTEXT *) context; -#ifdef USE_INTEL_PCLMUL u32 hwf = _gcry_get_hw_features (); +#ifdef USE_INTEL_PCLMUL ctx->use_pclmul = (hwf & HWF_INTEL_SSE4_1) && (hwf & HWF_INTEL_PCLMUL); #endif +#ifdef USE_ARM_PMULL + ctx->use_pmull = (hwf & HWF_ARM_NEON) && (hwf & HWF_ARM_PMULL); +#endif + (void)hwf; (void)flags; ctx->CRC = crc24_init(); @@ -794,6 +832,13 @@ crc24rfc2440_write (void *context, const void *inbuf_arg, size_t inlen) return; } #endif +#ifdef USE_ARM_PMULL + if (ctx->use_pmull) + { + _gcry_crc24rfc2440_armv8_ce_pmull(&ctx->CRC, inbuf, inlen); + return; + } +#endif if (!inbuf || !inlen) return; diff --git a/configure.ac b/configure.ac index 1aafc320..aa23a501 100644 --- a/configure.ac +++ b/configure.ac @@ -2409,6 +2409,11 @@ if test "$found" = "1" ; then # Build with the assembly implementation GCRYPT_DIGESTS="$GCRYPT_DIGESTS crc-intel-pclmul.lo" ;; + aarch64-*-*) + # Build with the assembly implementation + GCRYPT_CIPHERS="$GCRYPT_CIPHERS crc-armv8-ce.lo" + GCRYPT_CIPHERS="$GCRYPT_CIPHERS crc-armv8-aarch64-ce.lo" + ;; esac fi |