diff options
author | Niels Möller <nisse@lysator.liu.se> | 2022-02-21 17:39:31 +0100 |
---|---|---|
committer | Niels Möller <nisse@lysator.liu.se> | 2022-02-21 18:35:13 +0100 |
commit | b16455558886d47c9bdcd818b44b0843639640a2 (patch) | |
tree | 623660acc5e4ccc536fcc2c9cd747c6215948e76 | |
parent | 6b80b8894049731f78163ca38fc6625230d054d0 (diff) | |
download | nettle-b16455558886d47c9bdcd818b44b0843639640a2.tar.gz |
arm64: Update pclmul ghash to new organization.
-rw-r--r-- | ChangeLog | 6 | ||||
-rw-r--r-- | arm64/crypto/ghash-set-key.asm (renamed from arm64/crypto/gcm-hash.asm) | 185 | ||||
-rw-r--r-- | arm64/crypto/ghash-update.asm | 180 |
3 files changed, 199 insertions, 172 deletions
@@ -1,3 +1,9 @@ +2022-02-21 Niels Möller <nisse@lysator.liu.se> + + * arm64/crypto/gcm-hash.asm: Deleted, split into two new files... + * arm64/crypto/ghash-set-key.asm: New file. + * arm64/crypto/ghash-update.asm: New file. + 2022-02-19 Niels Möller <nisse@lysator.liu.se> * fat-x86_64.c (fat_init): Update fat init for new _ghash_set_key diff --git a/arm64/crypto/gcm-hash.asm b/arm64/crypto/ghash-set-key.asm index 3e4c98d8..7e09bb43 100644 --- a/arm64/crypto/gcm-hash.asm +++ b/arm64/crypto/ghash-set-key.asm @@ -1,4 +1,4 @@ -C arm64/crypto/gcm-hash.asm +C arm64/crypto/ghash-set-key.asm ifelse(` Copyright (C) 2020 Niels Möller and Mamone Tarsha @@ -30,14 +30,11 @@ ifelse(` not, see http://www.gnu.org/licenses/. ') -.file "gcm-hash.asm" +.file "ghash-set-key.asm" .arch armv8-a+crypto .text -C gcm_set_key() assigns H value in the middle element of the table -define(`H_Idx', `128') - C common SIMD register usage: define(`POLY', `v6') C temporary register that assist the reduction procedure @@ -75,7 +72,7 @@ define(`REDUCTION', m4_assert_numargs(1)` eor $1.16b,F.16b,R.16b ') - C void gcm_init_key (union gcm_block *table) + C void _ghash_set_key (struct gcm_key *ctx, const union nettle_block16 *key) C This function populates the gcm table as the following layout C ******************************************************************************* @@ -92,8 +89,9 @@ C | H4M = (H3 div x⁶⁴)||((H4 mod x⁶⁴) × (x⁶⁴+x⁶³+x⁶²+x⁵⁷) C | H4L = (H3 mod x⁶⁴)||(((H4 mod x⁶⁴) × (x⁶³+x⁶²+x⁵⁷)) mod x⁶⁴) + (H4 div x⁶⁴) | C ******************************************************************************* -C gcm_init_key register usage: -define(`TABLE', `x0') +C Register usage: +define(`CTX', `x0') +define(`KEY', `x1') define(`EMSB', `v0') define(`B', `v1') @@ -122,14 +120,13 @@ define(`PMUL_PARAM', m4_assert_numargs(3)` ext $2.16b,$2.16b,$2.16b,#8 ') -PROLOGUE(_nettle_gcm_init_key) - add x1,TABLE,#16*H_Idx - ld1 {H.2d},[x1] +PROLOGUE(_nettle_ghash_set_key) + ld1 {H.2d},[KEY] C we treat data as big-endian doublewords for processing. Since there is no C endianness-neutral MSB-first load operation we need to restore our desired C byte order on little-endian systems. The same holds true for DATA below - C but not our own internal precalculated TABLE (see below). + C but not our own internal precalculated CTX (see below). IF_LE(` rev64 H.16b,H.16b ') @@ -162,9 +159,9 @@ IF_LE(` PMUL_PARAM(H2,H2M,H2L) C we store to the table as doubleword-vectors in current memory endianness - C because it's our own strictly internal data structure and what gcm_hash + C because it's our own strictly internal data structure and what ghash_update C can most naturally use - st1 {H1M.2d,H1L.2d,H2M.2d,H2L.2d},[TABLE],#64 + st1 {H1M.2d,H1L.2d,H2M.2d,H2L.2d},[CTX],#64 C --- calculate H^3 = H^1 × H^2 --- @@ -182,163 +179,7 @@ IF_LE(` PMUL_PARAM(H4,H4M,H4L) - st1 {H3M.2d,H3L.2d,H4M.2d,H4L.2d},[TABLE] - - ret -EPILOGUE(_nettle_gcm_init_key) - -C gcm_hash register usage: -define(`TABLE', `x0') -define(`X', `x1') -define(`LENGTH', `x2') -define(`DATA', `x3') - -define(`D', `v0') -define(`C0', `v1') -define(`C1', `v2') -define(`C2', `v3') -define(`C3', `v4') -define(`R2', `v20') -define(`F2', `v21') -define(`R3', `v22') -define(`F3', `v23') -define(`H1M', `v24') -define(`H1L', `v25') -define(`H2M', `v26') -define(`H2L', `v27') -define(`H3M', `v28') -define(`H3L', `v29') -define(`H4M', `v30') -define(`H4L', `v31') - -C PMUL_SUM(in, param1, param2) -define(`PMUL_SUM', m4_assert_numargs(3)` - pmull F2.1q,$3.1d,$1.1d - pmull2 F3.1q,$3.2d,$1.2d - pmull R2.1q,$2.1d,$1.1d - pmull2 R3.1q,$2.2d,$1.2d - eor F2.16b,F2.16b,F3.16b - eor R2.16b,R2.16b,R3.16b - eor F.16b,F.16b,F2.16b - eor R.16b,R.16b,R2.16b -') - -C Load the final partial block into SIMD register, -C stored in little-endian order for each 64-bit part -C LOAD_REV_PARTIAL_BLOCK(out) -define(`LOAD_REV_PARTIAL_BLOCK', m4_assert_numargs(1)` - tbz LENGTH,3,Lless_8_bytes - ldr `d'substr($1,1,len($1)),[DATA],#8 -IF_LE(` - rev64 $1.16b,$1.16b -') - mov x7,#0 - mov $1.d[1],x7 - tst LENGTH,#7 - b.eq Lload_done -Lless_8_bytes: - mov x6,#0 - mov x5,#64 - and x4,LENGTH,#7 -Lload_byte_loop: - mov x7,#0 - ldrb w7,[DATA],#1 - sub x5,x5,#8 - lsl x7,x7,x5 - orr x6,x6,x7 - subs x4,x4,#1 - b.ne Lload_byte_loop - tbz LENGTH,3,Lstore_hi_dw - mov $1.d[1],x6 - b Lload_done -Lstore_hi_dw: - mov x7,#0 - mov $1.d[0],x6 - mov $1.d[1],x7 -Lload_done: -') - - C void gcm_hash (const struct gcm_key *key, union gcm_block *x, - C size_t length, const uint8_t *data) - -PROLOGUE(_nettle_gcm_hash) - mov x4,#0xC200000000000000 - mov POLY.d[0],x4 - - ld1 {D.2d},[X] -IF_LE(` - rev64 D.16b,D.16b -') - - ands x4,LENGTH,#-64 - b.eq L1_block - - add x5,TABLE,#64 - ld1 {H1M.2d,H1L.2d,H2M.2d,H2L.2d},[TABLE] - ld1 {H3M.2d,H3L.2d,H4M.2d,H4L.2d},[x5] - -L4_blocks_loop: - ld1 {C0.2d,C1.2d,C2.2d,C3.2d},[DATA],#64 -IF_LE(` - rev64 C0.16b,C0.16b - rev64 C1.16b,C1.16b - rev64 C2.16b,C2.16b - rev64 C3.16b,C3.16b -') - - eor C0.16b,C0.16b,D.16b - - PMUL(C1,H3M,H3L) - PMUL_SUM(C2,H2M,H2L) - PMUL_SUM(C3,H1M,H1L) - PMUL_SUM(C0,H4M,H4L) + st1 {H3M.2d,H3L.2d,H4M.2d,H4L.2d},[CTX] - REDUCTION(D) - - subs x4,x4,#64 - b.ne L4_blocks_loop - - and LENGTH,LENGTH,#63 - -L1_block: - ands x4,LENGTH,#-16 - b.eq Lpartial - - ld1 {H1M.2d,H1L.2d},[TABLE] - -L1_block_loop: - ld1 {C0.2d},[DATA],#16 -IF_LE(` - rev64 C0.16b,C0.16b -') - - eor C0.16b,C0.16b,D.16b - - PMUL(C0,H1M,H1L) - - REDUCTION(D) - - subs x4,x4,#16 - b.ne L1_block_loop - -Lpartial: - tst LENGTH,#15 - b.eq Lghash_done - - ld1 {H1M.2d,H1L.2d},[TABLE] - - LOAD_REV_PARTIAL_BLOCK(C0) - - eor C0.16b,C0.16b,D.16b - - PMUL(C0,H1M,H1L) - - REDUCTION(D) - -Lghash_done: -IF_LE(` - rev64 D.16b,D.16b -') - st1 {D.2d},[X] ret -EPILOGUE(_nettle_gcm_hash) +EPILOGUE(_nettle_ghash_set_key) diff --git a/arm64/crypto/ghash-update.asm b/arm64/crypto/ghash-update.asm new file mode 100644 index 00000000..b0e7ea99 --- /dev/null +++ b/arm64/crypto/ghash-update.asm @@ -0,0 +1,180 @@ +C arm64/crypto/ghash-update.asm + +ifelse(` + Copyright (C) 2020 Niels Möller and Mamone Tarsha + Copyright (C) 2021 Michael Weiser + This file is part of GNU Nettle. + + GNU Nettle is free software: you can redistribute it and/or + modify it under the terms of either: + + * the GNU Lesser General Public License as published by the Free + Software Foundation; either version 3 of the License, or (at your + option) any later version. + + or + + * the GNU General Public License as published by the Free + Software Foundation; either version 2 of the License, or (at your + option) any later version. + + or both in parallel, as here. + + GNU Nettle is distributed in the hope that it will be useful, + but WITHOUT ANY WARRANTY; without even the implied warranty of + MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU + General Public License for more details. + + You should have received copies of the GNU General Public License and + the GNU Lesser General Public License along with this program. If + not, see http://www.gnu.org/licenses/. +') + +.file "ghash-update.asm" +.arch armv8-a+crypto + +.text + +C common SIMD register usage: +define(`POLY', `v6') +C temporary register that assist the reduction procedure +define(`T', `v7') +C permenant register that hold the 16-byte result of pmull +define(`F', `v16') +C permenant register that hold the 16-byte result of pmull2, +C its value is accumulated on 'F' register immediately +define(`F1', `v17') +C permenant register that hold the 16-byte result of pmull +define(`R', `v18') +C permenant register that hold the 16-byte result of pmull2, +C its value is accumulated on 'F' register immediately +define(`R1', `v19') + +C common macros: +C long multiply of six 64-bit polynomials and sum +C R = (in.l × param2.l) + (in.h × param2.h) +C F = (in.l × param3.l) + (in.h × param3.h) +C PMUL(in, param1, param2) +define(`PMUL', m4_assert_numargs(3)` + pmull F.1q,$3.1d,$1.1d + pmull2 F1.1q,$3.2d,$1.2d + pmull R.1q,$2.1d,$1.1d + pmull2 R1.1q,$2.2d,$1.2d + eor F.16b,F.16b,F1.16b + eor R.16b,R.16b,R1.16b +') +C Reduce 'R' and 'F' values to 128-bit output +C REDUCTION(out) +define(`REDUCTION', m4_assert_numargs(1)` + pmull T.1q,F.1d,POLY.1d + eor R.16b,R.16b,T.16b + ext R.16b,R.16b,R.16b,#8 + eor $1.16b,F.16b,R.16b +') + +C register usage: +define(`CTX', `x0') +define(`X', `x1') +define(`BLOCKS', `x2') +define(`DATA', `x3') + +define(`D', `v0') +define(`C0', `v1') +define(`C1', `v2') +define(`C2', `v3') +define(`C3', `v4') +define(`R2', `v20') +define(`F2', `v21') +define(`R3', `v22') +define(`F3', `v23') +define(`H1M', `v24') +define(`H1L', `v25') +define(`H2M', `v26') +define(`H2L', `v27') +define(`H3M', `v28') +define(`H3L', `v29') +define(`H4M', `v30') +define(`H4L', `v31') + +C PMUL_SUM(in, param1, param2) +define(`PMUL_SUM', m4_assert_numargs(3)` + pmull F2.1q,$3.1d,$1.1d + pmull2 F3.1q,$3.2d,$1.2d + pmull R2.1q,$2.1d,$1.1d + pmull2 R3.1q,$2.2d,$1.2d + eor F2.16b,F2.16b,F3.16b + eor R2.16b,R2.16b,R3.16b + eor F.16b,F.16b,F2.16b + eor R.16b,R.16b,R2.16b +') + + C const uint8_t *_ghash_update (const struct gcm_key *key, + C union nettle_block16 *x, + C size_t blocks, const uint8_t *data) + +PROLOGUE(_nettle_ghash_update) + mov x4,#0xC200000000000000 + mov POLY.d[0],x4 + + ld1 {D.2d},[X] +IF_LE(` + rev64 D.16b,D.16b +') + + ands x4,BLOCKS,#-4 + b.eq L1_block + + add x5,CTX,#64 + ld1 {H1M.2d,H1L.2d,H2M.2d,H2L.2d},[CTX] + ld1 {H3M.2d,H3L.2d,H4M.2d,H4L.2d},[x5] + +L4_blocks_loop: + ld1 {C0.2d,C1.2d,C2.2d,C3.2d},[DATA],#64 +IF_LE(` + rev64 C0.16b,C0.16b + rev64 C1.16b,C1.16b + rev64 C2.16b,C2.16b + rev64 C3.16b,C3.16b +') + + eor C0.16b,C0.16b,D.16b + + PMUL(C1,H3M,H3L) + PMUL_SUM(C2,H2M,H2L) + PMUL_SUM(C3,H1M,H1L) + PMUL_SUM(C0,H4M,H4L) + + REDUCTION(D) + + subs x4,x4,#4 + b.ne L4_blocks_loop + +L1_block: + ands BLOCKS,BLOCKS,#3 + b.eq Lghash_done + + ld1 {H1M.2d,H1L.2d},[CTX] + +L1_block_loop: + ld1 {C0.2d},[DATA],#16 +IF_LE(` + rev64 C0.16b,C0.16b +') + + eor C0.16b,C0.16b,D.16b + + PMUL(C0,H1M,H1L) + + REDUCTION(D) + + subs BLOCKS, BLOCKS, #1 + b.ne L1_block_loop + +Lghash_done: +IF_LE(` + rev64 D.16b,D.16b +') + st1 {D.2d},[X] + mov x0, DATA + ret +EPILOGUE(_nettle_ghash_update) |