summaryrefslogtreecommitdiff
diff options
context:
space:
mode:
authorNiels Möller <nisse@lysator.liu.se>2022-02-21 17:39:31 +0100
committerNiels Möller <nisse@lysator.liu.se>2022-02-21 18:35:13 +0100
commitb16455558886d47c9bdcd818b44b0843639640a2 (patch)
tree623660acc5e4ccc536fcc2c9cd747c6215948e76
parent6b80b8894049731f78163ca38fc6625230d054d0 (diff)
downloadnettle-b16455558886d47c9bdcd818b44b0843639640a2.tar.gz
arm64: Update pclmul ghash to new organization.
-rw-r--r--ChangeLog6
-rw-r--r--arm64/crypto/ghash-set-key.asm (renamed from arm64/crypto/gcm-hash.asm)185
-rw-r--r--arm64/crypto/ghash-update.asm180
3 files changed, 199 insertions, 172 deletions
diff --git a/ChangeLog b/ChangeLog
index 4d4921f8..4d6efb22 100644
--- a/ChangeLog
+++ b/ChangeLog
@@ -1,3 +1,9 @@
+2022-02-21 Niels Möller <nisse@lysator.liu.se>
+
+ * arm64/crypto/gcm-hash.asm: Deleted, split into two new files...
+ * arm64/crypto/ghash-set-key.asm: New file.
+ * arm64/crypto/ghash-update.asm: New file.
+
2022-02-19 Niels Möller <nisse@lysator.liu.se>
* fat-x86_64.c (fat_init): Update fat init for new _ghash_set_key
diff --git a/arm64/crypto/gcm-hash.asm b/arm64/crypto/ghash-set-key.asm
index 3e4c98d8..7e09bb43 100644
--- a/arm64/crypto/gcm-hash.asm
+++ b/arm64/crypto/ghash-set-key.asm
@@ -1,4 +1,4 @@
-C arm64/crypto/gcm-hash.asm
+C arm64/crypto/ghash-set-key.asm
ifelse(`
Copyright (C) 2020 Niels Möller and Mamone Tarsha
@@ -30,14 +30,11 @@ ifelse(`
not, see http://www.gnu.org/licenses/.
')
-.file "gcm-hash.asm"
+.file "ghash-set-key.asm"
.arch armv8-a+crypto
.text
-C gcm_set_key() assigns H value in the middle element of the table
-define(`H_Idx', `128')
-
C common SIMD register usage:
define(`POLY', `v6')
C temporary register that assist the reduction procedure
@@ -75,7 +72,7 @@ define(`REDUCTION', m4_assert_numargs(1)`
eor $1.16b,F.16b,R.16b
')
- C void gcm_init_key (union gcm_block *table)
+ C void _ghash_set_key (struct gcm_key *ctx, const union nettle_block16 *key)
C This function populates the gcm table as the following layout
C *******************************************************************************
@@ -92,8 +89,9 @@ C | H4M = (H3 div x⁶⁴)||((H4 mod x⁶⁴) × (x⁶⁴+x⁶³+x⁶²+x⁵⁷)
C | H4L = (H3 mod x⁶⁴)||(((H4 mod x⁶⁴) × (x⁶³+x⁶²+x⁵⁷)) mod x⁶⁴) + (H4 div x⁶⁴) |
C *******************************************************************************
-C gcm_init_key register usage:
-define(`TABLE', `x0')
+C Register usage:
+define(`CTX', `x0')
+define(`KEY', `x1')
define(`EMSB', `v0')
define(`B', `v1')
@@ -122,14 +120,13 @@ define(`PMUL_PARAM', m4_assert_numargs(3)`
ext $2.16b,$2.16b,$2.16b,#8
')
-PROLOGUE(_nettle_gcm_init_key)
- add x1,TABLE,#16*H_Idx
- ld1 {H.2d},[x1]
+PROLOGUE(_nettle_ghash_set_key)
+ ld1 {H.2d},[KEY]
C we treat data as big-endian doublewords for processing. Since there is no
C endianness-neutral MSB-first load operation we need to restore our desired
C byte order on little-endian systems. The same holds true for DATA below
- C but not our own internal precalculated TABLE (see below).
+ C but not our own internal precalculated CTX (see below).
IF_LE(`
rev64 H.16b,H.16b
')
@@ -162,9 +159,9 @@ IF_LE(`
PMUL_PARAM(H2,H2M,H2L)
C we store to the table as doubleword-vectors in current memory endianness
- C because it's our own strictly internal data structure and what gcm_hash
+ C because it's our own strictly internal data structure and what ghash_update
C can most naturally use
- st1 {H1M.2d,H1L.2d,H2M.2d,H2L.2d},[TABLE],#64
+ st1 {H1M.2d,H1L.2d,H2M.2d,H2L.2d},[CTX],#64
C --- calculate H^3 = H^1 × H^2 ---
@@ -182,163 +179,7 @@ IF_LE(`
PMUL_PARAM(H4,H4M,H4L)
- st1 {H3M.2d,H3L.2d,H4M.2d,H4L.2d},[TABLE]
-
- ret
-EPILOGUE(_nettle_gcm_init_key)
-
-C gcm_hash register usage:
-define(`TABLE', `x0')
-define(`X', `x1')
-define(`LENGTH', `x2')
-define(`DATA', `x3')
-
-define(`D', `v0')
-define(`C0', `v1')
-define(`C1', `v2')
-define(`C2', `v3')
-define(`C3', `v4')
-define(`R2', `v20')
-define(`F2', `v21')
-define(`R3', `v22')
-define(`F3', `v23')
-define(`H1M', `v24')
-define(`H1L', `v25')
-define(`H2M', `v26')
-define(`H2L', `v27')
-define(`H3M', `v28')
-define(`H3L', `v29')
-define(`H4M', `v30')
-define(`H4L', `v31')
-
-C PMUL_SUM(in, param1, param2)
-define(`PMUL_SUM', m4_assert_numargs(3)`
- pmull F2.1q,$3.1d,$1.1d
- pmull2 F3.1q,$3.2d,$1.2d
- pmull R2.1q,$2.1d,$1.1d
- pmull2 R3.1q,$2.2d,$1.2d
- eor F2.16b,F2.16b,F3.16b
- eor R2.16b,R2.16b,R3.16b
- eor F.16b,F.16b,F2.16b
- eor R.16b,R.16b,R2.16b
-')
-
-C Load the final partial block into SIMD register,
-C stored in little-endian order for each 64-bit part
-C LOAD_REV_PARTIAL_BLOCK(out)
-define(`LOAD_REV_PARTIAL_BLOCK', m4_assert_numargs(1)`
- tbz LENGTH,3,Lless_8_bytes
- ldr `d'substr($1,1,len($1)),[DATA],#8
-IF_LE(`
- rev64 $1.16b,$1.16b
-')
- mov x7,#0
- mov $1.d[1],x7
- tst LENGTH,#7
- b.eq Lload_done
-Lless_8_bytes:
- mov x6,#0
- mov x5,#64
- and x4,LENGTH,#7
-Lload_byte_loop:
- mov x7,#0
- ldrb w7,[DATA],#1
- sub x5,x5,#8
- lsl x7,x7,x5
- orr x6,x6,x7
- subs x4,x4,#1
- b.ne Lload_byte_loop
- tbz LENGTH,3,Lstore_hi_dw
- mov $1.d[1],x6
- b Lload_done
-Lstore_hi_dw:
- mov x7,#0
- mov $1.d[0],x6
- mov $1.d[1],x7
-Lload_done:
-')
-
- C void gcm_hash (const struct gcm_key *key, union gcm_block *x,
- C size_t length, const uint8_t *data)
-
-PROLOGUE(_nettle_gcm_hash)
- mov x4,#0xC200000000000000
- mov POLY.d[0],x4
-
- ld1 {D.2d},[X]
-IF_LE(`
- rev64 D.16b,D.16b
-')
-
- ands x4,LENGTH,#-64
- b.eq L1_block
-
- add x5,TABLE,#64
- ld1 {H1M.2d,H1L.2d,H2M.2d,H2L.2d},[TABLE]
- ld1 {H3M.2d,H3L.2d,H4M.2d,H4L.2d},[x5]
-
-L4_blocks_loop:
- ld1 {C0.2d,C1.2d,C2.2d,C3.2d},[DATA],#64
-IF_LE(`
- rev64 C0.16b,C0.16b
- rev64 C1.16b,C1.16b
- rev64 C2.16b,C2.16b
- rev64 C3.16b,C3.16b
-')
-
- eor C0.16b,C0.16b,D.16b
-
- PMUL(C1,H3M,H3L)
- PMUL_SUM(C2,H2M,H2L)
- PMUL_SUM(C3,H1M,H1L)
- PMUL_SUM(C0,H4M,H4L)
+ st1 {H3M.2d,H3L.2d,H4M.2d,H4L.2d},[CTX]
- REDUCTION(D)
-
- subs x4,x4,#64
- b.ne L4_blocks_loop
-
- and LENGTH,LENGTH,#63
-
-L1_block:
- ands x4,LENGTH,#-16
- b.eq Lpartial
-
- ld1 {H1M.2d,H1L.2d},[TABLE]
-
-L1_block_loop:
- ld1 {C0.2d},[DATA],#16
-IF_LE(`
- rev64 C0.16b,C0.16b
-')
-
- eor C0.16b,C0.16b,D.16b
-
- PMUL(C0,H1M,H1L)
-
- REDUCTION(D)
-
- subs x4,x4,#16
- b.ne L1_block_loop
-
-Lpartial:
- tst LENGTH,#15
- b.eq Lghash_done
-
- ld1 {H1M.2d,H1L.2d},[TABLE]
-
- LOAD_REV_PARTIAL_BLOCK(C0)
-
- eor C0.16b,C0.16b,D.16b
-
- PMUL(C0,H1M,H1L)
-
- REDUCTION(D)
-
-Lghash_done:
-IF_LE(`
- rev64 D.16b,D.16b
-')
- st1 {D.2d},[X]
ret
-EPILOGUE(_nettle_gcm_hash)
+EPILOGUE(_nettle_ghash_set_key)
diff --git a/arm64/crypto/ghash-update.asm b/arm64/crypto/ghash-update.asm
new file mode 100644
index 00000000..b0e7ea99
--- /dev/null
+++ b/arm64/crypto/ghash-update.asm
@@ -0,0 +1,180 @@
+C arm64/crypto/ghash-update.asm
+
+ifelse(`
+ Copyright (C) 2020 Niels Möller and Mamone Tarsha
+ Copyright (C) 2021 Michael Weiser
+ This file is part of GNU Nettle.
+
+ GNU Nettle is free software: you can redistribute it and/or
+ modify it under the terms of either:
+
+ * the GNU Lesser General Public License as published by the Free
+ Software Foundation; either version 3 of the License, or (at your
+ option) any later version.
+
+ or
+
+ * the GNU General Public License as published by the Free
+ Software Foundation; either version 2 of the License, or (at your
+ option) any later version.
+
+ or both in parallel, as here.
+
+ GNU Nettle is distributed in the hope that it will be useful,
+ but WITHOUT ANY WARRANTY; without even the implied warranty of
+ MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
+ General Public License for more details.
+
+ You should have received copies of the GNU General Public License and
+ the GNU Lesser General Public License along with this program. If
+ not, see http://www.gnu.org/licenses/.
+')
+
+.file "ghash-update.asm"
+.arch armv8-a+crypto
+
+.text
+
+C common SIMD register usage:
+define(`POLY', `v6')
+C temporary register that assist the reduction procedure
+define(`T', `v7')
+C permenant register that hold the 16-byte result of pmull
+define(`F', `v16')
+C permenant register that hold the 16-byte result of pmull2,
+C its value is accumulated on 'F' register immediately
+define(`F1', `v17')
+C permenant register that hold the 16-byte result of pmull
+define(`R', `v18')
+C permenant register that hold the 16-byte result of pmull2,
+C its value is accumulated on 'F' register immediately
+define(`R1', `v19')
+
+C common macros:
+C long multiply of six 64-bit polynomials and sum
+C R = (in.l × param2.l) + (in.h × param2.h)
+C F = (in.l × param3.l) + (in.h × param3.h)
+C PMUL(in, param1, param2)
+define(`PMUL', m4_assert_numargs(3)`
+ pmull F.1q,$3.1d,$1.1d
+ pmull2 F1.1q,$3.2d,$1.2d
+ pmull R.1q,$2.1d,$1.1d
+ pmull2 R1.1q,$2.2d,$1.2d
+ eor F.16b,F.16b,F1.16b
+ eor R.16b,R.16b,R1.16b
+')
+C Reduce 'R' and 'F' values to 128-bit output
+C REDUCTION(out)
+define(`REDUCTION', m4_assert_numargs(1)`
+ pmull T.1q,F.1d,POLY.1d
+ eor R.16b,R.16b,T.16b
+ ext R.16b,R.16b,R.16b,#8
+ eor $1.16b,F.16b,R.16b
+')
+
+C register usage:
+define(`CTX', `x0')
+define(`X', `x1')
+define(`BLOCKS', `x2')
+define(`DATA', `x3')
+
+define(`D', `v0')
+define(`C0', `v1')
+define(`C1', `v2')
+define(`C2', `v3')
+define(`C3', `v4')
+define(`R2', `v20')
+define(`F2', `v21')
+define(`R3', `v22')
+define(`F3', `v23')
+define(`H1M', `v24')
+define(`H1L', `v25')
+define(`H2M', `v26')
+define(`H2L', `v27')
+define(`H3M', `v28')
+define(`H3L', `v29')
+define(`H4M', `v30')
+define(`H4L', `v31')
+
+C PMUL_SUM(in, param1, param2)
+define(`PMUL_SUM', m4_assert_numargs(3)`
+ pmull F2.1q,$3.1d,$1.1d
+ pmull2 F3.1q,$3.2d,$1.2d
+ pmull R2.1q,$2.1d,$1.1d
+ pmull2 R3.1q,$2.2d,$1.2d
+ eor F2.16b,F2.16b,F3.16b
+ eor R2.16b,R2.16b,R3.16b
+ eor F.16b,F.16b,F2.16b
+ eor R.16b,R.16b,R2.16b
+')
+
+ C const uint8_t *_ghash_update (const struct gcm_key *key,
+ C union nettle_block16 *x,
+ C size_t blocks, const uint8_t *data)
+
+PROLOGUE(_nettle_ghash_update)
+ mov x4,#0xC200000000000000
+ mov POLY.d[0],x4
+
+ ld1 {D.2d},[X]
+IF_LE(`
+ rev64 D.16b,D.16b
+')
+
+ ands x4,BLOCKS,#-4
+ b.eq L1_block
+
+ add x5,CTX,#64
+ ld1 {H1M.2d,H1L.2d,H2M.2d,H2L.2d},[CTX]
+ ld1 {H3M.2d,H3L.2d,H4M.2d,H4L.2d},[x5]
+
+L4_blocks_loop:
+ ld1 {C0.2d,C1.2d,C2.2d,C3.2d},[DATA],#64
+IF_LE(`
+ rev64 C0.16b,C0.16b
+ rev64 C1.16b,C1.16b
+ rev64 C2.16b,C2.16b
+ rev64 C3.16b,C3.16b
+')
+
+ eor C0.16b,C0.16b,D.16b
+
+ PMUL(C1,H3M,H3L)
+ PMUL_SUM(C2,H2M,H2L)
+ PMUL_SUM(C3,H1M,H1L)
+ PMUL_SUM(C0,H4M,H4L)
+
+ REDUCTION(D)
+
+ subs x4,x4,#4
+ b.ne L4_blocks_loop
+
+L1_block:
+ ands BLOCKS,BLOCKS,#3
+ b.eq Lghash_done
+
+ ld1 {H1M.2d,H1L.2d},[CTX]
+
+L1_block_loop:
+ ld1 {C0.2d},[DATA],#16
+IF_LE(`
+ rev64 C0.16b,C0.16b
+')
+
+ eor C0.16b,C0.16b,D.16b
+
+ PMUL(C0,H1M,H1L)
+
+ REDUCTION(D)
+
+ subs BLOCKS, BLOCKS, #1
+ b.ne L1_block_loop
+
+Lghash_done:
+IF_LE(`
+ rev64 D.16b,D.16b
+')
+ st1 {D.2d},[X]
+ mov x0, DATA
+ ret
+EPILOGUE(_nettle_ghash_update)