diff options
author | Niels Möller <nisse@lysator.liu.se> | 2013-04-18 14:07:20 +0200 |
---|---|---|
committer | Niels Möller <nisse@lysator.liu.se> | 2013-04-18 14:17:49 +0200 |
commit | ade7779c98a5426c7d86c8a01bbd7ad65980c9b9 (patch) | |
tree | 0235c694ed12a49037d62e4b05ada53472c804ad /arm/neon/sha3-permute.asm | |
parent | b7c953630bf9a05eca5b744c89eb643049eeb700 (diff) | |
download | nettle-ade7779c98a5426c7d86c8a01bbd7ad65980c9b9.tar.gz |
Reorganization of ARM assembly.
Renamed directory armv7 to arm. New subdirectory arm/neon, for files
using neon instructions. configure.ac hacked to make use of neon
configurable.
Diffstat (limited to 'arm/neon/sha3-permute.asm')
-rw-r--r-- | arm/neon/sha3-permute.asm | 266 |
1 files changed, 266 insertions, 0 deletions
diff --git a/arm/neon/sha3-permute.asm b/arm/neon/sha3-permute.asm new file mode 100644 index 00000000..beee09f7 --- /dev/null +++ b/arm/neon/sha3-permute.asm @@ -0,0 +1,266 @@ +C nettle, low-level cryptographics library +C +C Copyright (C) 2013 Niels Möller +C +C The nettle library is free software; you can redistribute it and/or modify +C it under the terms of the GNU Lesser General Public License as published by +C the Free Software Foundation; either version 2.1 of the License, or (at your +C option) any later version. +C +C The nettle library is distributed in the hope that it will be useful, but +C WITHOUT ANY WARRANTY; without even the implied warranty of MERCHANTABILITY +C or FITNESS FOR A PARTICULAR PURPOSE. See the GNU Lesser General Public +C License for more details. +C +C You should have received a copy of the GNU Lesser General Public License +C along with the nettle library; see the file COPYING.LIB. If not, write to +C the Free Software Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, +C MA 02111-1301, USA. + + .file "sha3-permute.asm" + .fpu neon + +define(<CTX>, <r0>) +define(<COUNT>, <r1>) +define(<RC>, <r2>) +C First column +define(<A0>, <d0>) +define(<A5>, <d2>) +define(<A10>, <d3>) +define(<A15>, <d4>) +define(<A20>, <d5>) + +define(<A1>, <d6>) +define(<A2>, <d7>) +define(<A3>, <d8>) +define(<A4>, <d9>) + +define(<A6>, <d16>) +define(<A7>, <d17>) +define(<A8>, <d18>) +define(<A9>, <d19>) + +define(<A11>, <d20>) +define(<A12>, <d21>) +define(<A13>, <d22>) +define(<A14>, <d23>) + +define(<A16>, <d24>) +define(<A17>, <d25>) +define(<A18>, <d26>) +define(<A19>, <d27>) + +define(<A21>, <d28>) +define(<A22>, <d29>) +define(<A23>, <d30>) +define(<A24>, <d31>) + +define(<T0>, <d10>) +define(<T1>, <d11>) + +define(<C0>, <d1>) +define(<C1>, <d12>) +define(<C2>, <d13>) +define(<C3>, <d14>) +define(<C4>, <d15>) + + +C ROL(DST, SRC, COUNT) +C Must have SRC != DST +define(<ROL>, < + vshr.u64 $1, $2, #eval(64-$3) + vsli.i64 $1, $2, #$3 + >) +C sha3_permute(struct sha3_ctx *ctx) + + .text + .align 3 +.Lrc: + .quad 0x0000000000000001 + .quad 0x0000000000008082 + .quad 0x800000000000808A + .quad 0x8000000080008000 + .quad 0x000000000000808B + .quad 0x0000000080000001 + .quad 0x8000000080008081 + .quad 0x8000000000008009 + .quad 0x000000000000008A + .quad 0x0000000000000088 + .quad 0x0000000080008009 + .quad 0x000000008000000A + .quad 0x000000008000808B + .quad 0x800000000000008B + .quad 0x8000000000008089 + .quad 0x8000000000008003 + .quad 0x8000000000008002 + .quad 0x8000000000000080 + .quad 0x000000000000800A + .quad 0x800000008000000A + .quad 0x8000000080008081 + .quad 0x8000000000008080 + .quad 0x0000000080000001 + .quad 0x8000000080008008 + +PROLOGUE(nettle_sha3_permute) + vpush {d8-d15} + + vld1.64 {A0}, [CTX]! + vldm CTX!, {A1,A2,A3,A4} + vld1.64 {A5}, [CTX]! + vldm CTX!, {A6,A7,A8,A9} + vld1.64 {A10}, [CTX]! + vldm CTX!, {A11,A12,A13,A14} + vld1.64 {A15}, [CTX]! + vldm CTX!, {A16,A17,A18,A19} + vld1.64 {A20}, [CTX]! + vldm CTX, {A21,A22,A23,A24} + sub CTX, CTX, #168 + + mov COUNT, #24 + adr RC, .Lrc + + .align 3 +.Loop: + veor QREG(T0), QREG(A5), QREG(A15) + veor C0, A0, T0 + veor C0, C0, T1 + veor QREG(C1), QREG(A1), QREG(A6) + veor QREG(C1), QREG(C1), QREG(A11) + veor QREG(C1), QREG(C1), QREG(A16) + veor QREG(C1), QREG(C1), QREG(A21) + + veor QREG(C3), QREG(A3), QREG(A8) + veor QREG(C3), QREG(C3), QREG(A13) + veor QREG(C3), QREG(C3), QREG(A18) + veor QREG(C3), QREG(C3), QREG(A23) + + C D0 = C4 ^ (C1 <<< 1) + C NOTE: Using ROL macro (and vsli) is slightly slower. + vshl.i64 T0, C1, #1 + vshr.u64 T1, C1, #63 + veor T0, T0, C4 + veor T0, T0, T1 + vmov T1, T0 + veor A0, A0, T0 + veor QREG(A5), QREG(A5), QREG(T0) + veor QREG(A15), QREG(A15), QREG(T0) + + C D1 = C0 ^ (C2 <<< 1) + C D2 = C1 ^ (C3 <<< 1) + ROL(T0, C2, 1) + ROL(T1, C3, 1) + veor T0, T0, C0 + veor T1, T1, C1 + veor QREG(A1), QREG(A1), QREG(T0) + veor QREG(A6), QREG(A6), QREG(T0) + veor QREG(A11), QREG(A11), QREG(T0) + veor QREG(A16), QREG(A16), QREG(T0) + veor QREG(A21), QREG(A21), QREG(T0) + + C D3 = C2 ^ (C4 <<< 1) + C D4 = C3 ^ (C0 <<< 1) + ROL(T0, C4, 1) + ROL(T1, C0, 1) + veor T0, T0, C2 + veor T1, T1, C3 + veor QREG(A3), QREG(A3), QREG(T0) + veor QREG(A8), QREG(A8), QREG(T0) + veor QREG(A13), QREG(A13), QREG(T0) + veor QREG(A18), QREG(A18), QREG(T0) + veor QREG(A23), QREG(A23), QREG(T0) + + ROL( T0, A1, 1) + ROL( A1, A6, 44) + ROL( A6, A9, 20) + ROL( A9, A22, 61) + ROL(A22, A14, 39) + ROL(A14, A20, 18) + ROL(A20, A2, 62) + ROL( A2, A12, 43) + ROL(A12, A13, 25) + ROL(A13, A19, 8) + ROL(A19, A23, 56) + ROL(A23, A15, 41) + ROL(A15, A4, 27) + ROL( A4, A24, 14) + ROL(A24, A21, 2) + ROL(A21, A8, 55) + ROL( A8, A16, 45) + ROL(A16, A5, 36) + ROL( A5, A3, 28) + ROL( A3, A18, 21) + ROL(A18, A17, 15) + ROL(A17, A11, 10) + ROL(A11, A7, 6) + ROL( A7, A10, 3) + C New A10 value left in T0 + + vbic C0, A2, A1 + vbic C1, A3, A2 + vbic C2, A4, A3 + vbic C3, A0, A4 + vbic C4, A1, A0 + + veor A0, A0, C0 + vld1.64 {C0}, [RC :64]! + veor QREG(A1), QREG(A1), QREG(C1) + veor QREG(A3), QREG(A3), QREG(C3) + veor A0, A0, C0 + + vbic C0, A7, A6 + vbic C1, A8, A7 + vbic C2, A9, A8 + vbic C3, A5, A9 + vbic C4, A6, A5 + + veor A5, A5, C0 + veor QREG(A6), QREG(A6), QREG(C1) + veor QREG(A8), QREG(A8), QREG(C3) + + vbic C0, A12, A11 + vbic C1, A13, A12 + vbic C2, A14, A13 + vbic C3, T0, A14 + vbic C4, A11, T0 + + veor A10, T0, C0 + veor QREG(A11), QREG(A11), QREG(C1) + veor QREG(A13), QREG(A13), QREG(C3) + + vbic C0, A17, A16 + vbic C1, A18, A17 + vbic C2, A19, A18 + vbic C3, A15, A19 + vbic C4, A16, A15 + + veor A15, A15, C0 + veor QREG(A16), QREG(A16), QREG(C1) + veor QREG(A18), QREG(A18), QREG(C3) + + vbic C0, A22, A21 + vbic C1, A23, A22 + vbic C2, A24, A23 + vbic C3, A20, A24 + vbic C4, A21, A20 + + subs COUNT, COUNT, #1 + veor A20, A20, C0 + veor QREG(A21), QREG(A21), QREG(C1) + veor QREG(A23), QREG(A23), QREG(C3) + + bne .Loop + + vst1.64 {A0}, [CTX]! + vstm CTX!, {A1,A2,A3,A4} + vst1.64 {A5}, [CTX]! + vstm CTX!, {A6,A7,A8,A9} + vst1.64 {A10}, [CTX]! + vstm CTX!, {A11,A12,A13,A14} + vst1.64 {A15}, [CTX]! + vstm CTX!, {A16,A17,A18,A19} + vst1.64 {A20}, [CTX]! + vstm CTX, {A21,A22,A23,A24} + + vpop {d8-d15} + bx lr +EPILOGUE(nettle_sha3_permute) |