diff options
author | Niels Möller <nisse@lysator.liu.se> | 2013-09-22 20:48:57 +0200 |
---|---|---|
committer | Niels Möller <nisse@lysator.liu.se> | 2013-09-22 20:48:57 +0200 |
commit | 4611f79cf2c2d863797de6497e543b7baac8b636 (patch) | |
tree | cc98bec1751b8ccfd7dfbf7ec09dbf96116344a1 /x86_64 | |
parent | a99c33b48df8a48e1c0be5e11301bfdfd51fa110 (diff) | |
download | nettle-4611f79cf2c2d863797de6497e543b7baac8b636.tar.gz |
New x86_64 assembly for gcm hashing.
Diffstat (limited to 'x86_64')
-rw-r--r-- | x86_64/gcm-hash8.asm (renamed from x86_64/gcm-gf-mul-8.asm) | 197 |
1 files changed, 146 insertions, 51 deletions
diff --git a/x86_64/gcm-gf-mul-8.asm b/x86_64/gcm-hash8.asm index 2296ba87..3a3f012e 100644 --- a/x86_64/gcm-gf-mul-8.asm +++ b/x86_64/gcm-hash8.asm @@ -1,17 +1,17 @@ C nettle, low-level cryptographics library -C +C C Copyright (C) 2013, Niels Möller -C +C C The nettle library is free software; you can redistribute it and/or modify C it under the terms of the GNU Lesser General Public License as published by C the Free Software Foundation; either version 2.1 of the License, or (at your C option) any later version. -C +C C The nettle library is distributed in the hope that it will be useful, but C WITHOUT ANY WARRANTY; without even the implied warranty of MERCHANTABILITY C or FITNESS FOR A PARTICULAR PURPOSE. See the GNU Lesser General Public C License for more details. -C +C C You should have received a copy of the GNU Lesser General Public License C along with the nettle library; see the file COPYING.LIB. If not, write to C the Free Software Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, @@ -19,47 +19,51 @@ C MA 02111-1301, USA. C Register usage: -define(<XP>, <%rdi>) -define(<TABLE>, <%rsi>) -define(<XW>, <%rax>) -define(<CNT>, <%ecx>) -define(<Z0>, <%rdx>) -define(<Z1>, <%r8>) -define(<T0>, <%r9>) -define(<T1>, <%r10>) -define(<T2>, <%r11>) -define(<SHIFT_TABLE>, <%rbx>) - -C The C code is 12.5 c/byte, slower than sha1 (10.6), while this code runs -C at 10.2, slightly faster. Benchmarked on a low-end AMD E-350. - - .file "gcm-gf-mul-8.asm" - - C void _gcm_gf_mul_8(union gcm_block *x, const union gcm_block *table) +define(<KEY>, <%rdi>) +define(<XP>, <%rsi>) +define(<LENGTH>, <%rdx>) +define(<SRC>, <%rcx>) +define(<X0>, <%rax>) +define(<X1>, <%rbx>) +define(<CNT>, <%ebp>) +define(<T0>, <%r8>) +define(<T1>, <%r9>) +define(<T2>, <%r10>) +define(<Z0>, <%r11>) +define(<Z1>, <%r12>) +define(<SHIFT_TABLE>, <%r13>) + + .file "gcm-hash8.asm" + + C void gcm_hash (const struct gcm_key *key, union gcm_block *x, + C size_t length, const uint8_t *data) + .text ALIGN(16) -PROLOGUE(_nettle_gcm_gf_mul_8) - W64_ENTRY(2, 0) +PROLOGUE(_nettle_gcm_hash8) + W64_ENTRY(4, 0) push %rbx - mov 8(XP), XW - rol $8, XW - movzbl LREG(XW), XREG(T0) - shl $4, T0 - mov (TABLE, T0), Z0 - mov 8(TABLE, T0), Z1 + push %rbp + push %r12 + push %r13 + sub $16, LENGTH lea .Lshift_table(%rip), SHIFT_TABLE - movl $7, CNT - call .Lmul_word - mov (XP), XW - movl $8, CNT - call .Lmul_word - mov Z0, (XP) - mov Z1, 8(XP) - W64_EXIT(2, 0) - pop %rbx - ret + mov (XP), X0 + mov 8(XP), X1 + jc .Lfinal +ALIGN(16) +.Lblock_loop: + + xor (SRC), X0 + xor 8(SRC), X1 + +.Lblock_mul: + rol $8, X1 + movzbl LREG(X1), XREG(T1) + shl $4, T1 + mov (KEY, T1), Z0 + mov 8(KEY, T1), Z1 -.Lmul_word: C shift Z1, Z0, transforming C +-----------------------+-----------------------+ C |15 14 13 12 11 10 09 08|07 06 05 04 03 02 01 00| @@ -70,25 +74,118 @@ PROLOGUE(_nettle_gcm_gf_mul_8) C +-----------------------+-----------------+-----+ C xor |T[15]| C +-----+ + + mov $7, CNT + +ALIGN(16) +.Loop_X1: mov Z1, T1 + shr $56, T1 + shl $8, Z1 mov Z0, T0 - shl $8, Z1 C Use shld? shl $8, Z0 - shr $56, T1 shr $56, T0 movzwl (SHIFT_TABLE, T1, 2), XREG(T1) - rol $8, XW + xor T1, Z0 + rol $8, X1 + movzbl LREG(X1), XREG(T2) + shl $4, T2 + xor (KEY, T2), Z0 add T0, Z1 + xor 8(KEY, T2), Z1 + decl CNT + jne .Loop_X1 + + mov $7, CNT + +ALIGN(16) +.Loop_X0: + mov Z1, T1 + shr $56, T1 + shl $8, Z1 + mov Z0, T0 + shl $8, Z0 + shr $56, T0 + movzwl (SHIFT_TABLE, T1, 2), XREG(T1) xor T1, Z0 - movzbl LREG(XW), XREG(T2) + rol $8, X0 + movzbl LREG(X0), XREG(T2) shl $4, T2 - xor (TABLE, T2), Z0 - xor 8(TABLE, T2), Z1 + xor (KEY, T2), Z0 + add T0, Z1 + xor 8(KEY, T2), Z1 decl CNT - jne .Lmul_word + jne .Loop_X0 + + mov Z1, T1 + shr $56, T1 + shl $8, Z1 + mov Z0, T0 + shl $8, Z0 + shr $56, T0 + movzwl (SHIFT_TABLE, T1, 2), XREG(T1) + xor T1, Z0 + rol $8, X0 + movzbl LREG(X0), XREG(T2) + shl $4, T2 + mov (KEY, T2), X0 + xor Z0, X0 + add T0, Z1 + mov 8(KEY, T2), X1 + xor Z1, X1 + + add $16, SRC + sub $16, LENGTH + jnc .Lblock_loop + +.Lfinal: + add $16, LENGTH + jnz .Lpartial + + mov X0, (XP) + mov X1, 8(XP) + + pop %r13 + pop %r12 + pop %rbp + pop %rbx + W64_EXIT(2, 0) + ret + +.Lpartial: + C Read and xor partial block, then jump back into the loop + C with LENGTH == 0. + + cmp $8, LENGTH + jc .Llt8 + + C 8 <= LENGTH < 16 + xor (SRC), X0 + add $8, SRC + sub $8, LENGTH + jz .Lblock_mul + call .Lread_bytes + xor T0, X1 + jmp .Lblock_mul + +.Llt8: C 0 < LENGTH < 8 + call .Lread_bytes + xor T0, X0 + jmp .Lblock_mul + +C Read 0 < LENGTH < 8 bytes at SRC, result in T0 +.Lread_bytes: + xor T0, T0 + sub $1, SRC +ALIGN(16) +.Lread_loop: + shl $8, T0 + orb (SRC, LENGTH), LREG(T0) +.Lread_next: + sub $1, LENGTH + jnz .Lread_loop ret - -EPILOGUE(_nettle_gcm_gf_mul_8) +EPILOGUE(_nettle_gcm_hash8) define(<W>, <0x$2$1>) .section .rodata @@ -126,5 +223,3 @@ define(<W>, <0x$2$1>) .hword W(a7,d0),W(a6,12),W(a4,54),W(a5,96),W(a0,d8),W(a1,1a),W(a3,5c),W(a2,9e) .hword W(b5,e0),W(b4,22),W(b6,64),W(b7,a6),W(b2,e8),W(b3,2a),W(b1,6c),W(b0,ae) .hword W(bb,f0),W(ba,32),W(b8,74),W(b9,b6),W(bc,f8),W(bd,3a),W(bf,7c),W(be,be) - - |