C -*- mode: asm; asm-comment-char: ?C; -*- C nettle, low-level cryptographics library C C Copyright (C) 2010, Niels Möller C C The nettle library is free software; you can redistribute it and/or modify C it under the terms of the GNU Lesser General Public License as published by C the Free Software Foundation; either version 2.1 of the License, or (at your C option) any later version. C C The nettle library is distributed in the hope that it will be useful, but C WITHOUT ANY WARRANTY; without even the implied warranty of MERCHANTABILITY C or FITNESS FOR A PARTICULAR PURPOSE. See the GNU Lesser General Public C License for more details. C C You should have received a copy of the GNU Lesser General Public License C along with the nettle library; see the file COPYING.LIB. If not, write to C the Free Software Foundation, Inc., 59 Temple Place - Suite 330, Boston, C MA 02111-1307, USA. C Register usage: define(, <%rax>) C Originally in %rdi define(, <%rsi>) define(, <%rdx>) define(, <%r10>) define(, <%r8>) define(, <%r9>) define(, <%rdi>) define(, <%r11>) define(, <%rdi>) C Overlaps with CNT define(, ) .file "memxor.asm" .text C memxor(uint8_t *dst, const uint8_t *src, size_t n) C %rdi %rsi %rdx ALIGN(4) PROLOGUE(memxor) W64_ENTRY(3, 0) mov %rdx, %r10 mov %rdi, %rdx jmp .Lmemxor3_entry EPILOGUE(memxor) C memxor3(uint8_t *dst, const uint8_t *a, const uint8_t *b, size_t n) C %rdi %rsi %rdx %rcx ALIGN(4) PROLOGUE(memxor3) W64_ENTRY(4, 0) C %cl needed for shift count, so move away N mov %rcx, N .Lmemxor3_entry: test N, N C Get number of unaligned bytes at the end C %rdi is used as CNT, %rax as DST and as return value mov %rdi, %rax jz .Ldone add N, CNT and $7, CNT jz .Laligned cmp $8, N jc .Lfinal_next C FIXME: Instead of this loop, could try cmov with memory C destination, as a sequence of one 8-bit, one 16-bit and one C 32-bit operations. (Except that cmov can't do 8-bit ops, so C that step has to use a conditional). .Lalign_loop: sub $1, N movb (AP, N), LREG(TMP) xorb (BP, N), LREG(TMP) movb LREG(TMP), (DST, N) sub $1, CNT jnz .Lalign_loop .Laligned: ifelse(USE_SSE2, yes, < cmp $16, N jnc .Lsse2_case >) C Check for the case that AP and BP have the same alignment, C but different from DST. mov AP, TMP sub BP, TMP test $7, TMP jnz .Lno_shift_case mov AP, %rcx sub DST, %rcx and $7, %rcx jz .Lno_shift_case sub %rcx, AP sub %rcx, BP shl $3, %rcx C Unrolling, with aligned values alternating in S0 and S1 test $8, N jnz .Lshift_odd mov (AP, N), S1 xor (BP, N), S1 jmp .Lshift_next .Lshift_odd: mov -8(AP, N), S1 mov (AP, N), S0 xor -8(BP, N), S1 xor (BP, N), S0 mov S1, TMP shr %cl, TMP neg %cl shl %cl, S0 neg %cl or S0, TMP mov TMP, -8(DST, N) sub $8, N jz .Ldone jmp .Lshift_next ALIGN(4) .Lshift_loop: mov 8(AP, N), S0 xor 8(BP, N), S0 mov S0, TMP shr %cl, TMP neg %cl shl %cl, S1 neg %cl or S1, TMP mov TMP, 8(DST, N) mov (AP, N), S1 xor (BP, N), S1 mov S1, TMP shr %cl, TMP neg %cl shl %cl, S0 neg %cl or S0, TMP mov TMP, (DST, N) .Lshift_next: sub $16, N C FIXME: Handle the case N == 16 specially, C like in the non-shifted case? C ja .Lshift_loop C jz .Ldone jnc .Lshift_loop add $15, N jnc .Ldone shr $3, %rcx add %rcx, AP add %rcx, BP jmp .Lfinal_loop .Lno_shift_case: C Next destination word is -8(DST, N) C Setup for unrolling test $8, N jz .Lword_next sub $8, N jz .Lone_word mov (AP, N), TMP xor (BP, N), TMP mov TMP, (DST, N) jmp .Lword_next ALIGN(4) .Lword_loop: mov 8(AP, N), TMP mov (AP, N), TMP2 xor 8(BP, N), TMP xor (BP, N), TMP2 mov TMP, 8(DST, N) mov TMP2, (DST, N) .Lword_next: sub $16, N ja .Lword_loop C Not zero and no carry jnz .Lfinal C Final operation is word aligned mov 8(AP, N), TMP xor 8(BP, N), TMP mov TMP, 8(DST, N) .Lone_word: mov (AP, N), TMP xor (BP, N), TMP mov TMP, (DST, N) C ENTRY might have been 3 args, too, but it doesn't matter for the exit W64_EXIT(4, 0) ret .Lfinal: add $15, N .Lfinal_loop: movb (AP, N), LREG(TMP) xorb (BP, N), LREG(TMP) movb LREG(TMP), (DST, N) .Lfinal_next: sub $1, N jnc .Lfinal_loop .Ldone: C ENTRY might have been 3 args, too, but it doesn't matter for the exit W64_EXIT(4, 0) ret ifelse(USE_SSE2, yes, < .Lsse2_case: lea (DST, N), TMP test $8, TMP jz .Lsse2_next sub $8, N mov (AP, N), TMP xor (BP, N), TMP mov TMP, (DST, N) jmp .Lsse2_next ALIGN(4) .Lsse2_loop: movdqu (AP, N), %xmm0 movdqu (BP, N), %xmm1 pxor %xmm0, %xmm1 movdqa %xmm1, (DST, N) .Lsse2_next: sub $16, N ja .Lsse2_loop C FIXME: See if we can do a full word first, before the C byte-wise final loop. jnz .Lfinal C Final operation is aligned movdqu (AP), %xmm0 movdqu (BP), %xmm1 pxor %xmm0, %xmm1 movdqa %xmm1, (DST) C ENTRY might have been 3 args, too, but it doesn't matter for the exit W64_EXIT(4, 0) ret >) EPILOGUE(memxor3)