diff options
author | Niels Möller <nisse@lysator.liu.se> | 2013-02-06 11:45:55 +0100 |
---|---|---|
committer | Niels Möller <nisse@lysator.liu.se> | 2013-02-06 11:45:55 +0100 |
commit | fc5801a729a6482e37c4fb2172016843496537e8 (patch) | |
tree | 38d1e2a6e15fbdffcad87ac4bee9bf25b98cca41 | |
parent | d56b44107aa1f74330005c1128b43986b8a2e8a9 (diff) | |
download | nettle-fc5801a729a6482e37c4fb2172016843496537e8.tar.gz |
Setup for using ARM assembly. Initial memxor for ARM.
-rw-r--r-- | ChangeLog | 5 | ||||
-rw-r--r-- | armv7/README | 41 | ||||
-rw-r--r-- | armv7/machine.m4 | 0 | ||||
-rw-r--r-- | armv7/memxor.asm | 382 | ||||
-rw-r--r-- | configure.ac | 3 |
5 files changed, 431 insertions, 0 deletions
@@ -1,5 +1,9 @@ 2013-02-06 Niels Möller <nisse@lysator.liu.se> + * armv7/README: New file. + * armv7/machine.m4: New (empty) file. + * armv7/memxor.asm: Initial assembly implementation. + * config.m4.in: Substitute ASM_TYPE_PROGBITS as TYPE_PROGBITS. * config.make.in: Added .s to the suffix list. @@ -17,6 +21,7 @@ (ASM_TYPE_PROGBITS): New substituted variable, set in the same way as ASM_TYPE_FUNCTION. (ASM_MARK_NOEXEC_STACK): Use TYPE_PROGBITS. + (asm_path): Set up asm_path for armv7. * asm.m4: Use changecom to disable m4 quoting. Use divert to suppress output. diff --git a/armv7/README b/armv7/README new file mode 100644 index 00000000..33715662 --- /dev/null +++ b/armv7/README @@ -0,0 +1,41 @@ +Currently, code in this directory is written for arm cortex-a9. + +ABI gnueabi(hf) (not depending on the floating point conventions) + +Registers May be Argument + clobbered number + +r0 Y 1 +r1 Y 2 +r2 Y 3 +r3 Y 4 +r4 N +r5 N +r6 N +r7 N +r8 N +r9 (sl) +r10 N +r11 N +r12 (ip) Y +r13 (sp) +r14 (lr) +r15 (pc) + +q0 (d0, d1) Y 1 (for "hf" abi) +q1 (d2, d3) Y 2 +q2 (d4, d5) Y 3 +q3 (d6, d7) Y 4 +q4 (d8, d9) N +q5 (d10, d11) N +q6 (d12, d13) N +q7 (d14, d15) N +q8 (d16, d17) Y +q9 (d18, d19) Y +q10 (d20, d21) Y +q11 (d22, d23) Y +q12 (d24, d25) Y +q13 (d26, d27) Y +q14 (d28, d29) Y +q15 (d30, d31) Y + diff --git a/armv7/machine.m4 b/armv7/machine.m4 new file mode 100644 index 00000000..e69de29b --- /dev/null +++ b/armv7/machine.m4 diff --git a/armv7/memxor.asm b/armv7/memxor.asm new file mode 100644 index 00000000..78762d03 --- /dev/null +++ b/armv7/memxor.asm @@ -0,0 +1,382 @@ +C -*- mode: asm; asm-comment-char: ?C; -*- +C nettle, low-level cryptographics library +C +C Copyright (C) 2013, Niels Möller +C +C The nettle library is free software; you can redistribute it and/or modify +C it under the terms of the GNU Lesser General Public License as published by +C the Free Software Foundation; either version 2.1 of the License, or (at your +C option) any later version. +C +C The nettle library is distributed in the hope that it will be useful, but +C WITHOUT ANY WARRANTY; without even the implied warranty of MERCHANTABILITY +C or FITNESS FOR A PARTICULAR PURPOSE. See the GNU Lesser General Public +C License for more details. +C +C You should have received a copy of the GNU Lesser General Public License +C along with the nettle library; see the file COPYING.LIB. If not, write to +C the Free Software Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, +C MA 02111-1301, USA. + +C Register usage: + +define(<DST>, <r0>) +define(<SRC>, <r1>) +define(<N>, <r2>) +define(<CNT>, <r6>) +define(<TNC>, <r7>) + + .syntax unified + + .file "memxor.asm" + + .text + .arm + + C memxor(uint8_t *dst, const uint8_t *src, size_t n) + .align 2 +PROLOGUE(memxor) + cmp N, #0 + beq .Lmemxor_ret + + push {r4, r5, r6, r7} + + cmp N, #7 + bcs .Lmemxor_large + + C Simple byte loop +.Lmemxor_bytes: + ldrb r3, [SRC], #+1 + ldrb r4, [DST] + eor r3, r4 + strb r3, [DST], #+1 + subs N, #1 + bne .Lmemxor_bytes + +.Lmemxor_done: + pop {r4,r5,r6,r7} +.Lmemxor_ret: + bx lr + +.Lmemxor_align_loop: + ldrb r3, [SRC], #+1 + ldrb r4, [DST] + eor r3, r4 + strb r3, [DST], #+1 + sub N, #1 + +.Lmemxor_large: + tst DST, #3 + bne .Lmemxor_align_loop + + C We have at least 4 bytes left to do here. + sub N, #4 + + ands CNT, SRC, #3 + beq .Lmemxor_same + + C Different alignment case. + C v original SRC + C +-------+------+ + C |SRC |SRC+4 | + C +---+---+------+ + C |DST | + C +-------+ + C + C With little-endian, we need to do + C DST[i] ^= (SRC[i] >> CNT) ^ (SRC[i+1] << TNC) + + lsl CNT, #3 + bic SRC, #3 + rsb TNC, CNT, #32 + + ldr r4, [SRC], #+4 + + tst N, #4 + itet eq + moveq r5, r4 + subne N, #4 + beq .Lmemxor_odd + +.Lmemxor_word_loop: + ldr r5, [SRC], #+4 + ldr r3, [DST] + eor r3, r3, r4, lsr CNT + eor r3, r3, r5, lsl TNC + str r3, [DST], #+4 +.Lmemxor_odd: + ldr r4, [SRC], #+4 + ldr r3, [DST] + eor r3, r3, r5, lsr CNT + eor r3, r3, r4, lsl TNC + str r3, [DST], #+4 + subs N, #8 + bcs .Lmemxor_word_loop + adds N, #8 + beq .Lmemxor_done + + C We have TNC/8 left-over bytes in r4, high end + lsr r4, CNT + ldr r3, [DST] + eor r3, r4 + C Store bytes, one by one. +.Lmemxor_leftover: + strb r3, [DST], #+1 + subs N, #1 + beq .Lmemxor_done + subs TNC, #8 + lsr r3, #8 + bne .Lmemxor_leftover + + b .Lmemxor_bytes + +.Lmemxor_same: + ldr r3, [SRC], #+4 + ldr r4, [DST] + eor r3, r4 + str r3, [DST], #+4 + + subs N, #4 + bcs .Lmemxor_same + adds N, #4 + beq .Lmemxor_done + b .Lmemxor_bytes +EPILOGUE(memxor) + +define(<DST>, <r0>) +define(<AP>, <r1>) +define(<BP>, <r2>) +define(<N>, <r3>) +undefine(<CNT>) +undefine(<TNC>) + +C Temporaries r4-r7 +define(<ACNT>, <r8>) +define(<ATNC>, <r10>) +define(<BCNT>, <r11>) +define(<BTNC>, <r12>) + + C memxor3(uint8_t *dst, const uint8_t *a, const uint8_t *b, size_t n) + .align 2 +PROLOGUE(memxor3) + cmp N, #0 + beq .Lmemxor3_ret + + push {r4,r5,r6,r7,r8,r10,r11} + cmp N, #7 + + add AP, N + add BP, N + add DST, N + + bcs .Lmemxor3_large + + C Simple byte loop +.Lmemxor3_bytes: + ldrb r4, [AP, #-1]! + ldrb r5, [BP, #-1]! + eor r4, r5 + strb r4, [DST, #-1]! + subs N, #1 + bne .Lmemxor3_bytes + +.Lmemxor3_done: + pop {r4,r5,r6,r7,r8,r10,r11} +.Lmemxor3_ret: + bx lr + +.Lmemxor3_align_loop: + ldrb r4, [AP, #-1]! + ldrb r5, [BP, #-1]! + eor r5, r4 + strb r5, [DST, #-1]! + sub N, #1 + +.Lmemxor3_large: + tst DST, #3 + bne .Lmemxor3_align_loop + + C We have at least 4 bytes left to do here. + sub N, #4 + ands ACNT, AP, #3 + lsl ACNT, #3 + beq .Lmemxor3_a_aligned + + ands BCNT, BP, #3 + lsl BCNT, #3 + bne .Lmemxor3_uu + + C Swap + mov r4, AP + mov AP, BP + mov BP, r4 + mov BCNT, ACNT + +.Lmemxor3_au: + C FIXME: Switch roles of A and B + + C AP is aligned, BP is not + C v original SRC + C +-------+------+ + C |SRC-4 |SRC | + C +---+---+------+ + C |DST-4 | + C +-------+ + C + C With little-endian, we need to do + C DST[i-i] ^= (SRC[i-i] >> CNT) ^ (SRC[i] << TNC) + rsb BTNC, BCNT, #32 + bic BP, #3 + + ldr r4, [BP] + + tst N, #4 + itet eq + moveq r5, r4 + subne N, #4 + beq .Lmemxor3_au_odd + +.Lmemxor3_au_loop: + ldr r5, [BP, #-4]! + ldr r6, [AP, #-4]! + eor r6, r6, r4, lsl BTNC + eor r6, r6, r5, lsr BCNT + str r6, [DST, #-4]! +.Lmemxor3_au_odd: + ldr r4, [BP, #-4]! + ldr r6, [AP, #-4]! + eor r6, r6, r5, lsl BTNC + eor r6, r6, r4, lsr BCNT + str r6, [DST, #-4]! + subs N, #8 + bcs .Lmemxor3_au_loop + adds N, #8 + beq .Lmemxor3_done + + C Leftover bytes in r4, low end + ldr r5, [AP, #-4] + C FIXME: Do this with a single shift/rotate? + lsr r5, BTNC + eor r4, r5 + ror r4, BCNT + +.Lmemxor3_au_leftover: + C Store a byte at a time + ror r4, #24 + strb r4, [DST, #-1]! + subs N, #1 + beq .Lmemxor3_done + subs BCNT, #8 + sub AP, #1 + bne .Lmemxor3_au_leftover + b .Lmemxor3_bytes + +.Lmemxor3_a_aligned: + ands BCNT, BP, #3 + lsl BCNT, #3 + bne .Lmemxor3_au ; + + C a, b and dst all have the same alignment. + +.Lmemxor3_aligned_word_loop: + ldr r4, [AP, #-4]! + ldr r5, [BP, #-4]! + eor r4, r5 + str r4, [DST, #-4]! + subs N, #4 + bcs .Lmemxor3_aligned_word_loop + adds N, #4 + beq .Lmemxor3_done + b .Lmemxor3_bytes + +.Lmemxor3_uu: + + cmp ACNT, BCNT + bic AP, #3 + bic BP, #3 + rsb ATNC, ACNT, #32 + + bne .Lmemxor3_uud + + C AP and BP are unaligned in the same way + + ldr r4, [AP] + ldr r6, [BP] + eor r4, r6 + + tst N, #4 + itet eq + moveq r5, r4 + subne N, #4 + beq .Lmemxor3_uu_odd + +.Lmemxor3_uu_loop: + ldr r5, [AP, #-4]! + ldr r6, [BP, #-4]! + eor r5, r6 + lsl r4, ATNC + eor r4, r4, r5, lsr ACNT + str r4, [DST, #-4]! +.Lmemxor3_uu_odd: + ldr r4, [AP, #-4]! + ldr r6, [BP, #-4]! + eor r4, r6 + lsl r5, ATNC + eor r5, r5, r4, lsr ACNT + str r5, [DST, #-4]! + subs N, #8 + bcs .Lmemxor3_uu_loop + adds N, #8 + beq .Lmemxor3_done + + C Leftover bytes in a4, low end + ror r4, ACNT +.Lmemxor3_uu_leftover: + ror r4, #24 + strb r4, [DST, #-1]! + subs N, #1 + beq .Lmemxor3_done + subs ACNT, #8 + bne .Lmemxor3_uu_leftover + b .Lmemxor3_bytes + +.Lmemxor3_uud: + C Both AP and BP unaligned, and in different ways + rsb BTNC, BCNT, #32 + + ldr r4, [AP] + ldr r6, [BP] + + tst N, #4 + ittet eq + moveq r5, r4 + moveq r7, r6 + subne N, #4 + beq .Lmemxor3_uud_odd + +.Lmemxor3_uud_loop: + ldr r5, [AP, #-4]! + ldr r7, [BP, #-4]! + lsl r4, ATNC + eor r4, r4, r6, lsl BTNC + eor r4, r4, r5, lsr ACNT + eor r4, r4, r7, lsr BCNT + str r4, [DST, #-4]! +.Lmemxor3_uud_odd: + ldr r4, [AP, #-4]! + ldr r6, [BP, #-4]! + lsl r5, ATNC + eor r5, r5, r7, lsl BTNC + eor r5, r5, r4, lsr ACNT + eor r5, r5, r6, lsr BCNT + str r5, [DST, #-4]! + subs N, #8 + bcs .Lmemxor3_uud_loop + adds N, #8 + beq .Lmemxor3_done + + C FIXME: More clever left-over handling? For now, just adjust pointers. + add AP, AP, ACNT, lsr #3 + add BP, BP, BCNT, lsr #3 + b .Lmemxor3_bytes +EPILOGUE(memxor3) diff --git a/configure.ac b/configure.ac index 577b29dc..3dda4612 100644 --- a/configure.ac +++ b/configure.ac @@ -229,6 +229,9 @@ if test "x$enable_assembler" = xyes ; then asm_path=sparc32 fi ;; + armv7l*) + asm_path=armv7 + ;; *) enable_assembler=no ;; |