summaryrefslogtreecommitdiff
diff options
context:
space:
mode:
authorNiels Möller <nisse@lysator.liu.se>2013-02-06 11:45:55 +0100
committerNiels Möller <nisse@lysator.liu.se>2013-02-06 11:45:55 +0100
commitfc5801a729a6482e37c4fb2172016843496537e8 (patch)
tree38d1e2a6e15fbdffcad87ac4bee9bf25b98cca41
parentd56b44107aa1f74330005c1128b43986b8a2e8a9 (diff)
downloadnettle-fc5801a729a6482e37c4fb2172016843496537e8.tar.gz
Setup for using ARM assembly. Initial memxor for ARM.
-rw-r--r--ChangeLog5
-rw-r--r--armv7/README41
-rw-r--r--armv7/machine.m40
-rw-r--r--armv7/memxor.asm382
-rw-r--r--configure.ac3
5 files changed, 431 insertions, 0 deletions
diff --git a/ChangeLog b/ChangeLog
index f9445b29..e05dedbf 100644
--- a/ChangeLog
+++ b/ChangeLog
@@ -1,5 +1,9 @@
2013-02-06 Niels Möller <nisse@lysator.liu.se>
+ * armv7/README: New file.
+ * armv7/machine.m4: New (empty) file.
+ * armv7/memxor.asm: Initial assembly implementation.
+
* config.m4.in: Substitute ASM_TYPE_PROGBITS as TYPE_PROGBITS.
* config.make.in: Added .s to the suffix list.
@@ -17,6 +21,7 @@
(ASM_TYPE_PROGBITS): New substituted variable, set in the same way
as ASM_TYPE_FUNCTION.
(ASM_MARK_NOEXEC_STACK): Use TYPE_PROGBITS.
+ (asm_path): Set up asm_path for armv7.
* asm.m4: Use changecom to disable m4 quoting. Use divert to
suppress output.
diff --git a/armv7/README b/armv7/README
new file mode 100644
index 00000000..33715662
--- /dev/null
+++ b/armv7/README
@@ -0,0 +1,41 @@
+Currently, code in this directory is written for arm cortex-a9.
+
+ABI gnueabi(hf) (not depending on the floating point conventions)
+
+Registers May be Argument
+ clobbered number
+
+r0 Y 1
+r1 Y 2
+r2 Y 3
+r3 Y 4
+r4 N
+r5 N
+r6 N
+r7 N
+r8 N
+r9 (sl)
+r10 N
+r11 N
+r12 (ip) Y
+r13 (sp)
+r14 (lr)
+r15 (pc)
+
+q0 (d0, d1) Y 1 (for "hf" abi)
+q1 (d2, d3) Y 2
+q2 (d4, d5) Y 3
+q3 (d6, d7) Y 4
+q4 (d8, d9) N
+q5 (d10, d11) N
+q6 (d12, d13) N
+q7 (d14, d15) N
+q8 (d16, d17) Y
+q9 (d18, d19) Y
+q10 (d20, d21) Y
+q11 (d22, d23) Y
+q12 (d24, d25) Y
+q13 (d26, d27) Y
+q14 (d28, d29) Y
+q15 (d30, d31) Y
+
diff --git a/armv7/machine.m4 b/armv7/machine.m4
new file mode 100644
index 00000000..e69de29b
--- /dev/null
+++ b/armv7/machine.m4
diff --git a/armv7/memxor.asm b/armv7/memxor.asm
new file mode 100644
index 00000000..78762d03
--- /dev/null
+++ b/armv7/memxor.asm
@@ -0,0 +1,382 @@
+C -*- mode: asm; asm-comment-char: ?C; -*-
+C nettle, low-level cryptographics library
+C
+C Copyright (C) 2013, Niels Möller
+C
+C The nettle library is free software; you can redistribute it and/or modify
+C it under the terms of the GNU Lesser General Public License as published by
+C the Free Software Foundation; either version 2.1 of the License, or (at your
+C option) any later version.
+C
+C The nettle library is distributed in the hope that it will be useful, but
+C WITHOUT ANY WARRANTY; without even the implied warranty of MERCHANTABILITY
+C or FITNESS FOR A PARTICULAR PURPOSE. See the GNU Lesser General Public
+C License for more details.
+C
+C You should have received a copy of the GNU Lesser General Public License
+C along with the nettle library; see the file COPYING.LIB. If not, write to
+C the Free Software Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston,
+C MA 02111-1301, USA.
+
+C Register usage:
+
+define(<DST>, <r0>)
+define(<SRC>, <r1>)
+define(<N>, <r2>)
+define(<CNT>, <r6>)
+define(<TNC>, <r7>)
+
+ .syntax unified
+
+ .file "memxor.asm"
+
+ .text
+ .arm
+
+ C memxor(uint8_t *dst, const uint8_t *src, size_t n)
+ .align 2
+PROLOGUE(memxor)
+ cmp N, #0
+ beq .Lmemxor_ret
+
+ push {r4, r5, r6, r7}
+
+ cmp N, #7
+ bcs .Lmemxor_large
+
+ C Simple byte loop
+.Lmemxor_bytes:
+ ldrb r3, [SRC], #+1
+ ldrb r4, [DST]
+ eor r3, r4
+ strb r3, [DST], #+1
+ subs N, #1
+ bne .Lmemxor_bytes
+
+.Lmemxor_done:
+ pop {r4,r5,r6,r7}
+.Lmemxor_ret:
+ bx lr
+
+.Lmemxor_align_loop:
+ ldrb r3, [SRC], #+1
+ ldrb r4, [DST]
+ eor r3, r4
+ strb r3, [DST], #+1
+ sub N, #1
+
+.Lmemxor_large:
+ tst DST, #3
+ bne .Lmemxor_align_loop
+
+ C We have at least 4 bytes left to do here.
+ sub N, #4
+
+ ands CNT, SRC, #3
+ beq .Lmemxor_same
+
+ C Different alignment case.
+ C v original SRC
+ C +-------+------+
+ C |SRC |SRC+4 |
+ C +---+---+------+
+ C |DST |
+ C +-------+
+ C
+ C With little-endian, we need to do
+ C DST[i] ^= (SRC[i] >> CNT) ^ (SRC[i+1] << TNC)
+
+ lsl CNT, #3
+ bic SRC, #3
+ rsb TNC, CNT, #32
+
+ ldr r4, [SRC], #+4
+
+ tst N, #4
+ itet eq
+ moveq r5, r4
+ subne N, #4
+ beq .Lmemxor_odd
+
+.Lmemxor_word_loop:
+ ldr r5, [SRC], #+4
+ ldr r3, [DST]
+ eor r3, r3, r4, lsr CNT
+ eor r3, r3, r5, lsl TNC
+ str r3, [DST], #+4
+.Lmemxor_odd:
+ ldr r4, [SRC], #+4
+ ldr r3, [DST]
+ eor r3, r3, r5, lsr CNT
+ eor r3, r3, r4, lsl TNC
+ str r3, [DST], #+4
+ subs N, #8
+ bcs .Lmemxor_word_loop
+ adds N, #8
+ beq .Lmemxor_done
+
+ C We have TNC/8 left-over bytes in r4, high end
+ lsr r4, CNT
+ ldr r3, [DST]
+ eor r3, r4
+ C Store bytes, one by one.
+.Lmemxor_leftover:
+ strb r3, [DST], #+1
+ subs N, #1
+ beq .Lmemxor_done
+ subs TNC, #8
+ lsr r3, #8
+ bne .Lmemxor_leftover
+
+ b .Lmemxor_bytes
+
+.Lmemxor_same:
+ ldr r3, [SRC], #+4
+ ldr r4, [DST]
+ eor r3, r4
+ str r3, [DST], #+4
+
+ subs N, #4
+ bcs .Lmemxor_same
+ adds N, #4
+ beq .Lmemxor_done
+ b .Lmemxor_bytes
+EPILOGUE(memxor)
+
+define(<DST>, <r0>)
+define(<AP>, <r1>)
+define(<BP>, <r2>)
+define(<N>, <r3>)
+undefine(<CNT>)
+undefine(<TNC>)
+
+C Temporaries r4-r7
+define(<ACNT>, <r8>)
+define(<ATNC>, <r10>)
+define(<BCNT>, <r11>)
+define(<BTNC>, <r12>)
+
+ C memxor3(uint8_t *dst, const uint8_t *a, const uint8_t *b, size_t n)
+ .align 2
+PROLOGUE(memxor3)
+ cmp N, #0
+ beq .Lmemxor3_ret
+
+ push {r4,r5,r6,r7,r8,r10,r11}
+ cmp N, #7
+
+ add AP, N
+ add BP, N
+ add DST, N
+
+ bcs .Lmemxor3_large
+
+ C Simple byte loop
+.Lmemxor3_bytes:
+ ldrb r4, [AP, #-1]!
+ ldrb r5, [BP, #-1]!
+ eor r4, r5
+ strb r4, [DST, #-1]!
+ subs N, #1
+ bne .Lmemxor3_bytes
+
+.Lmemxor3_done:
+ pop {r4,r5,r6,r7,r8,r10,r11}
+.Lmemxor3_ret:
+ bx lr
+
+.Lmemxor3_align_loop:
+ ldrb r4, [AP, #-1]!
+ ldrb r5, [BP, #-1]!
+ eor r5, r4
+ strb r5, [DST, #-1]!
+ sub N, #1
+
+.Lmemxor3_large:
+ tst DST, #3
+ bne .Lmemxor3_align_loop
+
+ C We have at least 4 bytes left to do here.
+ sub N, #4
+ ands ACNT, AP, #3
+ lsl ACNT, #3
+ beq .Lmemxor3_a_aligned
+
+ ands BCNT, BP, #3
+ lsl BCNT, #3
+ bne .Lmemxor3_uu
+
+ C Swap
+ mov r4, AP
+ mov AP, BP
+ mov BP, r4
+ mov BCNT, ACNT
+
+.Lmemxor3_au:
+ C FIXME: Switch roles of A and B
+
+ C AP is aligned, BP is not
+ C v original SRC
+ C +-------+------+
+ C |SRC-4 |SRC |
+ C +---+---+------+
+ C |DST-4 |
+ C +-------+
+ C
+ C With little-endian, we need to do
+ C DST[i-i] ^= (SRC[i-i] >> CNT) ^ (SRC[i] << TNC)
+ rsb BTNC, BCNT, #32
+ bic BP, #3
+
+ ldr r4, [BP]
+
+ tst N, #4
+ itet eq
+ moveq r5, r4
+ subne N, #4
+ beq .Lmemxor3_au_odd
+
+.Lmemxor3_au_loop:
+ ldr r5, [BP, #-4]!
+ ldr r6, [AP, #-4]!
+ eor r6, r6, r4, lsl BTNC
+ eor r6, r6, r5, lsr BCNT
+ str r6, [DST, #-4]!
+.Lmemxor3_au_odd:
+ ldr r4, [BP, #-4]!
+ ldr r6, [AP, #-4]!
+ eor r6, r6, r5, lsl BTNC
+ eor r6, r6, r4, lsr BCNT
+ str r6, [DST, #-4]!
+ subs N, #8
+ bcs .Lmemxor3_au_loop
+ adds N, #8
+ beq .Lmemxor3_done
+
+ C Leftover bytes in r4, low end
+ ldr r5, [AP, #-4]
+ C FIXME: Do this with a single shift/rotate?
+ lsr r5, BTNC
+ eor r4, r5
+ ror r4, BCNT
+
+.Lmemxor3_au_leftover:
+ C Store a byte at a time
+ ror r4, #24
+ strb r4, [DST, #-1]!
+ subs N, #1
+ beq .Lmemxor3_done
+ subs BCNT, #8
+ sub AP, #1
+ bne .Lmemxor3_au_leftover
+ b .Lmemxor3_bytes
+
+.Lmemxor3_a_aligned:
+ ands BCNT, BP, #3
+ lsl BCNT, #3
+ bne .Lmemxor3_au ;
+
+ C a, b and dst all have the same alignment.
+
+.Lmemxor3_aligned_word_loop:
+ ldr r4, [AP, #-4]!
+ ldr r5, [BP, #-4]!
+ eor r4, r5
+ str r4, [DST, #-4]!
+ subs N, #4
+ bcs .Lmemxor3_aligned_word_loop
+ adds N, #4
+ beq .Lmemxor3_done
+ b .Lmemxor3_bytes
+
+.Lmemxor3_uu:
+
+ cmp ACNT, BCNT
+ bic AP, #3
+ bic BP, #3
+ rsb ATNC, ACNT, #32
+
+ bne .Lmemxor3_uud
+
+ C AP and BP are unaligned in the same way
+
+ ldr r4, [AP]
+ ldr r6, [BP]
+ eor r4, r6
+
+ tst N, #4
+ itet eq
+ moveq r5, r4
+ subne N, #4
+ beq .Lmemxor3_uu_odd
+
+.Lmemxor3_uu_loop:
+ ldr r5, [AP, #-4]!
+ ldr r6, [BP, #-4]!
+ eor r5, r6
+ lsl r4, ATNC
+ eor r4, r4, r5, lsr ACNT
+ str r4, [DST, #-4]!
+.Lmemxor3_uu_odd:
+ ldr r4, [AP, #-4]!
+ ldr r6, [BP, #-4]!
+ eor r4, r6
+ lsl r5, ATNC
+ eor r5, r5, r4, lsr ACNT
+ str r5, [DST, #-4]!
+ subs N, #8
+ bcs .Lmemxor3_uu_loop
+ adds N, #8
+ beq .Lmemxor3_done
+
+ C Leftover bytes in a4, low end
+ ror r4, ACNT
+.Lmemxor3_uu_leftover:
+ ror r4, #24
+ strb r4, [DST, #-1]!
+ subs N, #1
+ beq .Lmemxor3_done
+ subs ACNT, #8
+ bne .Lmemxor3_uu_leftover
+ b .Lmemxor3_bytes
+
+.Lmemxor3_uud:
+ C Both AP and BP unaligned, and in different ways
+ rsb BTNC, BCNT, #32
+
+ ldr r4, [AP]
+ ldr r6, [BP]
+
+ tst N, #4
+ ittet eq
+ moveq r5, r4
+ moveq r7, r6
+ subne N, #4
+ beq .Lmemxor3_uud_odd
+
+.Lmemxor3_uud_loop:
+ ldr r5, [AP, #-4]!
+ ldr r7, [BP, #-4]!
+ lsl r4, ATNC
+ eor r4, r4, r6, lsl BTNC
+ eor r4, r4, r5, lsr ACNT
+ eor r4, r4, r7, lsr BCNT
+ str r4, [DST, #-4]!
+.Lmemxor3_uud_odd:
+ ldr r4, [AP, #-4]!
+ ldr r6, [BP, #-4]!
+ lsl r5, ATNC
+ eor r5, r5, r7, lsl BTNC
+ eor r5, r5, r4, lsr ACNT
+ eor r5, r5, r6, lsr BCNT
+ str r5, [DST, #-4]!
+ subs N, #8
+ bcs .Lmemxor3_uud_loop
+ adds N, #8
+ beq .Lmemxor3_done
+
+ C FIXME: More clever left-over handling? For now, just adjust pointers.
+ add AP, AP, ACNT, lsr #3
+ add BP, BP, BCNT, lsr #3
+ b .Lmemxor3_bytes
+EPILOGUE(memxor3)
diff --git a/configure.ac b/configure.ac
index 577b29dc..3dda4612 100644
--- a/configure.ac
+++ b/configure.ac
@@ -229,6 +229,9 @@ if test "x$enable_assembler" = xyes ; then
asm_path=sparc32
fi
;;
+ armv7l*)
+ asm_path=armv7
+ ;;
*)
enable_assembler=no
;;