summaryrefslogtreecommitdiff
path: root/powerpc64/p9/poly1305.m4
diff options
context:
space:
mode:
Diffstat (limited to 'powerpc64/p9/poly1305.m4')
-rw-r--r--powerpc64/p9/poly1305.m4105
1 files changed, 58 insertions, 47 deletions
diff --git a/powerpc64/p9/poly1305.m4 b/powerpc64/p9/poly1305.m4
index 3cb63f82..6a07ed6f 100644
--- a/powerpc64/p9/poly1305.m4
+++ b/powerpc64/p9/poly1305.m4
@@ -2,64 +2,75 @@ C Threshold of processing multiple blocks in parallel
C of a multiple of 4
define(`POLY1305_BLOCK_THRESHOLD', `12')
-C Argments
-define(`CTX', `r3')
-define(`DATA', `r4')
-define(`PADBYTE', `r5') C Padding byte register
-define(`LEN', `r6')
-
+C DEFINES_BLOCK_R64(GPR0, VR0)
define(`DEFINES_BLOCK_R64', `
- define(`T0', `r9')
- define(`T1', `r10')
- define(`T2', `r8')
- define(`T2A', `r9')
- define(`T2S', `r10')
- define(`RZ', `r6')
- define(`IDX', `r10')
-
- define(`ZERO', `v0')
- define(`F0S', `v3')
- define(`F11', `v4')
- define(`T', `v5')
-
- define(`R', `v6')
- define(`S', `v7')
-
- define(`T00', `v8')
- define(`T10', `v9')
- define(`T11', `v10')
- define(`MU0', `v11')
- define(`MU1', `v12')
+ define(`H0', `eval(0+$1)')
+ define(`H1', `eval(1+$1)')
+ define(`H2', `eval(2+$1)')
+
+ define(`T0', `eval(3+$1)')
+ define(`T1', `eval(4+$1)')
+ define(`T2', `eval(2+$1)')
+ define(`T2A', `eval(3+$1)')
+ define(`T2S', `eval(4+$1)')
+ define(`RZ', `eval(0+$1)')
+ define(`IDX', `eval(4+$1)')
+
+ define(`F0', `eval(0+$2)')
+ define(`F1', `eval(1+$2)')
+
+ define(`ZERO', `eval(2+$2)')
+ define(`F0S', `eval(3+$2)')
+ define(`F11', `eval(4+$2)')
+ define(`T', `eval(5+$2)')
+
+ define(`R', `eval(6+$2)')
+ define(`S', `eval(7+$2)')
+
+ define(`T00', `eval(8+$2)')
+ define(`T10', `eval(9+$2)')
+ define(`T11', `eval(10+$2)')
+ define(`MU0', `eval(11+$2)')
+ define(`MU1', `eval(12+$2)')
')
-C Inputs H0, H1, H2 are general-puropse registers of previous state radix 2^64
-C Outputs F0, F1 are vector registers of result state radix 2^64 sorted as follows
+C CTX is the address of context where key and pre-computed values are stored
+C DATA is the address of input block
+C PADBYTE is padding byte for input block
+C GPR0 is the starting register of sequential general-purpose registers
+C used in the macro of following layout
+C GPR0, GPR1, GPR2 are inputs representing the previous state radix 2^64
+C GPR3, GPR4 are temporary registers
+C VR0 is the starting register of sequential vector resigers used in
+C the macro of following layout
+C VR0, VR1 are outputs representing the result state radix 2^64 sorted as follows
C (low 64-bit of F0) + (low 64-bit of F1) + (high 64-bit of F1)
-C BLOCK_R64(F0, F1, H0, H1, H2)
+C VR2..VR12 are temporary registers
+C BLOCK_R64(CTX, DATA, PADBYTE, GPR0, VR0)
define(`BLOCK_R64', `
- DEFINES_BLOCK_R64()
+ DEFINES_BLOCK_R64($4,$5)
C Load 128-bit input block
IF_LE(`
- ld T0, 0(DATA)
- ld T1, 8(DATA)
+ ld T0, 0($2)
+ ld T1, 8($2)
')
IF_BE(`
li IDX, 8
- ldbrx T1, IDX, DATA
- ldbrx T0, 0, DATA
+ ldbrx T1, IDX, $2
+ ldbrx T0, 0, $2
')
C Combine state with input block, latter is padded to 17-bytes
C by low-order byte of PADBYTE register
- addc T0, T0, $3
- adde T1, T1, $4
- adde T2, PADBYTE, $5
+ addc T0, T0, H0
+ adde T1, T1, H1
+ adde T2, $3, H2
mtvsrdd VSR(T), T0, T1
C Load key and pre-computed values
li IDX, 16
- lxvd2x VSR(R), 0, CTX
- lxvd2x VSR(S), IDX, CTX
+ lxvd2x VSR(R), 0, $1
+ lxvd2x VSR(S), IDX, $1
andi. T2A, T2, 3
srdi T2S, T2, 2
@@ -75,17 +86,17 @@ IF_BE(`
mtvsrdd VSR(T10), 0, T2
C Mutiplicate key by combined state and block
- vmsumudm $1, T, MU0, ZERO
- vmsumudm $2, T, MU1, ZERO
+ vmsumudm F0, T, MU0, ZERO
+ vmsumudm F1, T, MU1, ZERO
vmsumudm F11, T11, MU1, ZERO
- vmsumudm $1, T00, S, $1
- vmsumudm $2, T10, MU0, $2
+ vmsumudm F0, T00, S, F0
+ vmsumudm F1, T10, MU0, F1
C Product addition
xxmrgld VSR(F11), VSR(F11), VSR(ZERO)
- vadduqm $2, $2, F11
+ vadduqm F1, F1, F11
- xxmrghd VSR(F0S), VSR(ZERO), VSR($1)
- vadduqm $2, $2, F0S
+ xxmrghd VSR(F0S), VSR(ZERO), VSR(F0)
+ vadduqm F1, F1, F0S
')