diff options
Diffstat (limited to 'powerpc64/p9/poly1305.m4')
-rw-r--r-- | powerpc64/p9/poly1305.m4 | 105 |
1 files changed, 58 insertions, 47 deletions
diff --git a/powerpc64/p9/poly1305.m4 b/powerpc64/p9/poly1305.m4 index 3cb63f82..6a07ed6f 100644 --- a/powerpc64/p9/poly1305.m4 +++ b/powerpc64/p9/poly1305.m4 @@ -2,64 +2,75 @@ C Threshold of processing multiple blocks in parallel C of a multiple of 4 define(`POLY1305_BLOCK_THRESHOLD', `12') -C Argments -define(`CTX', `r3') -define(`DATA', `r4') -define(`PADBYTE', `r5') C Padding byte register -define(`LEN', `r6') - +C DEFINES_BLOCK_R64(GPR0, VR0) define(`DEFINES_BLOCK_R64', ` - define(`T0', `r9') - define(`T1', `r10') - define(`T2', `r8') - define(`T2A', `r9') - define(`T2S', `r10') - define(`RZ', `r6') - define(`IDX', `r10') - - define(`ZERO', `v0') - define(`F0S', `v3') - define(`F11', `v4') - define(`T', `v5') - - define(`R', `v6') - define(`S', `v7') - - define(`T00', `v8') - define(`T10', `v9') - define(`T11', `v10') - define(`MU0', `v11') - define(`MU1', `v12') + define(`H0', `eval(0+$1)') + define(`H1', `eval(1+$1)') + define(`H2', `eval(2+$1)') + + define(`T0', `eval(3+$1)') + define(`T1', `eval(4+$1)') + define(`T2', `eval(2+$1)') + define(`T2A', `eval(3+$1)') + define(`T2S', `eval(4+$1)') + define(`RZ', `eval(0+$1)') + define(`IDX', `eval(4+$1)') + + define(`F0', `eval(0+$2)') + define(`F1', `eval(1+$2)') + + define(`ZERO', `eval(2+$2)') + define(`F0S', `eval(3+$2)') + define(`F11', `eval(4+$2)') + define(`T', `eval(5+$2)') + + define(`R', `eval(6+$2)') + define(`S', `eval(7+$2)') + + define(`T00', `eval(8+$2)') + define(`T10', `eval(9+$2)') + define(`T11', `eval(10+$2)') + define(`MU0', `eval(11+$2)') + define(`MU1', `eval(12+$2)') ') -C Inputs H0, H1, H2 are general-puropse registers of previous state radix 2^64 -C Outputs F0, F1 are vector registers of result state radix 2^64 sorted as follows +C CTX is the address of context where key and pre-computed values are stored +C DATA is the address of input block +C PADBYTE is padding byte for input block +C GPR0 is the starting register of sequential general-purpose registers +C used in the macro of following layout +C GPR0, GPR1, GPR2 are inputs representing the previous state radix 2^64 +C GPR3, GPR4 are temporary registers +C VR0 is the starting register of sequential vector resigers used in +C the macro of following layout +C VR0, VR1 are outputs representing the result state radix 2^64 sorted as follows C (low 64-bit of F0) + (low 64-bit of F1) + (high 64-bit of F1) -C BLOCK_R64(F0, F1, H0, H1, H2) +C VR2..VR12 are temporary registers +C BLOCK_R64(CTX, DATA, PADBYTE, GPR0, VR0) define(`BLOCK_R64', ` - DEFINES_BLOCK_R64() + DEFINES_BLOCK_R64($4,$5) C Load 128-bit input block IF_LE(` - ld T0, 0(DATA) - ld T1, 8(DATA) + ld T0, 0($2) + ld T1, 8($2) ') IF_BE(` li IDX, 8 - ldbrx T1, IDX, DATA - ldbrx T0, 0, DATA + ldbrx T1, IDX, $2 + ldbrx T0, 0, $2 ') C Combine state with input block, latter is padded to 17-bytes C by low-order byte of PADBYTE register - addc T0, T0, $3 - adde T1, T1, $4 - adde T2, PADBYTE, $5 + addc T0, T0, H0 + adde T1, T1, H1 + adde T2, $3, H2 mtvsrdd VSR(T), T0, T1 C Load key and pre-computed values li IDX, 16 - lxvd2x VSR(R), 0, CTX - lxvd2x VSR(S), IDX, CTX + lxvd2x VSR(R), 0, $1 + lxvd2x VSR(S), IDX, $1 andi. T2A, T2, 3 srdi T2S, T2, 2 @@ -75,17 +86,17 @@ IF_BE(` mtvsrdd VSR(T10), 0, T2 C Mutiplicate key by combined state and block - vmsumudm $1, T, MU0, ZERO - vmsumudm $2, T, MU1, ZERO + vmsumudm F0, T, MU0, ZERO + vmsumudm F1, T, MU1, ZERO vmsumudm F11, T11, MU1, ZERO - vmsumudm $1, T00, S, $1 - vmsumudm $2, T10, MU0, $2 + vmsumudm F0, T00, S, F0 + vmsumudm F1, T10, MU0, F1 C Product addition xxmrgld VSR(F11), VSR(F11), VSR(ZERO) - vadduqm $2, $2, F11 + vadduqm F1, F1, F11 - xxmrghd VSR(F0S), VSR(ZERO), VSR($1) - vadduqm $2, $2, F0S + xxmrghd VSR(F0S), VSR(ZERO), VSR(F0) + vadduqm F1, F1, F0S ') |