diff options
author | Maamoun TK <maamoun.tk@googlemail.com> | 2022-11-06 08:00:38 +0200 |
---|---|---|
committer | Maamoun TK <maamoun.tk@googlemail.com> | 2022-11-06 08:00:38 +0200 |
commit | 2d9f46878d4c6de044e6e0e4820d681e62851283 (patch) | |
tree | 4de683092fb2660ef2d3682229278d957a229558 | |
parent | aec77fd3b29663ca5b2b7094b3b590e4262da6b4 (diff) | |
download | nettle-2d9f46878d4c6de044e6e0e4820d681e62851283.tar.gz |
[PowerPC] Move register allocation from poly1305.m4
-rw-r--r-- | powerpc64/p9/poly1305-blocks.asm | 50 | ||||
-rw-r--r-- | powerpc64/p9/poly1305-internal.asm | 27 | ||||
-rw-r--r-- | powerpc64/p9/poly1305.m4 | 105 |
3 files changed, 92 insertions, 90 deletions
diff --git a/powerpc64/p9/poly1305-blocks.asm b/powerpc64/p9/poly1305-blocks.asm index 3f729e98..cbd03505 100644 --- a/powerpc64/p9/poly1305-blocks.asm +++ b/powerpc64/p9/poly1305-blocks.asm @@ -37,15 +37,12 @@ C Register usage: define(`SP', `r1') define(`TOCP', `r2') -define(`DEFINES_BLOCK_ARG_R64', ` - C State inputs - define(`H0', `r6') - define(`H1', `r7') - define(`H2', `r8') - C State outputs - define(`F0', `v1') - define(`F1', `v2') - ') +C Argments +define(`CTX', `r3') +define(`BLOCKS', `r4') +define(`DATA', `r5') + +define(`PADBYTE', `r6') C Padding byte register define(`DEFINES_BLOCK_R44', ` define(`R0', `v0') @@ -203,17 +200,15 @@ PROLOGUE(_nettle_poly1305_blocks) stxv VSR(v21),-192(SP) stxv VSR(v20),-208(SP) - mr LEN, r4 - mr DATA, r5 C Initialize padding byte register li PADBYTE, 1 C Process data blocks of number of multiple 4 DEFINES_BLOCK_R44() - cmpldi LEN, POLY1305_BLOCK_THRESHOLD + cmpldi BLOCKS, POLY1305_BLOCK_THRESHOLD blt Ldata_r64 - srdi r9, LEN, 2 - andi. LEN, LEN, 3 + srdi r9, BLOCKS, 2 + andi. BLOCKS, BLOCKS, 3 mtctr r9 C Initialize constants @@ -384,24 +379,23 @@ IF_BE(` stxsd H2, 48(CTX) Ldata_r64: - DEFINES_BLOCK_ARG_R64() - C COUNTER = LEN / 16 - cmpldi LEN, 0 + cmpldi BLOCKS, 0 beq Ldone - mtctr LEN - ld H0, P1305_H0 (CTX) - ld H1, P1305_H1 (CTX) - ld H2, P1305_H2 (CTX) + mtctr BLOCKS + mr r4, PADBYTE + ld r6, P1305_H0 (CTX) + ld r7, P1305_H1 (CTX) + ld r8, P1305_H2 (CTX) L1B_loop: - BLOCK_R64(F0,F1,H0,H1,H2) - mfvsrld H0, VSR(F0) - mfvsrld H1, VSR(F1) - mfvsrd H2, VSR(F1) + BLOCK_R64(CTX,DATA,r4,r6,v0) + mfvsrld r6, VSR(v0) + mfvsrld r7, VSR(v1) + mfvsrd r8, VSR(v1) addi DATA, DATA, 16 bdnz L1B_loop - std H0, P1305_H0 (CTX) - std H1, P1305_H1 (CTX) - std H2, P1305_H2 (CTX) + std r6, P1305_H0 (CTX) + std r7, P1305_H1 (CTX) + std r8, P1305_H2 (CTX) Ldone: C Restore non-volatile vector registers diff --git a/powerpc64/p9/poly1305-internal.asm b/powerpc64/p9/poly1305-internal.asm index a1e46e8f..c23e16fd 100644 --- a/powerpc64/p9/poly1305-internal.asm +++ b/powerpc64/p9/poly1305-internal.asm @@ -37,13 +37,10 @@ C Register usage: define(`SP', `r1') define(`TOCP', `r2') -C State inputs -define(`H0', `r6') -define(`H1', `r7') -define(`H2', `r8') -C State outputs -define(`F0', `v1') -define(`F1', `v2') +C Argments +define(`CTX', `r3') +define(`DATA', `r4') +define(`PADBYTE', `r5') C Padding byte register .text @@ -91,17 +88,17 @@ EPILOGUE(_nettle_poly1305_set_key) C void _nettle_poly1305_block(struct poly1305_ctx *ctx, const uint8_t *m, unsigned m128) define(`FUNC_ALIGN', `5') PROLOGUE(_nettle_poly1305_block) - ld H0, P1305_H0 (CTX) - ld H1, P1305_H1 (CTX) - ld H2, P1305_H2 (CTX) + ld r6, P1305_H0 (CTX) + ld r7, P1305_H1 (CTX) + ld r8, P1305_H2 (CTX) - BLOCK_R64(F0,F1,H0,H1,H2) + BLOCK_R64(CTX,DATA,PADBYTE,r6,v0) li r10, P1305_H1 - xxswapd VSR(F0), VSR(F0) - xxswapd VSR(F1), VSR(F1) - stxsd F0, P1305_H0 (CTX) - stxvd2x VSR(F1), r10, CTX + xxswapd VSR(v0), VSR(v0) + xxswapd VSR(v1), VSR(v1) + stxsd v0, P1305_H0 (CTX) + stxvd2x VSR(v1), r10, CTX blr EPILOGUE(_nettle_poly1305_block) diff --git a/powerpc64/p9/poly1305.m4 b/powerpc64/p9/poly1305.m4 index 3cb63f82..6a07ed6f 100644 --- a/powerpc64/p9/poly1305.m4 +++ b/powerpc64/p9/poly1305.m4 @@ -2,64 +2,75 @@ C Threshold of processing multiple blocks in parallel C of a multiple of 4 define(`POLY1305_BLOCK_THRESHOLD', `12') -C Argments -define(`CTX', `r3') -define(`DATA', `r4') -define(`PADBYTE', `r5') C Padding byte register -define(`LEN', `r6') - +C DEFINES_BLOCK_R64(GPR0, VR0) define(`DEFINES_BLOCK_R64', ` - define(`T0', `r9') - define(`T1', `r10') - define(`T2', `r8') - define(`T2A', `r9') - define(`T2S', `r10') - define(`RZ', `r6') - define(`IDX', `r10') - - define(`ZERO', `v0') - define(`F0S', `v3') - define(`F11', `v4') - define(`T', `v5') - - define(`R', `v6') - define(`S', `v7') - - define(`T00', `v8') - define(`T10', `v9') - define(`T11', `v10') - define(`MU0', `v11') - define(`MU1', `v12') + define(`H0', `eval(0+$1)') + define(`H1', `eval(1+$1)') + define(`H2', `eval(2+$1)') + + define(`T0', `eval(3+$1)') + define(`T1', `eval(4+$1)') + define(`T2', `eval(2+$1)') + define(`T2A', `eval(3+$1)') + define(`T2S', `eval(4+$1)') + define(`RZ', `eval(0+$1)') + define(`IDX', `eval(4+$1)') + + define(`F0', `eval(0+$2)') + define(`F1', `eval(1+$2)') + + define(`ZERO', `eval(2+$2)') + define(`F0S', `eval(3+$2)') + define(`F11', `eval(4+$2)') + define(`T', `eval(5+$2)') + + define(`R', `eval(6+$2)') + define(`S', `eval(7+$2)') + + define(`T00', `eval(8+$2)') + define(`T10', `eval(9+$2)') + define(`T11', `eval(10+$2)') + define(`MU0', `eval(11+$2)') + define(`MU1', `eval(12+$2)') ') -C Inputs H0, H1, H2 are general-puropse registers of previous state radix 2^64 -C Outputs F0, F1 are vector registers of result state radix 2^64 sorted as follows +C CTX is the address of context where key and pre-computed values are stored +C DATA is the address of input block +C PADBYTE is padding byte for input block +C GPR0 is the starting register of sequential general-purpose registers +C used in the macro of following layout +C GPR0, GPR1, GPR2 are inputs representing the previous state radix 2^64 +C GPR3, GPR4 are temporary registers +C VR0 is the starting register of sequential vector resigers used in +C the macro of following layout +C VR0, VR1 are outputs representing the result state radix 2^64 sorted as follows C (low 64-bit of F0) + (low 64-bit of F1) + (high 64-bit of F1) -C BLOCK_R64(F0, F1, H0, H1, H2) +C VR2..VR12 are temporary registers +C BLOCK_R64(CTX, DATA, PADBYTE, GPR0, VR0) define(`BLOCK_R64', ` - DEFINES_BLOCK_R64() + DEFINES_BLOCK_R64($4,$5) C Load 128-bit input block IF_LE(` - ld T0, 0(DATA) - ld T1, 8(DATA) + ld T0, 0($2) + ld T1, 8($2) ') IF_BE(` li IDX, 8 - ldbrx T1, IDX, DATA - ldbrx T0, 0, DATA + ldbrx T1, IDX, $2 + ldbrx T0, 0, $2 ') C Combine state with input block, latter is padded to 17-bytes C by low-order byte of PADBYTE register - addc T0, T0, $3 - adde T1, T1, $4 - adde T2, PADBYTE, $5 + addc T0, T0, H0 + adde T1, T1, H1 + adde T2, $3, H2 mtvsrdd VSR(T), T0, T1 C Load key and pre-computed values li IDX, 16 - lxvd2x VSR(R), 0, CTX - lxvd2x VSR(S), IDX, CTX + lxvd2x VSR(R), 0, $1 + lxvd2x VSR(S), IDX, $1 andi. T2A, T2, 3 srdi T2S, T2, 2 @@ -75,17 +86,17 @@ IF_BE(` mtvsrdd VSR(T10), 0, T2 C Mutiplicate key by combined state and block - vmsumudm $1, T, MU0, ZERO - vmsumudm $2, T, MU1, ZERO + vmsumudm F0, T, MU0, ZERO + vmsumudm F1, T, MU1, ZERO vmsumudm F11, T11, MU1, ZERO - vmsumudm $1, T00, S, $1 - vmsumudm $2, T10, MU0, $2 + vmsumudm F0, T00, S, F0 + vmsumudm F1, T10, MU0, F1 C Product addition xxmrgld VSR(F11), VSR(F11), VSR(ZERO) - vadduqm $2, $2, F11 + vadduqm F1, F1, F11 - xxmrghd VSR(F0S), VSR(ZERO), VSR($1) - vadduqm $2, $2, F0S + xxmrghd VSR(F0S), VSR(ZERO), VSR(F0) + vadduqm F1, F1, F0S ') |