diff options
author | Niels Möller <nisse@lysator.liu.se> | 2009-09-15 19:22:03 +0200 |
---|---|---|
committer | Niels Möller <nisse@lysator.liu.se> | 2009-09-15 19:22:03 +0200 |
commit | 60dfd8d41d8909f8535b29b686edc73b2e2efee6 (patch) | |
tree | e9b2c37372e9dd265d88505fdac9d00944ae710b /x86/sha1-compress.asm | |
parent | d8e65e43cf1e2177b81bf5c1f5294ac80da71298 (diff) | |
download | nettle-60dfd8d41d8909f8535b29b686edc73b2e2efee6.tar.gz |
Cleanup, removing old cruft. Slight improvement to ROUND_F1_NOEXP.
Rev: nettle/x86/sha1-compress.asm:1.8
Diffstat (limited to 'x86/sha1-compress.asm')
-rw-r--r-- | x86/sha1-compress.asm | 196 |
1 files changed, 27 insertions, 169 deletions
diff --git a/x86/sha1-compress.asm b/x86/sha1-compress.asm index 51bee9e0..1e4a9110 100644 --- a/x86/sha1-compress.asm +++ b/x86/sha1-compress.asm @@ -26,7 +26,6 @@ define(<SE>,<%ebp>) define(<DATA>,<%esp>) define(<T1>,<%edi>) define(<T2>,<%esi>) C Used by SWAP -define(<KVALUE>,<%esi>) C Used by rounds C Constants define(<K1VALUE>, <0x5A827999>) C Rounds 0-19 @@ -42,23 +41,6 @@ define(<SWAP>, < movl $2, OFFSET($1) (DATA) >)dnl -C expand(i) is the expansion function -C -C W[i] = (W[i - 16] ^ W[i - 14] ^ W[i - 8] ^ W[i - 3]) <<< 1 -C -C where W[i] is stored in DATA[i mod 16]. -C -C Result is stored back in W[i], and also left in T1, the only -C register that is used. -define(<EXPAND>, < - movl OFFSET(eval($1 % 16)) (DATA), T1 - xorl OFFSET(eval(($1 + 2) % 16)) (DATA), T1 - xorl OFFSET(eval(($1 + 8) % 16)) (DATA), T1 - xorl OFFSET(eval(($1 + 13) % 16)) (DATA), T1 - roll <$>1, T1 - movl T1, OFFSET(eval($1 % 16)) (DATA)>)dnl -define(<NOEXPAND>, <OFFSET($1) (DATA)>)dnl - C The f functions, C C f1(x,y,z) = z ^ (x & (y ^ z)) @@ -103,18 +85,18 @@ define(<ROUND_F1>, < add T2, $5 >) -C FIXME: Seems to be a slow sequence. +dnl ROUND_F1_NOEXP(a, b, c, d, e, i) define(<ROUND_F1_NOEXP>, < mov $4, T2 xor $3, T2 + mov $1, T1 and $2, T2 + add OFFSET($6) (DATA), $5 xor $4, T2 - add OFFSET($6) (DATA), T2 + add T2, $5 rol <$>30, $2 - mov $1, T1 rol <$>5, T1 lea K1VALUE (T1, $5), $5 - add T2, $5 >) dnl ROUND_F2(a, b, c, d, e, i, k) @@ -158,11 +140,6 @@ define(<ROUND_F3>, < add T2, $5 >) - -C As suggested by George Spelvin, write the F3 function as -C (x&y) | (y&z) | (x&z) == (x & (y^z)) + (y&z). Then, we can compute -C and add each term to e, using a single temporary. - .file "sha1-compress.asm" C _nettle_sha1_compress(uint32_t *state, uint8_t *data) @@ -179,7 +156,6 @@ PROLOGUE(_nettle_sha1_compress) pushl %esi C 68(%esp) pushl %edi C 64(%esp) - C FIXME: Trim to 64 subl $64, %esp C %esp = W C Load and byteswap data @@ -270,29 +246,29 @@ PROLOGUE(_nettle_sha1_compress) ROUND_F3(SC, SD, SE, SA, SB, 58) ROUND_F3(SB, SC, SD, SE, SA, 59) - ROUND_F2(SA, SB, SC, SD, SE, 60, K4VALUE) - ROUND_F2(SE, SA, SB, SC, SD, 61, K4VALUE) - ROUND_F2(SD, SE, SA, SB, SC, 62, K4VALUE) - ROUND_F2(SC, SD, SE, SA, SB, 63, K4VALUE) - ROUND_F2(SB, SC, SD, SE, SA, 64, K4VALUE) - - ROUND_F2(SA, SB, SC, SD, SE, 65, K4VALUE) - ROUND_F2(SE, SA, SB, SC, SD, 66, K4VALUE) - ROUND_F2(SD, SE, SA, SB, SC, 67, K4VALUE) - ROUND_F2(SC, SD, SE, SA, SB, 68, K4VALUE) - ROUND_F2(SB, SC, SD, SE, SA, 69, K4VALUE) - - ROUND_F2(SA, SB, SC, SD, SE, 70, K4VALUE) - ROUND_F2(SE, SA, SB, SC, SD, 71, K4VALUE) - ROUND_F2(SD, SE, SA, SB, SC, 72, K4VALUE) - ROUND_F2(SC, SD, SE, SA, SB, 73, K4VALUE) - ROUND_F2(SB, SC, SD, SE, SA, 74, K4VALUE) - - ROUND_F2(SA, SB, SC, SD, SE, 75, K4VALUE) - ROUND_F2(SE, SA, SB, SC, SD, 76, K4VALUE) - ROUND_F2(SD, SE, SA, SB, SC, 77, K4VALUE) - ROUND_F2(SC, SD, SE, SA, SB, 78, K4VALUE) - ROUND_F2(SB, SC, SD, SE, SA, 79, K4VALUE) + ROUND_F2(SA, SB, SC, SD, SE, 60, K4VALUE) + ROUND_F2(SE, SA, SB, SC, SD, 61, K4VALUE) + ROUND_F2(SD, SE, SA, SB, SC, 62, K4VALUE) + ROUND_F2(SC, SD, SE, SA, SB, 63, K4VALUE) + ROUND_F2(SB, SC, SD, SE, SA, 64, K4VALUE) + + ROUND_F2(SA, SB, SC, SD, SE, 65, K4VALUE) + ROUND_F2(SE, SA, SB, SC, SD, 66, K4VALUE) + ROUND_F2(SD, SE, SA, SB, SC, 67, K4VALUE) + ROUND_F2(SC, SD, SE, SA, SB, 68, K4VALUE) + ROUND_F2(SB, SC, SD, SE, SA, 69, K4VALUE) + + ROUND_F2(SA, SB, SC, SD, SE, 70, K4VALUE) + ROUND_F2(SE, SA, SB, SC, SD, 71, K4VALUE) + ROUND_F2(SD, SE, SA, SB, SC, 72, K4VALUE) + ROUND_F2(SC, SD, SE, SA, SB, 73, K4VALUE) + ROUND_F2(SB, SC, SD, SE, SA, 74, K4VALUE) + + ROUND_F2(SA, SB, SC, SD, SE, 75, K4VALUE) + ROUND_F2(SE, SA, SB, SC, SD, 76, K4VALUE) + ROUND_F2(SD, SE, SA, SB, SC, 77, K4VALUE) + ROUND_F2(SC, SD, SE, SA, SB, 78, K4VALUE) + ROUND_F2(SB, SC, SD, SE, SA, 79, K4VALUE) C Update the state vector movl 84(%esp),T1 @@ -309,121 +285,3 @@ PROLOGUE(_nettle_sha1_compress) popl %ebx ret EPILOGUE(_nettle_sha1_compress) - -C George Spelvin also suggested using lea, with an immediate offset -C for the magic constants. This frees one register, which can be used -C for loosen up dependencies and to more operations in parallel. For -C example, take the rounds involving f2, the simplest round function. -C Currently, we have -C -C movl 16(%esp), T1 -C xorl 24(%esp), T1 -C xorl 48(%esp), T1 -C xorl 4(%esp), T1 -C roll $1, T1 -C movl T1, 16(%esp) -C addl KVALUE, SE C 0 -C addl T1, SE C 1 -C movl SB, T1 C 0 -C xorl SC, T1 C 1 -C xorl SD, T1 C 2 -C addl T1, SE C 3 -C movl SA, T1 C 0 -C roll $5, T1 C 1 -C addl T1, SE C 4 -C roll $30, SB C 0 - -C These 16 instructions could be executed in 5.33 cycles if there were -C no dependencies. The crucial dependencies are from (previous) SE to -C use SA, and (previous) result SB to use SC. (What does this say -C about recurrency chain? Ought to unroll 5 times to see it). - -C It would be preferable to accumulate the terms in two or more -C registers, to make dependencies shallower. Something like - -C ...expand, put data in W -C movl SD, T1 C 0 -C leal K1VALUE(SE, W), SE C 0 -C movl SA, T2 C 0 -C xorl SC, T1 C 1 -C roll $5, T2 C 1 -C xorl SB, T1 C 2 -C addl T2, T1 C 3 -C addl T1, SE C 4 -C a + b + c + d + e = ((((a + b) + c) + d) + e), latency 4 -C a + b + c + d + e = ((a + b) + c) + (d + e) -C the out-of-order execution. Next iteration -C -C ...expand... -C roll $1, T1 C 4 -C movl T1, 16(%esp) C 5 -C addl KVALUE, SD C 0 -C addl T1, SD C 5 -C movl SA, T1 C 0 -C xorl SB, T1 C 1 -C xorl SC, T1 C 2 -C addl T1, SD C 6 -C movl SE, T1 C 8 -C roll $5, T1 C 9 -C addl T1, SD C 7 -C roll $30, SA C 0 -C -C Lets look at the latency. Next iteration will operate on (E, A, B, C, D), so we have recurrencies: - -C from result SA to use of SE (none, SA not modified) -C from result of SB to use of SA, result of SC to use of SB - -C It's possible to shave of half of the stores to tmp in the evaluation of f3, -C although it's probably not worth the effort. This is the trick: -C -C round(a,b,c,d,e,f,k) modifies only b,e. -C -C round(a,b,c,d,e,f3,k) -C round(e,a,b,c,d,f3,k) -C -C ; f3(b,c,d) = (b & c) | (d & (b | c)) -C -C movl b, tmp -C andl c, tmp -C movl tmp, tmp2 -C movl b, tmp -C orl c, tmp -C andl d, tmp -C orl tmp2, tmp -C -C and corresponding code for f3(a,b,c) -C -C Use the register allocated for c as a temporary? -C -C movl c, tmp2 -C ; f3(b,c,d) = (b & c) | (d & (b | c)) -C movl b, tmp -C orl c, tmp -C andl b, c -C andl d, tmp -C orl c, tmp -C -C ; f3(a,b,c) = (a & b) | (c & (a | b)) -C movl b, tmp -C andl a, tmp -C movl a, c -C orl b, c -C andl tmp2, c -C orl c, tmp -C -C movl tmp2, c -C -C Before: 14 instr, 2 store, 2 load -C After: 13 instr, 1 store, 2 load -C -C Final load can be folded into the next round, -C -C round(d,e,a,b,c,f3,k) -C -C c += d <<< 5 + f(e, a, b) + k + w -C -C if we arrange to have w placed directly into the register -C corresponding to w. That way we save one more instruction, total save -C of two instructions, one of which is a store, per two rounds. For the -C twenty rounds involving f3, that's 20 instructions, 10 of which are -C stores, or about 1.5 %. |