Cleanup, removing old cruft. Slight improvement to ROUND_F1_NOEXP.

Rev: nettle/x86/sha1-compress.asm:1.8
author: Niels Möller <nisse@lysator.liu.se> 2009-09-15 19:22:03 +0200
committer: Niels Möller <nisse@lysator.liu.se> 2009-09-15 19:22:03 +0200
commit: 60dfd8d41d8909f8535b29b686edc73b2e2efee6 (patch)
tree: e9b2c37372e9dd265d88505fdac9d00944ae710b /x86/sha1-compress.asm
parent: d8e65e43cf1e2177b81bf5c1f5294ac80da71298 (diff)
download: nettle-60dfd8d41d8909f8535b29b686edc73b2e2efee6.tar.gz
1 files changed, 27 insertions, 169 deletions
diff --git a/x86/sha1-compress.asm b/x86/sha1-compress.asm
index 51bee9e0..1e4a9110 100644
--- a/x86/sha1-compress.asm
+++ b/x86/sha1-compress.asm
@@ -26,7 +26,6 @@ define(<SE>,<%ebp>)
 define(<DATA>,<%esp>)
 define(<T1>,<%edi>)
 define(<T2>,<%esi>)				C  Used by SWAP
-define(<KVALUE>,<%esi>)				C  Used by rounds
 	
 C Constants
 define(<K1VALUE>, <0x5A827999>)		C  Rounds  0-19
@@ -42,23 +41,6 @@ define(<SWAP>, <
 	movl	$2, OFFSET($1) (DATA)
 >)dnl
 
-C expand(i) is the expansion function
-C
-C   W[i] = (W[i - 16] ^ W[i - 14] ^ W[i - 8] ^ W[i - 3]) <<< 1
-C
-C where W[i] is stored in DATA[i mod 16].
-C
-C Result is stored back in W[i], and also left in T1, the only
-C register that is used.
-define(<EXPAND>, <
-	movl	OFFSET(eval($1 % 16)) (DATA), T1
-	xorl	OFFSET(eval(($1 +  2) % 16)) (DATA), T1
-	xorl	OFFSET(eval(($1 +  8) % 16)) (DATA), T1
-	xorl	OFFSET(eval(($1 + 13) % 16)) (DATA), T1
-	roll	<$>1, T1
-	movl	T1, OFFSET(eval($1 % 16)) (DATA)>)dnl
-define(<NOEXPAND>, <OFFSET($1) (DATA)>)dnl
-
 C The f functions,
 C
 C  f1(x,y,z) = z ^ (x & (y ^ z))
@@ -103,18 +85,18 @@ define(<ROUND_F1>, <
 	add	T2, $5
 >)
 
-C FIXME: Seems to be a slow sequence.
+dnl ROUND_F1_NOEXP(a, b, c, d, e, i)
 define(<ROUND_F1_NOEXP>, <
 	mov	$4, T2
 	xor	$3, T2
+	mov	$1, T1
 	and	$2, T2
+	add	OFFSET($6) (DATA), $5
 	xor	$4, T2
-	add	OFFSET($6) (DATA), T2
+	add	T2, $5
 	rol	<$>30, $2
-	mov	$1, T1
 	rol	<$>5, T1
 	lea	K1VALUE (T1, $5), $5
-	add	T2, $5
 >)
 
 dnl ROUND_F2(a, b, c, d, e, i, k)
@@ -158,11 +140,6 @@ define(<ROUND_F3>, <
 	add	T2, $5
 >)
 
-
-C As suggested by George Spelvin, write the F3 function as
-C (x&y) | (y&z) | (x&z) == (x & (y^z)) + (y&z). Then, we can compute
-C and add each term to e, using a single temporary.
-
 	.file "sha1-compress.asm"
 
 	C _nettle_sha1_compress(uint32_t *state, uint8_t *data)
@@ -179,7 +156,6 @@ PROLOGUE(_nettle_sha1_compress)
 	pushl	%esi		C  68(%esp)
 	pushl	%edi		C  64(%esp)
 
-	C FIXME: Trim to 64
 	subl	$64, %esp	C  %esp = W
 
 	C Load and byteswap data
@@ -270,29 +246,29 @@ PROLOGUE(_nettle_sha1_compress)
 	ROUND_F3(SC, SD, SE, SA, SB, 58)
 	ROUND_F3(SB, SC, SD, SE, SA, 59)
 
- 	ROUND_F2(SA, SB, SC, SD, SE, 60, K4VALUE)
- 	ROUND_F2(SE, SA, SB, SC, SD, 61, K4VALUE)
- 	ROUND_F2(SD, SE, SA, SB, SC, 62, K4VALUE)
- 	ROUND_F2(SC, SD, SE, SA, SB, 63, K4VALUE)
- 	ROUND_F2(SB, SC, SD, SE, SA, 64, K4VALUE)
- 
- 	ROUND_F2(SA, SB, SC, SD, SE, 65, K4VALUE)
- 	ROUND_F2(SE, SA, SB, SC, SD, 66, K4VALUE)
- 	ROUND_F2(SD, SE, SA, SB, SC, 67, K4VALUE)
- 	ROUND_F2(SC, SD, SE, SA, SB, 68, K4VALUE)
- 	ROUND_F2(SB, SC, SD, SE, SA, 69, K4VALUE)
- 
- 	ROUND_F2(SA, SB, SC, SD, SE, 70, K4VALUE)
- 	ROUND_F2(SE, SA, SB, SC, SD, 71, K4VALUE)
- 	ROUND_F2(SD, SE, SA, SB, SC, 72, K4VALUE)
- 	ROUND_F2(SC, SD, SE, SA, SB, 73, K4VALUE)
- 	ROUND_F2(SB, SC, SD, SE, SA, 74, K4VALUE)
- 
- 	ROUND_F2(SA, SB, SC, SD, SE, 75, K4VALUE)
- 	ROUND_F2(SE, SA, SB, SC, SD, 76, K4VALUE)
- 	ROUND_F2(SD, SE, SA, SB, SC, 77, K4VALUE)
- 	ROUND_F2(SC, SD, SE, SA, SB, 78, K4VALUE)
- 	ROUND_F2(SB, SC, SD, SE, SA, 79, K4VALUE)
+	ROUND_F2(SA, SB, SC, SD, SE, 60, K4VALUE)
+	ROUND_F2(SE, SA, SB, SC, SD, 61, K4VALUE)
+	ROUND_F2(SD, SE, SA, SB, SC, 62, K4VALUE)
+	ROUND_F2(SC, SD, SE, SA, SB, 63, K4VALUE)
+	ROUND_F2(SB, SC, SD, SE, SA, 64, K4VALUE)
+
+	ROUND_F2(SA, SB, SC, SD, SE, 65, K4VALUE)
+	ROUND_F2(SE, SA, SB, SC, SD, 66, K4VALUE)
+	ROUND_F2(SD, SE, SA, SB, SC, 67, K4VALUE)
+	ROUND_F2(SC, SD, SE, SA, SB, 68, K4VALUE)
+	ROUND_F2(SB, SC, SD, SE, SA, 69, K4VALUE)
+
+	ROUND_F2(SA, SB, SC, SD, SE, 70, K4VALUE)
+	ROUND_F2(SE, SA, SB, SC, SD, 71, K4VALUE)
+	ROUND_F2(SD, SE, SA, SB, SC, 72, K4VALUE)
+	ROUND_F2(SC, SD, SE, SA, SB, 73, K4VALUE)
+	ROUND_F2(SB, SC, SD, SE, SA, 74, K4VALUE)
+
+	ROUND_F2(SA, SB, SC, SD, SE, 75, K4VALUE)
+	ROUND_F2(SE, SA, SB, SC, SD, 76, K4VALUE)
+	ROUND_F2(SD, SE, SA, SB, SC, 77, K4VALUE)
+	ROUND_F2(SC, SD, SE, SA, SB, 78, K4VALUE)
+	ROUND_F2(SB, SC, SD, SE, SA, 79, K4VALUE)
 
 	C Update the state vector
 	movl	84(%esp),T1
@@ -309,121 +285,3 @@ PROLOGUE(_nettle_sha1_compress)
 	popl	%ebx
 	ret
 EPILOGUE(_nettle_sha1_compress)
-
-C George Spelvin also suggested using lea, with an immediate offset
-C for the magic constants. This frees one register, which can be used
-C for loosen up dependencies and to more operations in parallel. For
-C example, take the rounds involving f2, the simplest round function.
-C Currently, we have
-C 
-C 	movl	16(%esp), T1
-C 	xorl	24(%esp), T1
-C 	xorl	48(%esp), T1
-C 	xorl	4(%esp), T1
-C 	roll	$1, T1
-C 	movl	T1, 16(%esp)
-C 	addl	KVALUE, SE	C 0
-C 	addl	T1, SE		C 1
-C 	movl	SB, T1		C 0
-C 	xorl	SC, T1		C 1
-C 	xorl	SD, T1		C 2
-C 	addl	T1, SE		C 3
-C 	movl	SA, T1		C 0
-C 	roll	$5, T1		C 1
-C 	addl	T1, SE		C 4
-C 	roll	$30, SB		C 0
-	
-C These 16 instructions could be executed in 5.33 cycles if there were
-C no dependencies. The crucial dependencies are from (previous) SE to
-C use SA, and (previous) result SB to use SC. (What does this say
-C about recurrency chain? Ought to unroll 5 times to see it).
-
-C It would be preferable to accumulate the terms in two or more
-C registers, to make dependencies shallower. Something like
-
-C	...expand, put data in W
-C	movl	SD, T1			C 0
-C	leal	K1VALUE(SE, W), SE	C 0
-C	movl	SA, T2		C 0
-C	xorl	SC, T1			C 1
-C	roll	$5, T2		C 1
-C 	xorl	SB, T1			C 2
-C	addl	T2, T1		C 3
-C	addl	T1, SE			C 4
-C a + b + c + d + e = ((((a + b) + c) + d) + e), latency 4
-C a + b + c + d + e = ((a + b) + c) + (d + e)
-C the out-of-order execution. Next iteration
-C
-C 	...expand...
-C 	roll	$1, T1		C 4
-C 	movl	T1, 16(%esp)	C 5
-C 	addl	KVALUE, SD	C 0
-C 	addl	T1, SD		C 5
-C 	movl	SA, T1		C 0
-C 	xorl	SB, T1		C 1
-C 	xorl	SC, T1		C 2
-C 	addl	T1, SD		C 6
-C 	movl	SE, T1		C 8
-C 	roll	$5, T1		C 9
-C 	addl	T1, SD		C 7
-C 	roll	$30, SA		C 0
-C
-C Lets look at the latency. Next iteration will operate on (E, A, B, C, D), so we have recurrencies:
-
-C from result SA to use of SE (none, SA not modified)
-C from result of SB to use of SA, result of SC to use of SB
-
-C It's possible to shave of half of the stores to tmp in the evaluation of f3,
-C  although it's probably not worth the effort. This is the trick: 
-C  
-C  round(a,b,c,d,e,f,k) modifies only b,e.
-C  
-C  round(a,b,c,d,e,f3,k)
-C  round(e,a,b,c,d,f3,k)
-C  
-C  ; f3(b,c,d) = (b & c) | (d & (b | c))
-C  
-C    movl b, tmp
-C    andl c, tmp
-C    movl tmp, tmp2
-C    movl b, tmp
-C    orl  c, tmp
-C    andl d, tmp
-C    orl tmp2, tmp
-C  
-C  and corresponding code for f3(a,b,c)
-C  
-C  Use the register allocated for c as a temporary?
-C  
-C    movl c, tmp2
-C  ; f3(b,c,d) = (b & c) | (d & (b | c))
-C    movl b, tmp
-C    orl  c, tmp
-C    andl b, c
-C    andl d, tmp
-C    orl  c, tmp
-C  
-C  ; f3(a,b,c) = (a & b) | (c & (a | b))
-C    movl b, tmp
-C    andl a, tmp
-C    movl a, c
-C    orl  b, c
-C    andl tmp2, c
-C    orl  c, tmp
-C  
-C    movl tmp2, c
-C  
-C  Before: 14 instr, 2 store, 2 load
-C  After: 13 instr, 1 store, 2 load
-C  
-C  Final load can be folded into the next round,
-C  
-C  round(d,e,a,b,c,f3,k)
-C  
-C    c += d <<< 5 + f(e, a, b) + k + w
-C  
-C  if we arrange to have w placed directly into the register
-C  corresponding to w. That way we save one more instruction, total save
-C  of two instructions, one of which is a store, per two rounds. For the
-C  twenty rounds involving f3, that's 20 instructions, 10 of which are
-C  stores, or about 1.5 %.
author	Niels Möller <nisse@lysator.liu.se>	2009-09-15 19:22:03 +0200
committer	Niels Möller <nisse@lysator.liu.se>	2009-09-15 19:22:03 +0200
commit	60dfd8d41d8909f8535b29b686edc73b2e2efee6 (patch)
tree	e9b2c37372e9dd265d88505fdac9d00944ae710b /x86/sha1-compress.asm
parent	d8e65e43cf1e2177b81bf5c1f5294ac80da71298 (diff)
download	nettle-60dfd8d41d8909f8535b29b686edc73b2e2efee6.tar.gz