crypto/rc4/asm/rc4-ia64.S


1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157

// ====================================================================
// Written by Andy Polyakov <appro@fy.chalmers.se> for the OpenSSL
// project.
//
// Rights for redistribution and usage in source and binary forms are
// granted according to the OpenSSL license. Warranty of any kind is
// disclaimed.
// ====================================================================

.ident  "rc4-ia64.S, Version 1.1"
.ident  "IA-64 ISA artwork by Andy Polyakov <appro@fy.chalmers.se>"

// What's wrong with compiler generated code? Because of the nature of
// C language, compiler doesn't [dare to] reorder load and stores. But
// being memory-bound, RC4 should benefit from reorder [on in-order-
// execution core such as IA-64]. But what can we reorder? At the very
// least we can safely reorder references to key schedule in respect
// to input and output streams. Secondly, from the first [close] glance
// it appeared that it's possible to pull up some references to
// elements of the key schedule itself. Original rationale ["prior
// loads are not safe only for "degenerated" key schedule, when some
// elements equal to the same value"] was kind of sloppy. I should have
// formulated as it really was: if we assume that pulling up reference
// to key[x+1] is not safe, then it would mean that key schedule would
// "degenerate," which is never the case. The problem is that this
// holds true in respect to references to key[x], but not to key[y].
// Legitimate "collisions" do occur within every 256^2 bytes window.
// Fortunately there're enough free instruction slots to keep prior
// reference to key[x+1], detect "collision" and compensate for it.
// All this without sacrificing a single clock cycle:-)
// Furthermore. In order to compress loop body to the minimum, I chose
// to deploy deposit instruction, which substitutes for the whole
// key->data+((x&255)<<log2(sizeof(key->data[0]))). This unfortunately
// requires key->data to be aligned at sizeof(key->data) boundary.
// This is why you'll find "RC4_INT pad[512-256-2];" addenum to RC4_KEY
// and "d=(RC4_INT *)(((size_t)(d+255))&~(sizeof(key->data)-1));" in
// rc4_skey.c [and rc4_enc.c, where it's retained for debugging
// purposes]. Throughput is ~210MBps on 900MHz CPU, which is is >3x
// faster than gcc generated code and +30% - if compared to HP-UX C.
// Unrolling loop below should give >30% on top of that...

.text
.explicit

#if defined(_HPUX_SOURCE) && !defined(_LP64)
# define ADDP	addp4
#else
# define ADDP	add
#endif

#define SZ	4	// this is set to sizeof(RC4_INT)
// SZ==4 seems to be optimal. At least SZ==8 is not any faster, not for
// assembler implementation, while SZ==1 code is ~30% slower.
#if SZ==1	// RC4_INT is unsigned char
# define	LDKEY	ld1
# define	STKEY	st1
# define	OFF	0
#elif SZ==4	// RC4_INT is unsigned int
# define	LDKEY	ld4
# define	STKEY	st4
# define	OFF	2
#elif SZ==8	// RC4_INT is unsigned long
# define	LDKEY	ld8
# define	STKEY	st8
# define	OFF	3
#endif

out=r8;		// [expanded] output pointer
inp=r9;		// [expanded] output pointer
prsave=r10;
key=r28;	// [expanded] pointer to RC4_KEY
ksch=r29;	// (key->data+255)[&~(sizeof(key->data)-1)]
xx=r30;
yy=r31;

// void RC4(RC4_KEY *key,size_t len,const void *inp,void *out);
.global	RC4#
.proc	RC4#
.align	32
.skip	16
RC4:
	.prologue
	.fframe 0
	.save   ar.pfs,r2
	.save	ar.lc,r3
	.save	pr,prsave
{ .mii;	alloc	r2=ar.pfs,4,12,0,16
	mov	prsave=pr
	ADDP	key=0,in0		};;
{ .mib;	cmp.eq	p6,p0=0,in1			// len==0?
	mov	r3=ar.lc
(p6)	br.ret.spnt.many	b0	};;	// emergency exit

	.body
	.rotr	dat[4],key_x[4],tx[2],rnd[2],key_y[2],ty[1];

{ .mib;	LDKEY	xx=[key],SZ			// load key->x
	add	in1=-1,in1			// adjust len for loop counter
	nop.b	0			}
{ .mib;	ADDP	inp=0,in2
	ADDP	out=0,in3
	brp.loop.imp	.Ltop,.Lexit-16	};;
{ .mmi;	LDKEY	yy=[key]			// load key->y
	add	ksch=(255+1)*SZ,key		// as ksch will be used with
						// deposit instruction only,
						// I don't have to &~255...
	mov	ar.lc=in1		}
{ .mmi;	mov	key_y[1]=r0			// guarantee inequality
						// in first iteration
	add	xx=1,xx
	mov	pr.rot=1<<16		};;
{ .mii;	nop.m	0
	dep	key_x[1]=xx,ksch,OFF,8
	mov	ar.ec=3			};;	// note that epilogue counter
						// is off by 1. I compensate
						// for this at exit...
.Ltop:
// The loop is scheduled for 3*(n+2) spin-rate on Itanium 2, which
// theoretically gives asymptotic performance of clock frequency
// divided by 3 bytes per seconds, or 500MBps on 1.5GHz CPU. Measured
// performance however is distinctly lower than 1/4:-( The culplrit
// seems to be *(out++)=dat, which inadvertently splits the bundle,
// even though there is M-port available... Unrolling is due...
// Unrolled loop should collect output with variable shift instruction
// in order to avoid starvation for integer shifter... It should be
// possible to get pretty close to theoretical peak...
{ .mmi;	(p16)	LDKEY	tx[0]=[key_x[1]]		// tx=key[xx]
	(p17)	LDKEY	ty[0]=[key_y[1]]		// ty=key[yy]	
	(p18)	dep	rnd[1]=rnd[1],ksch,OFF,8}	// &key[(tx+ty)&255]
{ .mmi;	(p19)	st1	[out]=dat[3],1			// *(out++)=dat
	(p16)	add	xx=1,xx				// x++
	(p16)	cmp.ne.unc p20,p21=key_x[1],key_y[1]	};;
{ .mmi;	(p18)	LDKEY	rnd[1]=[rnd[1]]			// rnd=key[(tx+ty)&255]
	(p16)	ld1	dat[0]=[inp],1			// dat=*(inp++)
	(p16)	dep	key_x[0]=xx,ksch,OFF,8	}	// &key[xx&255]
.pred.rel	"mutex",p20,p21
{ .mmi;	(p21)	add	yy=yy,tx[1]			// (p16)
	(p20)	add	yy=yy,tx[0]			// (p16) y+=tx
	(p21)	mov	tx[0]=tx[1]		};;	// (p16)
{ .mmi;	(p17)	STKEY	[key_y[1]]=tx[1]		// key[yy]=tx
	(p17)	STKEY	[key_x[2]]=ty[0]		// key[xx]=ty
	(p16)	dep	key_y[0]=yy,ksch,OFF,8	}	// &key[yy&255]
{ .mmb;	(p17)	add	rnd[0]=tx[1],ty[0]		// tx+=ty
	(p18)	xor	dat[2]=dat[2],rnd[1]		// dat^=rnd
	br.ctop.sptk	.Ltop			};;
.Lexit:
{ .mib;	STKEY	[key]=yy,-SZ			// save key->y
	mov	pr=prsave,0x1ffff
	nop.b	0			}
{ .mib;	st1	[out]=dat[3],1			// compensate for truncated
						// epilogue counter
	add	xx=-1,xx
	nop.b	0			};;
{ .mib;	STKEY	[key]=xx			// save key->x
	mov	ar.lc=r3
	br.ret.sptk.many	b0	};;
.endp	RC4#