diff options
Diffstat (limited to 'crypto/rc4')
-rw-r--r-- | crypto/rc4/Makefile.ssl | 55 | ||||
-rw-r--r-- | crypto/rc4/asm/rc4-586.pl | 114 | ||||
-rwxr-xr-x | crypto/rc4/asm/rc4-amd64.pl | 176 | ||||
-rw-r--r-- | crypto/rc4/rc4.c | 3 | ||||
-rw-r--r-- | crypto/rc4/rc4.h | 4 | ||||
-rw-r--r-- | crypto/rc4/rc4_enc.c | 4 | ||||
-rw-r--r-- | crypto/rc4/rc4_locl.h | 1 | ||||
-rw-r--r-- | crypto/rc4/rc4_skey.c | 51 | ||||
-rw-r--r-- | crypto/rc4/rc4test.c | 36 |
9 files changed, 310 insertions, 134 deletions
diff --git a/crypto/rc4/Makefile.ssl b/crypto/rc4/Makefile.ssl index a965c02d94..7e98de885a 100644 --- a/crypto/rc4/Makefile.ssl +++ b/crypto/rc4/Makefile.ssl @@ -18,14 +18,10 @@ MAKEFILE= Makefile.ssl AR= ar r RC4_ENC=rc4_enc.o -# or use -#RC4_ENC=asm/rx86-elf.o -#RC4_ENC=asm/rx86-out.o -#RC4_ENC=asm/rx86-sol.o -#RC4_ENC=asm/rx86bdsi.o CFLAGS= $(INCLUDES) $(CFLAG) ASFLAGS= $(INCLUDES) $(ASFLAG) +AFLAGS= $(ASFLAGS) GENERAL=Makefile TEST=rc4test.c @@ -52,22 +48,20 @@ lib: $(LIBOBJ) $(RANLIB) $(LIB) || echo Never mind. @touch lib -# elf -asm/rx86-elf.s: asm/rc4-586.pl ../perlasm/x86asm.pl - (cd asm; $(PERL) rc4-586.pl elf $(CFLAGS) > rx86-elf.s) - +# ELF +rx86-elf.s: asm/rc4-586.pl ../perlasm/x86asm.pl + (cd asm; $(PERL) rc4-586.pl elf $(CFLAGS) > ../$@) +# COFF +rx86-cof.s: asm/rc4-586.pl ../perlasm/x86asm.pl + (cd asm; $(PERL) rc4-586.pl coff $(CFLAGS) > ../$@) # a.out -asm/rx86-out.o: asm/rx86unix.cpp - $(CPP) -DOUT asm/rx86unix.cpp | as -o asm/rx86-out.o - -# bsdi -asm/rx86bsdi.o: asm/rx86unix.cpp - $(CPP) -DBSDI asm/rx86unix.cpp | sed 's/ :/:/' | as -o asm/rx86bsdi.o +rx86-out.s: asm/rc4-586.pl ../perlasm/x86asm.pl + (cd asm; $(PERL) rc4-586.pl a.out $(CFLAGS) > ../$@) -asm/rx86unix.cpp: asm/rc4-586.pl ../perlasm/x86asm.pl - (cd asm; $(PERL) rc4-586.pl cpp >rx86unix.cpp) +rc4-amd64.s: asm/rc4-amd64.pl; $(PERL) $< $@ -asm/rc4-amd64.s: asm/rc4-amd64.pl; $(PERL) $< $@ +rc4-ia64.s: asm/rc4-ia64.S + $(CC) $(CFLAGS) -E asm/rc4-ia64.S > $@ files: $(PERL) $(TOP)/util/files.pl Makefile.ssl >> $(TOP)/MINFO @@ -79,7 +73,7 @@ links: @$(PERL) $(TOP)/util/mklink.pl ../../apps $(APPS) install: - @for i in $(EXHEADER) ; \ + @headerlist="$(EXHEADER)"; for i in $$headerlist ; \ do \ (cp $$i $(INSTALL_PREFIX)$(INSTALLTOP)/include/openssl/$$i; \ chmod 644 $(INSTALL_PREFIX)$(INSTALLTOP)/include/openssl/$$i ); \ @@ -101,12 +95,23 @@ dclean: mv -f Makefile.new $(MAKEFILE) clean: - rm -f asm/rx86unix.cpp asm/*-elf.* *.o *.obj lib tags core .pure .nfs* *.old *.bak fluff asm/*.o + rm -f *.s *.o *.obj lib tags core .pure .nfs* *.old *.bak fluff # DO NOT DELETE THIS LINE -- make depend depends on it. -rc4_enc.o: ../../include/openssl/opensslconf.h ../../include/openssl/rc4.h -rc4_enc.o: rc4_enc.c rc4_locl.h -rc4_skey.o: ../../include/openssl/opensslconf.h -rc4_skey.o: ../../include/openssl/opensslv.h ../../include/openssl/rc4.h -rc4_skey.o: rc4_locl.h rc4_skey.c +rc4_enc.o: ../../e_os.h ../../include/openssl/bio.h +rc4_enc.o: ../../include/openssl/buffer.h ../../include/openssl/crypto.h +rc4_enc.o: ../../include/openssl/e_os2.h ../../include/openssl/err.h +rc4_enc.o: ../../include/openssl/lhash.h ../../include/openssl/opensslconf.h +rc4_enc.o: ../../include/openssl/opensslv.h ../../include/openssl/ossl_typ.h +rc4_enc.o: ../../include/openssl/rc4.h ../../include/openssl/safestack.h +rc4_enc.o: ../../include/openssl/stack.h ../../include/openssl/symhacks.h +rc4_enc.o: ../cryptlib.h rc4_enc.c rc4_locl.h +rc4_skey.o: ../../e_os.h ../../include/openssl/bio.h +rc4_skey.o: ../../include/openssl/buffer.h ../../include/openssl/crypto.h +rc4_skey.o: ../../include/openssl/e_os2.h ../../include/openssl/err.h +rc4_skey.o: ../../include/openssl/lhash.h ../../include/openssl/opensslconf.h +rc4_skey.o: ../../include/openssl/opensslv.h ../../include/openssl/ossl_typ.h +rc4_skey.o: ../../include/openssl/rc4.h ../../include/openssl/safestack.h +rc4_skey.o: ../../include/openssl/stack.h ../../include/openssl/symhacks.h +rc4_skey.o: ../cryptlib.h rc4_locl.h rc4_skey.c diff --git a/crypto/rc4/asm/rc4-586.pl b/crypto/rc4/asm/rc4-586.pl index 7ef889e5a1..d6e98f0811 100644 --- a/crypto/rc4/asm/rc4-586.pl +++ b/crypto/rc4/asm/rc4-586.pl @@ -1,16 +1,37 @@ #!/usr/local/bin/perl -# define for pentium pro friendly version +# At some point it became apparent that the original SSLeay RC4 +# assembler implementation performs suboptimaly on latest IA-32 +# microarchitectures. After re-tuning performance has changed as +# following: +# +# Pentium +0% +# Pentium III +17% +# AMD +52%(*) +# P4 +180%(**) +# +# (*) This number is actually a trade-off:-) It's possible to +# achieve +72%, but at the cost of -48% off PIII performance. +# In other words code performing further 13% faster on AMD +# would perform almost 2 times slower on Intel PIII... +# For reference! This code delivers ~80% of rc4-amd64.pl +# performance on the same Opteron machine. +# (**) This number requires compressed key schedule set up by +# RC4_set_key and therefore doesn't apply to 0.9.7 [option for +# compressed key schedule is implemented in 0.9.8 and later, +# see commentary section in rc4_skey.c for further details]. +# +# <appro@fy.chalmers.se> push(@INC,"perlasm","../../perlasm"); require "x86asm.pl"; &asm_init($ARGV[0],"rc4-586.pl"); -$tx="eax"; -$ty="ebx"; -$x="ecx"; -$y="edx"; +$x="eax"; +$y="ebx"; +$tx="ecx"; +$ty="edx"; $in="esi"; $out="edi"; $d="ebp"; @@ -31,7 +52,7 @@ sub RC4_loop { &mov($ty, &swtmp(2)); &cmp($ty, $in); - &jle(&label("finished")); + &jbe(&label("finished")); &inc($in); } else @@ -39,27 +60,23 @@ sub RC4_loop &add($ty, 8); &inc($in); &cmp($ty, $in); - &jl(&label("finished")); + &jb(&label("finished")); &mov(&swtmp(2), $ty); } } # Moved out # &mov( $tx, &DWP(0,$d,$x,4)) if $p < 0; - &add( $y, $tx); - &and( $y, 0xff); - &inc( $x); # NEXT ROUND + &add( &LB($y), &LB($tx)); &mov( $ty, &DWP(0,$d,$y,4)); # XXX - &mov( &DWP(-4,$d,$x,4),$ty); # AGI + &mov( &DWP(0,$d,$x,4),$ty); &add( $ty, $tx); - &and( $x, 0xff); # NEXT ROUND - &and( $ty, 0xff); &mov( &DWP(0,$d,$y,4),$tx); - &nop(); - &mov( $ty, &DWP(0,$d,$ty,4)); - &mov( $tx, &DWP(0,$d,$x,4)) if $p < 1; # NEXT ROUND - # XXX + &and( $ty, 0xff); + &inc( &LB($x)); # NEXT ROUND + &mov( $tx, &DWP(0,$d,$x,4)) if $p < 1; # NEXT ROUND + &mov( $ty, &DWP(0,$d,$ty,4)); if (!$char) { @@ -88,35 +105,47 @@ sub RC4 &function_begin_B($name,""); + &mov($ty,&wparam(1)); # len + &cmp($ty,0); + &jne(&label("proceed")); + &ret(); + &set_label("proceed"); + &comment(""); &push("ebp"); &push("ebx"); - &mov( $d, &wparam(0)); # key - &mov( $ty, &wparam(1)); # num &push("esi"); - &push("edi"); + &xor( $x, $x); # avoid partial register stalls + &push("edi"); + &xor( $y, $y); # avoid partial register stalls + &mov( $d, &wparam(0)); # key + &mov( $in, &wparam(2)); - &mov( $x, &DWP(0,$d,"",1)); - &mov( $y, &DWP(4,$d,"",1)); + &movb( &LB($x), &BP(0,$d,"",1)); + &movb( &LB($y), &BP(4,$d,"",1)); - &mov( $in, &wparam(2)); - &inc( $x); + &mov( $out, &wparam(3)); + &inc( &LB($x)); &stack_push(3); # 3 temp variables &add( $d, 8); - &and( $x, 0xff); + + # detect compressed schedule, see commentary section in rc4_skey.c... + # in 0.9.7 context ~50 bytes below RC4_CHAR label remain redundant, + # as compressed key schedule is set up in 0.9.8 and later. + &cmp(&DWP(256,$d),-1); + &je(&label("RC4_CHAR")); &lea( $ty, &DWP(-8,$ty,$in)); # check for 0 length input - &mov( $out, &wparam(3)); &mov( &swtmp(2), $ty); # this is now address to exit at &mov( $tx, &DWP(0,$d,$x,4)); &cmp( $ty, $in); - &jl( &label("end")); # less than 8 bytes + &jb( &label("end")); # less than 8 bytes &set_label("start"); @@ -148,7 +177,7 @@ sub RC4 &mov( &DWP(-4,$out,"",0), $tx); &mov( $tx, &DWP(0,$d,$x,4)); &cmp($in, $ty); - &jle(&label("start")); + &jbe(&label("start")); &set_label("end"); @@ -162,10 +191,37 @@ sub RC4 &RC4_loop(5,0,1); &RC4_loop(6,1,1); + &jmp(&label("finished")); + + &align(16); + # this is essentially Intel P4 specific codepath, see rc4_skey.c, + # and is engaged in 0.9.8 and later context... + &set_label("RC4_CHAR"); + + &lea ($ty,&DWP(0,$in,$ty)); + &mov (&swtmp(2),$ty); + + # strangely enough unrolled loop performs over 20% slower... + &set_label("RC4_CHAR_loop"); + &movz ($tx,&BP(0,$d,$x)); + &add (&LB($y),&LB($tx)); + &movz ($ty,&BP(0,$d,$y)); + &movb (&BP(0,$d,$y),&LB($tx)); + &movb (&BP(0,$d,$x),&LB($ty)); + &add (&LB($ty),&LB($tx)); + &movz ($ty,&BP(0,$d,$ty)); + &xorb (&LB($ty),&BP(0,$in)); + &movb (&BP(0,$out),&LB($ty)); + &inc (&LB($x)); + &inc ($in); + &inc ($out); + &cmp ($in,&swtmp(2)); + &jb (&label("RC4_CHAR_loop")); + &set_label("finished"); &dec( $x); &stack_pop(3); - &mov( &DWP(-4,$d,"",0),$y); + &movb( &BP(-4,$d,"",0),&LB($y)); &movb( &BP(-8,$d,"",0),&LB($x)); &function_end($name); diff --git a/crypto/rc4/asm/rc4-amd64.pl b/crypto/rc4/asm/rc4-amd64.pl index 767a4018fc..9e0da8af99 100755 --- a/crypto/rc4/asm/rc4-amd64.pl +++ b/crypto/rc4/asm/rc4-amd64.pl @@ -13,18 +13,34 @@ # Presumably it has everything to do with AMD cache architecture and # RAW or whatever penalties. Once again! The module *requires* config # line *without* RC4_CHAR! As for coding "secret," I bet on partial -# register arithmetics. For example instead 'inc %r8; and $255,%r8' +# register arithmetics. For example instead of 'inc %r8; and $255,%r8' # I simply 'inc %r8b'. Even though optimization manual discourages # to operate on partial registers, it turned out to be the best bet. # At least for AMD... How IA32E would perform remains to be seen... +# As was shown by Marc Bevand reordering of couple of load operations +# results in even higher performance gain of 3.3x:-) At least on +# Opteron... For reference, 1x in this case is RC4_CHAR C-code +# compiled with gcc 3.3.2, which performs at ~54MBps per 1GHz clock. +# Latter means that if you want to *estimate* what to expect from +# *your* CPU, then multiply 54 by 3.3 and clock frequency in GHz. + +# Intel P4 EM64T core was found to run the AMD64 code really slow... +# The only way to achieve comparable performance on P4 is to keep +# RC4_CHAR. Kind of ironic, huh? As it's apparently impossible to +# compose blended code, which would perform even within 30% marginal +# on either AMD and Intel platforms, I implement both cases. See +# rc4_skey.c for further details... This applies to 0.9.8 and later. +# In 0.9.7 context RC4_CHAR codepath is never engaged and ~70 bytes +# of code remain redundant. + $output=shift; -$win64=1 if ($output =~ /win64.[s|asm]/); +$win64a=1 if ($output =~ /win64a.[s|asm]/); open STDOUT,">$output" || die "can't open $output: $!"; -if (defined($win64)) { +if (defined($win64a)) { $dat="%rcx"; # arg1 $len="%rdx"; # arg2 $inp="%rsi"; # r8, arg3 moves here @@ -43,8 +59,9 @@ $TY="%r9"; sub PTR() { my $ret=shift; - if (defined($win64)) { - $ret =~ s/\[([\S]+)\+([\S]+)\]/[$2+$1]/g; # [%rN+%rM*4]->[%rM*4+%rN] + if (defined($win64a)) { + $ret =~ s/\[([\S]+)\+([\S]+)\]/[$2+$1]/g; # [%rN+%rM*4]->[%rM*4+%rN] + $ret =~ s/:([^\[]+)\[([^\]]+)\]/:[$2+$1]/g; # :off[ea]->:[ea+off] } else { $ret =~ s/[\+\*]/,/g; # [%rN+%rM*4]->[%rN,%rM,4] $ret =~ s/\[([^\]]+)\]/($1)/g; # [%rN]->(%rN) @@ -52,7 +69,7 @@ sub PTR() { $ret; } -$code=<<___ if (!defined($win64)); +$code=<<___ if (!defined($win64a)); .text .globl RC4 @@ -60,130 +77,151 @@ $code=<<___ if (!defined($win64)); .align 16 RC4: or $len,$len jne .Lentry - .byte 0xF3,0xC3 # repz ret, 2-byte ret + repret .Lentry: ___ -$code=<<___ if (defined($win64)); -TEXT SEGMENT +$code=<<___ if (defined($win64a)); +_TEXT SEGMENT PUBLIC RC4 ALIGN 16 -RC4 PROC NEAR +RC4 PROC or $len,$len jne .Lentry - DB F3h,C3h ; repz ret, 2-byte ret + repret .Lentry: - push %edi - push %esi - sub \$40,%esp + push %rdi + push %rsi + sub \$40,%rsp mov %r8,$inp mov %r9,$out ___ $code.=<<___; add \$8,$dat - movl `&PTR("DWORD-8[$dat]")`,$XX#d - movl `&PTR("DWORD-4[$dat]")`,$YY#d + movl `&PTR("DWORD:-8[$dat]")`,$XX#d + movl `&PTR("DWORD:-4[$dat]")`,$YY#d + cmpl \$-1,`&PTR("DWORD:256[$dat]")` + je .LRC4_CHAR test \$-8,$len jz .Lloop1 .align 16 .Lloop8: - movq `&PTR("QWORD[$inp]")`,%rax - inc $XX#b - movl `&PTR("DWORD[$dat+$XX*4]")`,$TX#d + movl `&PTR("DWORD:[$dat+$XX*4]")`,$TX#d add $TX#b,$YY#b - movl `&PTR("DWORD[$dat+$YY*4]")`,$TY#d - movl $TX#d,`&PTR("DWORD[$dat+$YY*4]")` - movl $TY#d,`&PTR("DWORD[$dat+$XX*4]")` - add $TY#b,$TX#b + movl `&PTR("DWORD:[$dat+$YY*4]")`,$TY#d + movl $TX#d,`&PTR("DWORD:[$dat+$YY*4]")` + movl $TY#d,`&PTR("DWORD:[$dat+$XX*4]")` + add $TX#b,$TY#b inc $XX#b - movl `&PTR("DWORD[$dat+$TX*4]")`,$TY#d - xor $TY,%rax + movl `&PTR("DWORD:[$dat+$XX*4]")`,$TX#d + movb `&PTR("BYTE:[$dat+$TY*4]")`,%al ___ for ($i=1;$i<=6;$i++) { $code.=<<___; - movl `&PTR("DWORD[$dat+$XX*4]")`,$TX#d add $TX#b,$YY#b - movl `&PTR("DWORD[$dat+$YY*4]")`,$TY#d - movl $TX#d,`&PTR("DWORD[$dat+$YY*4]")` - movl $TY#d,`&PTR("DWORD[$dat+$XX*4]")` - add $TY#b,$TX#b - movl `&PTR("DWORD[$dat+$TX*4]")`,$TY#d - shl \$`8*$i`,$TY + ror \$8,%rax + movl `&PTR("DWORD:[$dat+$YY*4]")`,$TY#d + movl $TX#d,`&PTR("DWORD:[$dat+$YY*4]")` + movl $TY#d,`&PTR("DWORD:[$dat+$XX*4]")` + add $TX#b,$TY#b inc $XX#b - xor $TY,%rax + movl `&PTR("DWORD:[$dat+$XX*4]")`,$TX#d + movb `&PTR("BYTE:[$dat+$TY*4]")`,%al ___ } $code.=<<___; - movl `&PTR("DWORD[$dat+$XX*4]")`,$TX#d add $TX#b,$YY#b - movl `&PTR("DWORD[$dat+$YY*4]")`,$TY#d - movl $TX#d,`&PTR("DWORD[$dat+$YY*4]")` - movl $TY#d,`&PTR("DWORD[$dat+$XX*4]")` + ror \$8,%rax + movl `&PTR("DWORD:[$dat+$YY*4]")`,$TY#d + movl $TX#d,`&PTR("DWORD:[$dat+$YY*4]")` + movl $TY#d,`&PTR("DWORD:[$dat+$XX*4]")` sub \$8,$len add $TY#b,$TX#b - add \$8,$out - movl `&PTR("DWORD[$dat+$TX*4]")`,$TY#d - shl \$56,$TY + movb `&PTR("BYTE:[$dat+$TX*4]")`,%al + ror \$8,%rax add \$8,$inp - xor $TY,%rax + add \$8,$out - mov %rax,`&PTR("QWORD-8[$out]")` + xor `&PTR("QWORD:-8[$inp]")`,%rax + mov %rax,`&PTR("QWORD:-8[$out]")` test \$-8,$len jnz .Lloop8 cmp \$0,$len jne .Lloop1 .Lexit: - movl $XX#d,`&PTR("DWORD-8[$dat]")` - movl $YY#d,`&PTR("DWORD-4[$dat]")` -___ -$code.=<<___ if (defined($win64)); - add \$40,%esp - pop %esi - pop %edi - DB F3h,C3h ; retz ret, 2-byte ret + movl $XX#d,`&PTR("DWORD:-8[$dat]")` + movl $YY#d,`&PTR("DWORD:-4[$dat]")` ___ -$code.=<<___ if (!defined($win64)); - .byte 0xF3,0xC3 # repz ret, 2-byte ret +$code.=<<___ if (defined($win64a)); + add \$40,%rsp + pop %rsi + pop %rdi ___ $code.=<<___; + repret .align 16 .Lloop1: - movzb `&PTR("BYTE[$inp]")`,%rax + movzb `&PTR("BYTE:[$inp]")`,%eax inc $XX#b - nop - movl `&PTR("DWORD[$dat+$XX*4]")`,$TX#d + movl `&PTR("DWORD:[$dat+$XX*4]")`,$TX#d add $TX#b,$YY#b - movl `&PTR("DWORD[$dat+$YY*4]")`,$TY#d - movl $TX#d,`&PTR("DWORD[$dat+$YY*4]")` - movl $TY#d,`&PTR("DWORD[$dat+$XX*4]")` + movl `&PTR("DWORD:[$dat+$YY*4]")`,$TY#d + movl $TX#d,`&PTR("DWORD:[$dat+$YY*4]")` + movl $TY#d,`&PTR("DWORD:[$dat+$XX*4]")` add $TY#b,$TX#b - movl `&PTR("DWORD[$dat+$TX*4]")`,$TY#d + movl `&PTR("DWORD:[$dat+$TX*4]")`,$TY#d xor $TY,%rax inc $inp - movb %al,`&PTR("BYTE[$out]")` + movb %al,`&PTR("BYTE:[$out]")` inc $out dec $len jnz .Lloop1 jmp .Lexit + +.align 16 +.LRC4_CHAR: + inc $XX#b + movzb `&PTR("BYTE:[$dat+$XX]")`,$TX#d + add $TX#b,$YY#b + movzb `&PTR("BYTE:[$dat+$YY]")`,$TY#d + movb $TX#b,`&PTR("BYTE:[$dat+$YY]")` + movb $TY#b,`&PTR("BYTE:[$dat+$XX]")` + add $TX#b,$TY#b + movzb `&PTR("BYTE:[$dat+$TY]")`,$TY#d + xorb `&PTR("BYTE:[$inp]")`,$TY#b + movb $TY#b,`&PTR("BYTE:[$out]")` + inc $inp + inc $out + dec $len + jnz .LRC4_CHAR + jmp .Lexit +___ +$code.=<<___ if (defined($win64a)); +RC4 ENDP +_TEXT ENDS +END +___ +$code.=<<___ if (!defined($win64a)); +.size RC4,.-RC4 ___ -if (defined($win64)) { - $code.="RC4 ENDP\n"; -} else { - $code.=".size RC4,.-RC4\n" -} $code =~ s/#([bwd])/$1/gm; $code =~ s/\`([^\`]*)\`/eval $1/gem; -if (defined($win64)) { +if (defined($win64a)) { $code =~ s/\.align/ALIGN/gm; $code =~ s/[\$%]//gm; $code =~ s/\.L/\$L/gm; $code =~ s/([\w]+)([\s]+)([\S]+),([\S]+)/$1$2$4,$3/gm; - $code =~ s/([QD]*WORD|BYTE)/$1 PTR /gm; - $code =~ s/(mov[z]*)[bwlq]/$1/gm; + $code =~ s/([QD]*WORD|BYTE):/$1 PTR/gm; + $code =~ s/mov[bwlq]/mov/gm; + $code =~ s/movzb/movzx/gm; + $code =~ s/repret/DB\t0F3h,0C3h/gm; + $code =~ s/cmpl/cmp/gm; + $code =~ s/xorb/xor/gm; } else { - $code =~ s/[QD]*WORD|BYTE//gm; + $code =~ s/([QD]*WORD|BYTE)://gm; + $code =~ s/repret/.byte\t0xF3,0xC3/gm; } print $code; diff --git a/crypto/rc4/rc4.c b/crypto/rc4/rc4.c index b39c070292..c900b26055 100644 --- a/crypto/rc4/rc4.c +++ b/crypto/rc4/rc4.c @@ -60,6 +60,7 @@ #include <stdlib.h> #include <string.h> #include <openssl/rc4.h> +#include <openssl/evp.h> char *usage[]={ "usage: rc4 args\n", @@ -162,7 +163,7 @@ bad: keystr=buf; } - EVP_Digest((unsigned char *)keystr,(unsigned long)strlen(keystr),md,NULL,EVP_md5()); + EVP_Digest((unsigned char *)keystr,strlen(keystr),md,NULL,EVP_md5(),NULL); OPENSSL_cleanse(keystr,strlen(keystr)); RC4_set_key(&key,MD5_DIGEST_LENGTH,md); diff --git a/crypto/rc4/rc4.h b/crypto/rc4/rc4.h index 7aec04fe93..c24a5b1281 100644 --- a/crypto/rc4/rc4.h +++ b/crypto/rc4/rc4.h @@ -72,6 +72,10 @@ typedef struct rc4_key_st { RC4_INT x,y; RC4_INT data[256]; +#if defined(__ia64) || defined(__ia64__) || defined(_M_IA64) + /* see crypto/rc4/asm/rc4-ia64.S for further details... */ + RC4_INT pad[512-256-2]; +#endif } RC4_KEY; diff --git a/crypto/rc4/rc4_enc.c b/crypto/rc4/rc4_enc.c index a0f71d004a..7d0bf92f03 100644 --- a/crypto/rc4/rc4_enc.c +++ b/crypto/rc4/rc4_enc.c @@ -77,6 +77,10 @@ void RC4(RC4_KEY *key, unsigned long len, const unsigned char *indata, x=key->x; y=key->y; d=key->data; +#if defined(__ia64) || defined(__ia64__) || defined(_M_IA64) + /* see crypto/rc4/asm/rc4-ia64.S for further details... */ + d=(RC4_INT *)(((size_t)(d+255))&~(sizeof(key->data)-1)); +#endif #if defined(RC4_CHUNK) /* diff --git a/crypto/rc4/rc4_locl.h b/crypto/rc4/rc4_locl.h index 3bb80b6ce9..c712e1632e 100644 --- a/crypto/rc4/rc4_locl.h +++ b/crypto/rc4/rc4_locl.h @@ -1,4 +1,5 @@ #ifndef HEADER_RC4_LOCL_H #define HEADER_RC4_LOCL_H #include <openssl/opensslconf.h> +#include <cryptlib.h> #endif diff --git a/crypto/rc4/rc4_skey.c b/crypto/rc4/rc4_skey.c index bb10c1ebe2..083b53dfb8 100644 --- a/crypto/rc4/rc4_skey.c +++ b/crypto/rc4/rc4_skey.c @@ -93,25 +93,62 @@ void RC4_set_key(RC4_KEY *key, int len, const unsigned char *data) unsigned int i; d= &(key->data[0]); - for (i=0; i<256; i++) - d[i]=i; +#if defined(__ia64) || defined(__ia64__) || defined(_M_IA64) + /* see crypto/rc4/asm/rc4-ia64.S for further details... */ + d=(RC4_INT *)(((size_t)(d+255))&~(sizeof(key->data)-1)); +#endif key->x = 0; key->y = 0; id1=id2=0; -#define SK_LOOP(n) { \ +#define SK_LOOP(d,n) { \ tmp=d[(n)]; \ id2 = (data[id1] + tmp + id2) & 0xff; \ if (++id1 == len) id1=0; \ d[(n)]=d[id2]; \ d[id2]=tmp; } +#if defined(OPENSSL_CPUID_OBJ) && !defined(OPENSSL_NO_ASM) +# if defined(__i386) || defined(__i386__) || defined(_M_IX86) || \ + defined(__INTEL__) || \ + defined(__x86_64) || defined(__x86_64__) || defined(_M_AMD64) + if (sizeof(RC4_INT) > 1) { + /* + * Unlike all other x86 [and x86_64] implementations, + * Intel P4 core [including EM64T] was found to perform + * poorly with wider RC4_INT. Performance improvement + * for IA-32 hand-coded assembler turned out to be 2.8x + * if re-coded for RC4_CHAR! It's however inappropriate + * to just switch to RC4_CHAR for x86[_64], as non-P4 + * implementations suffer from significant performance + * losses then, e.g. PIII exhibits >2x deterioration, + * and so does Opteron. In order to assure optimal + * all-round performance, let us [try to] detect P4 at + * run-time by checking upon HTT bit in CPU capability + * vector and set up compressed key schedule, which is + * recognized by correspondingly updated assembler + * module... + * <appro@fy.chalmers.se> + */ + if (OPENSSL_ia32cap_P & (1<<28)) { + unsigned char *cp=(unsigned char *)d; + + for (i=0;i<256;i++) cp[i]=i; + for (i=0;i<256;i++) SK_LOOP(cp,i); + /* mark schedule as compressed! */ + d[256/sizeof(RC4_INT)]=-1; + return; + } + } +# endif +#endif + for (i=0; i < 256; i++) d[i]=i; for (i=0; i < 256; i+=4) { - SK_LOOP(i+0); - SK_LOOP(i+1); - SK_LOOP(i+2); - SK_LOOP(i+3); + SK_LOOP(d,i+0); + SK_LOOP(d,i+1); + SK_LOOP(d,i+2); + SK_LOOP(d,i+3); } } diff --git a/crypto/rc4/rc4test.c b/crypto/rc4/rc4test.c index 18154025eb..49afa5c1db 100644 --- a/crypto/rc4/rc4test.c +++ b/crypto/rc4/rc4test.c @@ -70,6 +70,7 @@ int main(int argc, char *argv[]) } #else #include <openssl/rc4.h> +#include <openssl/sha.h> static unsigned char keys[7][30]={ {8,0x01,0x23,0x45,0x67,0x89,0xab,0xcd,0xef}, @@ -117,9 +118,7 @@ int main(int argc, char *argv[]) int j; unsigned char *p; RC4_KEY key; - unsigned char buf[512],obuf[512]; - - for (i=0; i<512; i++) buf[i]=0x01; + unsigned char obuf[512]; for (i=0; i<6; i++) { @@ -197,6 +196,37 @@ int main(int argc, char *argv[]) } } printf("done\n"); + printf("bulk test "); + { unsigned char buf[513]; + SHA_CTX c; + unsigned char md[SHA_DIGEST_LENGTH]; + static unsigned char expected[]={ + 0xa4,0x7b,0xcc,0x00,0x3d,0xd0,0xbd,0xe1,0xac,0x5f, + 0x12,0x1e,0x45,0xbc,0xfb,0x1a,0xa1,0xf2,0x7f,0xc5 }; + + RC4_set_key(&key,keys[0][0],&(keys[3][1])); + memset(buf,'\0',sizeof(buf)); + SHA1_Init(&c); + for (i=0;i<2571;i++) { + RC4(&key,sizeof(buf),buf,buf); + SHA1_Update(&c,buf,sizeof(buf)); + } + SHA1_Final(md,&c); + + if (memcmp(md,expected,sizeof(md))) { + printf("error in RC4 bulk test\n"); + printf("output:"); + for (j=0; j<sizeof(md); j++) + printf(" %02x",md[j]); + printf("\n"); + printf("expect:"); + for (j=0; j<sizeof(md); j++) + printf(" %02x",expected[j]); + printf("\n"); + err++; + } + else printf("ok\n"); + } #ifdef OPENSSL_SYS_NETWARE if (err) printf("ERROR: %d\n", err); #endif |