summaryrefslogtreecommitdiff
path: root/crypto
diff options
context:
space:
mode:
authorAndy Polyakov <appro@openssl.org>2011-04-17 12:46:00 +0000
committerAndy Polyakov <appro@openssl.org>2011-04-17 12:46:00 +0000
commit5fabb88a7816f19090384e45bb8f2a22c7f290fb (patch)
tree0a90cb4d459b2d9dd08b334b50f02d479ebbcc72 /crypto
parent764ef43962327c7252fd12d309bbf1cd86b95545 (diff)
downloadopenssl-new-5fabb88a7816f19090384e45bb8f2a22c7f290fb.tar.gz
Multiple assembler packs: add experimental memory bus instrumentation.
Diffstat (limited to 'crypto')
-rw-r--r--crypto/alphacpuid.pl90
-rw-r--r--crypto/ia64cpuid.S88
-rw-r--r--crypto/pariscid.pl94
-rwxr-xr-xcrypto/ppccpuid.pl87
-rw-r--r--crypto/s390xcpuid.S16
-rw-r--r--crypto/sparccpuid.S96
-rw-r--r--crypto/sparcv9cap.c26
-rw-r--r--crypto/x86_64cpuid.pl95
-rw-r--r--crypto/x86cpuid.pl102
9 files changed, 682 insertions, 12 deletions
diff --git a/crypto/alphacpuid.pl b/crypto/alphacpuid.pl
index c9474ff497..11f2e30ce0 100644
--- a/crypto/alphacpuid.pl
+++ b/crypto/alphacpuid.pl
@@ -126,3 +126,93 @@ OPENSSL_cleanse:
.Ldone: ret ($26)
.end OPENSSL_cleanse
___
+{
+my ($out,$cnt,$max)=("\$16","\$17","\$18");
+my ($tick,$lasttick)=("\$19","\$20");
+my ($diff,$lastdiff)=("\$21","\$22");
+my ($v0,$ra,$sp,$zero)=("\$0","\$26","\$30","\$31");
+
+print <<___;
+.globl OPENSSL_instrument_bus
+.ent OPENSSL_instrument_bus
+OPENSSL_instrument_bus:
+ .frame $sp,0,$ra
+ .prologue 0
+ mov $cnt,$v0
+
+ rpcc $lasttick
+ mov 0,$diff
+
+ ecb ($out)
+ ldl_l $tick,0($out)
+ addl $diff,$tick,$tick
+ mov $tick,$diff
+ stl_c $tick,0($out)
+ stl $diff,0($out)
+
+.Loop: rpcc $tick
+ subq $tick,$lasttick,$diff
+ mov $tick,$lasttick
+
+ ecb ($out)
+ ldl_l $tick,0($out)
+ addl $diff,$tick,$tick
+ mov $tick,$diff
+ stl_c $tick,0($out)
+ stl $diff,0($out)
+
+ subl $cnt,1,$cnt
+ lda $out,4($out)
+ bne $cnt,.Loop
+
+ ret ($ra)
+.end OPENSSL_instrument_bus
+
+.globl OPENSSL_instrument_bus2
+.ent OPENSSL_instrument_bus2
+OPENSSL_instrument_bus2:
+ .frame $sp,0,$ra
+ .prologue 0
+ mov $cnt,$v0
+
+ rpcc $lasttick
+ mov 0,$diff
+
+ ecb ($out)
+ ldl_l $tick,0($out)
+ addl $diff,$tick,$tick
+ mov $tick,$diff
+ stl_c $tick,0($out)
+ stl $diff,0($out)
+
+ rpcc $tick
+ subq $tick,$lasttick,$diff
+ mov $tick,$lasttick
+ mov $diff,$lastdiff
+.Loop2:
+ ecb ($out)
+ ldl_l $tick,0($out)
+ addl $diff,$tick,$tick
+ mov $tick,$diff
+ stl_c $tick,0($out)
+ stl $diff,0($out)
+
+ subl $max,1,$max
+ beq $max,.Ldone2
+
+ rpcc $tick
+ subq $tick,$lasttick,$diff
+ mov $tick,$lasttick
+ subq $lastdiff,$diff,$tick
+ mov $diff,$lastdiff
+ cmovne $tick,1,$tick
+ subl $cnt,$tick,$cnt
+ s4addq $tick,$out,$out
+ bne $cnt,.Loop2
+
+.Ldone2:
+ subl $v0,$cnt,$v0
+ ret ($ra)
+.end OPENSSL_instrument_bus2
+___
+}
diff --git a/crypto/ia64cpuid.S b/crypto/ia64cpuid.S
index d705fff7ee..dd27e163ce 100644
--- a/crypto/ia64cpuid.S
+++ b/crypto/ia64cpuid.S
@@ -26,7 +26,7 @@ OPENSSL_atomic_add:
{ .mii; mov ar.ccv=r2
add r8=r2,r33
mov r3=r2 };;
-{ .mmi; mf
+{ .mmi; mf;;
cmpxchg4.acq r2=[r32],r8,ar.ccv
nop.i 0 };;
{ .mib; cmp.ne p6,p0=r2,r3
@@ -165,3 +165,89 @@ OPENSSL_cleanse:
(p7) br.cond.dpnt .Little
(p6) br.ret.sptk.many b0 };;
.endp OPENSSL_cleanse#
+
+.global OPENSSL_instrument_bus#
+.proc OPENSSL_instrument_bus#
+OPENSSL_instrument_cache:
+{ .mmi; mov r2=r33
+#if defined(_HPUX_SOURCE) && !defined(_LP64)
+ addp4 r32=0,r32
+#endif
+ }
+{ .mmi; mov r8=ar.itc;;
+ mov r10=r0
+ mov r9=r8 };;
+
+{ .mmi; fc r32;;
+ ld4 r8=[r32] };;
+{ .mmi; mf
+ mov ar.ccv=r8
+ add r8=r8,r10 };;
+{ .mmi; cmpxchg4.acq r3=[r32],r8,ar.ccv
+ };;
+.Loop:
+{ .mmi; mov r8=ar.itc;;
+ sub r10=r8,r9 // diff=tick-lasttick
+ mov r9=r8 };; // lasttick=tick
+{ .mmi; fc r32;;
+ ld4 r8=[r32] };;
+{ .mmi; mf
+ mov ar.ccv=r8
+ add r8=r8,r10 };;
+{ .mmi; cmpxchg4.acq r3=[r32],r8,ar.ccv
+ add r33=-1,r33
+ add r32=4,r32 };;
+{ .mib; cmp4.ne p6,p0=0,r33
+(p6) br.cond.dptk .Loop };;
+
+{ .mib; sub r8=r2,r33
+ br.ret.sptk.many b0 };;
+.endp OPENSSL_instrument_bus#
+
+.global OPENSSL_instrument_bus2#
+.proc OPENSSL_instrument_bus2#
+OPENSSL_instrument_cache2:
+{ .mmi; mov r2=r33 // put aside cnt
+#if defined(_HPUX_SOURCE) && !defined(_LP64)
+ addp4 r32=0,r32
+#endif
+ }
+{ .mmi; mov r8=ar.itc;;
+ mov r10=r0
+ mov r9=r8 };;
+
+{ .mmi; fc r32;;
+ ld4 r8=[r32] };;
+{ .mmi; mf
+ mov ar.ccv=r8
+ add r8=r8,r10 };;
+{ .mmi; cmpxchg4.acq r3=[r32],r8,ar.ccv
+ };;
+
+{ .mmi; mov r8=ar.itc;;
+ sub r10=r8,r9
+ mov r9=r8 };;
+.Loop2:
+{ .mmi; mov r11=r10 // lastdiff=diff
+ add r34=-1,r34 };; // --max
+{ .mmi; fc r32;;
+ ld4 r8=[r32]
+ cmp4.eq p6,p0=0,r34 };;
+{ .mmi; mf
+ mov ar.ccv=r8
+ add r8=r8,r10 };;
+{ .mmb; cmpxchg4.acq r3=[r32],r8,ar.ccv
+(p6) br.cond.spnt .Ldone2 };;
+
+{ .mmi; mov r8=ar.itc;;
+ sub r10=r8,r9 // diff=tick-lasttick
+ mov r9=r8 };; // lasttick=tick
+{ .mmi; cmp.ne p6,p0=r10,r11;; // diff!=lastdiff
+(p6) add r33=-1,r33 };; // conditional --cnt
+{ .mib; cmp4.ne p7,p0=0,r33
+(p6) add r32=4,r32 // conditional ++out
+(p7) br.cond.dptk .Loop2 };;
+.Ldone2:
+{ .mib; sub r8=r2,r33
+ br.ret.sptk.many b0 };;
+.endp OPENSSL_instrument_bus2#
diff --git a/crypto/pariscid.pl b/crypto/pariscid.pl
index 1ed5381957..477ec9b87d 100644
--- a/crypto/pariscid.pl
+++ b/crypto/pariscid.pl
@@ -87,8 +87,8 @@ OPENSSL_wipe_cpu
.PROCEND
___
{
-$inp="%r26";
-$len="%r25";
+my $inp="%r26";
+my $len="%r25";
$code.=<<___;
.EXPORT OPENSSL_cleanse,ENTRY,ARGW0=GR,ARGW1=GR
@@ -112,9 +112,9 @@ Lalign
Laligned
andcm $len,%r1,%r28
-Loop
+Lot
$ST %r0,0($inp)
- addib,*<> -$SIZE_T,%r28,Loop
+ addib,*<> -$SIZE_T,%r28,Lot
ldo $SIZE_T($inp),$inp
and,*<> $len,%r1,$len
@@ -130,7 +130,93 @@ Ldone
.PROCEND
___
}
+{
+my ($out,$cnt,$max)=("%r26","%r25","%r24");
+my ($tick,$lasttick)=("%r23","%r22");
+my ($diff,$lastdiff)=("%r21","%r20");
+
+$code.=<<___;
+ .EXPORT OPENSSL_instrument_bus,ENTRY,ARGW0=GR,ARGW1=GR
+ .ALIGN 8
+OPENSSL_instrument_bus
+ .PROC
+ .CALLINFO NO_CALLS
+ .ENTRY
+ copy $cnt,$rv
+ mfctl %cr16,$tick
+ copy $tick,$lasttick
+ ldi 0,$diff
+
+ fdc 0($out)
+ ldw 0($out),$tick
+ add $diff,$tick,$tick
+ stw $tick,0($out)
+Loop
+ mfctl %cr16,$tick
+ sub $tick,$lasttick,$diff
+ copy $tick,$lasttick
+
+ fdc 0($out)
+ ldw 0($out),$tick
+ add $diff,$tick,$tick
+ stw $tick,0($out)
+
+ addib,<> -1,$cnt,Loop
+ addi 4,$out,$out
+
+ bv ($rp)
+ .EXIT
+ sub $rv,$cnt,$rv
+ .PROCEND
+ .EXPORT OPENSSL_instrument_bus2,ENTRY,ARGW0=GR,ARGW1=GR
+ .ALIGN 8
+OPENSSL_instrument_bus2
+ .PROC
+ .CALLINFO NO_CALLS
+ .ENTRY
+ copy $cnt,$rv
+ sub %r0,$cnt,$cnt
+
+ mfctl %cr16,$tick
+ copy $tick,$lasttick
+ ldi 0,$diff
+
+ fdc 0($out)
+ ldw 0($out),$tick
+ add $diff,$tick,$tick
+ stw $tick,0($out)
+
+ mfctl %cr16,$tick
+ sub $tick,$lasttick,$diff
+ copy $tick,$lasttick
+Loop2
+ copy $diff,$lastdiff
+ fdc 0($out)
+ ldw 0($out),$tick
+ add $diff,$tick,$tick
+ stw $tick,0($out)
+
+ addib,= -1,$max,Ldone2
+ nop
+
+ mfctl %cr16,$tick
+ sub $tick,$lasttick,$diff
+ copy $tick,$lasttick
+ cmpclr,<> $lastdiff,$diff,$tick
+ ldi 1,$tick
+
+ ldi 1,%r1
+ xor %r1,$tick,$tick
+ addb,<> $tick,$cnt,Loop2
+ shladd,l $tick,2,$out,$out
+Ldone2
+ bv ($rp)
+ .EXIT
+ add $rv,$cnt,$rv
+ .PROCEND
+___
+}
$code =~ s/cmpib,\*/comib,/gm if ($SIZE_T==4);
$code =~ s/,\*/,/gm if ($SIZE_T==4);
print $code;
diff --git a/crypto/ppccpuid.pl b/crypto/ppccpuid.pl
index 2131d30c1d..d6220e747d 100755
--- a/crypto/ppccpuid.pl
+++ b/crypto/ppccpuid.pl
@@ -69,10 +69,10 @@ $code=<<___;
.globl .OPENSSL_atomic_add
.align 4
.OPENSSL_atomic_add:
-Loop: lwarx r5,0,r3
+Ladd: lwarx r5,0,r3
add r0,r4,r5
stwcx. r0,0,r3
- bne- Loop
+ bne- Ladd
$SIGNX r3,r0
blr
@@ -112,6 +112,89 @@ Laligned:
bne Little
blr
___
+{
+my ($out,$cnt,$max)=("r3","r4","r5");
+my ($tick,$lasttick)=("r6","r7");
+my ($diff,$lastdiff)=("r8","r9");
+
+$code.=<<___;
+.globl .OPENSSL_instrument_bus
+.align 4
+.OPENSSL_instrument_bus:
+ mtctr $cnt
+
+ mftb $lasttick # collect 1st tick
+ li $diff,0
+
+ dcbf 0,$out # flush cache line
+ lwarx $tick,0,$out # load and lock
+ add $tick,$tick,$diff
+ stwcx. $tick,0,$out
+ stwx $tick,0,$out
+
+Loop: mftb $tick
+ sub $diff,$tick,$lasttick
+ mr $lasttick,$tick
+ dcbf 0,$out # flush cache line
+ lwarx $tick,0,$out # load and lock
+ add $tick,$tick,$diff
+ stwcx. $tick,0,$out
+ stwx $tick,0,$out
+ addi $out,$out,4 # ++$out
+ bdnz Loop
+
+ mr r3,$cnt
+ blr
+
+.globl .OPENSSL_instrument_bus2
+.align 4
+.OPENSSL_instrument_bus2:
+ mr r0,$cnt
+ slwi $cnt,$cnt,2
+
+ mftb $lasttick # collect 1st tick
+ li $diff,0
+
+ dcbf 0,$out # flush cache line
+ lwarx $tick,0,$out # load and lock
+ add $tick,$tick,$diff
+ stwcx. $tick,0,$out
+ stwx $tick,0,$out
+
+ mftb $tick # collect 1st diff
+ sub $diff,$tick,$lasttick
+ mr $lasttick,$tick
+ mr $lastdiff,$diff
+Loop2:
+ dcbf 0,$out # flush cache line
+ lwarx $tick,0,$out # load and lock
+ add $tick,$tick,$diff
+ stwcx. $tick,0,$out
+ stwx $tick,0,$out
+
+ addic. $max,$max,-1
+ beq Ldone2
+
+ mftb $tick
+ sub $diff,$tick,$lasttick
+ mr $lasttick,$tick
+ cmplw 7,$diff,$lastdiff
+ mr $lastdiff,$diff
+
+ mfcr $tick # pull cr
+ not $tick,$tick # flip bits
+ rlwinm $tick,$tick,1,29,29 # isolate flipped eq bit and scale
+
+ sub. $cnt,$cnt,$tick # conditional --$cnt
+ add $out,$out,$tick # conditional ++$out
+ bne Loop2
+
+Ldone2:
+ srwi $cnt,$cnt,2
+ sub r3,r0,$cnt
+ blr
+___
+}
$code =~ s/\`([^\`]*)\`/eval $1/gem;
print $code;
diff --git a/crypto/s390xcpuid.S b/crypto/s390xcpuid.S
index 06815347e6..3402a2404b 100644
--- a/crypto/s390xcpuid.S
+++ b/crypto/s390xcpuid.S
@@ -93,6 +93,22 @@ OPENSSL_cleanse:
br %r14
.size OPENSSL_cleanse,.-OPENSSL_cleanse
+.globl OPENSSL_instrument_bus
+.type OPENSSL_instrument_bus,@function
+.align 16
+OPENSSL_instrument_bus:
+ lghi %r2,0
+ br %r14
+.size OPENSSL_instrument_bus,.-OPENSSL_instrument_bus
+
+.globl OPENSSL_instrument_bus2
+.type OPENSSL_instrument_bus2,@function
+.align 16
+OPENSSL_instrument_bus2:
+ lghi %r2,0
+ br %r14
+.size OPENSSL_instrument_bus2,.-OPENSSL_instrument_bus2
+
.section .init
brasl %r14,OPENSSL_cpuid_setup
diff --git a/crypto/sparccpuid.S b/crypto/sparccpuid.S
index ae61f7f5ce..329efcdd3d 100644
--- a/crypto/sparccpuid.S
+++ b/crypto/sparccpuid.S
@@ -397,6 +397,102 @@ OPENSSL_cleanse:
.type OPENSSL_cleanse,#function
.size OPENSSL_cleanse,.-OPENSSL_cleanse
+.global _sparcv9_vis1_instrument_bus
+.align 8
+_sparcv9_vis1_instrument_bus:
+ mov %o1,%o3 ! save cnt
+ .word 0x99410000 !rd %tick,%o4 ! tick
+ mov %o4,%o5 ! lasttick = tick
+ set 0,%g4 ! diff
+
+ andn %o0,63,%g1
+ .word 0xc1985e00 !ldda [%g1]0xf0,%f0 ! block load
+ .word 0x8143e040 !membar #Sync
+ .word 0xc1b85c00 !stda %f0,[%g1]0xe0 ! block store and commit
+ .word 0x8143e040 !membar #Sync
+ ld [%o0],%o4
+ add %o4,%g4,%g4
+ .word 0xc9e2100c !cas [%o0],%o4,%g4
+
+.Loop: .word 0x99410000 !rd %tick,%o4
+ sub %o4,%o5,%g4 ! diff=tick-lasttick
+ mov %o4,%o5 ! lasttick=tick
+
+ andn %o0,63,%g1
+ .word 0xc1985e00 !ldda [%g1]0xf0,%f0 ! block load
+ .word 0x8143e040 !membar #Sync
+ .word 0xc1b85c00 !stda %f0,[%g1]0xe0 ! block store and commit
+ .word 0x8143e040 !membar #Sync
+ ld [%o0],%o4
+ add %o4,%g4,%g4
+ .word 0xc9e2100c !cas [%o0],%o4,%g4
+ subcc %o1,1,%o1 ! --$cnt
+ bnz .Loop
+ add %o0,4,%o0 ! ++$out
+
+ retl
+ mov %o3,%o0
+.type _sparcv9_vis1_instrument_bus,#function
+.size _sparcv9_vis1_instrument_bus,.-_sparcv9_vis1_instrument_bus
+
+.global _sparcv9_vis1_instrument_bus2
+.align 8
+_sparcv9_vis1_instrument_bus2:
+ mov %o1,%o3 ! save cnt
+ sll %o1,2,%o1 ! cnt*=4
+
+ .word 0x99410000 !rd %tick,%o4 ! tick
+ mov %o4,%o5 ! lasttick = tick
+ set 0,%g4 ! diff
+
+ andn %o0,63,%g1
+ .word 0xc1985e00 !ldda [%g1]0xf0,%f0 ! block load
+ .word 0x8143e040 !membar #Sync
+ .word 0xc1b85c00 !stda %f0,[%g1]0xe0 ! block store and commit
+ .word 0x8143e040 !membar #Sync
+ ld [%o0],%o4
+ add %o4,%g4,%g4
+ .word 0xc9e2100c !cas [%o0],%o4,%g4
+
+ .word 0x99410000 !rd %tick,%o4 ! tick
+ sub %o4,%o5,%g4 ! diff=tick-lasttick
+ mov %o4,%o5 ! lasttick=tick
+ mov %g4,%g5 ! lastdiff=diff
+.Loop2:
+ andn %o0,63,%g1
+ .word 0xc1985e00 !ldda [%g1]0xf0,%f0 ! block load
+ .word 0x8143e040 !membar #Sync
+ .word 0xc1b85c00 !stda %f0,[%g1]0xe0 ! block store and commit
+ .word 0x8143e040 !membar #Sync
+ ld [%o0],%o4
+ add %o4,%g4,%g4
+ .word 0xc9e2100c !cas [%o0],%o4,%g4
+
+ subcc %o2,1,%o2 ! --max
+ bz .Ldone2
+ nop
+
+ .word 0x99410000 !rd %tick,%o4 ! tick
+ sub %o4,%o5,%g4 ! diff=tick-lasttick
+ mov %o4,%o5 ! lasttick=tick
+ cmp %g4,%g5
+ mov %g4,%g5 ! lastdiff=diff
+
+ .word 0x83408000 !rd %ccr,%g1
+ and %g1,4,%g1 ! isolate zero flag
+ xor %g1,4,%g1 ! flip zero flag
+
+ subcc %o1,%g1,%o1 ! conditional --$cnt
+ bnz .Loop2
+ add %o0,%g1,%o0 ! conditional ++$out
+
+.Ldone2:
+ srl %o1,2,%o1
+ retl
+ sub %o3,%o1,%o0
+.type _sparcv9_vis1_instrument_bus2,#function
+.size _sparcv9_vis1_instrument_bus2,.-_sparcv9_vis1_instrument_bus2
+
.section ".init",#alloc,#execinstr
call OPENSSL_cpuid_setup
nop
diff --git a/crypto/sparcv9cap.c b/crypto/sparcv9cap.c
index ed195ab402..ad4b3be718 100644
--- a/crypto/sparcv9cap.c
+++ b/crypto/sparcv9cap.c
@@ -11,6 +11,7 @@
#define SPARCV9_VIS1 (1<<2)
#define SPARCV9_VIS2 (1<<3) /* reserved */
#define SPARCV9_FMADD (1<<4) /* reserved for SPARC64 V */
+#define SPARCV9_BLK (1<<5) /* VIS1 block copy */
static int OPENSSL_sparcv9cap_P=SPARCV9_TICK_PRIVILEGED;
@@ -31,6 +32,8 @@ void _sparcv9_vis1_probe(void);
unsigned long _sparcv9_vis1_instrument(void);
void _sparcv9_vis2_probe(void);
void _sparcv9_fmadd_probe(void);
+size_t _sparcv9_vis1_instrument_bus(unsigned int *,size_t);
+size_t _sparcv8_vis1_instrument_bus2(unsigned int *,size_t,size_t);
unsigned long OPENSSL_rdtsc(void)
{
@@ -44,6 +47,24 @@ unsigned long OPENSSL_rdtsc(void)
return _sparcv9_rdtick();
}
+size_t OPENSSL_instrument_bus(unsigned int *out,size_t cnt)
+ {
+ if (OPENSSL_sparcv9cap_P&(SPARCV9_TICK_PRIVILEGED|SPARCV9_BLK) ==
+ SPARCV9_BLK)
+ return _sparcv9_vis1_instrument_bus(out,cnt);
+ else
+ return 0;
+ }
+
+size_t OPENSSL_instrument_bus2(unsigned int *out,size_t cnt,size_t max)
+ {
+ if (OPENSSL_sparcv9cap_P&(SPARCV9_TICK_PRIVILEGED|SPARCV9_BLK) ==
+ SPARCV9_BLK)
+ return _sparcv9_vis1_instrument_bus2(out,cnt,max);
+ else
+ return 0;
+ }
+
#if 0 && defined(__sun) && defined(__SVR4)
/* This code path is disabled, because of incompatibility of
* libdevinfo.so.1 and libmalloc.so.1 (see below for details)
@@ -112,7 +133,7 @@ void OPENSSL_cpuid_setup(void)
if (sysinfo(SI_ISALIST,si,sizeof(si))>0)
{
if (strstr(si,"+vis"))
- OPENSSL_sparcv9cap_P |= SPARCV9_VIS1;
+ OPENSSL_sparcv9cap_P |= SPARCV9_VIS1|SPARCV9_BLK;
if (strstr(si,"+vis2"))
{
OPENSSL_sparcv9cap_P |= SPARCV9_VIS2;
@@ -169,7 +190,6 @@ void OPENSSL_cpuid_setup(void)
char *e;
struct sigaction common_act,ill_oact,bus_oact;
sigset_t all_masked,oset;
- int sig;
static int trigger=0;
if (trigger) return;
@@ -211,7 +231,7 @@ void OPENSSL_cpuid_setup(void)
if (sigsetjmp(common_jmp,1) == 0)
{
_sparcv9_vis1_probe();
- OPENSSL_sparcv9cap_P |= SPARCV9_VIS1;
+ OPENSSL_sparcv9cap_P |= SPARCV9_VIS1|SPARCV9_BLK;
/* detect UltraSPARC-Tx, see sparccpud.S for details... */
if (_sparcv9_vis1_instrument() >= 12)
OPENSSL_sparcv9cap_P &= ~(SPARCV9_VIS1|SPARCV9_PREFER_FPU);
diff --git a/crypto/x86_64cpuid.pl b/crypto/x86_64cpuid.pl
index c96821a3c8..ecfcfc763c 100644
--- a/crypto/x86_64cpuid.pl
+++ b/crypto/x86_64cpuid.pl
@@ -9,8 +9,9 @@ $win64=0; $win64=1 if ($flavour =~ /[nm]asm|mingw64/ || $output =~ /\.asm$/);
$0 =~ m/(.*[\/\\])[^\/\\]+$/; $dir=$1;
open STDOUT,"| $^X ${dir}perlasm/x86_64-xlate.pl $flavour $output";
-if ($win64) { $arg1="%rcx"; $arg2="%rdx"; }
-else { $arg1="%rdi"; $arg2="%rsi"; }
+($arg1,$arg2,$arg3,$arg4)=$win64?("%rcx","%rdx","%r8", "%r9") : # Win64 order
+ ("%rdi","%rsi","%rdx","%rcx"); # Unix order
+
print<<___;
.extern OPENSSL_cpuid_setup
.section .init
@@ -228,5 +229,95 @@ OPENSSL_wipe_cpu:
ret
.size OPENSSL_wipe_cpu,.-OPENSSL_wipe_cpu
___
+{
+my $out="%r10";
+my $cnt="%rcx";
+my $max="%r11";
+my $lasttick="%r8d";
+my $lastdiff="%r9d";
+my $redzone=win64?8:-8;
+
+print<<___;
+.globl OPENSSL_instrument_bus
+.type OPENSSL_instrument_bus,\@abi-omnipotent
+.align 16
+OPENSSL_instrument_bus:
+ mov $arg1,$out # tribute to Win64
+ mov $arg2,$cnt
+ mov $arg2,$max
+
+ rdtsc # collect 1st tick
+ mov %eax,$lasttick # lasttick = tick
+ mov \$0,$lastdiff # lastdiff = 0
+ clflush ($out)
+ lock
+ add $lastdiff,($out)
+ jmp .Loop
+.align 16
+.Loop: rdtsc
+ mov %eax,%edx
+ sub $lasttick,%eax
+ mov %edx,$lasttick
+ mov %eax,$lastdiff
+ clflush ($out)
+ lock
+ add %eax,($out)
+ lea 4($out),$out
+ sub \$1,$cnt
+ jnz .Loop
+
+ mov $max,%rax
+ ret
+.size OPENSSL_instrument_bus,.-OPENSSL_instrument_bus
+
+.globl OPENSSL_instrument_bus2
+.type OPENSSL_instrument_bus2,\@abi-omnipotent
+.align 16
+OPENSSL_instrument_bus2:
+ mov $arg1,$out # tribute to Win64
+ mov $arg2,$cnt
+ mov $arg3,$max
+ mov $cnt,$redzone(%rsp)
+
+ rdtsc # collect 1st tick
+ mov %eax,$lasttick # lasttick = tick
+ mov \$0,$lastdiff # lastdiff = 0
+
+ clflush ($out)
+ lock
+ add $lastdiff,($out)
+
+ rdtsc # collect 1st diff
+ mov %eax,%edx
+ sub $lasttick,%eax # diff
+ mov %edx,$lasttick # lasttick = tick
+ mov %eax,$lastdiff # lastdiff = diff
+.Loop2:
+ clflush ($out)
+ lock
+ add %eax,($out) # accumulate diff
+
+ sub \$1,$max
+ jz .Ldone2
+
+ rdtsc
+ mov %eax,%edx
+ sub $lasttick,%eax # diff
+ mov %edx,$lasttick # lasttick = tick
+ cmp $lastdiff,%eax
+ mov %eax,$lastdiff # lastdiff = diff
+ mov \$0,%edx
+ setne %dl
+ sub %rdx,$cnt # conditional --$cnt
+ lea ($out,%rdx,4),$out # conditional ++$out
+ jnz .Loop2
+
+.Ldone2:
+ mov $redzone(%rsp),%rax
+ sub $cnt,%rax
+ ret
+.size OPENSSL_instrument_bus2,.-OPENSSL_instrument_bus2
+___
+}
close STDOUT; # flush
diff --git a/crypto/x86cpuid.pl b/crypto/x86cpuid.pl
index a7464af19b..0513398739 100644
--- a/crypto/x86cpuid.pl
+++ b/crypto/x86cpuid.pl
@@ -307,6 +307,108 @@ for (@ARGV) { $sse2=1 if (/-DOPENSSL_IA32_SSE2/); }
&ret ();
&function_end_B("OPENSSL_cleanse");
+{
+my $lasttick = "esi";
+my $lastdiff = "ebx";
+my $out = "edi";
+my $cnt = "ecx";
+my $max = "ebp";
+
+&function_begin("OPENSSL_instrument_bus");
+ &mov ("eax",0);
+ if ($sse2) {
+ &picmeup("edx","OPENSSL_ia32cap_P");
+ &bt (&DWP(0,"edx"),4);
+ &jnc (&label("nogo")); # no TSC
+ &bt (&DWP(0,"edx"),19);
+ &jnc (&label("nogo")); # no CLFLUSH
+
+ &mov ($out,&wparam(0)); # load arguments
+ &mov ($cnt,&wparam(1));
+
+ # collect 1st tick
+ &rdtsc ();
+ &mov ($lasttick,"eax"); # lasttick = tick
+ &mov ($lastdiff,0); # lastdiff = 0
+ &clflush(&DWP(0,$out));
+ &lock ();
+ &add (&DWP(0,$out),$lastdiff);
+ &jmp (&label("loop"));
+
+&set_label("loop",16);
+ &rdtsc ();
+ &mov ("edx","eax"); # put aside tick (yes, I neglect edx)
+ &sub ("eax",$lasttick); # diff
+ &mov ($lasttick,"edx"); # lasttick = tick
+ &mov ($lastdiff,"eax"); # lastdiff = diff
+ &clflush(&DWP(0,$out));
+ &lock ();
+ &add (&DWP(0,$out),"eax"); # accumulate diff
+ &lea ($out,&DWP(4,$out)); # ++$out
+ &sub ($cnt,1); # --$cnt
+ &jnz (&label("loop"));
+
+ &mov ("eax",&wparam(1));
+&set_label("nogo");
+ }
+&function_end("OPENSSL_instrument_bus");
+
+&function_begin("OPENSSL_instrument_bus2");
+ &mov ("eax",0);
+ if ($sse2) {
+ &picmeup("edx","OPENSSL_ia32cap_P");
+ &bt (&DWP(0,"edx"),4);
+ &jnc (&label("nogo")); # no TSC
+ &bt (&DWP(0,"edx"),19);
+ &jnc (&label("nogo")); # no CLFLUSH
+
+ &mov ($out,&wparam(0)); # load arguments
+ &mov ($cnt,&wparam(1));
+ &mov ($max,&wparam(2));
+
+ &rdtsc (); # collect 1st tick
+ &mov ($lasttick,"eax"); # lasttick = tick
+ &mov ($lastdiff,0); # lastdiff = 0
+
+ &clflush(&DWP(0,$out));
+ &lock ();
+ &add (&DWP(0,$out),$lastdiff);
+
+ &rdtsc (); # collect 1st diff
+ &mov ("edx","eax"); # put aside tick (yes, I neglect edx)
+ &sub ("eax",$lasttick); # diff
+ &mov ($lasttick,"edx"); # lasttick = tick
+ &mov ($lastdiff,"eax"); # lastdiff = diff
+ &jmp (&label("loop2"));
+
+&set_label("loop2",16);
+ &clflush(&DWP(0,$out));
+ &lock ();
+ &add (&DWP(0,$out),"eax"); # accumulate diff
+
+ &sub ($max,1);
+ &jz (&label("done2"));
+
+ &rdtsc ();
+ &mov ("edx","eax"); # put aside tick (yes, I neglect edx)
+ &sub ("eax",$lasttick); # diff
+ &mov ($lasttick,"edx"); # lasttick = tick
+ &cmp ("eax",$lastdiff);
+ &mov ($lastdiff,"eax"); # lastdiff = diff
+ &mov ("edx",0);
+ &setne ("dl");
+ &sub ($cnt,"edx"); # conditional --$cnt
+ &lea ($out,&DWP(0,$out,"edx",4)); # conditional ++$out
+ &jnz (&label("loop2"));
+
+&set_label("done2");
+ &mov ("eax",&wparam(1));
+ &sub ("eax",$cnt);
+&set_label("nogo");
+ }
+&function_end("OPENSSL_instrument_bus2");
+}
+
&initseg("OPENSSL_cpuid_setup");
&asm_finish();