summaryrefslogtreecommitdiff
path: root/panama.cpp
diff options
context:
space:
mode:
authorJeffrey Walton <noloader@gmail.com>2018-08-23 14:42:29 -0400
committerGitHub <noreply@github.com>2018-08-23 14:42:29 -0400
commitafbd3e60f68ff8d9ae1b90d9c3c4eb576f518dbd (patch)
tree99d43bac5e27abb44453e699ae308407f25fba3e /panama.cpp
parente054d36dc88d004efb16f6afe1234b4ea94f995c (diff)
downloadcryptopp-git-afbd3e60f68ff8d9ae1b90d9c3c4eb576f518dbd.tar.gz
Fix alignment on Win32 and Solaris Sparc (PR #709)
These fixes were interesting in a morbid sort of way. I thought the FixedSizeAllocatorWithCleanup specializations faithfully reproduced semantics but I was wrong on Win32 and Sparc. Also see Commit e054d36dc88d. It seems there was another requirement or dependency that we missed, but it was not readily apparent. If I am parsing results correctly (which I may not be), it appears the bit twiddling using 8 byte alignment had more influence on alignment than I originally thought based on use of CRYPTOPP_BOOL_ALIGN16 and T_Align16. Or maybe the alignment attributes specified by CRYPTOPP_ALIGN_DATA are not being honored like they should for stack allocations. This check-in avoids some uses of x86 movdqa (aligned) in favor of movdqu (unaligned). The uses were concentrated on memory operands which were 8-byte aligned instead of 16-byte aligned. It is not clear to me how the specializations lost 8-bytes of alignment. The check-in also enlists CRYPTOPP_ASSERT to tell us when there's a problem so we don't need to go hunting for bugs.
Diffstat (limited to 'panama.cpp')
-rw-r--r--panama.cpp56
1 files changed, 29 insertions, 27 deletions
diff --git a/panama.cpp b/panama.cpp
index 19b5fcca..351165ab 100644
--- a/panama.cpp
+++ b/panama.cpp
@@ -93,10 +93,10 @@ void CRYPTOPP_NOINLINE Panama_SSE2_Pull(size_t count, word32 *state, word32 *z,
AS_PUSH_IF86( cx)
#endif
- AS2( movdqa xmm0, XMMWORD_PTR [AS_REG_2+0*16])
- AS2( movdqa xmm1, XMMWORD_PTR [AS_REG_2+1*16])
- AS2( movdqa xmm2, XMMWORD_PTR [AS_REG_2+2*16])
- AS2( movdqa xmm3, XMMWORD_PTR [AS_REG_2+3*16])
+ AS2( movdqu xmm0, XMMWORD_PTR [AS_REG_2+0*16])
+ AS2( movdqu xmm1, XMMWORD_PTR [AS_REG_2+1*16])
+ AS2( movdqu xmm2, XMMWORD_PTR [AS_REG_2+2*16])
+ AS2( movdqu xmm3, XMMWORD_PTR [AS_REG_2+3*16])
AS2( mov eax, dword ptr [AS_REG_2+4*16])
ASL(4)
@@ -184,8 +184,8 @@ void CRYPTOPP_NOINLINE Panama_SSE2_Pull(size_t count, word32 *state, word32 *z,
ASL(1)
AS2( test AS_REG_3, 15)
ASJ( jnz, 3, f)
- AS2( movdqa XMMWORD_PTR [AS_REG_3], xmm4)
- AS2( movdqa XMMWORD_PTR [AS_REG_3+16], xmm6)
+ AS2( movdqu XMMWORD_PTR [AS_REG_3], xmm4)
+ AS2( movdqu XMMWORD_PTR [AS_REG_3+16], xmm6)
AS2( add AS_REG_3, 32)
ASJ( jmp, 0, f)
ASL(3)
@@ -200,24 +200,26 @@ void CRYPTOPP_NOINLINE Panama_SSE2_Pull(size_t count, word32 *state, word32 *z,
AS2( lea AS_REG_7, [AS_REG_6 + (32-24)*32])
AS2( and AS_REG_7, 31*32)
- AS2( movdqa xmm0, XMMWORD_PTR [AS_REG_2+20*4+AS_REG_1+0*8])
+ AS2( movdqu xmm0, XMMWORD_PTR [AS_REG_2+20*4+AS_REG_1+0*8])
AS2( pxor xmm3, xmm0)
ASS( pshufd xmm0, xmm0, 2, 3, 0, 1)
- AS2( movdqa XMMWORD_PTR [AS_REG_2+20*4+AS_REG_1+0*8], xmm3)
- AS2( pxor xmm0, XMMWORD_PTR [AS_REG_2+20*4+AS_REG_7+2*8])
- AS2( movdqa XMMWORD_PTR [AS_REG_2+20*4+AS_REG_7+2*8], xmm0)
+ AS2( movdqu XMMWORD_PTR [AS_REG_2+20*4+AS_REG_1+0*8], xmm3)
+ AS2( movdqu xmm5, XMMWORD_PTR [AS_REG_2+20*4+AS_REG_7+2*8])
+ AS2( pxor xmm0, xmm5)
+ AS2( movdqu XMMWORD_PTR [AS_REG_2+20*4+AS_REG_7+2*8], xmm0)
- AS2( movdqa xmm4, XMMWORD_PTR [AS_REG_2+20*4+AS_REG_1+2*8])
+ AS2( movdqu xmm4, XMMWORD_PTR [AS_REG_2+20*4+AS_REG_1+2*8])
AS2( pxor xmm1, xmm4)
- AS2( movdqa XMMWORD_PTR [AS_REG_2+20*4+AS_REG_1+2*8], xmm1)
- AS2( pxor xmm4, XMMWORD_PTR [AS_REG_2+20*4+AS_REG_7+0*8])
- AS2( movdqa XMMWORD_PTR [AS_REG_2+20*4+AS_REG_7+0*8], xmm4)
+ AS2( movdqu XMMWORD_PTR [AS_REG_2+20*4+AS_REG_1+2*8], xmm1)
+ AS2( movdqu xmm5, XMMWORD_PTR [AS_REG_2+20*4+AS_REG_7+0*8])
+ AS2( pxor xmm4, xmm5)
+ AS2( movdqu XMMWORD_PTR [AS_REG_2+20*4+AS_REG_7+0*8], xmm4)
// theta
- AS2( movdqa xmm3, XMMWORD_PTR [AS_REG_2+3*16])
- AS2( movdqa xmm2, XMMWORD_PTR [AS_REG_2+2*16])
- AS2( movdqa xmm1, XMMWORD_PTR [AS_REG_2+1*16])
- AS2( movdqa xmm0, XMMWORD_PTR [AS_REG_2+0*16])
+ AS2( movdqu xmm3, XMMWORD_PTR [AS_REG_2+3*16])
+ AS2( movdqu xmm2, XMMWORD_PTR [AS_REG_2+2*16])
+ AS2( movdqu xmm1, XMMWORD_PTR [AS_REG_2+1*16])
+ AS2( movdqu xmm0, XMMWORD_PTR [AS_REG_2+0*16])
#if CRYPTOPP_SSSE3_ASM_AVAILABLE
AS2( test AS_REG_6, 1)
@@ -271,16 +273,16 @@ void CRYPTOPP_NOINLINE Panama_SSE2_Pull(size_t count, word32 *state, word32 *z,
AS2( lea AS_REG_7, [AS_REG_6 + 16*32])
AS2( and AS_REG_7, 31*32)
- AS2( movdqa xmm4, XMMWORD_PTR [AS_REG_2+20*4+AS_REG_1+0*16])
- AS2( movdqa xmm5, XMMWORD_PTR [AS_REG_2+20*4+AS_REG_7+0*16])
+ AS2( movdqu xmm4, XMMWORD_PTR [AS_REG_2+20*4+AS_REG_1+0*16])
+ AS2( movdqu xmm5, XMMWORD_PTR [AS_REG_2+20*4+AS_REG_7+0*16])
AS2( movdqa xmm6, xmm4)
AS2( punpcklqdq xmm4, xmm5)
AS2( punpckhqdq xmm6, xmm5)
AS2( pxor xmm3, xmm4)
AS2( pxor xmm2, xmm6)
- AS2( movdqa xmm4, XMMWORD_PTR [AS_REG_2+20*4+AS_REG_1+1*16])
- AS2( movdqa xmm5, XMMWORD_PTR [AS_REG_2+20*4+AS_REG_7+1*16])
+ AS2( movdqu xmm4, XMMWORD_PTR [AS_REG_2+20*4+AS_REG_1+1*16])
+ AS2( movdqu xmm5, XMMWORD_PTR [AS_REG_2+20*4+AS_REG_7+1*16])
AS2( movdqa xmm6, xmm4)
AS2( punpcklqdq xmm4, xmm5)
AS2( punpckhqdq xmm6, xmm5)
@@ -294,10 +296,10 @@ void CRYPTOPP_NOINLINE Panama_SSE2_Pull(size_t count, word32 *state, word32 *z,
// save state
AS2( mov [AS_REG_2+4*16], eax)
- AS2( movdqa XMMWORD_PTR [AS_REG_2+3*16], xmm3)
- AS2( movdqa XMMWORD_PTR [AS_REG_2+2*16], xmm2)
- AS2( movdqa XMMWORD_PTR [AS_REG_2+1*16], xmm1)
- AS2( movdqa XMMWORD_PTR [AS_REG_2+0*16], xmm0)
+ AS2( movdqu XMMWORD_PTR [AS_REG_2+3*16], xmm3)
+ AS2( movdqu XMMWORD_PTR [AS_REG_2+2*16], xmm2)
+ AS2( movdqu XMMWORD_PTR [AS_REG_2+1*16], xmm1)
+ AS2( movdqu XMMWORD_PTR [AS_REG_2+0*16], xmm0)
#if CRYPTOPP_BOOL_X86
AS2( add esp, 4)
@@ -329,7 +331,7 @@ void CRYPTOPP_NOINLINE Panama_SSE2_Pull(size_t count, word32 *state, word32 *z,
#else
}
#endif
-#endif // #ifdef CRYPTOPP_SSE2_ASM_AVAILABLE
+#endif // CRYPTOPP_SSE2_ASM_AVAILABLE
#ifndef CRYPTOPP_GENERATE_X64_MASM