From 920db1ca0107a2cc02b2f21a8b6ad11ecc8e5e44 Mon Sep 17 00:00:00 2001 From: Jeffrey Walton Date: Sun, 10 May 2020 00:40:25 -0400 Subject: Use vec_sld in VecAdd64 and VecSub64 Using the proper mask and a shift saves a load. Thanks to Steven Munroe for the guidance. --- ppc_simd.h | 51 +++++++++++++++++++++++---------------------------- 1 file changed, 23 insertions(+), 28 deletions(-) (limited to 'ppc_simd.h') diff --git a/ppc_simd.h b/ppc_simd.h index 232f9f21..91e4b2aa 100644 --- a/ppc_simd.h +++ b/ppc_simd.h @@ -1969,24 +1969,24 @@ inline uint32x4_p VecAdd64(const uint32x4_p& vec1, const uint32x4_p& vec2) #if defined(_ARCH_PWR8) && !defined(CRYPTOPP_DEBUG) return (uint32x4_p)vec_add((uint64x2_p)vec1, (uint64x2_p)vec2); #else - // The carry mask selects carries for elements 1 and 3 and sets - // remaining elements to 0. The mask also shifts the carried values - // left by 4 bytes so the carries are added to elements 0 and 2. - - // Small optimization... We can avoid a zero vector {0,0,0,0} and the - // load by using an element that will always be 0. Bytes 1,2, 5,6, 9,10, - // 13,14 are zero because we are using a vector unsigned int. There are - // no carries into those bytes using a vector unsigned int. The - // code below uses byte 2 for the 0 value. + // The carry mask selects carrys for elements 1 and 3 and sets + // remaining elements to 0. The results is then shifted so the + // carried values are subtracted from elements 0 and 2. #if defined(CRYPTOPP_BIG_ENDIAN) - const uint8x16_p cmask = {4,5,6,7, 2,2,2,2, 12,13,14,15, 2,2,2,2}; + const uint32x4_p zero = {0, 0, 0, 0}; + const uint32x4_p mask = {0, 1, 0, 1}; #else - const uint8x16_p cmask = {2,2,2,2, 0,1,2,3, 2,2,2,2, 8,9,10,11}; + const uint32x4_p zero = {0, 0, 0, 0}; + const uint32x4_p mask = {1, 0, 1, 0}; #endif + // subc sets the compliment of borrow, so we have to un-compliment it + // using andc. uint32x4_p cy = vec_addc(vec1, vec2); - cy = vec_perm(cy, cy, cmask); - return vec_add(vec_add(vec1, vec2), cy); + uint32x4_p res = vec_add(vec1, vec2); + cy = vec_and(mask, cy); + cy = vec_sld (cy, zero, 4); + return vec_add(res, cy); #endif } @@ -2035,28 +2035,23 @@ inline uint32x4_p VecSub64(const uint32x4_p& vec1, const uint32x4_p& vec2) return (uint32x4_p)vec_sub((uint64x2_p)vec1, (uint64x2_p)vec2); #else // The borrow mask selects borrows for elements 1 and 3 and sets - // remaining elements to 0. The mask also shifts the borrowed values - // left by 4 bytes so the borrows are subtracted from elements 0 and 2. - - // Small optimization... We can avoid a zero vector {0,0,0,0} and the - // load by using an element that will always be 0. Bytes 1,2, 5,6, 9,10, - // 13,14 are zero because we are using a vector unsigned int. There are - // no borrows from those bytes using a vector unsigned int. The - // code below uses byte 2 for the 0 value. + // remaining elements to 0. The results is then shifted so the + // borrowed values are subtracted from elements 0 and 2. #if defined(CRYPTOPP_BIG_ENDIAN) - const uint8x16_p bmask = {4,5,6,7, 2,2,2,2, 12,13,14,15, 2,2,2,2}; - const uint32x4_p amask = {1, 1, 1, 1}; + const uint32x4_p zero = {0, 0, 0, 0}; + const uint32x4_p mask = {0, 1, 0, 1}; #else - const uint8x16_p bmask = {2,2,2,2, 0,1,2,3, 2,2,2,2, 8,9,10,11}; - const uint32x4_p amask = {1, 1, 1, 1}; + const uint32x4_p zero = {0, 0, 0, 0}; + const uint32x4_p mask = {1, 0, 1, 0}; #endif // subc sets the compliment of borrow, so we have to un-compliment it // using andc. uint32x4_p bw = vec_subc(vec1, vec2); - bw = vec_andc(amask, bw); - bw = vec_perm(bw, bw, bmask); - return vec_sub(vec_sub(vec1, vec2), bw); + uint32x4_p res = vec_sub(vec1, vec2); + bw = vec_andc(mask, bw); + bw = vec_sld (bw, zero, 4); + return vec_sub(res, bw); #endif } -- cgit v1.2.1