From f6e04e5f338d2573f182a2daabed3220ce3dda7e Mon Sep 17 00:00:00 2001 From: Jeffrey Walton Date: Thu, 15 Nov 2018 15:17:49 -0500 Subject: Rename PPC vector functions from VectorFunc to VecFunc --- blake2b_simd.cpp | 124 +++++++++++++++++++++++++++---------------------------- 1 file changed, 62 insertions(+), 62 deletions(-) (limited to 'blake2b_simd.cpp') diff --git a/blake2b_simd.cpp b/blake2b_simd.cpp index 853a4cb5..ee701bd4 100644 --- a/blake2b_simd.cpp +++ b/blake2b_simd.cpp @@ -742,7 +742,7 @@ void BLAKE2_Compress64_NEON(const byte* input, BLAKE2b_State& state) #if (CRYPTOPP_POWER8_AVAILABLE) -inline uint64x2_p VectorLoad64(const void* p) +inline uint64x2_p VecLoad64(const void* p) { #if defined(__xlc__) || defined(__xlC__) || defined(__clang__) return (uint64x2_p)vec_xl(0, (uint8_t*)p); @@ -751,18 +751,18 @@ inline uint64x2_p VectorLoad64(const void* p) #endif } -inline uint64x2_p VectorLoad64LE(const void* p) +inline uint64x2_p VecLoad64LE(const void* p) { #if __BIG_ENDIAN__ const uint8x16_p m = {7,6,5,4, 3,2,1,0, 15,14,13,12, 11,10,9,8}; - const uint64x2_p v = VectorLoad64(p); - return vec_perm(v, v, m); + const uint64x2_p v = VecLoad64(p); + return VecPermute(v, v, m); #else - return VectorLoad64(p); + return VecLoad64(p); #endif } -inline void VectorStore64(void* p, const uint64x2_p x) +inline void VecStore64(void* p, const uint64x2_p x) { #if defined(__xlc__) || defined(__xlC__) || defined(__clang__) vec_xst((uint8x16_p)x,0,(uint8_t*)p); @@ -771,18 +771,18 @@ inline void VectorStore64(void* p, const uint64x2_p x) #endif } -inline void VectorStore64LE(void* p, const uint64x2_p x) +inline void VecStore64LE(void* p, const uint64x2_p x) { #if __BIG_ENDIAN__ const uint8x16_p m = {7,6,5,4, 3,2,1,0, 15,14,13,12, 11,10,9,8}; - VectorStore64(p, vec_perm(x, x, m)); + VecStore64(p, VecPermute(x, x, m)); #else - VectorStore64(p, x); + VecStore64(p, x); #endif } template -inline uint64x2_p VectorShiftLeftOctet(const uint64x2_p a, const uint64x2_p b) +inline uint64x2_p VecShiftLeftOctet(const uint64x2_p a, const uint64x2_p b) { #if __BIG_ENDIAN__ return (uint64x2_p)vec_sld((uint8x16_p)a, (uint8x16_p)b, C); @@ -791,18 +791,18 @@ inline uint64x2_p VectorShiftLeftOctet(const uint64x2_p a, const uint64x2_p b) #endif } -#define vec_shl_octet(a,b,c) VectorShiftLeftOctet(a, b) +#define vec_shl_octet(a,b,c) VecShiftLeftOctet(a, b) -// vec_mergeh(a,b) is equivalent to vec_perm(a,b,HH_MASK); and -// vec_mergel(a,b) is equivalent vec_perm(a,b,LL_MASK). Benchmarks +// vec_mergeh(a,b) is equivalent to VecPermute(a,b,HH_MASK); and +// vec_mergel(a,b) is equivalent VecPermute(a,b,LL_MASK). Benchmarks // show vec_mergeh and vec_mergel is faster on little-endian -// machines by 0.4 cpb. Benchmarks show vec_perm is faster on +// machines by 0.4 cpb. Benchmarks show VecPermute is faster on // big-endian machines by 1.5 cpb. The code that uses // vec_mergeh and vec_mergel is about 880 bytes shorter. #if defined(__GNUC__) && (__BIG_ENDIAN__) -# define vec_merge_hi(a,b) vec_perm(a,b, HH_MASK) -# define vec_merge_lo(a,b) vec_perm(a,b, LL_MASK) +# define vec_merge_hi(a,b) VecPermute(a,b, HH_MASK) +# define vec_merge_lo(a,b) VecPermute(a,b, LL_MASK) #else # define vec_merge_hi(a,b) vec_mergeh(a,b) # define vec_merge_lo(a,b) vec_mergel(a,b) @@ -878,12 +878,12 @@ void BLAKE2_Compress64_POWER8(const byte* input, BLAKE2b_State& state) #define BLAKE2B_LOAD_MSG_2_2(b0, b1) \ do { \ b0 = vec_merge_hi(m4, m0); \ - b1 = vec_perm(m1, m6, HL_MASK); \ + b1 = VecPermute(m1, m6, HL_MASK); \ } while(0) #define BLAKE2B_LOAD_MSG_2_3(b0, b1) \ do { \ - b0 = vec_perm(m5, m1, HL_MASK); \ + b0 = VecPermute(m5, m1, HL_MASK); \ b1 = vec_merge_lo(m3, m4); \ } while(0) @@ -907,8 +907,8 @@ void BLAKE2_Compress64_POWER8(const byte* input, BLAKE2b_State& state) #define BLAKE2B_LOAD_MSG_3_3(b0, b1) \ do { \ - b0 = vec_perm(m1, m2, HL_MASK); \ - b1 = vec_perm(m2, m7, HL_MASK); \ + b0 = VecPermute(m1, m2, HL_MASK); \ + b1 = VecPermute(m2, m7, HL_MASK); \ } while(0) #define BLAKE2B_LOAD_MSG_3_4(b0, b1) \ @@ -925,20 +925,20 @@ void BLAKE2_Compress64_POWER8(const byte* input, BLAKE2b_State& state) #define BLAKE2B_LOAD_MSG_4_2(b0, b1) \ do { \ - b0 = vec_perm(m0, m3, HL_MASK); \ - b1 = vec_perm(m2, m7, HL_MASK); \ + b0 = VecPermute(m0, m3, HL_MASK); \ + b1 = VecPermute(m2, m7, HL_MASK); \ } while(0) #define BLAKE2B_LOAD_MSG_4_3(b0, b1) \ do { \ - b0 = vec_perm(m7, m5, HL_MASK); \ - b1 = vec_perm(m3, m1, HL_MASK); \ + b0 = VecPermute(m7, m5, HL_MASK); \ + b1 = VecPermute(m3, m1, HL_MASK); \ } while(0) #define BLAKE2B_LOAD_MSG_4_4(b0, b1) \ do { \ b0 = vec_shl_octet(m0, m6, 1); \ - b1 = vec_perm(m4, m6, HL_MASK); \ + b1 = VecPermute(m4, m6, HL_MASK); \ } while(0) #define BLAKE2B_LOAD_MSG_5_1(b0, b1) \ @@ -955,19 +955,19 @@ void BLAKE2_Compress64_POWER8(const byte* input, BLAKE2b_State& state) #define BLAKE2B_LOAD_MSG_5_3(b0, b1) \ do { \ - b0 = vec_perm(m2, m3, HL_MASK); \ + b0 = VecPermute(m2, m3, HL_MASK); \ b1 = vec_merge_lo(m7, m0); \ } while(0) #define BLAKE2B_LOAD_MSG_5_4(b0, b1) \ do { \ b0 = vec_merge_lo(m6, m2); \ - b1 = vec_perm(m7, m4, HL_MASK); \ + b1 = VecPermute(m7, m4, HL_MASK); \ } while(0) #define BLAKE2B_LOAD_MSG_6_1(b0, b1) \ do { \ - b0 = vec_perm(m6, m0, HL_MASK); \ + b0 = VecPermute(m6, m0, HL_MASK); \ b1 = vec_merge_hi(m7, m2); \ } while(0) @@ -986,13 +986,13 @@ void BLAKE2_Compress64_POWER8(const byte* input, BLAKE2b_State& state) #define BLAKE2B_LOAD_MSG_6_4(b0, b1) \ do { \ b0 = vec_merge_lo(m3, m1); \ - b1 = vec_perm(m1, m5, HL_MASK); \ + b1 = VecPermute(m1, m5, HL_MASK); \ } while(0) #define BLAKE2B_LOAD_MSG_7_1(b0, b1) \ do { \ b0 = vec_merge_lo(m6, m3); \ - b1 = vec_perm(m6, m1, HL_MASK); \ + b1 = VecPermute(m6, m1, HL_MASK); \ } while(0) #define BLAKE2B_LOAD_MSG_7_2(b0, b1) \ @@ -1033,7 +1033,7 @@ void BLAKE2_Compress64_POWER8(const byte* input, BLAKE2b_State& state) #define BLAKE2B_LOAD_MSG_8_4(b0, b1) \ do { \ - b0 = vec_perm(m1, m3, HL_MASK); \ + b0 = VecPermute(m1, m3, HL_MASK); \ b1 = m2; \ } while(0) @@ -1046,7 +1046,7 @@ void BLAKE2_Compress64_POWER8(const byte* input, BLAKE2b_State& state) #define BLAKE2B_LOAD_MSG_9_2(b0, b1) \ do { \ b0 = vec_merge_hi(m1, m2); \ - b1 = vec_perm(m3, m2, HL_MASK); \ + b1 = VecPermute(m3, m2, HL_MASK); \ } while(0) #define BLAKE2B_LOAD_MSG_9_3(b0, b1) \ @@ -1122,23 +1122,23 @@ void BLAKE2_Compress64_POWER8(const byte* input, BLAKE2b_State& state) #define BLAKE2B_G1(row1l,row2l,row3l,row4l,row1h,row2h,row3h,row4h,b0,b1) \ do { \ - row1l = vec_add(vec_add(row1l, b0), row2l); \ - row1h = vec_add(vec_add(row1h, b1), row2h); \ - row4l = vec_xor(row4l, row1l); row4h = vec_xor(row4h, row1h); \ + row1l = VecAdd(VecAdd(row1l, b0), row2l); \ + row1h = VecAdd(VecAdd(row1h, b1), row2h); \ + row4l = VecXor(row4l, row1l); row4h = VecXor(row4h, row1h); \ row4l = vec_ror_32(row4l); row4h = vec_ror_32(row4h); \ - row3l = vec_add(row3l, row4l); row3h = vec_add(row3h, row4h); \ - row2l = vec_xor(row2l, row3l); row2h = vec_xor(row2h, row3h); \ + row3l = VecAdd(row3l, row4l); row3h = VecAdd(row3h, row4h); \ + row2l = VecXor(row2l, row3l); row2h = VecXor(row2h, row3h); \ row2l = vec_ror_24(row2l); row2h = vec_ror_24(row2h); \ } while(0) #define BLAKE2B_G2(row1l,row2l,row3l,row4l,row1h,row2h,row3h,row4h,b0,b1) \ do { \ - row1l = vec_add(vec_add(row1l, b0), row2l); \ - row1h = vec_add(vec_add(row1h, b1), row2h); \ - row4l = vec_xor(row4l, row1l); row4h = vec_xor(row4h, row1h); \ + row1l = VecAdd(VecAdd(row1l, b0), row2l); \ + row1h = VecAdd(VecAdd(row1h, b1), row2h); \ + row4l = VecXor(row4l, row1l); row4h = VecXor(row4h, row1h); \ row4l = vec_ror_16(row4l); row4h = vec_ror_16(row4h); \ - row3l = vec_add(row3l, row4l); row3h = vec_add(row3h, row4h); \ - row2l = vec_xor(row2l, row3l); row2h = vec_xor(row2h, row3h); \ + row3l = VecAdd(row3l, row4l); row3h = VecAdd(row3h, row4h); \ + row2l = VecXor(row2l, row3l); row2h = VecXor(row2h, row3h); \ row2l = vec_ror_63(row2l); row2h = vec_ror_63(row2h); \ } while(0) @@ -1175,27 +1175,27 @@ void BLAKE2_Compress64_POWER8(const byte* input, BLAKE2b_State& state) BLAKE2B_UNDIAGONALIZE(row1l,row2l,row3l,row4l,row1h,row2h,row3h,row4h); \ } while(0) - const uint64x2_p m0 = VectorLoad64LE(input + 00); - const uint64x2_p m1 = VectorLoad64LE(input + 16); - const uint64x2_p m2 = VectorLoad64LE(input + 32); - const uint64x2_p m3 = VectorLoad64LE(input + 48); - const uint64x2_p m4 = VectorLoad64LE(input + 64); - const uint64x2_p m5 = VectorLoad64LE(input + 80); - const uint64x2_p m6 = VectorLoad64LE(input + 96); - const uint64x2_p m7 = VectorLoad64LE(input + 112); + const uint64x2_p m0 = VecLoad64LE(input + 00); + const uint64x2_p m1 = VecLoad64LE(input + 16); + const uint64x2_p m2 = VecLoad64LE(input + 32); + const uint64x2_p m3 = VecLoad64LE(input + 48); + const uint64x2_p m4 = VecLoad64LE(input + 64); + const uint64x2_p m5 = VecLoad64LE(input + 80); + const uint64x2_p m6 = VecLoad64LE(input + 96); + const uint64x2_p m7 = VecLoad64LE(input + 112); uint64x2_p row1l, row1h, row2l, row2h; uint64x2_p row3l, row3h, row4l, row4h; - const uint64x2_p h0 = row1l = VectorLoad64LE(&state.h[0]); - const uint64x2_p h1 = row1h = VectorLoad64LE(&state.h[2]); - const uint64x2_p h2 = row2l = VectorLoad64LE(&state.h[4]); - const uint64x2_p h3 = row2h = VectorLoad64LE(&state.h[6]); + const uint64x2_p h0 = row1l = VecLoad64LE(&state.h[0]); + const uint64x2_p h1 = row1h = VecLoad64LE(&state.h[2]); + const uint64x2_p h2 = row2l = VecLoad64LE(&state.h[4]); + const uint64x2_p h3 = row2h = VecLoad64LE(&state.h[6]); - row3l = VectorLoad64(&BLAKE2B_IV[0]); - row3h = VectorLoad64(&BLAKE2B_IV[2]); - row4l = vec_xor(VectorLoad64(&BLAKE2B_IV[4]), VectorLoad64(&state.tf[0])); - row4h = vec_xor(VectorLoad64(&BLAKE2B_IV[6]), VectorLoad64(&state.tf[2])); + row3l = VecLoad64(&BLAKE2B_IV[0]); + row3h = VecLoad64(&BLAKE2B_IV[2]); + row4l = VecXor(VecLoad64(&BLAKE2B_IV[4]), VecLoad64(&state.tf[0])); + row4h = VecXor(VecLoad64(&BLAKE2B_IV[6]), VecLoad64(&state.tf[2])); BLAKE2B_ROUND(0); BLAKE2B_ROUND(1); @@ -1210,10 +1210,10 @@ void BLAKE2_Compress64_POWER8(const byte* input, BLAKE2b_State& state) BLAKE2B_ROUND(10); BLAKE2B_ROUND(11); - VectorStore64LE(&state.h[0], vec_xor(h0, vec_xor(row1l, row3l))); - VectorStore64LE(&state.h[2], vec_xor(h1, vec_xor(row1h, row3h))); - VectorStore64LE(&state.h[4], vec_xor(h2, vec_xor(row2l, row4l))); - VectorStore64LE(&state.h[6], vec_xor(h3, vec_xor(row2h, row4h))); + VecStore64LE(&state.h[0], VecXor(h0, VecXor(row1l, row3l))); + VecStore64LE(&state.h[2], VecXor(h1, VecXor(row1h, row3h))); + VecStore64LE(&state.h[4], VecXor(h2, VecXor(row2l, row4l))); + VecStore64LE(&state.h[6], VecXor(h3, VecXor(row2h, row4h))); } #endif // CRYPTOPP_POWER8_AVAILABLE -- cgit v1.2.1