// arm_simd.h - written and placed in public domain by Jeffrey Walton /// \file arm_simd.h /// \brief Support functions for ARM and vector operations #ifndef CRYPTOPP_ARM_SIMD_H #define CRYPTOPP_ARM_SIMD_H #include "config.h" #if (CRYPTOPP_ARM_NEON_HEADER) # include #endif #if (CRYPTOPP_ARM_ACLE_HEADER) # include # include #endif #if (CRYPTOPP_ARM_PMULL_AVAILABLE) || defined(CRYPTOPP_DOXYGEN_PROCESSING) /// \brief Polynomial multiplication /// \param a the first term /// \param b the second term /// \return vector product /// \details PMULL_00() performs polynomial multiplication and presents /// the result like Intel's c = _mm_clmulepi64_si128(a, b, 0x00). /// The 0x00 indicates the low 64-bits of a and b /// are multiplied. /// \note An Intel XMM register is composed of 128-bits. The leftmost bit /// is MSB and numbered 127, while the the rightmost bit is LSB and /// numbered 0. /// \since Crypto++ 8.0 inline uint64x2_t PMULL_00(const uint64x2_t a, const uint64x2_t b) { #if defined(_MSC_VER) const __n64 x = { vgetq_lane_u64(a, 0) }; const __n64 y = { vgetq_lane_u64(b, 0) }; return vmull_p64(x, y); #elif defined(__GNUC__) uint64x2_t r; __asm __volatile("pmull %0.1q, %1.1d, %2.1d \n\t" :"=w" (r) : "w" (a), "w" (b) ); return r; #else return (uint64x2_t)(vmull_p64( vgetq_lane_u64(vreinterpretq_u64_u8(a),0), vgetq_lane_u64(vreinterpretq_u64_u8(b),0))); #endif } /// \brief Polynomial multiplication /// \param a the first term /// \param b the second term /// \return vector product /// \details PMULL_01 performs() polynomial multiplication and presents /// the result like Intel's c = _mm_clmulepi64_si128(a, b, 0x01). /// The 0x01 indicates the low 64-bits of a and high /// 64-bits of b are multiplied. /// \note An Intel XMM register is composed of 128-bits. The leftmost bit /// is MSB and numbered 127, while the the rightmost bit is LSB and /// numbered 0. /// \since Crypto++ 8.0 inline uint64x2_t PMULL_01(const uint64x2_t a, const uint64x2_t b) { #if defined(_MSC_VER) const __n64 x = { vgetq_lane_u64(a, 0) }; const __n64 y = { vgetq_lane_u64(b, 1) }; return vmull_p64(x, y); #elif defined(__GNUC__) uint64x2_t r; __asm __volatile("pmull %0.1q, %1.1d, %2.1d \n\t" :"=w" (r) : "w" (a), "w" (vget_high_u64(b)) ); return r; #else return (uint64x2_t)(vmull_p64( vgetq_lane_u64(vreinterpretq_u64_u8(a),0), vgetq_lane_u64(vreinterpretq_u64_u8(b),1))); #endif } /// \brief Polynomial multiplication /// \param a the first term /// \param b the second term /// \return vector product /// \details PMULL_10() performs polynomial multiplication and presents /// the result like Intel's c = _mm_clmulepi64_si128(a, b, 0x10). /// The 0x10 indicates the high 64-bits of a and low /// 64-bits of b are multiplied. /// \note An Intel XMM register is composed of 128-bits. The leftmost bit /// is MSB and numbered 127, while the the rightmost bit is LSB and /// numbered 0. /// \since Crypto++ 8.0 inline uint64x2_t PMULL_10(const uint64x2_t a, const uint64x2_t b) { #if defined(_MSC_VER) const __n64 x = { vgetq_lane_u64(a, 1) }; const __n64 y = { vgetq_lane_u64(b, 0) }; return vmull_p64(x, y); #elif defined(__GNUC__) uint64x2_t r; __asm __volatile("pmull %0.1q, %1.1d, %2.1d \n\t" :"=w" (r) : "w" (vget_high_u64(a)), "w" (b) ); return r; #else return (uint64x2_t)(vmull_p64( vgetq_lane_u64(vreinterpretq_u64_u8(a),1), vgetq_lane_u64(vreinterpretq_u64_u8(b),0))); #endif } /// \brief Polynomial multiplication /// \param a the first term /// \param b the second term /// \return vector product /// \details PMULL_11() performs polynomial multiplication and presents /// the result like Intel's c = _mm_clmulepi64_si128(a, b, 0x11). /// The 0x11 indicates the high 64-bits of a and b /// are multiplied. /// \note An Intel XMM register is composed of 128-bits. The leftmost bit /// is MSB and numbered 127, while the the rightmost bit is LSB and /// numbered 0. /// \since Crypto++ 8.0 inline uint64x2_t PMULL_11(const uint64x2_t a, const uint64x2_t b) { #if defined(_MSC_VER) const __n64 x = { vgetq_lane_u64(a, 1) }; const __n64 y = { vgetq_lane_u64(b, 1) }; return vmull_p64(x, y); #elif defined(__GNUC__) uint64x2_t r; __asm __volatile("pmull2 %0.1q, %1.2d, %2.2d \n\t" :"=w" (r) : "w" (a), "w" (b) ); return r; #else return (uint64x2_t)(vmull_p64( vgetq_lane_u64(vreinterpretq_u64_u8(a),1), vgetq_lane_u64(vreinterpretq_u64_u8(b),1))); #endif } /// \brief Vector extraction /// \param a the first term /// \param b the second term /// \param c the byte count /// \return vector /// \details VEXT_U8() extracts the first c bytes of vector /// a and the remaining bytes in b. /// \since Crypto++ 8.0 inline uint64x2_t VEXT_U8(uint64x2_t a, uint64x2_t b, unsigned int c) { #if defined(_MSC_VER) return (uint64x2_t)vextq_u8( vreinterpretq_u8_u64(a), vreinterpretq_u8_u64(b), c); #else uint64x2_t r; __asm __volatile("ext %0.16b, %1.16b, %2.16b, %3 \n\t" :"=w" (r) : "w" (a), "w" (b), "I" (c) ); return r; #endif } /// \brief Vector extraction /// \tparam C the byte count /// \param a the first term /// \param b the second term /// \return vector /// \details VEXT_U8() extracts the first C bytes of vector /// a and the remaining bytes in b. /// \since Crypto++ 8.0 template inline uint64x2_t VEXT_U8(uint64x2_t a, uint64x2_t b) { // https://github.com/weidai11/cryptopp/issues/366 #if defined(_MSC_VER) return (uint64x2_t)vextq_u8( vreinterpretq_u8_u64(a), vreinterpretq_u8_u64(b), C); #else uint64x2_t r; __asm __volatile("ext %0.16b, %1.16b, %2.16b, %3 \n\t" :"=w" (r) : "w" (a), "w" (b), "I" (C) ); return r; #endif } #endif // CRYPTOPP_ARM_PMULL_AVAILABLE #endif // CRYPTOPP_ARM_SIMD_H