From 0e5c920fb69f2d962db1df045d1b71b9b012b902 Mon Sep 17 00:00:00 2001 From: Bassam Tabbara Date: Tue, 13 Sep 2016 10:19:24 -0700 Subject: gf_multby_one now checks runtime SIMD support --- src/gf.c | 128 ++++++++++++++++++++++++++++++++------------------------------- 1 file changed, 65 insertions(+), 63 deletions(-) diff --git a/src/gf.c b/src/gf.c index feeafdc..84d6996 100644 --- a/src/gf.c +++ b/src/gf.c @@ -912,9 +912,6 @@ static void gf_unaligned_xor(void *src, void *dest, int bytes); void gf_multby_one(void *src, void *dest, int bytes, int xor) { -#ifdef INTEL_SSE2 - __m128i ms, md; -#endif unsigned long uls, uld; uint8_t *s8, *d8; uint64_t *s64, *d64, *dtop64; @@ -929,84 +926,89 @@ void gf_multby_one(void *src, void *dest, int bytes, int xor) uld = (unsigned long) dest; #ifdef INTEL_SSE2 - int abytes; - s8 = (uint8_t *) src; - d8 = (uint8_t *) dest; - if (uls % 16 == uld % 16) { - gf_set_region_data(&rd, NULL, src, dest, bytes, 1, xor, 16); - while (s8 != rd.s_start) { - *d8 ^= *s8; - d8++; - s8++; + if (gf_cpu_supports_intel_sse2) { + __m128i ms, md; + int abytes; + s8 = (uint8_t *) src; + d8 = (uint8_t *) dest; + if (uls % 16 == uld % 16) { + gf_set_region_data(&rd, NULL, src, dest, bytes, 1, xor, 16); + while (s8 != rd.s_start) { + *d8 ^= *s8; + d8++; + s8++; + } + while (s8 < (uint8_t *) rd.s_top) { + ms = _mm_load_si128 ((__m128i *)(s8)); + md = _mm_load_si128 ((__m128i *)(d8)); + md = _mm_xor_si128(md, ms); + _mm_store_si128((__m128i *)(d8), md); + s8 += 16; + d8 += 16; + } + while (s8 != (uint8_t *) src + bytes) { + *d8 ^= *s8; + d8++; + s8++; + } + return; } - while (s8 < (uint8_t *) rd.s_top) { - ms = _mm_load_si128 ((__m128i *)(s8)); - md = _mm_load_si128 ((__m128i *)(d8)); + + abytes = (bytes & 0xfffffff0); + + while (d8 < (uint8_t *) dest + abytes) { + ms = _mm_loadu_si128 ((__m128i *)(s8)); + md = _mm_loadu_si128 ((__m128i *)(d8)); md = _mm_xor_si128(md, ms); - _mm_store_si128((__m128i *)(d8), md); + _mm_storeu_si128((__m128i *)(d8), md); s8 += 16; d8 += 16; } - while (s8 != (uint8_t *) src + bytes) { + while (d8 != (uint8_t *) dest+bytes) { *d8 ^= *s8; d8++; s8++; } return; } - - abytes = (bytes & 0xfffffff0); - - while (d8 < (uint8_t *) dest + abytes) { - ms = _mm_loadu_si128 ((__m128i *)(s8)); - md = _mm_loadu_si128 ((__m128i *)(d8)); - md = _mm_xor_si128(md, ms); - _mm_storeu_si128((__m128i *)(d8), md); - s8 += 16; - d8 += 16; - } - while (d8 != (uint8_t *) dest+bytes) { - *d8 ^= *s8; - d8++; - s8++; - } - return; #endif #if defined(ARM_NEON) - s8 = (uint8_t *) src; - d8 = (uint8_t *) dest; - - if (uls % 16 == uld % 16) { - gf_set_region_data(&rd, NULL, src, dest, bytes, 1, xor, 16); - while (s8 != rd.s_start) { + if (gf_cpu_supports_arm_neon) { + s8 = (uint8_t *) src; + d8 = (uint8_t *) dest; + + if (uls % 16 == uld % 16) { + gf_set_region_data(&rd, NULL, src, dest, bytes, 1, xor, 16); + while (s8 != rd.s_start) { + *d8 ^= *s8; + s8++; + d8++; + } + while (s8 < (uint8_t *) rd.s_top) { + uint8x16_t vs = vld1q_u8 (s8); + uint8x16_t vd = vld1q_u8 (d8); + uint8x16_t vr = veorq_u8 (vs, vd); + vst1q_u8 (d8, vr); + s8 += 16; + d8 += 16; + } + } else { + while (s8 + 15 < (uint8_t *) src + bytes) { + uint8x16_t vs = vld1q_u8 (s8); + uint8x16_t vd = vld1q_u8 (d8); + uint8x16_t vr = veorq_u8 (vs, vd); + vst1q_u8 (d8, vr); + s8 += 16; + d8 += 16; + } + } + while (s8 < (uint8_t *) src + bytes) { *d8 ^= *s8; s8++; d8++; } - while (s8 < (uint8_t *) rd.s_top) { - uint8x16_t vs = vld1q_u8 (s8); - uint8x16_t vd = vld1q_u8 (d8); - uint8x16_t vr = veorq_u8 (vs, vd); - vst1q_u8 (d8, vr); - s8 += 16; - d8 += 16; - } - } else { - while (s8 + 15 < (uint8_t *) src + bytes) { - uint8x16_t vs = vld1q_u8 (s8); - uint8x16_t vd = vld1q_u8 (d8); - uint8x16_t vr = veorq_u8 (vs, vd); - vst1q_u8 (d8, vr); - s8 += 16; - d8 += 16; - } - } - while (s8 < (uint8_t *) src + bytes) { - *d8 ^= *s8; - s8++; - d8++; + return; } - return; #endif if (uls % 8 != uld % 8) { gf_unaligned_xor(src, dest, bytes); -- cgit v1.2.1