summaryrefslogtreecommitdiff
diff options
context:
space:
mode:
authorBassam Tabbara <bassam@symform.com>2016-09-13 10:19:24 -0700
committerBassam Tabbara <bassam@symform.com>2016-09-13 12:25:00 -0700
commit0e5c920fb69f2d962db1df045d1b71b9b012b902 (patch)
tree795e16e1eb193b877dbd8b81c33040216f8af5c8
parentad11042132c7db78e8ae57a364c37df74572e8b6 (diff)
downloadgf-complete-0e5c920fb69f2d962db1df045d1b71b9b012b902.tar.gz
gf_multby_one now checks runtime SIMD support
-rw-r--r--src/gf.c128
1 files changed, 65 insertions, 63 deletions
diff --git a/src/gf.c b/src/gf.c
index feeafdc..84d6996 100644
--- a/src/gf.c
+++ b/src/gf.c
@@ -912,9 +912,6 @@ static void gf_unaligned_xor(void *src, void *dest, int bytes);
void gf_multby_one(void *src, void *dest, int bytes, int xor)
{
-#ifdef INTEL_SSE2
- __m128i ms, md;
-#endif
unsigned long uls, uld;
uint8_t *s8, *d8;
uint64_t *s64, *d64, *dtop64;
@@ -929,84 +926,89 @@ void gf_multby_one(void *src, void *dest, int bytes, int xor)
uld = (unsigned long) dest;
#ifdef INTEL_SSE2
- int abytes;
- s8 = (uint8_t *) src;
- d8 = (uint8_t *) dest;
- if (uls % 16 == uld % 16) {
- gf_set_region_data(&rd, NULL, src, dest, bytes, 1, xor, 16);
- while (s8 != rd.s_start) {
- *d8 ^= *s8;
- d8++;
- s8++;
+ if (gf_cpu_supports_intel_sse2) {
+ __m128i ms, md;
+ int abytes;
+ s8 = (uint8_t *) src;
+ d8 = (uint8_t *) dest;
+ if (uls % 16 == uld % 16) {
+ gf_set_region_data(&rd, NULL, src, dest, bytes, 1, xor, 16);
+ while (s8 != rd.s_start) {
+ *d8 ^= *s8;
+ d8++;
+ s8++;
+ }
+ while (s8 < (uint8_t *) rd.s_top) {
+ ms = _mm_load_si128 ((__m128i *)(s8));
+ md = _mm_load_si128 ((__m128i *)(d8));
+ md = _mm_xor_si128(md, ms);
+ _mm_store_si128((__m128i *)(d8), md);
+ s8 += 16;
+ d8 += 16;
+ }
+ while (s8 != (uint8_t *) src + bytes) {
+ *d8 ^= *s8;
+ d8++;
+ s8++;
+ }
+ return;
}
- while (s8 < (uint8_t *) rd.s_top) {
- ms = _mm_load_si128 ((__m128i *)(s8));
- md = _mm_load_si128 ((__m128i *)(d8));
+
+ abytes = (bytes & 0xfffffff0);
+
+ while (d8 < (uint8_t *) dest + abytes) {
+ ms = _mm_loadu_si128 ((__m128i *)(s8));
+ md = _mm_loadu_si128 ((__m128i *)(d8));
md = _mm_xor_si128(md, ms);
- _mm_store_si128((__m128i *)(d8), md);
+ _mm_storeu_si128((__m128i *)(d8), md);
s8 += 16;
d8 += 16;
}
- while (s8 != (uint8_t *) src + bytes) {
+ while (d8 != (uint8_t *) dest+bytes) {
*d8 ^= *s8;
d8++;
s8++;
}
return;
}
-
- abytes = (bytes & 0xfffffff0);
-
- while (d8 < (uint8_t *) dest + abytes) {
- ms = _mm_loadu_si128 ((__m128i *)(s8));
- md = _mm_loadu_si128 ((__m128i *)(d8));
- md = _mm_xor_si128(md, ms);
- _mm_storeu_si128((__m128i *)(d8), md);
- s8 += 16;
- d8 += 16;
- }
- while (d8 != (uint8_t *) dest+bytes) {
- *d8 ^= *s8;
- d8++;
- s8++;
- }
- return;
#endif
#if defined(ARM_NEON)
- s8 = (uint8_t *) src;
- d8 = (uint8_t *) dest;
-
- if (uls % 16 == uld % 16) {
- gf_set_region_data(&rd, NULL, src, dest, bytes, 1, xor, 16);
- while (s8 != rd.s_start) {
+ if (gf_cpu_supports_arm_neon) {
+ s8 = (uint8_t *) src;
+ d8 = (uint8_t *) dest;
+
+ if (uls % 16 == uld % 16) {
+ gf_set_region_data(&rd, NULL, src, dest, bytes, 1, xor, 16);
+ while (s8 != rd.s_start) {
+ *d8 ^= *s8;
+ s8++;
+ d8++;
+ }
+ while (s8 < (uint8_t *) rd.s_top) {
+ uint8x16_t vs = vld1q_u8 (s8);
+ uint8x16_t vd = vld1q_u8 (d8);
+ uint8x16_t vr = veorq_u8 (vs, vd);
+ vst1q_u8 (d8, vr);
+ s8 += 16;
+ d8 += 16;
+ }
+ } else {
+ while (s8 + 15 < (uint8_t *) src + bytes) {
+ uint8x16_t vs = vld1q_u8 (s8);
+ uint8x16_t vd = vld1q_u8 (d8);
+ uint8x16_t vr = veorq_u8 (vs, vd);
+ vst1q_u8 (d8, vr);
+ s8 += 16;
+ d8 += 16;
+ }
+ }
+ while (s8 < (uint8_t *) src + bytes) {
*d8 ^= *s8;
s8++;
d8++;
}
- while (s8 < (uint8_t *) rd.s_top) {
- uint8x16_t vs = vld1q_u8 (s8);
- uint8x16_t vd = vld1q_u8 (d8);
- uint8x16_t vr = veorq_u8 (vs, vd);
- vst1q_u8 (d8, vr);
- s8 += 16;
- d8 += 16;
- }
- } else {
- while (s8 + 15 < (uint8_t *) src + bytes) {
- uint8x16_t vs = vld1q_u8 (s8);
- uint8x16_t vd = vld1q_u8 (d8);
- uint8x16_t vr = veorq_u8 (vs, vd);
- vst1q_u8 (d8, vr);
- s8 += 16;
- d8 += 16;
- }
- }
- while (s8 < (uint8_t *) src + bytes) {
- *d8 ^= *s8;
- s8++;
- d8++;
+ return;
}
- return;
#endif
if (uls % 8 != uld % 8) {
gf_unaligned_xor(src, dest, bytes);