13 files changed, 810 insertions, 377 deletions
diff --git a/.gitignore b/.gitignore
index 22e6fbe..bfc1dfc 100644
--- a/.gitignore
+++ b/.gitignore
@@ -75,4 +75,4 @@ tools/gf_time
 tools/gf_unit_w*
 tools/test-suite.log
 tools/.qemu/
-tools/test_simd*.results
+tools/test_simd*.results*
diff --git a/include/gf_cpu.h b/include/gf_cpu.h
new file mode 100644
index 0000000..71c7227
--- /dev/null
+++ b/include/gf_cpu.h
@@ -0,0 +1,20 @@
+/*
+ * GF-Complete: A Comprehensive Open Source Library for Galois Field Arithmetic
+ * James S. Plank, Ethan L. Miller, Kevin M. Greenan,
+ * Benjamin A. Arnold, John A. Burnum, Adam W. Disney, Allen C. McBride.
+ *
+ * gf_cpu.h
+ *
+ * Identifies whether the CPU supports SIMD instructions at runtime.
+ */
+
+#pragma once
+
+extern int gf_cpu_supports_intel_pclmul;
+extern int gf_cpu_supports_intel_sse4;
+extern int gf_cpu_supports_intel_ssse3;
+extern int gf_cpu_supports_intel_sse3;
+extern int gf_cpu_supports_intel_sse2;
+extern int gf_cpu_supports_arm_neon;
+
+void gf_cpu_identify(void);
diff --git a/src/Makefile.am b/src/Makefile.am
index a3bd37a..cfc2a50 100644
--- a/src/Makefile.am
+++ b/src/Makefile.am
@@ -4,11 +4,21 @@
 AUTOMAKE_OPTIONS = subdir-objects
 
 AM_CPPFLAGS = -I$(top_builddir)/include -I$(top_srcdir)/include
-AM_CFLAGS = -O3 $(SIMD_FLAGS) -fPIC -Wsign-compare
 
+# avoid using SIMD_FLAGS for code that calls strcmp as new gcc
+# versions will use SIMD for the strcmp implementation. Instead
+# we create a static library just for gf_method that is not compiled
+# with SIMD_FLAGS, this static library will get linked into gf_complete.so
+noinst_LTLIBRARIES = libgf_util.la
+libgf_util_la_SOURCES = gf_method.c
+libgf_util_la_CFLAGS = -O3 -fPIC -Wsign-compare
+
+# we narrowly use SIMD_FLAGS for code that needs it
 lib_LTLIBRARIES = libgf_complete.la
-libgf_complete_la_SOURCES = gf.c gf_method.c gf_wgen.c gf_w4.c gf_w8.c gf_w16.c gf_w32.c \
-          gf_w64.c gf_w128.c gf_rand.c gf_general.c
+libgf_complete_la_SOURCES = gf.c gf_wgen.c gf_w4.c gf_w8.c gf_w16.c gf_w32.c \
+          gf_w64.c gf_w128.c gf_rand.c gf_general.c gf_cpu.c
+libgf_complete_la_CFLAGS = -O3 $(SIMD_FLAGS) -fPIC -Wsign-compare
+libgf_complete_la_LIBADD = libgf_util.la
 
 if HAVE_NEON
 libgf_complete_la_SOURCES += neon/gf_w4_neon.c  \
diff --git a/src/gf.c b/src/gf.c
index b7a5c01..feeafdc 100644
--- a/src/gf.c
+++ b/src/gf.c
@@ -12,6 +12,7 @@
 #include <stdio.h>
 #include <stdlib.h>
 #include <assert.h>
+#include "gf_cpu.h"
 
 int _gf_errno = GF_E_DEFAULT;
 
@@ -207,20 +208,28 @@ int gf_error_check(int w, int mult_type, int region_type, int divide_type,
   if (region_type & (~tmp)) { _gf_errno = GF_E_UNK_REG; return 0; }
 
 #ifdef INTEL_SSE2
-  sse2 = 1;
+  if (gf_cpu_supports_intel_sse2) {
+    sse2 = 1;
+  }
 #endif
 
 #ifdef INTEL_SSSE3
-  sse3 = 1;
+  if (gf_cpu_supports_intel_ssse3) {
+    sse3 = 1;
+  }
 #endif
 
 #ifdef INTEL_SSE4_PCLMUL
-  pclmul = 1;
+  if (gf_cpu_supports_intel_pclmul) {
+    pclmul = 1;
+  }
 #endif
 
 #ifdef ARM_NEON
-  pclmul = (w == 4 || w == 8);
-  sse3 = 1;
+  if (gf_cpu_supports_arm_neon) {
+    pclmul = (w == 4 || w == 8);
+    sse3 = 1;
+  }
 #endif
 
 
@@ -473,6 +482,8 @@ int gf_init_hard(gf_t *gf, int w, int mult_type,
   int sz;
   gf_internal_t *h;
  
+  gf_cpu_identify();
+
   if (gf_error_check(w, mult_type, region_type, divide_type, 
                      arg1, arg2, prim_poly, base_gf) == 0) return 0;
 
diff --git a/src/gf_cpu.c b/src/gf_cpu.c
new file mode 100644
index 0000000..ee2f847
--- /dev/null
+++ b/src/gf_cpu.c
@@ -0,0 +1,153 @@
+/*
+ * GF-Complete: A Comprehensive Open Source Library for Galois Field Arithmetic
+ * James S. Plank, Ethan L. Miller, Kevin M. Greenan,
+ * Benjamin A. Arnold, John A. Burnum, Adam W. Disney, Allen C. McBride.
+ *
+ * gf_cpu.h
+ *
+ * Identifies whether the CPU supports SIMD instructions at runtime.
+ */
+
+#include <stdio.h>
+#include <stdlib.h>
+
+int gf_cpu_identified = 0;
+
+int gf_cpu_supports_intel_pclmul = 0;
+int gf_cpu_supports_intel_sse4 = 0;
+int gf_cpu_supports_intel_ssse3 = 0;
+int gf_cpu_supports_intel_sse3 = 0;
+int gf_cpu_supports_intel_sse2 = 0;
+int gf_cpu_supports_arm_neon = 0;
+
+#if defined(__x86_64__)
+
+void gf_cpu_identify(void)
+{
+  if (gf_cpu_identified) {
+      return;
+  }
+
+  int op = 1, eax, ebx, ecx, edx;
+
+  __asm__("cpuid"	
+      : "=a" (eax), "=b" (ebx), "=c" (ecx), "=d" (edx)
+      : "a" (op));
+
+#if defined(INTEL_SSE4_PCLMUL)
+  if ((ecx & 1) != 0 && !getenv("GF_COMPLETE_DISABLE_SSE4_PCLMUL")) {
+      gf_cpu_supports_intel_pclmul = 1;
+#ifdef DEBUG_CPU_DETECTION
+      printf("#gf_cpu_supports_intel_pclmul\n");
+#endif
+  }
+#endif
+
+#if defined(INTEL_SSE4)
+  if (((ecx & (1<<20)) != 0 || (ecx & (1<<19)) != 0) && !getenv("GF_COMPLETE_DISABLE_SSE4")) {
+      gf_cpu_supports_intel_sse4 = 1;
+#ifdef DEBUG_CPU_DETECTION
+      printf("#gf_cpu_supports_intel_sse4\n");
+#endif
+  }
+#endif
+
+#if defined(INTEL_SSSE3)
+  if ((ecx & (1<<9)) != 0 && !getenv("GF_COMPLETE_DISABLE_SSSE3")) {
+      gf_cpu_supports_intel_ssse3 = 1;
+#ifdef DEBUG_CPU_DETECTION
+      printf("#gf_cpu_supports_intel_ssse3\n");
+#endif
+  }
+#endif
+
+#if defined(INTEL_SSE3)
+  if ((ecx & 1) != 0 && !getenv("GF_COMPLETE_DISABLE_SSE3")) {
+      gf_cpu_supports_intel_sse3 = 1;
+#ifdef DEBUG_CPU_DETECTION
+      printf("#gf_cpu_supports_intel_sse3\n");
+#endif
+  }
+#endif
+
+#if defined(INTEL_SSE2)
+  if ((edx & (1<<26)) != 0 && !getenv("GF_COMPLETE_DISABLE_SSE2")) {
+      gf_cpu_supports_intel_sse2 = 1;
+#ifdef DEBUG_CPU_DETECTION
+      printf("#gf_cpu_supports_intel_sse2\n");
+#endif
+  }
+#endif
+
+  gf_cpu_identified = 1;
+}
+
+#elif defined(__arm__) || defined(__aarch64__)
+
+#ifdef __linux__
+
+#include <stdio.h>
+#include <unistd.h>
+#include <elf.h>
+#include <linux/auxvec.h>
+#include <asm/hwcap.h>
+#include <fcntl.h>
+
+unsigned long get_hwcap(unsigned long type) {
+    unsigned long hwcap = 0; 
+    int fd = open("/proc/self/auxv", O_RDONLY);
+    if (fd > 0) {
+        Elf32_auxv_t auxv;
+        while (read(fd, &auxv, sizeof(Elf32_auxv_t))) {
+            if (auxv.a_type == type) {
+                hwcap = auxv.a_un.a_val;
+                break;
+            }
+        }
+        close(fd);
+    }
+
+    return hwcap;
+}
+
+#endif // linux
+
+void gf_cpu_identify(void)
+{
+  if (gf_cpu_identified) {
+      return;
+  }
+
+#if defined(ARM_NEON)
+  if (!getenv("GF_COMPLETE_DISABLE_NEON")) {
+#if __linux__ && __arm__
+	  gf_cpu_supports_arm_neon = (get_hwcap(AT_HWCAP) & HWCAP_NEON) > 0;
+#elif __aarch64__
+    // ASIMD is supported on all aarch64 architectures
+	  gf_cpu_supports_arm_neon = 1;
+#else
+    // we assume that NEON is supported if the compiler supports
+    // NEON and we dont have a reliable way to detect runtime support.
+	  gf_cpu_supports_arm_neon = 1;
+#endif
+
+#ifdef DEBUG_CPU_DETECTION
+    if (gf_cpu_supports_arm_neon) {
+      printf("#gf_cpu_supports_arm_neon\n");
+    }
+#endif
+  }
+#endif // defined(ARM_NEON)
+
+  gf_cpu_identified = 1;
+}
+
+#else // defined(__arm__) || defined(__aarch64__)
+
+int gf_cpu_identify(void)
+{
+    gf_cpu_identified = 1;
+    return 0;
+}
+
+#endif
diff --git a/src/gf_w128.c b/src/gf_w128.c
index 5f650b3..74f72e8 100644
--- a/src/gf_w128.c
+++ b/src/gf_w128.c
@@ -11,6 +11,7 @@
 #include "gf_int.h"
 #include <stdio.h>
 #include <stdlib.h>
+#include "gf_cpu.h"
 
 #define GF_FIELD_WIDTH (128)
 
@@ -290,11 +291,11 @@ gf_w128_shift_multiply(gf_t *gf, gf_val_128_t a128, gf_val_128_t b128, gf_val_12
   return;
 }
 
+#if defined(INTEL_SSE4_PCLMUL)
+
 void
 gf_w128_clm_multiply(gf_t *gf, gf_val_128_t a128, gf_val_128_t b128, gf_val_128_t c128)
 {
-#if defined(INTEL_SSE4_PCLMUL)
-
     __m128i     a,b;
     __m128i     result0,result1;
     __m128i     prim_poly;
@@ -338,9 +339,8 @@ gf_w128_clm_multiply(gf_t *gf, gf_val_128_t a128, gf_val_128_t b128, gf_val_128_
 
     c128[0] = (uint64_t)_mm_extract_epi64(result1,1);
     c128[1] = (uint64_t)_mm_extract_epi64(result1,0);
-#endif
-return;
 }
+#endif
 
 void
 gf_w128_bytwo_p_multiply(gf_t *gf, gf_val_128_t a128, gf_val_128_t b128, gf_val_128_t c128)
@@ -376,10 +376,10 @@ gf_w128_bytwo_p_multiply(gf_t *gf, gf_val_128_t a128, gf_val_128_t b128, gf_val_
   return;
 }
 
+#if defined(INTEL_SSE4)
 void
 gf_w128_sse_bytwo_p_multiply(gf_t *gf, gf_val_128_t a128, gf_val_128_t b128, gf_val_128_t c128)
 {
-#if defined(INTEL_SSE4)
   int i;
   __m128i a, b, pp, prod, amask, u_middle_one; 
   /*John: pmask is always the highest bit set, and the rest zeros. amask changes, it's a countdown.*/
@@ -427,16 +427,16 @@ gf_w128_sse_bytwo_p_multiply(gf_t *gf, gf_val_128_t a128, gf_val_128_t b128, gf_
   }
   c128[0] = (uint64_t)_mm_extract_epi64(prod, 1);
   c128[1] = (uint64_t)_mm_extract_epi64(prod, 0);
-#endif
   return;
 }
+#endif
 
 
 /* Ben: This slow function implements sse instrutions for bytwo_b because why not */
+#if defined(INTEL_SSE4)
 void
 gf_w128_sse_bytwo_b_multiply(gf_t *gf, gf_val_128_t a128, gf_val_128_t b128, gf_val_128_t c128)
 {
-#if defined(INTEL_SSE4)
   __m128i a, b, lmask, hmask, pp, c, middle_one;
   gf_internal_t *h;
   uint64_t topbit, middlebit;
@@ -471,8 +471,8 @@ gf_w128_sse_bytwo_b_multiply(gf_t *gf, gf_val_128_t a128, gf_val_128_t b128, gf_
     if (middlebit) b = _mm_xor_si128(b, middle_one);
     if (topbit) b = _mm_xor_si128(b, pp);
   }
-#endif
 }
+#endif
 
 void
 gf_w128_bytwo_b_multiply(gf_t *gf, gf_val_128_t a128, gf_val_128_t b128, gf_val_128_t c128)
@@ -1146,7 +1146,7 @@ gf_w128_group_multiply_region(gf_t *gf, void *src, void *dest, gf_val_128_t val,
 }
 
 /* a^-1 -> b */
-  void
+void
 gf_w128_euclid(GFP gf, gf_val_128_t a128, gf_val_128_t b128)
 {
   uint64_t e_i[2], e_im1[2], e_ip1[2];
@@ -1239,7 +1239,7 @@ gf_w128_euclid(GFP gf, gf_val_128_t a128, gf_val_128_t b128)
   return;
 }
 
-  void
+void
 gf_w128_divide_from_inverse(GFP gf, gf_val_128_t a128, gf_val_128_t b128, gf_val_128_t c128)
 {
   uint64_t d[2];
@@ -1248,7 +1248,7 @@ gf_w128_divide_from_inverse(GFP gf, gf_val_128_t a128, gf_val_128_t b128, gf_val
   return;
 }
 
-  void
+void
 gf_w128_inverse_from_divide(GFP gf, gf_val_128_t a128, gf_val_128_t b128)
 {
   uint64_t one128[2];
@@ -1260,7 +1260,7 @@ gf_w128_inverse_from_divide(GFP gf, gf_val_128_t a128, gf_val_128_t b128)
 
 
 static
-  void
+void
 gf_w128_composite_inverse(gf_t *gf, gf_val_128_t a, gf_val_128_t inv)
 {
   gf_internal_t *h = (gf_internal_t *) gf->scratch;
@@ -1421,10 +1421,12 @@ static
 int gf_w128_cfm_init(gf_t *gf)
 {
 #if defined(INTEL_SSE4_PCLMUL)
-  SET_FUNCTION(gf,inverse,w128,gf_w128_euclid)
-  SET_FUNCTION(gf,multiply,w128,gf_w128_clm_multiply)
-  SET_FUNCTION(gf,multiply_region,w128,gf_w128_clm_multiply_region_from_single)
-  return 1;
+  if (gf_cpu_supports_intel_pclmul) {
+    SET_FUNCTION(gf,inverse,w128,gf_w128_euclid)
+    SET_FUNCTION(gf,multiply,w128,gf_w128_clm_multiply)
+    SET_FUNCTION(gf,multiply_region,w128,gf_w128_clm_multiply_region_from_single)
+    return 1;
+  }
 #endif
 
   return 0;
@@ -1527,7 +1529,7 @@ int gf_w128_split_init(gf_t *gf)
 
   SET_FUNCTION(gf,multiply,w128,gf_w128_bytwo_p_multiply)
 #if defined(INTEL_SSE4_PCLMUL)
-  if (!(h->region_type & GF_REGION_NOSIMD)){
+  if (gf_cpu_supports_intel_pclmul && !(h->region_type & GF_REGION_NOSIMD)){
     SET_FUNCTION(gf,multiply,w128,gf_w128_clm_multiply)
   }
 #endif
@@ -1546,23 +1548,19 @@ int gf_w128_split_init(gf_t *gf)
     if((h->region_type & GF_REGION_ALTMAP))
     {
       #ifdef INTEL_SSE4
-        if(!(h->region_type & GF_REGION_NOSIMD))
+        if(gf_cpu_supports_intel_sse4 && !(h->region_type & GF_REGION_NOSIMD))
           SET_FUNCTION(gf,multiply_region,w128,gf_w128_split_4_128_sse_altmap_multiply_region)
         else
-          return 0;
-      #else
-        return 0;
       #endif
+          return 0;
     }
     else {
       #ifdef INTEL_SSE4
-        if(!(h->region_type & GF_REGION_NOSIMD))
+        if(gf_cpu_supports_intel_sse4 && !(h->region_type & GF_REGION_NOSIMD))
           SET_FUNCTION(gf,multiply_region,w128,gf_w128_split_4_128_sse_multiply_region)
         else
-          SET_FUNCTION(gf,multiply_region,w128,gf_w128_split_4_128_multiply_region)
-      #else
-      SET_FUNCTION(gf,multiply_region,w128,gf_w128_split_4_128_multiply_region)
       #endif
+        SET_FUNCTION(gf,multiply_region,w128,gf_w128_split_4_128_multiply_region)
     }
   }
   return 1;
diff --git a/src/gf_w16.c b/src/gf_w16.c
index a62ea51..8316892 100644
--- a/src/gf_w16.c
+++ b/src/gf_w16.c
@@ -12,6 +12,7 @@
 #include <stdio.h>
 #include <stdlib.h>
 #include "gf_w16.h"
+#include "gf_cpu.h"
 
 #define AB2(ip, am1 ,am2, b, t1, t2) {\
   t1 = (b << 1) & am1;\
@@ -391,6 +392,7 @@ gf_val_32_t gf_w16_matrix (gf_t *gf, gf_val_32_t b)
    extra memory.  
  */
 
+#if defined(INTEL_SSE4_PCLMUL)
 static
 inline
 gf_val_32_t
@@ -398,8 +400,6 @@ gf_w16_clm_multiply_2 (gf_t *gf, gf_val_32_t a16, gf_val_32_t b16)
 {
   gf_val_32_t rv = 0;
 
-#if defined(INTEL_SSE4_PCLMUL)
-
   __m128i         a, b;
   __m128i         result;
   __m128i         prim_poly;
@@ -433,11 +433,11 @@ gf_w16_clm_multiply_2 (gf_t *gf, gf_val_32_t a16, gf_val_32_t b16)
   
   rv = ((gf_val_32_t)_mm_extract_epi32(result, 0));
 
-
-#endif
   return rv;
 }
+#endif
 
+#if defined(INTEL_SSE4_PCLMUL)
 static
 inline
 gf_val_32_t
@@ -445,8 +445,6 @@ gf_w16_clm_multiply_3 (gf_t *gf, gf_val_32_t a16, gf_val_32_t b16)
 {
   gf_val_32_t rv = 0;
 
-#if defined(INTEL_SSE4_PCLMUL)
-
   __m128i         a, b;
   __m128i         result;
   __m128i         prim_poly;
@@ -473,11 +471,11 @@ gf_w16_clm_multiply_3 (gf_t *gf, gf_val_32_t a16, gf_val_32_t b16)
   
   rv = ((gf_val_32_t)_mm_extract_epi32(result, 0));
 
-
-#endif
   return rv;
 }
+#endif
 
+#if defined(INTEL_SSE4_PCLMUL)
 static
 inline
 gf_val_32_t
@@ -485,8 +483,6 @@ gf_w16_clm_multiply_4 (gf_t *gf, gf_val_32_t a16, gf_val_32_t b16)
 {
   gf_val_32_t rv = 0;
 
-#if defined(INTEL_SSE4_PCLMUL)
-
   __m128i         a, b;
   __m128i         result;
   __m128i         prim_poly;
@@ -515,10 +511,9 @@ gf_w16_clm_multiply_4 (gf_t *gf, gf_val_32_t a16, gf_val_32_t b16)
   
   rv = ((gf_val_32_t)_mm_extract_epi32(result, 0));
 
-
-#endif
   return rv;
 }
+#endif
 
 
 static
@@ -556,25 +551,27 @@ static
 int gf_w16_cfm_init(gf_t *gf)
 {
 #if defined(INTEL_SSE4_PCLMUL)
-  gf_internal_t *h;
+  if (gf_cpu_supports_intel_pclmul) {
+    gf_internal_t *h;
 
-  h = (gf_internal_t *) gf->scratch;
-  
-  /*Ben: Determining how many reductions to do */
-  
-  if ((0xfe00 & h->prim_poly) == 0) {
-    SET_FUNCTION(gf,multiply,w32,gf_w16_clm_multiply_2)
-    SET_FUNCTION(gf,multiply_region,w32,gf_w16_clm_multiply_region_from_single_2)
-  } else if((0xf000 & h->prim_poly) == 0) {
-    SET_FUNCTION(gf,multiply,w32,gf_w16_clm_multiply_3)
-    SET_FUNCTION(gf,multiply_region,w32,gf_w16_clm_multiply_region_from_single_3)
-  } else if ((0xe000 & h->prim_poly) == 0) {
-    SET_FUNCTION(gf,multiply,w32,gf_w16_clm_multiply_4)
-    SET_FUNCTION(gf,multiply_region,w32,gf_w16_clm_multiply_region_from_single_4)
-  } else {
-    return 0;
-  } 
-  return 1;
+    h = (gf_internal_t *) gf->scratch;
+    
+    /*Ben: Determining how many reductions to do */
+    
+    if ((0xfe00 & h->prim_poly) == 0) {
+      SET_FUNCTION(gf,multiply,w32,gf_w16_clm_multiply_2)
+      SET_FUNCTION(gf,multiply_region,w32,gf_w16_clm_multiply_region_from_single_2)
+    } else if((0xf000 & h->prim_poly) == 0) {
+      SET_FUNCTION(gf,multiply,w32,gf_w16_clm_multiply_3)
+      SET_FUNCTION(gf,multiply_region,w32,gf_w16_clm_multiply_region_from_single_3)
+    } else if ((0xe000 & h->prim_poly) == 0) {
+      SET_FUNCTION(gf,multiply,w32,gf_w16_clm_multiply_4)
+      SET_FUNCTION(gf,multiply_region,w32,gf_w16_clm_multiply_region_from_single_4)
+    } else {
+      return 0;
+    } 
+    return 1;
+  }
 #endif
 
   return 0;
@@ -688,10 +685,9 @@ int gf_w16_log_init(gf_t *gf)
 
   if (check) {
     if (h->mult_type != GF_MULT_LOG_TABLE) {
-
-#if defined(INTEL_SSE4_PCLMUL)
-      return gf_w16_cfm_init(gf);
-#endif
+      if (gf_cpu_supports_intel_pclmul) {
+        return gf_w16_cfm_init(gf);
+      }
       return gf_w16_shift_init(gf);
     } else {
       _gf_errno = GF_E_LOGPOLY;
@@ -948,11 +944,11 @@ gf_w16_table_lazy_multiply_region(gf_t *gf, void *src, void *dest, gf_val_32_t v
   gf_do_final_region_alignment(&rd);
 }
 
+#ifdef INTEL_SSSE3
 static
 void
 gf_w16_split_4_16_lazy_sse_multiply_region(gf_t *gf, void *src, void *dest, gf_val_32_t val, int bytes, int xor)
 {
-#ifdef INTEL_SSSE3
   uint64_t i, j, *s64, *d64, *top64;;
   uint64_t c, prod;
   uint8_t low[4][16];
@@ -1078,14 +1074,14 @@ gf_w16_split_4_16_lazy_sse_multiply_region(gf_t *gf, void *src, void *dest, gf_v
   }
 
   gf_do_final_region_alignment(&rd);
-#endif
 }
+#endif
 
+#ifdef INTEL_SSSE3
 static
 void
 gf_w16_split_4_16_lazy_sse_altmap_multiply_region(gf_t *gf, void *src, void *dest, gf_val_32_t val, int bytes, int xor)
 {
-#ifdef INTEL_SSSE3
   uint64_t i, j, *s64, *d64, *top64;;
   uint64_t c, prod;
   uint8_t low[4][16];
@@ -1187,8 +1183,8 @@ gf_w16_split_4_16_lazy_sse_altmap_multiply_region(gf_t *gf, void *src, void *des
   }
   gf_do_final_region_alignment(&rd);
 
-#endif
 }
+#endif
 
 uint32_t 
 gf_w16_split_8_8_multiply(gf_t *gf, gf_val_32_t a, gf_val_32_t b)
@@ -1216,21 +1212,11 @@ int gf_w16_split_init(gf_t *gf)
 {
   gf_internal_t *h;
   struct gf_w16_split_8_8_data *d8;
-  int i, j, exp, issse3;
-  int isneon = 0;
+  int i, j, exp;
   uint32_t p, basep, tmp;
 
   h = (gf_internal_t *) gf->scratch;
 
-#ifdef INTEL_SSSE3
-  issse3 = 1;
-#else
-  issse3 = 0;
-#endif
-#ifdef ARM_NEON
-  isneon = 1;
-#endif
-
   if (h->arg1 == 8 && h->arg2 == 8) {
     d8 = (struct gf_w16_split_8_8_data *) h->private;
     basep = 1;
@@ -1273,36 +1259,45 @@ int gf_w16_split_init(gf_t *gf)
 
   /* Defaults */
 
-  if (issse3) {
+#ifdef INTEL_SSSE3
+  if (gf_cpu_supports_intel_ssse3) {
     SET_FUNCTION(gf,multiply_region,w32,gf_w16_split_4_16_lazy_sse_multiply_region)
-  } else if (isneon) {
-#ifdef ARM_NEON
+  } else {
+#elif ARM_NEON
+  if (gf_cpu_supports_arm_neon) {
     gf_w16_neon_split_init(gf);
-#endif
   } else {
+#endif
     SET_FUNCTION(gf,multiply_region,w32,gf_w16_split_8_16_lazy_multiply_region)
+#if defined(INTEL_SSSE3) || defined(ARM_NEON)
   }
-
+#endif
 
   if ((h->arg1 == 8 && h->arg2 == 16) || (h->arg2 == 8 && h->arg1 == 16)) {
     SET_FUNCTION(gf,multiply_region,w32,gf_w16_split_8_16_lazy_multiply_region)
 
   } else if ((h->arg1 == 4 && h->arg2 == 16) || (h->arg2 == 4 && h->arg1 == 16)) {
-    if (issse3 || isneon) {
+#if defined(INTEL_SSSE3) || defined(ARM_NEON)
+    if (gf_cpu_supports_intel_ssse3 || gf_cpu_supports_arm_neon) {
       if(h->region_type & GF_REGION_ALTMAP && h->region_type & GF_REGION_NOSIMD)
         SET_FUNCTION(gf,multiply_region,w32,gf_w16_split_4_16_lazy_nosse_altmap_multiply_region)
       else if(h->region_type & GF_REGION_NOSIMD)
         SET_FUNCTION(gf,multiply_region,w32,gf_w16_split_4_16_lazy_multiply_region)
-      else if(h->region_type & GF_REGION_ALTMAP && issse3)
+#if defined(INTEL_SSSE3)
+      else if(h->region_type & GF_REGION_ALTMAP && gf_cpu_supports_intel_ssse3)
         SET_FUNCTION(gf,multiply_region,w32,gf_w16_split_4_16_lazy_sse_altmap_multiply_region)
+#endif        
     } else {
+#endif
       if(h->region_type & GF_REGION_SIMD)
         return 0;
       else if(h->region_type & GF_REGION_ALTMAP)
         SET_FUNCTION(gf,multiply_region,w32,gf_w16_split_4_16_lazy_nosse_altmap_multiply_region)
       else
         SET_FUNCTION(gf,multiply_region,w32,gf_w16_split_4_16_lazy_multiply_region)
+#if defined(INTEL_SSSE3) || defined(ARM_NEON)
     }
+#endif
   }
 
   return 1;
@@ -1846,26 +1841,28 @@ int gf_w16_bytwo_init(gf_t *gf)
   if (h->mult_type == GF_MULT_BYTWO_p) {
     SET_FUNCTION(gf,multiply,w32,gf_w16_bytwo_p_multiply)
     #ifdef INTEL_SSE2
-      if (h->region_type & GF_REGION_NOSIMD)
-        SET_FUNCTION(gf,multiply_region,w32,gf_w16_bytwo_p_nosse_multiply_region)
-      else
-        SET_FUNCTION(gf,multiply_region,w32,gf_w16_bytwo_p_sse_multiply_region)
-    #else
+    if (gf_cpu_supports_intel_sse2 && !(h->region_type & GF_REGION_NOSIMD)) {
+      SET_FUNCTION(gf,multiply_region,w32,gf_w16_bytwo_p_sse_multiply_region)
+    } else {
+    #endif
       SET_FUNCTION(gf,multiply_region,w32,gf_w16_bytwo_p_nosse_multiply_region)
       if(h->region_type & GF_REGION_SIMD)
         return 0;
+    #ifdef INTEL_SSE2
+    }
     #endif
   } else {
     SET_FUNCTION(gf,multiply,w32,gf_w16_bytwo_b_multiply)
     #ifdef INTEL_SSE2
-      if (h->region_type & GF_REGION_NOSIMD)
-        SET_FUNCTION(gf,multiply_region,w32,gf_w16_bytwo_b_nosse_multiply_region)
-      else
+    if (gf_cpu_supports_intel_sse2 && !(h->region_type & GF_REGION_NOSIMD)) {
         SET_FUNCTION(gf,multiply_region,w32,gf_w16_bytwo_b_sse_multiply_region)
-    #else
+    } else {
+    #endif
       SET_FUNCTION(gf,multiply_region,w32,gf_w16_bytwo_b_nosse_multiply_region)
       if(h->region_type & GF_REGION_SIMD)
         return 0;
+    #ifdef INTEL_SSE2
+    }
     #endif
   }
 
diff --git a/src/gf_w32.c b/src/gf_w32.c
index d496c3a..bb22894 100644
--- a/src/gf_w32.c
+++ b/src/gf_w32.c
@@ -13,6 +13,7 @@
 #include <stdio.h>
 #include <stdlib.h>
 #include "gf_w32.h"
+#include "gf_cpu.h"
 
 #define MM_PRINT32(s, r) { uint8_t blah[16], ii; printf("%-12s", s); _mm_storeu_si128((__m128i *)blah, r); for (ii = 0; ii < 16; ii += 4) printf(" %02x%02x%02x%02x", blah[15-ii], blah[14-ii], blah[13-ii], blah[12-ii]); printf("\n"); }
 
@@ -347,6 +348,8 @@ uint32_t gf_w32_matrix (gf_t *gf, uint32_t b)
    extra memory.  
 */
 
+#if defined(INTEL_SSE4_PCLMUL)
+
 static
 inline
 gf_val_32_t
@@ -354,8 +357,6 @@ gf_w32_cfmgk_multiply (gf_t *gf, gf_val_32_t a32, gf_val_32_t b32)
 {
   gf_val_32_t rv = 0;
 
-#if defined(INTEL_SSE4_PCLMUL)
-
   __m128i         a, b;
   __m128i         result;
   __m128i         w;
@@ -378,9 +379,9 @@ gf_w32_cfmgk_multiply (gf_t *gf, gf_val_32_t a32, gf_val_32_t b32)
 
   /* Extracts 32 bit value from result. */
   rv = ((gf_val_32_t)_mm_extract_epi32(result, 0));
-#endif
   return rv;
 }
+#endif
 
 #if defined(INTEL_SSE4_PCLMUL)
 
@@ -435,6 +436,8 @@ gf_w32_cfmgk_multiply_region_from_single(gf_t *gf, void *src, void *dest, uint32
 #endif
 
 
+#if defined(INTEL_SSE4_PCLMUL)
+
 static
 inline
 gf_val_32_t
@@ -442,8 +445,6 @@ gf_w32_clm_multiply_2 (gf_t *gf, gf_val_32_t a32, gf_val_32_t b32)
 {
   gf_val_32_t rv = 0;
 
-#if defined(INTEL_SSE4_PCLMUL)
-
   __m128i         a, b;
   __m128i         result;
   __m128i         prim_poly;
@@ -476,9 +477,11 @@ gf_w32_clm_multiply_2 (gf_t *gf, gf_val_32_t a32, gf_val_32_t b32)
 
   /* Extracts 32 bit value from result. */
   rv = ((gf_val_32_t)_mm_extract_epi32(result, 0));
-#endif
   return rv;
 }
+#endif
+
+#if defined(INTEL_SSE4_PCLMUL)
 
 static
 inline
@@ -487,8 +490,6 @@ gf_w32_clm_multiply_3 (gf_t *gf, gf_val_32_t a32, gf_val_32_t b32)
 {
   gf_val_32_t rv = 0;
 
-#if defined(INTEL_SSE4_PCLMUL)
-
   __m128i         a, b;
   __m128i         result;
   __m128i         prim_poly;
@@ -515,9 +516,11 @@ gf_w32_clm_multiply_3 (gf_t *gf, gf_val_32_t a32, gf_val_32_t b32)
   /* Extracts 32 bit value from result. */
   
   rv = ((gf_val_32_t)_mm_extract_epi32(result, 0));
-#endif
   return rv;
 }
+#endif
+
+#if defined(INTEL_SSE4_PCLMUL)
 
 static
 inline
@@ -526,8 +529,6 @@ gf_w32_clm_multiply_4 (gf_t *gf, gf_val_32_t a32, gf_val_32_t b32)
 {
   gf_val_32_t rv = 0;
 
-#if defined(INTEL_SSE4_PCLMUL)
-
   __m128i         a, b;
   __m128i         result;
   __m128i         prim_poly;
@@ -556,9 +557,9 @@ gf_w32_clm_multiply_4 (gf_t *gf, gf_val_32_t a32, gf_val_32_t b32)
   /* Extracts 32 bit value from result. */
   
   rv = ((gf_val_32_t)_mm_extract_epi32(result, 0));
-#endif
   return rv;
 }
+#endif
 
 
 static
@@ -593,29 +594,31 @@ int gf_w32_cfmgk_init(gf_t *gf)
   SET_FUNCTION(gf,multiply_region,w32,gf_w32_multiply_region_from_single)
   
 #if defined(INTEL_SSE4_PCLMUL)
-  gf_internal_t *h;
+  if (gf_cpu_supports_intel_pclmul) {
+    gf_internal_t *h;
 
-  h = (gf_internal_t *) gf->scratch;
-  SET_FUNCTION(gf,multiply,w32,gf_w32_cfmgk_multiply)
-  SET_FUNCTION(gf,multiply_region,w32,gf_w32_cfmgk_multiply_region_from_single)
+    h = (gf_internal_t *) gf->scratch;
+    SET_FUNCTION(gf,multiply,w32,gf_w32_cfmgk_multiply)
+    SET_FUNCTION(gf,multiply_region,w32,gf_w32_cfmgk_multiply_region_from_single)
 
-  uint64_t *q_plus = (uint64_t *) h->private;
-  uint64_t *g_star = (uint64_t *) h->private + 1;
+    uint64_t *q_plus = (uint64_t *) h->private;
+    uint64_t *g_star = (uint64_t *) h->private + 1;
 
-  uint64_t tmp = h->prim_poly << 32;
-  *q_plus = 1ULL << 32;
+    uint64_t tmp = h->prim_poly << 32;
+    *q_plus = 1ULL << 32;
 
-  int i;
-  for(i = 63; i >= 32; i--)
-    if((1ULL << i) & tmp)
-    {
-      *q_plus |= 1ULL << (i-32);
-      tmp ^= h->prim_poly << (i-32);
-    }
+    int i;
+    for(i = 63; i >= 32; i--)
+      if((1ULL << i) & tmp)
+      {
+        *q_plus |= 1ULL << (i-32);
+        tmp ^= h->prim_poly << (i-32);
+      }
 
-  *g_star = h->prim_poly & ((1ULL << 32) - 1);
+    *g_star = h->prim_poly & ((1ULL << 32) - 1);
 
-  return 1;
+    return 1;
+  }
 #endif
 
   return 0;
@@ -631,23 +634,25 @@ int gf_w32_cfm_init(gf_t *gf)
   /*Ben: Check to see how many reduction steps it will take*/
 
 #if defined(INTEL_SSE4_PCLMUL)
-  gf_internal_t *h;
+  if (gf_cpu_supports_intel_pclmul) {
+    gf_internal_t *h;
 
-  h = (gf_internal_t *) gf->scratch;
+    h = (gf_internal_t *) gf->scratch;
 
-  if ((0xfffe0000 & h->prim_poly) == 0){ 
-    SET_FUNCTION(gf,multiply,w32,gf_w32_clm_multiply_2)
-    SET_FUNCTION(gf,multiply_region,w32,gf_w32_clm_multiply_region_from_single_2)
-  }else if ((0xffc00000 & h->prim_poly) == 0){
-    SET_FUNCTION(gf,multiply,w32,gf_w32_clm_multiply_3)
-    SET_FUNCTION(gf,multiply_region,w32,gf_w32_clm_multiply_region_from_single_3)
-  }else if ((0xfe000000 & h->prim_poly) == 0){
-    SET_FUNCTION(gf,multiply,w32,gf_w32_clm_multiply_4)
-    SET_FUNCTION(gf,multiply_region,w32,gf_w32_clm_multiply_region_from_single_4)
-  } else {
-    return 0;
+    if ((0xfffe0000 & h->prim_poly) == 0){ 
+      SET_FUNCTION(gf,multiply,w32,gf_w32_clm_multiply_2)
+      SET_FUNCTION(gf,multiply_region,w32,gf_w32_clm_multiply_region_from_single_2)
+    }else if ((0xffc00000 & h->prim_poly) == 0){
+      SET_FUNCTION(gf,multiply,w32,gf_w32_clm_multiply_3)
+      SET_FUNCTION(gf,multiply_region,w32,gf_w32_clm_multiply_region_from_single_3)
+    }else if ((0xfe000000 & h->prim_poly) == 0){
+      SET_FUNCTION(gf,multiply,w32,gf_w32_clm_multiply_4)
+      SET_FUNCTION(gf,multiply_region,w32,gf_w32_clm_multiply_region_from_single_4)
+    } else {
+      return 0;
+    }
+    return 1;
   }
-  return 1;
   #endif
 
   return 0;
@@ -1382,26 +1387,28 @@ int gf_w32_bytwo_init(gf_t *gf)
   if (h->mult_type == GF_MULT_BYTWO_p) {
     SET_FUNCTION(gf,multiply,w32,gf_w32_bytwo_p_multiply)
     #ifdef INTEL_SSE2
-      if (h->region_type & GF_REGION_NOSIMD)
-        SET_FUNCTION(gf,multiply_region,w32,gf_w32_bytwo_p_nosse_multiply_region) 
-      else
+      if (gf_cpu_supports_intel_sse2 && !(h->region_type & GF_REGION_NOSIMD)) {
         SET_FUNCTION(gf,multiply_region,w32,gf_w32_bytwo_p_sse_multiply_region) 
-    #else
-      SET_FUNCTION(gf,multiply_region,w32,gf_w32_bytwo_p_nosse_multiply_region) 
-      if(h->region_type & GF_REGION_SIMD)
-        return 0;
+      } else {
+    #endif 
+        SET_FUNCTION(gf,multiply_region,w32,gf_w32_bytwo_p_nosse_multiply_region) 
+        if(h->region_type & GF_REGION_SIMD)
+          return 0;
+    #ifdef INTEL_SSE2
+      }
     #endif
   } else {
     SET_FUNCTION(gf,multiply,w32,gf_w32_bytwo_b_multiply) 
     #ifdef INTEL_SSE2
-      if (h->region_type & GF_REGION_NOSIMD)
-        SET_FUNCTION(gf,multiply_region,w32,gf_w32_bytwo_b_nosse_multiply_region) 
-      else
+      if (gf_cpu_supports_intel_sse2 && !(h->region_type & GF_REGION_NOSIMD)) {
         SET_FUNCTION(gf,multiply_region,w32,gf_w32_bytwo_b_sse_multiply_region) 
-    #else
+      } else {
+    #endif 
       SET_FUNCTION(gf,multiply_region,w32,gf_w32_bytwo_b_nosse_multiply_region) 
       if(h->region_type & GF_REGION_SIMD)
         return 0;
+    #ifdef INTEL_SSE2
+      }
     #endif
   }
 
@@ -1755,11 +1762,11 @@ gf_w32_split_4_32_lazy_multiply_region(gf_t *gf, void *src, void *dest, uint32_t
   gf_do_final_region_alignment(&rd);
 }
 
+#ifdef INTEL_SSSE3
 static
 void
 gf_w32_split_4_32_lazy_sse_altmap_multiply_region(gf_t *gf, void *src, void *dest, uint32_t val, int bytes, int xor)
 {
-#ifdef INTEL_SSSE3
   gf_internal_t *h;
   int i, j, k;
   uint32_t pp, v, *s32, *d32, *top;
@@ -1942,16 +1949,15 @@ gf_w32_split_4_32_lazy_sse_altmap_multiply_region(gf_t *gf, void *src, void *des
   }
 
   gf_do_final_region_alignment(&rd);
-
-#endif
 }
+#endif
 
 
+#ifdef INTEL_SSSE3
 static
 void
 gf_w32_split_4_32_lazy_sse_multiply_region(gf_t *gf, void *src, void *dest, uint32_t val, int bytes, int xor)
 {
-#ifdef INTEL_SSSE3
   gf_internal_t *h;
   int i, j, k;
   uint32_t pp, v, *s32, *d32, *top, tmp_table[16];
@@ -2216,9 +2222,8 @@ gf_w32_split_4_32_lazy_sse_multiply_region(gf_t *gf, void *src, void *dest, uint
     } 
   }
   gf_do_final_region_alignment(&rd);
-
-#endif
 }
+#endif
 
 static 
 int gf_w32_split_init(gf_t *gf)
@@ -2230,23 +2235,7 @@ int gf_w32_split_init(gf_t *gf)
   struct gf_split_8_32_lazy_data *d32;
   struct gf_split_16_32_lazy_data *d16;
   uint32_t p, basep;
-  int i, j, exp, ispclmul, issse3;
-  int isneon = 0;
-
-#if defined(INTEL_SSE4_PCLMUL)
-  ispclmul = 1;
-#else
-  ispclmul = 0;
-#endif
-
-#ifdef INTEL_SSSE3
-  issse3 = 1;
-#else
-  issse3 = 0;
-#endif
-#ifdef ARM_NEON
-  isneon = 1;
-#endif
+  int i, j, exp;
 
   h = (gf_internal_t *) gf->scratch;
 
@@ -2262,7 +2251,8 @@ int gf_w32_split_init(gf_t *gf)
 
   if (h->arg1 == 8 && h->arg2 == 8) {
     SET_FUNCTION(gf,multiply,w32,gf_w32_split_8_8_multiply)
-  } else if (ispclmul) {
+#if defined(INTEL_SSE4_PCLMUL)
+  } else if (gf_cpu_supports_intel_pclmul) {
     if ((0xfffe0000 & h->prim_poly) == 0){
       SET_FUNCTION(gf,multiply,w32,gf_w32_clm_multiply_2)
     } else if ((0xffc00000 & h->prim_poly) == 0){
@@ -2270,6 +2260,7 @@ int gf_w32_split_init(gf_t *gf)
     } else if ((0xfe000000 & h->prim_poly) == 0){
      SET_FUNCTION(gf,multiply,w32,gf_w32_clm_multiply_4)
     }
+#endif
   } else {
     SET_FUNCTION(gf,multiply,w32,gf_w32_bytwo_p_multiply)
   }
@@ -2287,33 +2278,39 @@ int gf_w32_split_init(gf_t *gf)
     ld2 = (struct gf_split_2_32_lazy_data *) h->private;
     ld2->last_value = 0;
     #ifdef INTEL_SSSE3
-      if (!(h->region_type & GF_REGION_NOSIMD))
+      if (gf_cpu_supports_intel_ssse3 && !(h->region_type & GF_REGION_NOSIMD)) {
         SET_FUNCTION(gf,multiply_region,w32,gf_w32_split_2_32_lazy_sse_multiply_region)
-      else
+      } else {
+    #endif
         SET_FUNCTION(gf,multiply_region,w32,gf_w32_split_2_32_lazy_multiply_region)
-    #else
-      SET_FUNCTION(gf,multiply_region,w32,gf_w32_split_2_32_lazy_multiply_region)
-      if(h->region_type & GF_REGION_SIMD) return 0;
+        if(h->region_type & GF_REGION_SIMD) return 0;
+    #ifdef INTEL_SSSE3
+      }
     #endif
     return 1;
   } 
 
   /* 4/32 or Default + SSE - There is no ALTMAP/NOSSE. */
 
+
   if ((h->arg1 == 4 && h->arg2 == 32) || (h->arg1 == 32 && h->arg2 == 4) ||
-      ((issse3 || isneon) && h->mult_type == GF_REGION_DEFAULT)) {
+      ((gf_cpu_supports_intel_ssse3 || gf_cpu_supports_arm_neon) && h->mult_type == GF_REGION_DEFAULT)) {
     ld4 = (struct gf_split_4_32_lazy_data *) h->private;
     ld4->last_value = 0;
-    if ((h->region_type & GF_REGION_NOSIMD) || !(issse3 || isneon)) {
+    if ((h->region_type & GF_REGION_NOSIMD) || !(gf_cpu_supports_intel_ssse3 || gf_cpu_supports_arm_neon)) {
       SET_FUNCTION(gf,multiply_region,w32,gf_w32_split_4_32_lazy_multiply_region)
-    } else if (isneon) {
+    } else if (gf_cpu_supports_arm_neon) {
 #ifdef ARM_NEON
       gf_w32_neon_split_init(gf);
 #endif
     } else if (h->region_type & GF_REGION_ALTMAP) {
+#ifdef INTEL_SSSE3
       SET_FUNCTION(gf,multiply_region,w32,gf_w32_split_4_32_lazy_sse_altmap_multiply_region)
+#endif
     } else {
+#ifdef INTEL_SSSE3
       SET_FUNCTION(gf,multiply_region,w32,gf_w32_split_4_32_lazy_sse_multiply_region)
+#endif
     }
     return 1;
   } 
@@ -2686,16 +2683,6 @@ int gf_w32_composite_init(gf_t *gf)
 
 int gf_w32_scratch_size(int mult_type, int region_type, int divide_type, int arg1, int arg2)
 {
-  int issse3 = 0;
-  int isneon = 0;
-
-#ifdef INTEL_SSSE3
-  issse3 = 1;
-#endif
-#ifdef ARM_NEON
-  isneon = 1;
-#endif
-
   switch(mult_type)
   {
     case GF_MULT_BYTWO_p:
@@ -2720,7 +2707,7 @@ int gf_w32_scratch_size(int mult_type, int region_type, int divide_type, int arg
           return sizeof(gf_internal_t) + sizeof(struct gf_split_2_32_lazy_data) + 64;
         }
         if ((arg1 == 8 && arg2 == 32) || (arg2 == 8 && arg1 == 32) || 
-             (mult_type == GF_MULT_DEFAULT && !(issse3 || isneon))) {
+             (mult_type == GF_MULT_DEFAULT && !(gf_cpu_supports_intel_ssse3 || gf_cpu_supports_arm_neon))) {
           return sizeof(gf_internal_t) + sizeof(struct gf_split_8_32_lazy_data) + 64;
         }
         if ((arg1 == 4 && arg2 == 32) || 
diff --git a/src/gf_w4.c b/src/gf_w4.c
index 814b0f5..3a7b953 100644
--- a/src/gf_w4.c
+++ b/src/gf_w4.c
@@ -12,6 +12,7 @@
 #include <stdio.h>
 #include <stdlib.h>
 #include "gf_w4.h"
+#include "gf_cpu.h"
 
 #define AB2(ip, am1 ,am2, b, t1, t2) {\
   t1 = (b << 1) & am1;\
@@ -134,6 +135,7 @@ gf_w4_shift_multiply (gf_t *gf, gf_val_32_t a, gf_val_32_t b)
 
 /* Ben: This function works, but it is 33% slower than the normal shift mult */
 
+#if defined(INTEL_SSE4_PCLMUL)
 static
 inline
 gf_val_32_t
@@ -141,8 +143,6 @@ gf_w4_clm_multiply (gf_t *gf, gf_val_32_t a4, gf_val_32_t b4)
 {
   gf_val_32_t rv = 0;
 
-#if defined(INTEL_SSE4_PCLMUL)
-
   __m128i         a, b;
   __m128i         result;
   __m128i         prim_poly;
@@ -173,9 +173,9 @@ gf_w4_clm_multiply (gf_t *gf, gf_val_32_t a4, gf_val_32_t b4)
   /* Extracts 32 bit value from result. */
 
   rv = ((gf_val_32_t)_mm_extract_epi32(result, 0));
-#endif
   return rv;
 }
+#endif
 
 static
 void
@@ -447,18 +447,19 @@ int gf_w4_single_table_init(gf_t *gf)
   SET_FUNCTION(gf,inverse,w32,NULL)
   SET_FUNCTION(gf,divide,w32,gf_w4_single_table_divide)
   SET_FUNCTION(gf,multiply,w32,gf_w4_single_table_multiply)
-  #if defined(INTEL_SSSE3) || defined(ARM_NEON)
-    if(h->region_type & (GF_REGION_NOSIMD | GF_REGION_CAUCHY))
-      SET_FUNCTION(gf,multiply_region,w32,gf_w4_single_table_multiply_region)
-    else
-    #if defined(INTEL_SSSE3)
+  #if defined(INTEL_SSSE3)
+    if (gf_cpu_supports_intel_ssse3 && !(h->region_type & (GF_REGION_NOSIMD | GF_REGION_CAUCHY))) {
       SET_FUNCTION(gf,multiply_region,w32,gf_w4_single_table_sse_multiply_region)
-    #elif defined(ARM_NEON)
+    } else {
+  #elif defined(ARM_NEON)
+    if (gf_cpu_supports_arm_neon && !(h->region_type & (GF_REGION_NOSIMD | GF_REGION_CAUCHY))) {
       gf_w4_neon_single_table_init(gf);
-    #endif
-  #else
-    SET_FUNCTION(gf,multiply_region,w32,gf_w4_single_table_multiply_region)
-    if (h->region_type & GF_REGION_SIMD) return 0;
+    } else {
+  #endif
+      SET_FUNCTION(gf,multiply_region,w32,gf_w4_single_table_multiply_region)
+      if (h->region_type & GF_REGION_SIMD) return 0;
+  #if defined(INTEL_SSSE3) || defined(ARM_NEON)
+    }
   #endif
 
   return 1;
@@ -736,16 +737,13 @@ int gf_w4_table_init(gf_t *gf)
 {
   int rt;
   gf_internal_t *h;
-  int simd = 0;
-
-#if defined(INTEL_SSSE3) || defined(ARM_NEON)
-  simd = 1;
-#endif
 
   h = (gf_internal_t *) gf->scratch;
   rt = (h->region_type);
 
-  if (h->mult_type == GF_MULT_DEFAULT && !simd) rt |= GF_REGION_DOUBLE_TABLE;
+  if (h->mult_type == GF_MULT_DEFAULT && 
+    !(gf_cpu_supports_intel_ssse3 || gf_cpu_supports_arm_neon)) 
+      rt |= GF_REGION_DOUBLE_TABLE;
 
   if (rt & GF_REGION_DOUBLE_TABLE) {
     return gf_w4_double_table_init(gf);
@@ -929,11 +927,11 @@ gf_w4_bytwo_p_sse_multiply_region(gf_t *gf, void *src, void *dest, gf_val_32_t v
 #endif
 
 /*
+#ifdef INTEL_SSE2
 static
 void 
 gf_w4_bytwo_b_sse_multiply_region(gf_t *gf, void *src, void *dest, gf_val_32_t val, int bytes, int xor)
 {
-#ifdef INTEL_SSE2
   uint8_t *d8, *s8, tb;
   __m128i pp, m1, m2, t1, t2, va, vb;
   struct gf_bytwo_data *btd;
@@ -990,8 +988,8 @@ gf_w4_bytwo_b_sse_multiply_region(gf_t *gf, void *src, void *dest, gf_val_32_t v
     }
   }
   gf_do_final_region_alignment(&rd);
-#endif
 }
+#endif
 */
 
 #ifdef INTEL_SSE2
@@ -1867,26 +1865,28 @@ int gf_w4_bytwo_init(gf_t *gf)
   if (h->mult_type == GF_MULT_BYTWO_p) {
     SET_FUNCTION(gf,multiply,w32,gf_w4_bytwo_p_multiply)
     #ifdef INTEL_SSE2
-      if (h->region_type & GF_REGION_NOSIMD)
-        SET_FUNCTION(gf,multiply_region,w32,gf_w4_bytwo_p_nosse_multiply_region)
-      else
+      if (gf_cpu_supports_intel_sse2 && !(h->region_type & GF_REGION_NOSIMD)) {
         SET_FUNCTION(gf,multiply_region,w32,gf_w4_bytwo_p_sse_multiply_region)
-    #else
-      SET_FUNCTION(gf,multiply_region,w32,gf_w4_bytwo_p_nosse_multiply_region)
-      if (h->region_type & GF_REGION_SIMD)
-        return 0;
+      } else {
+    #endif
+        SET_FUNCTION(gf,multiply_region,w32,gf_w4_bytwo_p_nosse_multiply_region)
+        if (h->region_type & GF_REGION_SIMD)
+          return 0;
+    #ifdef INTEL_SSE2
+      }
     #endif
   } else {
     SET_FUNCTION(gf,multiply,w32,gf_w4_bytwo_b_multiply)
     #ifdef INTEL_SSE2
-      if (h->region_type & GF_REGION_NOSIMD)
-        SET_FUNCTION(gf,multiply_region,w32,gf_w4_bytwo_b_nosse_multiply_region)
-      else
+      if (gf_cpu_supports_intel_sse2 && !(h->region_type & GF_REGION_NOSIMD)) {
         SET_FUNCTION(gf,multiply_region,w32,gf_w4_bytwo_b_sse_multiply_region)
-    #else
-      SET_FUNCTION(gf,multiply_region,w32,gf_w4_bytwo_b_nosse_multiply_region)
-      if (h->region_type & GF_REGION_SIMD)
-        return 0;
+      } else {
+    #endif
+        SET_FUNCTION(gf,multiply_region,w32,gf_w4_bytwo_b_nosse_multiply_region)
+        if (h->region_type & GF_REGION_SIMD)
+          return 0;
+    #ifdef INTEL_SSE2
+      }
     #endif
   }
   return 1;
@@ -1897,10 +1897,14 @@ static
 int gf_w4_cfm_init(gf_t *gf)
 {
 #if defined(INTEL_SSE4_PCLMUL)
-  SET_FUNCTION(gf,multiply,w32,gf_w4_clm_multiply)
-  return 1;
+  if (gf_cpu_supports_intel_pclmul) {
+    SET_FUNCTION(gf,multiply,w32,gf_w4_clm_multiply)
+    return 1;
+  }
 #elif defined(ARM_NEON)
-  return gf_w4_neon_cfm_init(gf);
+  if (gf_cpu_supports_arm_neon) {
+    return gf_w4_neon_cfm_init(gf);
+  }
 #endif
   return 0;
 }
@@ -1917,15 +1921,6 @@ int gf_w4_shift_init(gf_t *gf)
 
 int gf_w4_scratch_size(int mult_type, int region_type, int divide_type, int arg1, int arg2)
 {
-  int issse3 = 0, isneon = 0;
-
-#ifdef INTEL_SSSE3
-  issse3 = 1;
-#endif
-#ifdef ARM_NEON
-  isneon = 1;
-#endif
-
   switch(mult_type)
   {
     case GF_MULT_BYTWO_p:
@@ -1938,7 +1933,8 @@ int gf_w4_scratch_size(int mult_type, int region_type, int divide_type, int arg1
         return sizeof(gf_internal_t) + sizeof(struct gf_single_table_data) + 64;
       }
 
-      if (mult_type == GF_MULT_DEFAULT && !(issse3 || isneon))
+      if (mult_type == GF_MULT_DEFAULT && 
+          !(gf_cpu_supports_arm_neon || gf_cpu_supports_intel_ssse3))
           region_type = GF_REGION_DOUBLE_TABLE;
 
       if (region_type & GF_REGION_DOUBLE_TABLE) {
diff --git a/src/gf_w64.c b/src/gf_w64.c
index a096161..69e55db 100644
--- a/src/gf_w64.c
+++ b/src/gf_w64.c
@@ -12,6 +12,7 @@
 #include <stdio.h>
 #include <stdlib.h>
 #include "gf_w64.h"
+#include "gf_cpu.h"
 
 static
 inline
@@ -338,6 +339,8 @@ gf_w64_shift_multiply (gf_t *gf, gf_val_64_t a64, gf_val_64_t b64)
  * ELM: Use the Intel carryless multiply instruction to do very fast 64x64 multiply.
  */
 
+#if defined(INTEL_SSE4_PCLMUL) 
+
 static
 inline
 gf_val_64_t
@@ -345,8 +348,6 @@ gf_w64_clm_multiply_2 (gf_t *gf, gf_val_64_t a64, gf_val_64_t b64)
 {
        gf_val_64_t rv = 0;
 
-#if defined(INTEL_SSE4_PCLMUL) 
-
         __m128i         a, b;
         __m128i         result;
         __m128i         prim_poly;
@@ -376,10 +377,12 @@ gf_w64_clm_multiply_2 (gf_t *gf, gf_val_64_t a64, gf_val_64_t b64)
         result = _mm_xor_si128 (result, w);
 
         rv = ((gf_val_64_t)_mm_extract_epi64(result, 0));
-#endif
         return rv;
 }
+#endif
  
+#if defined(INTEL_SSE4_PCLMUL) 
+
 static
 inline
 gf_val_64_t
@@ -387,8 +390,6 @@ gf_w64_clm_multiply_4 (gf_t *gf, gf_val_64_t a64, gf_val_64_t b64)
 {
   gf_val_64_t rv = 0;
 
-#if defined(INTEL_SSE4_PCLMUL) 
-
   __m128i         a, b;
   __m128i         result;
   __m128i         prim_poly;
@@ -418,15 +419,15 @@ gf_w64_clm_multiply_4 (gf_t *gf, gf_val_64_t a64, gf_val_64_t b64)
   result = _mm_xor_si128 (result, w);
 
   rv = ((gf_val_64_t)_mm_extract_epi64(result, 0));
-#endif
   return rv;
 }
+#endif
 
 
+#if defined(INTEL_SSE4_PCLMUL) 
   void
 gf_w64_clm_multiply_region(gf_t *gf, void *src, void *dest, uint64_t val, int bytes, int xor)
 {
-#if defined(INTEL_SSE4_PCLMUL) 
   gf_internal_t *h;
   uint8_t *s8, *d8, *dtop;
   gf_region_data rd;
@@ -504,8 +505,8 @@ gf_w64_clm_multiply_region(gf_t *gf, void *src, void *dest, uint64_t val, int by
     }
   }
   gf_do_final_region_alignment(&rd);
-#endif
 }
+#endif
 
 void
 gf_w64_split_4_64_lazy_multiply_region(gf_t *gf, void *src, void *dest, uint64_t val, int bytes, int xor)
@@ -709,21 +710,23 @@ int gf_w64_cfm_init(gf_t *gf)
   SET_FUNCTION(gf,inverse,w64,gf_w64_euclid)
   SET_FUNCTION(gf,multiply_region,w64,gf_w64_multiply_region_from_single)
 
-#if defined(INTEL_SSE4_PCLMUL) 
-  gf_internal_t *h;
+#if defined(INTEL_SSE4_PCLMUL)
+  if (gf_cpu_supports_intel_pclmul) {
+    gf_internal_t *h;
 
-  h = (gf_internal_t *) gf->scratch;
+    h = (gf_internal_t *) gf->scratch;
 
-  if ((0xfffffffe00000000ULL & h->prim_poly) == 0){ 
-    SET_FUNCTION(gf,multiply,w64,gf_w64_clm_multiply_2)
-    SET_FUNCTION(gf,multiply_region,w64,gf_w64_clm_multiply_region_from_single_2) 
-  }else if((0xfffe000000000000ULL & h->prim_poly) == 0){
-    SET_FUNCTION(gf,multiply,w64,gf_w64_clm_multiply_4)
-    SET_FUNCTION(gf,multiply_region,w64,gf_w64_clm_multiply_region_from_single_4)
-  } else {
-    return 0;
+    if ((0xfffffffe00000000ULL & h->prim_poly) == 0){ 
+      SET_FUNCTION(gf,multiply,w64,gf_w64_clm_multiply_2)
+      SET_FUNCTION(gf,multiply_region,w64,gf_w64_clm_multiply_region_from_single_2) 
+    }else if((0xfffe000000000000ULL & h->prim_poly) == 0){
+      SET_FUNCTION(gf,multiply,w64,gf_w64_clm_multiply_4)
+      SET_FUNCTION(gf,multiply_region,w64,gf_w64_clm_multiply_region_from_single_4)
+    } else {
+      return 0;
+    }
+    return 1;
   }
-  return 1;
 #endif
 
   return 0;
@@ -1261,9 +1264,9 @@ gf_w64_bytwo_b_nosse_multiply_region(gf_t *gf, void *src, void *dest, gf_val_64_
       v = _mm_srli_epi64(v, 1); }
 
 
+#ifdef INTEL_SSE2
 void gf_w64_bytwo_p_sse_multiply_region(gf_t *gf, void *src, void *dest, gf_val_64_t val, int bytes, int xor)
 {
-#ifdef INTEL_SSE2
   int i;
   uint8_t *s8, *d8;
   uint64_t vrev, one64;
@@ -1322,8 +1325,8 @@ void gf_w64_bytwo_p_sse_multiply_region(gf_t *gf, void *src, void *dest, gf_val_
     s8 += 16;
   }
   gf_do_final_region_alignment(&rd);
-#endif
 }
+#endif
 
 #ifdef INTEL_SSE2
 static
@@ -1457,26 +1460,28 @@ int gf_w64_bytwo_init(gf_t *gf)
   if (h->mult_type == GF_MULT_BYTWO_p) {
     SET_FUNCTION(gf,multiply,w64,gf_w64_bytwo_p_multiply)
     #ifdef INTEL_SSE2 
-      if (h->region_type & GF_REGION_NOSIMD)
-        SET_FUNCTION(gf,multiply_region,w64,gf_w64_bytwo_p_nosse_multiply_region) 
-      else
+      if (gf_cpu_supports_intel_sse2 && !(h->region_type & GF_REGION_NOSIMD)) {
         SET_FUNCTION(gf,multiply_region,w64,gf_w64_bytwo_p_sse_multiply_region) 
-    #else
-      SET_FUNCTION(gf,multiply_region,w64,gf_w64_bytwo_p_nosse_multiply_region) 
-      if(h->region_type & GF_REGION_SIMD)
-        return 0;
+      } else {
+    #endif
+        SET_FUNCTION(gf,multiply_region,w64,gf_w64_bytwo_p_nosse_multiply_region) 
+        if(h->region_type & GF_REGION_SIMD)
+          return 0;
+    #ifdef INTEL_SSE2
+      } 
     #endif
   } else {
     SET_FUNCTION(gf,multiply,w64,gf_w64_bytwo_b_multiply)
     #ifdef INTEL_SSE2 
-      if (h->region_type & GF_REGION_NOSIMD)
-        SET_FUNCTION(gf,multiply_region,w64,gf_w64_bytwo_b_nosse_multiply_region) 
-      else
+      if (gf_cpu_supports_intel_sse2 && !(h->region_type & GF_REGION_NOSIMD)) {
         SET_FUNCTION(gf,multiply_region,w64,gf_w64_bytwo_b_sse_multiply_region) 
-    #else
+      } else {
+    #endif
       SET_FUNCTION(gf,multiply_region,w64,gf_w64_bytwo_b_nosse_multiply_region) 
       if(h->region_type & GF_REGION_SIMD)
         return 0;
+    #ifdef INTEL_SSE2
+      } 
     #endif
   }
   SET_FUNCTION(gf,inverse,w64,gf_w64_euclid)
@@ -1975,18 +1980,20 @@ int gf_w64_split_init(gf_t *gf)
   SET_FUNCTION(gf,multiply,w64,gf_w64_bytwo_p_multiply) 
 
 #if defined(INTEL_SSE4_PCLMUL) 
-  if ((!(h->region_type & GF_REGION_NOSIMD) &&
-     (h->arg1 == 64 || h->arg2 == 64)) ||
-     h->mult_type == GF_MULT_DEFAULT){
-   
-    if ((0xfffffffe00000000ULL & h->prim_poly) == 0){ 
-      SET_FUNCTION(gf,multiply,w64,gf_w64_clm_multiply_2)
-      SET_FUNCTION(gf,multiply_region,w64,gf_w64_clm_multiply_region_from_single_2) 
-    }else if((0xfffe000000000000ULL & h->prim_poly) == 0){
-      SET_FUNCTION(gf,multiply,w64,gf_w64_clm_multiply_4)
-      SET_FUNCTION(gf,multiply_region,w64,gf_w64_clm_multiply_region_from_single_4) 
-    }else{
-      return 0;
+  if (gf_cpu_supports_intel_pclmul) {
+    if ((!(h->region_type & GF_REGION_NOSIMD) &&
+        (h->arg1 == 64 || h->arg2 == 64)) ||
+        h->mult_type == GF_MULT_DEFAULT){
+    
+      if ((0xfffffffe00000000ULL & h->prim_poly) == 0){ 
+        SET_FUNCTION(gf,multiply,w64,gf_w64_clm_multiply_2)
+        SET_FUNCTION(gf,multiply_region,w64,gf_w64_clm_multiply_region_from_single_2) 
+      }else if((0xfffe000000000000ULL & h->prim_poly) == 0){
+        SET_FUNCTION(gf,multiply,w64,gf_w64_clm_multiply_4)
+        SET_FUNCTION(gf,multiply_region,w64,gf_w64_clm_multiply_region_from_single_4) 
+      }else{
+        return 0;
+      }
     }
   }
 #endif
@@ -1996,23 +2003,27 @@ int gf_w64_split_init(gf_t *gf)
   /* Allen: set region pointers for default mult type. Single pointers are
    * taken care of above (explicitly for sse, implicitly for no sse). */
 
-#if defined(INTEL_SSE4) || defined(ARCH_AARCH64)
   if (h->mult_type == GF_MULT_DEFAULT) {
-    d4 = (struct gf_split_4_64_lazy_data *) h->private;
-    d4->last_value = 0;
+#if defined(INTEL_SSE4) || defined(ARCH_AARCH64)
+    if (gf_cpu_supports_intel_sse4 || gf_cpu_supports_arm_neon) {
+      d4 = (struct gf_split_4_64_lazy_data *) h->private;
+      d4->last_value = 0;
 #if defined(INTEL_SSE4)
-    SET_FUNCTION(gf,multiply_region,w64,gf_w64_split_4_64_lazy_sse_multiply_region) 
+      if (gf_cpu_supports_intel_sse4)
+        SET_FUNCTION(gf,multiply_region,w64,gf_w64_split_4_64_lazy_sse_multiply_region)
 #elif defined(ARCH_AARCH64)
-    gf_w64_neon_split_init(gf);
+      if (gf_cpu_supports_arm_neon)
+        gf_w64_neon_split_init(gf);
 #endif
-  }
-#else
-  if (h->mult_type == GF_MULT_DEFAULT) {
-    d8 = (struct gf_split_8_64_lazy_data *) h->private;
-    d8->last_value = 0;
-    SET_FUNCTION(gf,multiply_region,w64,gf_w64_split_8_64_lazy_multiply_region)
-  }
+    } else {
+#endif
+      d8 = (struct gf_split_8_64_lazy_data *) h->private;
+      d8->last_value = 0;
+      SET_FUNCTION(gf,multiply_region,w64,gf_w64_split_8_64_lazy_multiply_region)
+#if defined(INTEL_SSE4) || defined(ARCH_AARCH64)
+    }
 #endif
+  }
 
   if ((h->arg1 == 4 && h->arg2 == 64) || (h->arg1 == 64 && h->arg2 == 4)) {
     d4 = (struct gf_split_4_64_lazy_data *) h->private;
@@ -2022,28 +2033,35 @@ int gf_w64_split_init(gf_t *gf)
     if(h->region_type & GF_REGION_ALTMAP)
     {
       #ifdef INTEL_SSSE3
-        SET_FUNCTION(gf,multiply_region,w64,gf_w64_split_4_64_lazy_sse_altmap_multiply_region) 
+        if (gf_cpu_supports_intel_ssse3) {
+          SET_FUNCTION(gf,multiply_region,w64,gf_w64_split_4_64_lazy_sse_altmap_multiply_region)
+        } else
       #elif defined(ARCH_AARCH64)
-        gf_w64_neon_split_init(gf);
-      #else
-        return 0;
+        if (gf_cpu_supports_arm_neon) {
+          gf_w64_neon_split_init(gf);
+        } else
       #endif
+        return 0;
     }
     else //no altmap
     {
       #if defined(INTEL_SSE4) || defined(ARCH_AARCH64)
-        if(h->region_type & GF_REGION_NOSIMD)
-          SET_FUNCTION(gf,multiply_region,w64,gf_w64_split_4_64_lazy_multiply_region)
-        else
-        #if defined(INTEL_SSE4)
-          SET_FUNCTION(gf,multiply_region,w64,gf_w64_split_4_64_lazy_sse_multiply_region)
-        #elif defined(ARCH_AARCH64)
-          gf_w64_neon_split_init(gf);
-        #endif
-      #else
+        if(gf_cpu_supports_intel_sse4 || gf_cpu_supports_arm_neon) {
+          if (h->region_type & GF_REGION_NOSIMD) {
+            SET_FUNCTION(gf,multiply_region,w64,gf_w64_split_4_64_lazy_multiply_region)
+          } else
+          #if defined(INTEL_SSE4)
+            SET_FUNCTION(gf,multiply_region,w64,gf_w64_split_4_64_lazy_sse_multiply_region)
+          #elif defined(ARCH_AARCH64)
+            gf_w64_neon_split_init(gf);
+          #endif
+        } else {
+      #endif
         SET_FUNCTION(gf,multiply_region,w64,gf_w64_split_4_64_lazy_multiply_region)
         if(h->region_type & GF_REGION_SIMD)
           return 0;
+      #if defined(INTEL_SSE4) || defined(ARCH_AARCH64)
+        }
       #endif
     }
   }
@@ -2114,11 +2132,15 @@ int gf_w64_scratch_size(int mult_type, int region_type, int divide_type, int arg
        * then fall through to split table scratch size code. */
 
 #if defined(INTEL_SSE4) || defined(ARCH_AARCH64)
+    if (gf_cpu_supports_intel_sse4 || gf_cpu_supports_arm_neon) {
       arg1 = 64;
       arg2 = 4;
-#else
+    } else {
+#endif
       arg1 = 64;
       arg2 = 8;
+#if defined(INTEL_SSE4) || defined(ARCH_AARCH64)
+    }
 #endif
 
     case GF_MULT_SPLIT_TABLE:
diff --git a/src/gf_w8.c b/src/gf_w8.c
index 81a0eba..f647a31 100644
--- a/src/gf_w8.c
+++ b/src/gf_w8.c
@@ -13,6 +13,7 @@
 #include <stdio.h>
 #include <stdlib.h>
 #include <assert.h>
+#include "gf_cpu.h"
 
 #define AB2(ip, am1 ,am2, b, t1, t2) {\
   t1 = (b << 1) & am1;\
@@ -127,6 +128,7 @@ uint32_t gf_w8_matrix (gf_t *gf, uint32_t b)
 }
 
 
+#if defined(INTEL_SSE4_PCLMUL)
 static
 inline
 gf_val_32_t
@@ -134,8 +136,6 @@ gf_w8_clm_multiply_2 (gf_t *gf, gf_val_32_t a8, gf_val_32_t b8)
 {
   gf_val_32_t rv = 0;
 
-#if defined(INTEL_SSE4_PCLMUL)
-
   __m128i         a, b;
   __m128i         result;
   __m128i         prim_poly;
@@ -169,10 +169,11 @@ gf_w8_clm_multiply_2 (gf_t *gf, gf_val_32_t a8, gf_val_32_t b8)
   
   rv = ((gf_val_32_t)_mm_extract_epi32(result, 0));
 
-#endif
   return rv;
 }
+#endif
 
+#if defined(INTEL_SSE4_PCLMUL)
 static
 inline
 gf_val_32_t
@@ -180,8 +181,6 @@ gf_w8_clm_multiply_3 (gf_t *gf, gf_val_32_t a8, gf_val_32_t b8)
 {
   gf_val_32_t rv = 0;
 
-#if defined(INTEL_SSE4_PCLMUL)
-
   __m128i         a, b;
   __m128i         result;
   __m128i         prim_poly;
@@ -208,10 +207,11 @@ gf_w8_clm_multiply_3 (gf_t *gf, gf_val_32_t a8, gf_val_32_t b8)
   
   rv = ((gf_val_32_t)_mm_extract_epi32(result, 0));
 
-#endif
   return rv;
 }
+#endif
 
+#if defined(INTEL_SSE4_PCLMUL)
 static
 inline
 gf_val_32_t
@@ -219,8 +219,6 @@ gf_w8_clm_multiply_4 (gf_t *gf, gf_val_32_t a8, gf_val_32_t b8)
 {
   gf_val_32_t rv = 0;
 
-#if defined(INTEL_SSE4_PCLMUL)
-
   __m128i         a, b;
   __m128i         result;
   __m128i         prim_poly;
@@ -248,9 +246,9 @@ gf_w8_clm_multiply_4 (gf_t *gf, gf_val_32_t a8, gf_val_32_t b8)
   /* Extracts 32 bit value from result. */
   rv = ((gf_val_32_t)_mm_extract_epi32(result, 0));
 
-#endif
   return rv;
 }
+#endif
 
 
 static
@@ -509,25 +507,29 @@ static
 int gf_w8_cfm_init(gf_t *gf)
 { 
 #if defined(INTEL_SSE4_PCLMUL)
-  gf_internal_t *h;
-
-  h = (gf_internal_t *) gf->scratch;
-
-    if ((0xe0 & h->prim_poly) == 0){
-      SET_FUNCTION(gf,multiply,w32,gf_w8_clm_multiply_2)
-      SET_FUNCTION(gf,multiply_region,w32,gf_w8_clm_multiply_region_from_single_2)
-    }else if ((0xc0 & h->prim_poly) == 0){
-      SET_FUNCTION(gf,multiply,w32,gf_w8_clm_multiply_3)
-      SET_FUNCTION(gf,multiply_region,w32,gf_w8_clm_multiply_region_from_single_3)
-    }else if ((0x80 & h->prim_poly) == 0){ 
-      SET_FUNCTION(gf,multiply,w32,gf_w8_clm_multiply_4)
-      SET_FUNCTION(gf,multiply_region,w32,gf_w8_clm_multiply_region_from_single_4)
-    }else{
-      return 0;
-    }
-  return 1;
+  if (gf_cpu_supports_intel_pclmul) {
+    gf_internal_t *h;
+
+    h = (gf_internal_t *) gf->scratch;
+
+      if ((0xe0 & h->prim_poly) == 0){
+        SET_FUNCTION(gf,multiply,w32,gf_w8_clm_multiply_2)
+        SET_FUNCTION(gf,multiply_region,w32,gf_w8_clm_multiply_region_from_single_2)
+      }else if ((0xc0 & h->prim_poly) == 0){
+        SET_FUNCTION(gf,multiply,w32,gf_w8_clm_multiply_3)
+        SET_FUNCTION(gf,multiply_region,w32,gf_w8_clm_multiply_region_from_single_3)
+      }else if ((0x80 & h->prim_poly) == 0){ 
+        SET_FUNCTION(gf,multiply,w32,gf_w8_clm_multiply_4)
+        SET_FUNCTION(gf,multiply_region,w32,gf_w8_clm_multiply_region_from_single_4)
+      }else{
+        return 0;
+      }
+    return 1;
+  }
 #elif defined(ARM_NEON)
-  return gf_w8_neon_cfm_init(gf);
+  if (gf_cpu_supports_arm_neon) {
+    return gf_w8_neon_cfm_init(gf);
+  }
 #endif
 
   return 0;
@@ -1103,20 +1105,21 @@ int gf_w8_split_init(gf_t *gf)
   }
 
   SET_FUNCTION(gf,multiply,w32,gf_w8_split_multiply)
-  
-  #if defined(INTEL_SSSE3) || defined(ARM_NEON)
-    if (h->region_type & GF_REGION_NOSIMD)
-      SET_FUNCTION(gf,multiply_region,w32,gf_w8_split_multiply_region)
-    else
-    #if defined(INTEL_SSSE3)
+
+  #if defined(INTEL_SSSE3)
+    if (gf_cpu_supports_intel_ssse3 && !(h->region_type & GF_REGION_NOSIMD)) {
       SET_FUNCTION(gf,multiply_region,w32,gf_w8_split_multiply_region_sse)
-    #elif defined(ARM_NEON)
+    } else {
+  #elif defined(ARM_NEON)
+    if (gf_cpu_supports_arm_neon && !(h->region_type & GF_REGION_NOSIMD)) {
       gf_w8_neon_split_init(gf);
-    #endif
-  #else
+    } else {
+  #endif
     SET_FUNCTION(gf,multiply_region,w32,gf_w8_split_multiply_region)
     if(h->region_type & GF_REGION_SIMD)
       return 0;
+  #if defined(INTEL_SSSE3) || defined(ARM_NEON)
+    }
   #endif
 
   return 1;
@@ -1134,17 +1137,12 @@ int gf_w8_table_init(gf_t *gf)
   struct gf_w8_double_table_data *dtd = NULL;
   struct gf_w8_double_table_lazy_data *ltd = NULL;
   struct gf_w8_default_data *dd = NULL;
-  int a, b, c, prod, scase, use_simd;
+  int a, b, c, prod, scase;
 
   h = (gf_internal_t *) gf->scratch;
 
-#if defined(INTEL_SSSE3) || defined(ARM_NEON)
-  use_simd = 1;
-#else
-  use_simd = 0;
-#endif
-
-  if (h->mult_type == GF_MULT_DEFAULT && use_simd) {
+  if (h->mult_type == GF_MULT_DEFAULT &&
+      (gf_cpu_supports_intel_ssse3 || gf_cpu_supports_arm_neon)) {
     dd = (struct gf_w8_default_data *)h->private;
     scase = 3;
     bzero(dd->high, sizeof(uint8_t) * GF_FIELD_SIZE * GF_HALF_SIZE);
@@ -1220,13 +1218,19 @@ int gf_w8_table_init(gf_t *gf)
       break;
     case 3:
 #if defined(INTEL_SSSE3) || defined(ARM_NEON)
-      SET_FUNCTION(gf,divide,w32,gf_w8_default_divide)
-      SET_FUNCTION(gf,multiply,w32,gf_w8_default_multiply)
+      if (gf_cpu_supports_intel_ssse3 || gf_cpu_supports_arm_neon) {
+        SET_FUNCTION(gf,divide,w32,gf_w8_default_divide)
+        SET_FUNCTION(gf,multiply,w32,gf_w8_default_multiply)
 #if defined(INTEL_SSSE3)
-      SET_FUNCTION(gf,multiply_region,w32,gf_w8_split_multiply_region_sse)
+        if (gf_cpu_supports_intel_ssse3) {
+          SET_FUNCTION(gf,multiply_region,w32,gf_w8_split_multiply_region_sse)
+        }
 #elif defined(ARM_NEON)
-      gf_w8_neon_split_init(gf);
+        if (gf_cpu_supports_arm_neon) {
+          gf_w8_neon_split_init(gf);
+        }
 #endif
+      }
 #endif
       break;
   }
@@ -2192,26 +2196,28 @@ int gf_w8_bytwo_init(gf_t *gf)
   if (h->mult_type == GF_MULT_BYTWO_p) {
     SET_FUNCTION(gf,multiply,w32,gf_w8_bytwo_p_multiply)
 #ifdef INTEL_SSE2
-    if (h->region_type & GF_REGION_NOSIMD)
-      SET_FUNCTION(gf,multiply_region,w32,gf_w8_bytwo_p_nosse_multiply_region)
-    else
+    if (gf_cpu_supports_intel_sse2 && !(h->region_type & GF_REGION_NOSIMD)) {
       SET_FUNCTION(gf,multiply_region,w32,gf_w8_bytwo_p_sse_multiply_region)
-#else
-    SET_FUNCTION(gf,multiply_region,w32,gf_w8_bytwo_p_nosse_multiply_region)
-    if(h->region_type & GF_REGION_SIMD)
-      return 0;
+    } else {
+#endif
+      SET_FUNCTION(gf,multiply_region,w32,gf_w8_bytwo_p_nosse_multiply_region)
+      if(h->region_type & GF_REGION_SIMD)
+        return 0;
+#ifdef INTEL_SSE2
+    }
 #endif
   } else {
     SET_FUNCTION(gf,multiply,w32,gf_w8_bytwo_b_multiply)
 #ifdef INTEL_SSE2
-    if (h->region_type & GF_REGION_NOSIMD)
-      SET_FUNCTION(gf,multiply_region,w32,gf_w8_bytwo_b_nosse_multiply_region)
-    else
+    if (gf_cpu_supports_intel_sse2 && !(h->region_type & GF_REGION_NOSIMD)) {
       SET_FUNCTION(gf,multiply_region,w32,gf_w8_bytwo_b_sse_multiply_region)
-#else
+    } else {
+#endif
     SET_FUNCTION(gf,multiply_region,w32,gf_w8_bytwo_b_nosse_multiply_region)
     if(h->region_type & GF_REGION_SIMD)
       return 0;
+#ifdef INTEL_SSE2
+    }
 #endif
   }
   return 1;
@@ -2229,9 +2235,9 @@ int gf_w8_scratch_size(int mult_type, int region_type, int divide_type, int arg1
   switch(mult_type)
   {
     case GF_MULT_DEFAULT:
-#if defined(INTEL_SSSE3) || defined(ARM_NEON)
-      return sizeof(gf_internal_t) + sizeof(struct gf_w8_default_data) + 64;
-#endif
+      if (gf_cpu_supports_intel_ssse3 || gf_cpu_supports_arm_neon) {
+        return sizeof(gf_internal_t) + sizeof(struct gf_w8_default_data) + 64;
+      }
       return sizeof(gf_internal_t) + sizeof(struct gf_w8_single_table_data) + 64;
     case GF_MULT_TABLE:
       if (region_type == GF_REGION_CAUCHY) {
diff --git a/tools/test_simd.sh b/tools/test_simd.sh
index 1268f87..1b0e319 100755
--- a/tools/test_simd.sh
+++ b/tools/test_simd.sh
@@ -118,6 +118,237 @@ test_compile() {
     esac
 }
 
+# disable through build flags
+runtime_arm_flags() {
+    failed=0
+
+    echo "====NO SIMD support..." >> ${1}
+    { ./configure --disable-neon && make clean && make CFLAGS="-DDEBUG_FUNCTIONS"; } || { echo "Compile FAILED" >> ${1}; return 1; }
+    for i in 128 64 32 16 8 4; do
+        { ${script_dir}/gf_methods $i -ACD -X >> ${1}; } || { echo "gf_methods $i FAILED" >> ${1}; ((++failed)); }
+    done
+
+    echo "====FULL SIMD support..." >> ${1}
+    { ./configure && make clean && make CFLAGS="-DDEBUG_FUNCTIONS"; } || { echo "Compile FAILED" >> ${1}; return 1; }
+    for i in 128 64 32 16 8 4; do
+        { ${script_dir}/gf_methods $i -ACD -X >> ${1}; } || { echo "gf_methods $i FAILED" >> ${1}; ((++failed)); }
+    done
+
+    return ${failed}
+}
+
+# build once with FULL SIMD and disable at runtime through environment
+runtime_arm_env() {
+    failed=0
+
+    { ./configure && make clean && make CFLAGS="-DDEBUG_FUNCTIONS"; } || { echo "Compile FAILED" >> ${1}; return 1; }
+
+    echo "====NO SIMD support..." >> ${1}
+    export GF_COMPLETE_DISABLE_NEON=1
+    for i in 128 64 32 16 8 4; do
+        { ${script_dir}/gf_methods $i -ACD -X >> ${1}; } || { echo "gf_methods $i FAILED" >> ${1}; ((++failed)); }
+    done
+
+    echo "====FULL SIMD support..." >> ${1}
+    unset GF_COMPLETE_DISABLE_NEON
+    for i in 128 64 32 16 8 4; do
+        { ${script_dir}/gf_methods $i -ACD -X >> ${1}; } || { echo "gf_methods $i FAILED" >> ${1}; ((++failed)); }
+    done
+
+    return ${failed}
+}
+
+runtime_intel_flags() {
+    failed=0
+
+    echo "====NO SIMD support..." >> ${1}
+    { ./configure --disable-sse && make clean && make CFLAGS="-DDEBUG_FUNCTIONS"; } || { echo "FAIL" >> ${1}; ((++failed)); }
+    for i in 128 64 32 16 8 4; do
+        { ${script_dir}/gf_methods $i -ACD -X >> ${1}; } || { echo "gf_methods $i FAILED" >> ${1}; ((++failed)); }
+    done
+
+    echo "====SSE2 support..." >> ${1}
+    export ax_cv_have_sse_ext=no 
+    export ax_cv_have_sse2_ext=yes
+    export ax_cv_have_sse3_ext=no
+    export ax_cv_have_ssse3_ext=no
+    export ax_cv_have_sse41_ext=no
+    export ax_cv_have_sse42_ext=no
+    export ax_cv_have_pclmuldq_ext=no
+    { ./configure && make clean && make CFLAGS="-DDEBUG_FUNCTIONS"; } || { echo "FAIL" >> ${1}; ((++failed)); }
+    for i in 128 64 32 16 8 4; do
+        { ${script_dir}/gf_methods $i -ACD -X >> ${1}; } || { echo "gf_methods $i FAILED" >> ${1}; ((++failed)); }
+    done
+
+    echo "====SSE2,SSE3 support..." >> ${1}
+    export ax_cv_have_sse_ext=no 
+    export ax_cv_have_sse2_ext=yes
+    export ax_cv_have_sse3_ext=yes
+    export ax_cv_have_ssse3_ext=no
+    export ax_cv_have_sse41_ext=no
+    export ax_cv_have_sse42_ext=no
+    export ax_cv_have_pclmuldq_ext=no
+    { ./configure && make clean && make CFLAGS="-DDEBUG_FUNCTIONS"; } || { echo "FAIL" >> ${1}; ((++failed)); }
+    for i in 128 64 32 16 8 4; do
+        { ${script_dir}/gf_methods $i -ACD -X >> ${1}; } || { echo "gf_methods $i FAILED" >> ${1}; ((++failed)); }
+    done
+
+    echo "====SSE2,SSE3,SSSE3 support..." >> ${1}
+    export ax_cv_have_sse_ext=no 
+    export ax_cv_have_sse2_ext=yes
+    export ax_cv_have_sse3_ext=yes
+    export ax_cv_have_ssse3_ext=yes
+    export ax_cv_have_sse41_ext=no
+    export ax_cv_have_sse42_ext=no
+    export ax_cv_have_pclmuldq_ext=no
+    { ./configure && make clean && make CFLAGS="-DDEBUG_FUNCTIONS"; } || { echo "FAIL" >> ${1}; ((++failed)); }
+    for i in 128 64 32 16 8 4; do
+        { ${script_dir}/gf_methods $i -ACD -X >> ${1}; } || { echo "gf_methods $i FAILED" >> ${1}; ((++failed)); }
+    done
+
+    echo "====SSE2,SSE3,SSSE3,SSE4_1 support..." >> ${1}
+    export ax_cv_have_sse_ext=no 
+    export ax_cv_have_sse2_ext=yes
+    export ax_cv_have_sse3_ext=yes
+    export ax_cv_have_ssse3_ext=yes
+    export ax_cv_have_sse41_ext=yes
+    export ax_cv_have_sse42_ext=no
+    export ax_cv_have_pclmuldq_ext=no
+    { ./configure && make clean && make CFLAGS="-DDEBUG_FUNCTIONS"; } || { echo "FAIL" >> ${1}; ((++failed)); }
+    for i in 128 64 32 16 8 4; do
+        { ${script_dir}/gf_methods $i -ACD -X >> ${1}; } || { echo "gf_methods $i FAILED" >> ${1}; ((++failed)); }
+    done
+
+    echo "====SSE2,SSE3,SSSE3,SSE4_2 support..." >> ${1}
+    export ax_cv_have_sse_ext=no 
+    export ax_cv_have_sse2_ext=yes
+    export ax_cv_have_sse3_ext=yes
+    export ax_cv_have_ssse3_ext=yes
+    export ax_cv_have_sse41_ext=no
+    export ax_cv_have_sse42_ext=yes
+    export ax_cv_have_pclmuldq_ext=no
+    { ./configure && make clean && make CFLAGS="-DDEBUG_FUNCTIONS"; } || { echo "FAIL" >> ${1}; ((++failed)); }
+    for i in 128 64 32 16 8 4; do
+        { ${script_dir}/gf_methods $i -ACD -X >> ${1}; } || { echo "gf_methods $i FAILED" >> ${1}; ((++failed)); }
+    done
+
+    echo "====FULL SIMD support..." >> ${1}
+    { ./configure && make clean && make CFLAGS="-DDEBUG_FUNCTIONS"; } || { echo "FAIL" >> ${1}; ((++failed)); }
+    for i in 128 64 32 16 8 4; do
+        { ${script_dir}/gf_methods $i -ACD -X >> ${1}; } || { echo "gf_methods $i FAILED" >> ${1}; ((++failed)); }
+    done
+
+    return ${failed}
+}
+
+runtime_intel_env() {
+    failed=0
+
+    # compile a build with full SIMD support
+    { ./configure && make clean && make CFLAGS="-DDEBUG_FUNCTIONS"; } || { echo "Compile FAILED" >> ${1}; return 1; }
+
+    echo "====NO SIMD support..." >> ${1}
+    export GF_COMPLETE_DISABLE_SSE2=1
+    export GF_COMPLETE_DISABLE_SSE3=1
+    export GF_COMPLETE_DISABLE_SSSE3=1
+    export GF_COMPLETE_DISABLE_SSE4=1
+    export GF_COMPLETE_DISABLE_SSE4_PCLMUL=1
+    for i in 128 64 32 16 8 4; do
+        { ${script_dir}/gf_methods $i -ACD -X >> ${1}; } || { echo "gf_methods $i FAILED" >> ${1}; ((++failed)); }
+    done
+
+    echo "====SSE2 support..." >> ${1}
+    unset GF_COMPLETE_DISABLE_SSE2
+    export GF_COMPLETE_DISABLE_SSE3=1
+    export GF_COMPLETE_DISABLE_SSSE3=1
+    export GF_COMPLETE_DISABLE_SSE4=1
+    export GF_COMPLETE_DISABLE_SSE4_PCLMUL=1
+    for i in 128 64 32 16 8 4; do
+        { ${script_dir}/gf_methods $i -ACD -X >> ${1}; } || { echo "gf_methods $i FAILED" >> ${1}; ((++failed)); }
+    done
+
+    echo "====SSE2,SSE3 support..." >> ${1}
+    unset GF_COMPLETE_DISABLE_SSE2
+    unset GF_COMPLETE_DISABLE_SSE3
+    export GF_COMPLETE_DISABLE_SSSE3=1
+    export GF_COMPLETE_DISABLE_SSE4=1
+    export GF_COMPLETE_DISABLE_SSE4_PCLMUL=1
+    for i in 128 64 32 16 8 4; do
+        { ${script_dir}/gf_methods $i -ACD -X >> ${1}; } || { echo "gf_methods $i FAILED" >> ${1}; ((++failed)); }
+    done
+
+    echo "====SSE2,SSE3,SSSE3 support..." >> ${1}
+    unset GF_COMPLETE_DISABLE_SSE2
+    unset GF_COMPLETE_DISABLE_SSE3
+    unset GF_COMPLETE_DISABLE_SSSE3
+    export GF_COMPLETE_DISABLE_SSE4=1
+    export GF_COMPLETE_DISABLE_SSE4_PCLMUL=1
+    for i in 128 64 32 16 8 4; do
+        { ${script_dir}/gf_methods $i -ACD -X >> ${1}; } || { echo "gf_methods $i FAILED" >> ${1}; ((++failed)); }
+    done
+
+    echo "====SSE2,SSE3,SSSE3,SSE4_1 support..." >> ${1}
+    unset GF_COMPLETE_DISABLE_SSE2
+    unset GF_COMPLETE_DISABLE_SSE3
+    unset GF_COMPLETE_DISABLE_SSSE3
+    unset GF_COMPLETE_DISABLE_SSE4
+    export GF_COMPLETE_DISABLE_SSE4_PCLMUL=1
+    for i in 128 64 32 16 8 4; do
+        { ${script_dir}/gf_methods $i -ACD -X >> ${1}; } || { echo "gf_methods $i FAILED" >> ${1}; ((++failed)); }
+    done
+
+    echo "====SSE2,SSE3,SSSE3,SSE4_2 support..." >> ${1}
+    unset GF_COMPLETE_DISABLE_SSE2
+    unset GF_COMPLETE_DISABLE_SSE3
+    unset GF_COMPLETE_DISABLE_SSSE3
+    unset GF_COMPLETE_DISABLE_SSE4
+    export GF_COMPLETE_DISABLE_SSE4_PCLMUL=1
+    for i in 128 64 32 16 8 4; do
+        { ${script_dir}/gf_methods $i -ACD -X >> ${1}; } || { echo "gf_methods $i FAILED" >> ${1}; ((++failed)); }
+    done
+
+    echo "====FULL SIMD support..." >> ${1}
+    unset GF_COMPLETE_DISABLE_SSE2
+    unset GF_COMPLETE_DISABLE_SSE3
+    unset GF_COMPLETE_DISABLE_SSSE3
+    unset GF_COMPLETE_DISABLE_SSE4
+    unset GF_COMPLETE_DISABLE_SSE4_PCLMUL
+    for i in 128 64 32 16 8 4; do
+        { ${script_dir}/gf_methods $i -ACD -X >> ${1}; } || { echo "gf_methods $i FAILED" >> ${1}; ((++failed)); }
+    done
+
+    return ${failed}
+}
+
+test_runtime() {
+    rm -f ${results}.left
+    rm -f ${results}.right
+    
+    case $host_cpu in
+        aarch64*|arm*) 
+            runtime_arm_flags ${results}.left
+            runtime_arm_env ${results}.right
+            ;;
+        i[[3456]]86*|x86_64*|amd64*)
+            runtime_intel_flags ${results}.left
+            runtime_intel_env ${results}.right
+            ;;
+    esac
+
+    echo "======LEFT======" > ${results}
+    cat ${results}.left >> ${results}
+    echo "======RIGHT======" >> ${results}
+    cat ${results}.right >> ${results}
+    echo "======RESULT======" >> ${results}
+    if diff "${results}.left" "${results}.right"; then
+        echo SUCCESS >> ${results}
+        return 0
+    else
+        echo SUCCESS >> ${results}
+        return 1
+    fi
+}
+
 cd ${script_dir}/..
 rm -f ${results}
 
diff --git a/tools/test_simd_qemu.sh b/tools/test_simd_qemu.sh
index a270e20..7b2cb1c 100755
--- a/tools/test_simd_qemu.sh
+++ b/tools/test_simd_qemu.sh
@@ -224,6 +224,8 @@ run_test_simd_basic() {
     { run_test $arch $cpu "unit" && echo "SUCCESS"; } || { echo "FAILED"; ((++failed)); }
     echo "=====running functions test"
     { run_test $arch $cpu "functions" && echo "SUCCESS"; } || { echo "FAILED"; ((++failed)); }
+    echo "=====running runtime test"
+    { run_test $arch $cpu "runtime" && echo "SUCCESS"; } || { echo "FAILED"; ((++failed)); }
     stop_qemu
 
     return ${failed}