summaryrefslogtreecommitdiff
diff options
context:
space:
mode:
-rw-r--r--.gitignore2
-rw-r--r--include/gf_cpu.h20
-rw-r--r--src/Makefile.am16
-rw-r--r--src/gf.c21
-rw-r--r--src/gf_cpu.c153
-rw-r--r--src/gf_w128.c48
-rw-r--r--src/gf_w16.c127
-rw-r--r--src/gf_w32.c181
-rw-r--r--src/gf_w4.c92
-rw-r--r--src/gf_w64.c164
-rw-r--r--src/gf_w8.c130
-rwxr-xr-xtools/test_simd.sh231
-rwxr-xr-xtools/test_simd_qemu.sh2
13 files changed, 810 insertions, 377 deletions
diff --git a/.gitignore b/.gitignore
index 22e6fbe..bfc1dfc 100644
--- a/.gitignore
+++ b/.gitignore
@@ -75,4 +75,4 @@ tools/gf_time
tools/gf_unit_w*
tools/test-suite.log
tools/.qemu/
-tools/test_simd*.results
+tools/test_simd*.results*
diff --git a/include/gf_cpu.h b/include/gf_cpu.h
new file mode 100644
index 0000000..71c7227
--- /dev/null
+++ b/include/gf_cpu.h
@@ -0,0 +1,20 @@
+/*
+ * GF-Complete: A Comprehensive Open Source Library for Galois Field Arithmetic
+ * James S. Plank, Ethan L. Miller, Kevin M. Greenan,
+ * Benjamin A. Arnold, John A. Burnum, Adam W. Disney, Allen C. McBride.
+ *
+ * gf_cpu.h
+ *
+ * Identifies whether the CPU supports SIMD instructions at runtime.
+ */
+
+#pragma once
+
+extern int gf_cpu_supports_intel_pclmul;
+extern int gf_cpu_supports_intel_sse4;
+extern int gf_cpu_supports_intel_ssse3;
+extern int gf_cpu_supports_intel_sse3;
+extern int gf_cpu_supports_intel_sse2;
+extern int gf_cpu_supports_arm_neon;
+
+void gf_cpu_identify(void);
diff --git a/src/Makefile.am b/src/Makefile.am
index a3bd37a..cfc2a50 100644
--- a/src/Makefile.am
+++ b/src/Makefile.am
@@ -4,11 +4,21 @@
AUTOMAKE_OPTIONS = subdir-objects
AM_CPPFLAGS = -I$(top_builddir)/include -I$(top_srcdir)/include
-AM_CFLAGS = -O3 $(SIMD_FLAGS) -fPIC -Wsign-compare
+# avoid using SIMD_FLAGS for code that calls strcmp as new gcc
+# versions will use SIMD for the strcmp implementation. Instead
+# we create a static library just for gf_method that is not compiled
+# with SIMD_FLAGS, this static library will get linked into gf_complete.so
+noinst_LTLIBRARIES = libgf_util.la
+libgf_util_la_SOURCES = gf_method.c
+libgf_util_la_CFLAGS = -O3 -fPIC -Wsign-compare
+
+# we narrowly use SIMD_FLAGS for code that needs it
lib_LTLIBRARIES = libgf_complete.la
-libgf_complete_la_SOURCES = gf.c gf_method.c gf_wgen.c gf_w4.c gf_w8.c gf_w16.c gf_w32.c \
- gf_w64.c gf_w128.c gf_rand.c gf_general.c
+libgf_complete_la_SOURCES = gf.c gf_wgen.c gf_w4.c gf_w8.c gf_w16.c gf_w32.c \
+ gf_w64.c gf_w128.c gf_rand.c gf_general.c gf_cpu.c
+libgf_complete_la_CFLAGS = -O3 $(SIMD_FLAGS) -fPIC -Wsign-compare
+libgf_complete_la_LIBADD = libgf_util.la
if HAVE_NEON
libgf_complete_la_SOURCES += neon/gf_w4_neon.c \
diff --git a/src/gf.c b/src/gf.c
index b7a5c01..feeafdc 100644
--- a/src/gf.c
+++ b/src/gf.c
@@ -12,6 +12,7 @@
#include <stdio.h>
#include <stdlib.h>
#include <assert.h>
+#include "gf_cpu.h"
int _gf_errno = GF_E_DEFAULT;
@@ -207,20 +208,28 @@ int gf_error_check(int w, int mult_type, int region_type, int divide_type,
if (region_type & (~tmp)) { _gf_errno = GF_E_UNK_REG; return 0; }
#ifdef INTEL_SSE2
- sse2 = 1;
+ if (gf_cpu_supports_intel_sse2) {
+ sse2 = 1;
+ }
#endif
#ifdef INTEL_SSSE3
- sse3 = 1;
+ if (gf_cpu_supports_intel_ssse3) {
+ sse3 = 1;
+ }
#endif
#ifdef INTEL_SSE4_PCLMUL
- pclmul = 1;
+ if (gf_cpu_supports_intel_pclmul) {
+ pclmul = 1;
+ }
#endif
#ifdef ARM_NEON
- pclmul = (w == 4 || w == 8);
- sse3 = 1;
+ if (gf_cpu_supports_arm_neon) {
+ pclmul = (w == 4 || w == 8);
+ sse3 = 1;
+ }
#endif
@@ -473,6 +482,8 @@ int gf_init_hard(gf_t *gf, int w, int mult_type,
int sz;
gf_internal_t *h;
+ gf_cpu_identify();
+
if (gf_error_check(w, mult_type, region_type, divide_type,
arg1, arg2, prim_poly, base_gf) == 0) return 0;
diff --git a/src/gf_cpu.c b/src/gf_cpu.c
new file mode 100644
index 0000000..ee2f847
--- /dev/null
+++ b/src/gf_cpu.c
@@ -0,0 +1,153 @@
+/*
+ * GF-Complete: A Comprehensive Open Source Library for Galois Field Arithmetic
+ * James S. Plank, Ethan L. Miller, Kevin M. Greenan,
+ * Benjamin A. Arnold, John A. Burnum, Adam W. Disney, Allen C. McBride.
+ *
+ * gf_cpu.h
+ *
+ * Identifies whether the CPU supports SIMD instructions at runtime.
+ */
+
+#include <stdio.h>
+#include <stdlib.h>
+
+int gf_cpu_identified = 0;
+
+int gf_cpu_supports_intel_pclmul = 0;
+int gf_cpu_supports_intel_sse4 = 0;
+int gf_cpu_supports_intel_ssse3 = 0;
+int gf_cpu_supports_intel_sse3 = 0;
+int gf_cpu_supports_intel_sse2 = 0;
+int gf_cpu_supports_arm_neon = 0;
+
+#if defined(__x86_64__)
+
+void gf_cpu_identify(void)
+{
+ if (gf_cpu_identified) {
+ return;
+ }
+
+ int op = 1, eax, ebx, ecx, edx;
+
+ __asm__("cpuid"
+ : "=a" (eax), "=b" (ebx), "=c" (ecx), "=d" (edx)
+ : "a" (op));
+
+#if defined(INTEL_SSE4_PCLMUL)
+ if ((ecx & 1) != 0 && !getenv("GF_COMPLETE_DISABLE_SSE4_PCLMUL")) {
+ gf_cpu_supports_intel_pclmul = 1;
+#ifdef DEBUG_CPU_DETECTION
+ printf("#gf_cpu_supports_intel_pclmul\n");
+#endif
+ }
+#endif
+
+#if defined(INTEL_SSE4)
+ if (((ecx & (1<<20)) != 0 || (ecx & (1<<19)) != 0) && !getenv("GF_COMPLETE_DISABLE_SSE4")) {
+ gf_cpu_supports_intel_sse4 = 1;
+#ifdef DEBUG_CPU_DETECTION
+ printf("#gf_cpu_supports_intel_sse4\n");
+#endif
+ }
+#endif
+
+#if defined(INTEL_SSSE3)
+ if ((ecx & (1<<9)) != 0 && !getenv("GF_COMPLETE_DISABLE_SSSE3")) {
+ gf_cpu_supports_intel_ssse3 = 1;
+#ifdef DEBUG_CPU_DETECTION
+ printf("#gf_cpu_supports_intel_ssse3\n");
+#endif
+ }
+#endif
+
+#if defined(INTEL_SSE3)
+ if ((ecx & 1) != 0 && !getenv("GF_COMPLETE_DISABLE_SSE3")) {
+ gf_cpu_supports_intel_sse3 = 1;
+#ifdef DEBUG_CPU_DETECTION
+ printf("#gf_cpu_supports_intel_sse3\n");
+#endif
+ }
+#endif
+
+#if defined(INTEL_SSE2)
+ if ((edx & (1<<26)) != 0 && !getenv("GF_COMPLETE_DISABLE_SSE2")) {
+ gf_cpu_supports_intel_sse2 = 1;
+#ifdef DEBUG_CPU_DETECTION
+ printf("#gf_cpu_supports_intel_sse2\n");
+#endif
+ }
+#endif
+
+ gf_cpu_identified = 1;
+}
+
+#elif defined(__arm__) || defined(__aarch64__)
+
+#ifdef __linux__
+
+#include <stdio.h>
+#include <unistd.h>
+#include <elf.h>
+#include <linux/auxvec.h>
+#include <asm/hwcap.h>
+#include <fcntl.h>
+
+unsigned long get_hwcap(unsigned long type) {
+ unsigned long hwcap = 0;
+ int fd = open("/proc/self/auxv", O_RDONLY);
+ if (fd > 0) {
+ Elf32_auxv_t auxv;
+ while (read(fd, &auxv, sizeof(Elf32_auxv_t))) {
+ if (auxv.a_type == type) {
+ hwcap = auxv.a_un.a_val;
+ break;
+ }
+ }
+ close(fd);
+ }
+
+ return hwcap;
+}
+
+#endif // linux
+
+void gf_cpu_identify(void)
+{
+ if (gf_cpu_identified) {
+ return;
+ }
+
+#if defined(ARM_NEON)
+ if (!getenv("GF_COMPLETE_DISABLE_NEON")) {
+#if __linux__ && __arm__
+ gf_cpu_supports_arm_neon = (get_hwcap(AT_HWCAP) & HWCAP_NEON) > 0;
+#elif __aarch64__
+ // ASIMD is supported on all aarch64 architectures
+ gf_cpu_supports_arm_neon = 1;
+#else
+ // we assume that NEON is supported if the compiler supports
+ // NEON and we dont have a reliable way to detect runtime support.
+ gf_cpu_supports_arm_neon = 1;
+#endif
+
+#ifdef DEBUG_CPU_DETECTION
+ if (gf_cpu_supports_arm_neon) {
+ printf("#gf_cpu_supports_arm_neon\n");
+ }
+#endif
+ }
+#endif // defined(ARM_NEON)
+
+ gf_cpu_identified = 1;
+}
+
+#else // defined(__arm__) || defined(__aarch64__)
+
+int gf_cpu_identify(void)
+{
+ gf_cpu_identified = 1;
+ return 0;
+}
+
+#endif
diff --git a/src/gf_w128.c b/src/gf_w128.c
index 5f650b3..74f72e8 100644
--- a/src/gf_w128.c
+++ b/src/gf_w128.c
@@ -11,6 +11,7 @@
#include "gf_int.h"
#include <stdio.h>
#include <stdlib.h>
+#include "gf_cpu.h"
#define GF_FIELD_WIDTH (128)
@@ -290,11 +291,11 @@ gf_w128_shift_multiply(gf_t *gf, gf_val_128_t a128, gf_val_128_t b128, gf_val_12
return;
}
+#if defined(INTEL_SSE4_PCLMUL)
+
void
gf_w128_clm_multiply(gf_t *gf, gf_val_128_t a128, gf_val_128_t b128, gf_val_128_t c128)
{
-#if defined(INTEL_SSE4_PCLMUL)
-
__m128i a,b;
__m128i result0,result1;
__m128i prim_poly;
@@ -338,9 +339,8 @@ gf_w128_clm_multiply(gf_t *gf, gf_val_128_t a128, gf_val_128_t b128, gf_val_128_
c128[0] = (uint64_t)_mm_extract_epi64(result1,1);
c128[1] = (uint64_t)_mm_extract_epi64(result1,0);
-#endif
-return;
}
+#endif
void
gf_w128_bytwo_p_multiply(gf_t *gf, gf_val_128_t a128, gf_val_128_t b128, gf_val_128_t c128)
@@ -376,10 +376,10 @@ gf_w128_bytwo_p_multiply(gf_t *gf, gf_val_128_t a128, gf_val_128_t b128, gf_val_
return;
}
+#if defined(INTEL_SSE4)
void
gf_w128_sse_bytwo_p_multiply(gf_t *gf, gf_val_128_t a128, gf_val_128_t b128, gf_val_128_t c128)
{
-#if defined(INTEL_SSE4)
int i;
__m128i a, b, pp, prod, amask, u_middle_one;
/*John: pmask is always the highest bit set, and the rest zeros. amask changes, it's a countdown.*/
@@ -427,16 +427,16 @@ gf_w128_sse_bytwo_p_multiply(gf_t *gf, gf_val_128_t a128, gf_val_128_t b128, gf_
}
c128[0] = (uint64_t)_mm_extract_epi64(prod, 1);
c128[1] = (uint64_t)_mm_extract_epi64(prod, 0);
-#endif
return;
}
+#endif
/* Ben: This slow function implements sse instrutions for bytwo_b because why not */
+#if defined(INTEL_SSE4)
void
gf_w128_sse_bytwo_b_multiply(gf_t *gf, gf_val_128_t a128, gf_val_128_t b128, gf_val_128_t c128)
{
-#if defined(INTEL_SSE4)
__m128i a, b, lmask, hmask, pp, c, middle_one;
gf_internal_t *h;
uint64_t topbit, middlebit;
@@ -471,8 +471,8 @@ gf_w128_sse_bytwo_b_multiply(gf_t *gf, gf_val_128_t a128, gf_val_128_t b128, gf_
if (middlebit) b = _mm_xor_si128(b, middle_one);
if (topbit) b = _mm_xor_si128(b, pp);
}
-#endif
}
+#endif
void
gf_w128_bytwo_b_multiply(gf_t *gf, gf_val_128_t a128, gf_val_128_t b128, gf_val_128_t c128)
@@ -1146,7 +1146,7 @@ gf_w128_group_multiply_region(gf_t *gf, void *src, void *dest, gf_val_128_t val,
}
/* a^-1 -> b */
- void
+void
gf_w128_euclid(GFP gf, gf_val_128_t a128, gf_val_128_t b128)
{
uint64_t e_i[2], e_im1[2], e_ip1[2];
@@ -1239,7 +1239,7 @@ gf_w128_euclid(GFP gf, gf_val_128_t a128, gf_val_128_t b128)
return;
}
- void
+void
gf_w128_divide_from_inverse(GFP gf, gf_val_128_t a128, gf_val_128_t b128, gf_val_128_t c128)
{
uint64_t d[2];
@@ -1248,7 +1248,7 @@ gf_w128_divide_from_inverse(GFP gf, gf_val_128_t a128, gf_val_128_t b128, gf_val
return;
}
- void
+void
gf_w128_inverse_from_divide(GFP gf, gf_val_128_t a128, gf_val_128_t b128)
{
uint64_t one128[2];
@@ -1260,7 +1260,7 @@ gf_w128_inverse_from_divide(GFP gf, gf_val_128_t a128, gf_val_128_t b128)
static
- void
+void
gf_w128_composite_inverse(gf_t *gf, gf_val_128_t a, gf_val_128_t inv)
{
gf_internal_t *h = (gf_internal_t *) gf->scratch;
@@ -1421,10 +1421,12 @@ static
int gf_w128_cfm_init(gf_t *gf)
{
#if defined(INTEL_SSE4_PCLMUL)
- SET_FUNCTION(gf,inverse,w128,gf_w128_euclid)
- SET_FUNCTION(gf,multiply,w128,gf_w128_clm_multiply)
- SET_FUNCTION(gf,multiply_region,w128,gf_w128_clm_multiply_region_from_single)
- return 1;
+ if (gf_cpu_supports_intel_pclmul) {
+ SET_FUNCTION(gf,inverse,w128,gf_w128_euclid)
+ SET_FUNCTION(gf,multiply,w128,gf_w128_clm_multiply)
+ SET_FUNCTION(gf,multiply_region,w128,gf_w128_clm_multiply_region_from_single)
+ return 1;
+ }
#endif
return 0;
@@ -1527,7 +1529,7 @@ int gf_w128_split_init(gf_t *gf)
SET_FUNCTION(gf,multiply,w128,gf_w128_bytwo_p_multiply)
#if defined(INTEL_SSE4_PCLMUL)
- if (!(h->region_type & GF_REGION_NOSIMD)){
+ if (gf_cpu_supports_intel_pclmul && !(h->region_type & GF_REGION_NOSIMD)){
SET_FUNCTION(gf,multiply,w128,gf_w128_clm_multiply)
}
#endif
@@ -1546,23 +1548,19 @@ int gf_w128_split_init(gf_t *gf)
if((h->region_type & GF_REGION_ALTMAP))
{
#ifdef INTEL_SSE4
- if(!(h->region_type & GF_REGION_NOSIMD))
+ if(gf_cpu_supports_intel_sse4 && !(h->region_type & GF_REGION_NOSIMD))
SET_FUNCTION(gf,multiply_region,w128,gf_w128_split_4_128_sse_altmap_multiply_region)
else
- return 0;
- #else
- return 0;
#endif
+ return 0;
}
else {
#ifdef INTEL_SSE4
- if(!(h->region_type & GF_REGION_NOSIMD))
+ if(gf_cpu_supports_intel_sse4 && !(h->region_type & GF_REGION_NOSIMD))
SET_FUNCTION(gf,multiply_region,w128,gf_w128_split_4_128_sse_multiply_region)
else
- SET_FUNCTION(gf,multiply_region,w128,gf_w128_split_4_128_multiply_region)
- #else
- SET_FUNCTION(gf,multiply_region,w128,gf_w128_split_4_128_multiply_region)
#endif
+ SET_FUNCTION(gf,multiply_region,w128,gf_w128_split_4_128_multiply_region)
}
}
return 1;
diff --git a/src/gf_w16.c b/src/gf_w16.c
index a62ea51..8316892 100644
--- a/src/gf_w16.c
+++ b/src/gf_w16.c
@@ -12,6 +12,7 @@
#include <stdio.h>
#include <stdlib.h>
#include "gf_w16.h"
+#include "gf_cpu.h"
#define AB2(ip, am1 ,am2, b, t1, t2) {\
t1 = (b << 1) & am1;\
@@ -391,6 +392,7 @@ gf_val_32_t gf_w16_matrix (gf_t *gf, gf_val_32_t b)
extra memory.
*/
+#if defined(INTEL_SSE4_PCLMUL)
static
inline
gf_val_32_t
@@ -398,8 +400,6 @@ gf_w16_clm_multiply_2 (gf_t *gf, gf_val_32_t a16, gf_val_32_t b16)
{
gf_val_32_t rv = 0;
-#if defined(INTEL_SSE4_PCLMUL)
-
__m128i a, b;
__m128i result;
__m128i prim_poly;
@@ -433,11 +433,11 @@ gf_w16_clm_multiply_2 (gf_t *gf, gf_val_32_t a16, gf_val_32_t b16)
rv = ((gf_val_32_t)_mm_extract_epi32(result, 0));
-
-#endif
return rv;
}
+#endif
+#if defined(INTEL_SSE4_PCLMUL)
static
inline
gf_val_32_t
@@ -445,8 +445,6 @@ gf_w16_clm_multiply_3 (gf_t *gf, gf_val_32_t a16, gf_val_32_t b16)
{
gf_val_32_t rv = 0;
-#if defined(INTEL_SSE4_PCLMUL)
-
__m128i a, b;
__m128i result;
__m128i prim_poly;
@@ -473,11 +471,11 @@ gf_w16_clm_multiply_3 (gf_t *gf, gf_val_32_t a16, gf_val_32_t b16)
rv = ((gf_val_32_t)_mm_extract_epi32(result, 0));
-
-#endif
return rv;
}
+#endif
+#if defined(INTEL_SSE4_PCLMUL)
static
inline
gf_val_32_t
@@ -485,8 +483,6 @@ gf_w16_clm_multiply_4 (gf_t *gf, gf_val_32_t a16, gf_val_32_t b16)
{
gf_val_32_t rv = 0;
-#if defined(INTEL_SSE4_PCLMUL)
-
__m128i a, b;
__m128i result;
__m128i prim_poly;
@@ -515,10 +511,9 @@ gf_w16_clm_multiply_4 (gf_t *gf, gf_val_32_t a16, gf_val_32_t b16)
rv = ((gf_val_32_t)_mm_extract_epi32(result, 0));
-
-#endif
return rv;
}
+#endif
static
@@ -556,25 +551,27 @@ static
int gf_w16_cfm_init(gf_t *gf)
{
#if defined(INTEL_SSE4_PCLMUL)
- gf_internal_t *h;
+ if (gf_cpu_supports_intel_pclmul) {
+ gf_internal_t *h;
- h = (gf_internal_t *) gf->scratch;
-
- /*Ben: Determining how many reductions to do */
-
- if ((0xfe00 & h->prim_poly) == 0) {
- SET_FUNCTION(gf,multiply,w32,gf_w16_clm_multiply_2)
- SET_FUNCTION(gf,multiply_region,w32,gf_w16_clm_multiply_region_from_single_2)
- } else if((0xf000 & h->prim_poly) == 0) {
- SET_FUNCTION(gf,multiply,w32,gf_w16_clm_multiply_3)
- SET_FUNCTION(gf,multiply_region,w32,gf_w16_clm_multiply_region_from_single_3)
- } else if ((0xe000 & h->prim_poly) == 0) {
- SET_FUNCTION(gf,multiply,w32,gf_w16_clm_multiply_4)
- SET_FUNCTION(gf,multiply_region,w32,gf_w16_clm_multiply_region_from_single_4)
- } else {
- return 0;
- }
- return 1;
+ h = (gf_internal_t *) gf->scratch;
+
+ /*Ben: Determining how many reductions to do */
+
+ if ((0xfe00 & h->prim_poly) == 0) {
+ SET_FUNCTION(gf,multiply,w32,gf_w16_clm_multiply_2)
+ SET_FUNCTION(gf,multiply_region,w32,gf_w16_clm_multiply_region_from_single_2)
+ } else if((0xf000 & h->prim_poly) == 0) {
+ SET_FUNCTION(gf,multiply,w32,gf_w16_clm_multiply_3)
+ SET_FUNCTION(gf,multiply_region,w32,gf_w16_clm_multiply_region_from_single_3)
+ } else if ((0xe000 & h->prim_poly) == 0) {
+ SET_FUNCTION(gf,multiply,w32,gf_w16_clm_multiply_4)
+ SET_FUNCTION(gf,multiply_region,w32,gf_w16_clm_multiply_region_from_single_4)
+ } else {
+ return 0;
+ }
+ return 1;
+ }
#endif
return 0;
@@ -688,10 +685,9 @@ int gf_w16_log_init(gf_t *gf)
if (check) {
if (h->mult_type != GF_MULT_LOG_TABLE) {
-
-#if defined(INTEL_SSE4_PCLMUL)
- return gf_w16_cfm_init(gf);
-#endif
+ if (gf_cpu_supports_intel_pclmul) {
+ return gf_w16_cfm_init(gf);
+ }
return gf_w16_shift_init(gf);
} else {
_gf_errno = GF_E_LOGPOLY;
@@ -948,11 +944,11 @@ gf_w16_table_lazy_multiply_region(gf_t *gf, void *src, void *dest, gf_val_32_t v
gf_do_final_region_alignment(&rd);
}
+#ifdef INTEL_SSSE3
static
void
gf_w16_split_4_16_lazy_sse_multiply_region(gf_t *gf, void *src, void *dest, gf_val_32_t val, int bytes, int xor)
{
-#ifdef INTEL_SSSE3
uint64_t i, j, *s64, *d64, *top64;;
uint64_t c, prod;
uint8_t low[4][16];
@@ -1078,14 +1074,14 @@ gf_w16_split_4_16_lazy_sse_multiply_region(gf_t *gf, void *src, void *dest, gf_v
}
gf_do_final_region_alignment(&rd);
-#endif
}
+#endif
+#ifdef INTEL_SSSE3
static
void
gf_w16_split_4_16_lazy_sse_altmap_multiply_region(gf_t *gf, void *src, void *dest, gf_val_32_t val, int bytes, int xor)
{
-#ifdef INTEL_SSSE3
uint64_t i, j, *s64, *d64, *top64;;
uint64_t c, prod;
uint8_t low[4][16];
@@ -1187,8 +1183,8 @@ gf_w16_split_4_16_lazy_sse_altmap_multiply_region(gf_t *gf, void *src, void *des
}
gf_do_final_region_alignment(&rd);
-#endif
}
+#endif
uint32_t
gf_w16_split_8_8_multiply(gf_t *gf, gf_val_32_t a, gf_val_32_t b)
@@ -1216,21 +1212,11 @@ int gf_w16_split_init(gf_t *gf)
{
gf_internal_t *h;
struct gf_w16_split_8_8_data *d8;
- int i, j, exp, issse3;
- int isneon = 0;
+ int i, j, exp;
uint32_t p, basep, tmp;
h = (gf_internal_t *) gf->scratch;
-#ifdef INTEL_SSSE3
- issse3 = 1;
-#else
- issse3 = 0;
-#endif
-#ifdef ARM_NEON
- isneon = 1;
-#endif
-
if (h->arg1 == 8 && h->arg2 == 8) {
d8 = (struct gf_w16_split_8_8_data *) h->private;
basep = 1;
@@ -1273,36 +1259,45 @@ int gf_w16_split_init(gf_t *gf)
/* Defaults */
- if (issse3) {
+#ifdef INTEL_SSSE3
+ if (gf_cpu_supports_intel_ssse3) {
SET_FUNCTION(gf,multiply_region,w32,gf_w16_split_4_16_lazy_sse_multiply_region)
- } else if (isneon) {
-#ifdef ARM_NEON
+ } else {
+#elif ARM_NEON
+ if (gf_cpu_supports_arm_neon) {
gf_w16_neon_split_init(gf);
-#endif
} else {
+#endif
SET_FUNCTION(gf,multiply_region,w32,gf_w16_split_8_16_lazy_multiply_region)
+#if defined(INTEL_SSSE3) || defined(ARM_NEON)
}
-
+#endif
if ((h->arg1 == 8 && h->arg2 == 16) || (h->arg2 == 8 && h->arg1 == 16)) {
SET_FUNCTION(gf,multiply_region,w32,gf_w16_split_8_16_lazy_multiply_region)
} else if ((h->arg1 == 4 && h->arg2 == 16) || (h->arg2 == 4 && h->arg1 == 16)) {
- if (issse3 || isneon) {
+#if defined(INTEL_SSSE3) || defined(ARM_NEON)
+ if (gf_cpu_supports_intel_ssse3 || gf_cpu_supports_arm_neon) {
if(h->region_type & GF_REGION_ALTMAP && h->region_type & GF_REGION_NOSIMD)
SET_FUNCTION(gf,multiply_region,w32,gf_w16_split_4_16_lazy_nosse_altmap_multiply_region)
else if(h->region_type & GF_REGION_NOSIMD)
SET_FUNCTION(gf,multiply_region,w32,gf_w16_split_4_16_lazy_multiply_region)
- else if(h->region_type & GF_REGION_ALTMAP && issse3)
+#if defined(INTEL_SSSE3)
+ else if(h->region_type & GF_REGION_ALTMAP && gf_cpu_supports_intel_ssse3)
SET_FUNCTION(gf,multiply_region,w32,gf_w16_split_4_16_lazy_sse_altmap_multiply_region)
+#endif
} else {
+#endif
if(h->region_type & GF_REGION_SIMD)
return 0;
else if(h->region_type & GF_REGION_ALTMAP)
SET_FUNCTION(gf,multiply_region,w32,gf_w16_split_4_16_lazy_nosse_altmap_multiply_region)
else
SET_FUNCTION(gf,multiply_region,w32,gf_w16_split_4_16_lazy_multiply_region)
+#if defined(INTEL_SSSE3) || defined(ARM_NEON)
}
+#endif
}
return 1;
@@ -1846,26 +1841,28 @@ int gf_w16_bytwo_init(gf_t *gf)
if (h->mult_type == GF_MULT_BYTWO_p) {
SET_FUNCTION(gf,multiply,w32,gf_w16_bytwo_p_multiply)
#ifdef INTEL_SSE2
- if (h->region_type & GF_REGION_NOSIMD)
- SET_FUNCTION(gf,multiply_region,w32,gf_w16_bytwo_p_nosse_multiply_region)
- else
- SET_FUNCTION(gf,multiply_region,w32,gf_w16_bytwo_p_sse_multiply_region)
- #else
+ if (gf_cpu_supports_intel_sse2 && !(h->region_type & GF_REGION_NOSIMD)) {
+ SET_FUNCTION(gf,multiply_region,w32,gf_w16_bytwo_p_sse_multiply_region)
+ } else {
+ #endif
SET_FUNCTION(gf,multiply_region,w32,gf_w16_bytwo_p_nosse_multiply_region)
if(h->region_type & GF_REGION_SIMD)
return 0;
+ #ifdef INTEL_SSE2
+ }
#endif
} else {
SET_FUNCTION(gf,multiply,w32,gf_w16_bytwo_b_multiply)
#ifdef INTEL_SSE2
- if (h->region_type & GF_REGION_NOSIMD)
- SET_FUNCTION(gf,multiply_region,w32,gf_w16_bytwo_b_nosse_multiply_region)
- else
+ if (gf_cpu_supports_intel_sse2 && !(h->region_type & GF_REGION_NOSIMD)) {
SET_FUNCTION(gf,multiply_region,w32,gf_w16_bytwo_b_sse_multiply_region)
- #else
+ } else {
+ #endif
SET_FUNCTION(gf,multiply_region,w32,gf_w16_bytwo_b_nosse_multiply_region)
if(h->region_type & GF_REGION_SIMD)
return 0;
+ #ifdef INTEL_SSE2
+ }
#endif
}
diff --git a/src/gf_w32.c b/src/gf_w32.c
index d496c3a..bb22894 100644
--- a/src/gf_w32.c
+++ b/src/gf_w32.c
@@ -13,6 +13,7 @@
#include <stdio.h>
#include <stdlib.h>
#include "gf_w32.h"
+#include "gf_cpu.h"
#define MM_PRINT32(s, r) { uint8_t blah[16], ii; printf("%-12s", s); _mm_storeu_si128((__m128i *)blah, r); for (ii = 0; ii < 16; ii += 4) printf(" %02x%02x%02x%02x", blah[15-ii], blah[14-ii], blah[13-ii], blah[12-ii]); printf("\n"); }
@@ -347,6 +348,8 @@ uint32_t gf_w32_matrix (gf_t *gf, uint32_t b)
extra memory.
*/
+#if defined(INTEL_SSE4_PCLMUL)
+
static
inline
gf_val_32_t
@@ -354,8 +357,6 @@ gf_w32_cfmgk_multiply (gf_t *gf, gf_val_32_t a32, gf_val_32_t b32)
{
gf_val_32_t rv = 0;
-#if defined(INTEL_SSE4_PCLMUL)
-
__m128i a, b;
__m128i result;
__m128i w;
@@ -378,9 +379,9 @@ gf_w32_cfmgk_multiply (gf_t *gf, gf_val_32_t a32, gf_val_32_t b32)
/* Extracts 32 bit value from result. */
rv = ((gf_val_32_t)_mm_extract_epi32(result, 0));
-#endif
return rv;
}
+#endif
#if defined(INTEL_SSE4_PCLMUL)
@@ -435,6 +436,8 @@ gf_w32_cfmgk_multiply_region_from_single(gf_t *gf, void *src, void *dest, uint32
#endif
+#if defined(INTEL_SSE4_PCLMUL)
+
static
inline
gf_val_32_t
@@ -442,8 +445,6 @@ gf_w32_clm_multiply_2 (gf_t *gf, gf_val_32_t a32, gf_val_32_t b32)
{
gf_val_32_t rv = 0;
-#if defined(INTEL_SSE4_PCLMUL)
-
__m128i a, b;
__m128i result;
__m128i prim_poly;
@@ -476,9 +477,11 @@ gf_w32_clm_multiply_2 (gf_t *gf, gf_val_32_t a32, gf_val_32_t b32)
/* Extracts 32 bit value from result. */
rv = ((gf_val_32_t)_mm_extract_epi32(result, 0));
-#endif
return rv;
}
+#endif
+
+#if defined(INTEL_SSE4_PCLMUL)
static
inline
@@ -487,8 +490,6 @@ gf_w32_clm_multiply_3 (gf_t *gf, gf_val_32_t a32, gf_val_32_t b32)
{
gf_val_32_t rv = 0;
-#if defined(INTEL_SSE4_PCLMUL)
-
__m128i a, b;
__m128i result;
__m128i prim_poly;
@@ -515,9 +516,11 @@ gf_w32_clm_multiply_3 (gf_t *gf, gf_val_32_t a32, gf_val_32_t b32)
/* Extracts 32 bit value from result. */
rv = ((gf_val_32_t)_mm_extract_epi32(result, 0));
-#endif
return rv;
}
+#endif
+
+#if defined(INTEL_SSE4_PCLMUL)
static
inline
@@ -526,8 +529,6 @@ gf_w32_clm_multiply_4 (gf_t *gf, gf_val_32_t a32, gf_val_32_t b32)
{
gf_val_32_t rv = 0;
-#if defined(INTEL_SSE4_PCLMUL)
-
__m128i a, b;
__m128i result;
__m128i prim_poly;
@@ -556,9 +557,9 @@ gf_w32_clm_multiply_4 (gf_t *gf, gf_val_32_t a32, gf_val_32_t b32)
/* Extracts 32 bit value from result. */
rv = ((gf_val_32_t)_mm_extract_epi32(result, 0));
-#endif
return rv;
}
+#endif
static
@@ -593,29 +594,31 @@ int gf_w32_cfmgk_init(gf_t *gf)
SET_FUNCTION(gf,multiply_region,w32,gf_w32_multiply_region_from_single)
#if defined(INTEL_SSE4_PCLMUL)
- gf_internal_t *h;
+ if (gf_cpu_supports_intel_pclmul) {
+ gf_internal_t *h;
- h = (gf_internal_t *) gf->scratch;
- SET_FUNCTION(gf,multiply,w32,gf_w32_cfmgk_multiply)
- SET_FUNCTION(gf,multiply_region,w32,gf_w32_cfmgk_multiply_region_from_single)
+ h = (gf_internal_t *) gf->scratch;
+ SET_FUNCTION(gf,multiply,w32,gf_w32_cfmgk_multiply)
+ SET_FUNCTION(gf,multiply_region,w32,gf_w32_cfmgk_multiply_region_from_single)
- uint64_t *q_plus = (uint64_t *) h->private;
- uint64_t *g_star = (uint64_t *) h->private + 1;
+ uint64_t *q_plus = (uint64_t *) h->private;
+ uint64_t *g_star = (uint64_t *) h->private + 1;
- uint64_t tmp = h->prim_poly << 32;
- *q_plus = 1ULL << 32;
+ uint64_t tmp = h->prim_poly << 32;
+ *q_plus = 1ULL << 32;
- int i;
- for(i = 63; i >= 32; i--)
- if((1ULL << i) & tmp)
- {
- *q_plus |= 1ULL << (i-32);
- tmp ^= h->prim_poly << (i-32);
- }
+ int i;
+ for(i = 63; i >= 32; i--)
+ if((1ULL << i) & tmp)
+ {
+ *q_plus |= 1ULL << (i-32);
+ tmp ^= h->prim_poly << (i-32);
+ }
- *g_star = h->prim_poly & ((1ULL << 32) - 1);
+ *g_star = h->prim_poly & ((1ULL << 32) - 1);
- return 1;
+ return 1;
+ }
#endif
return 0;
@@ -631,23 +634,25 @@ int gf_w32_cfm_init(gf_t *gf)
/*Ben: Check to see how many reduction steps it will take*/
#if defined(INTEL_SSE4_PCLMUL)
- gf_internal_t *h;
+ if (gf_cpu_supports_intel_pclmul) {
+ gf_internal_t *h;
- h = (gf_internal_t *) gf->scratch;
+ h = (gf_internal_t *) gf->scratch;
- if ((0xfffe0000 & h->prim_poly) == 0){
- SET_FUNCTION(gf,multiply,w32,gf_w32_clm_multiply_2)
- SET_FUNCTION(gf,multiply_region,w32,gf_w32_clm_multiply_region_from_single_2)
- }else if ((0xffc00000 & h->prim_poly) == 0){
- SET_FUNCTION(gf,multiply,w32,gf_w32_clm_multiply_3)
- SET_FUNCTION(gf,multiply_region,w32,gf_w32_clm_multiply_region_from_single_3)
- }else if ((0xfe000000 & h->prim_poly) == 0){
- SET_FUNCTION(gf,multiply,w32,gf_w32_clm_multiply_4)
- SET_FUNCTION(gf,multiply_region,w32,gf_w32_clm_multiply_region_from_single_4)
- } else {
- return 0;
+ if ((0xfffe0000 & h->prim_poly) == 0){
+ SET_FUNCTION(gf,multiply,w32,gf_w32_clm_multiply_2)
+ SET_FUNCTION(gf,multiply_region,w32,gf_w32_clm_multiply_region_from_single_2)
+ }else if ((0xffc00000 & h->prim_poly) == 0){
+ SET_FUNCTION(gf,multiply,w32,gf_w32_clm_multiply_3)
+ SET_FUNCTION(gf,multiply_region,w32,gf_w32_clm_multiply_region_from_single_3)
+ }else if ((0xfe000000 & h->prim_poly) == 0){
+ SET_FUNCTION(gf,multiply,w32,gf_w32_clm_multiply_4)
+ SET_FUNCTION(gf,multiply_region,w32,gf_w32_clm_multiply_region_from_single_4)
+ } else {
+ return 0;
+ }
+ return 1;
}
- return 1;
#endif
return 0;
@@ -1382,26 +1387,28 @@ int gf_w32_bytwo_init(gf_t *gf)
if (h->mult_type == GF_MULT_BYTWO_p) {
SET_FUNCTION(gf,multiply,w32,gf_w32_bytwo_p_multiply)
#ifdef INTEL_SSE2
- if (h->region_type & GF_REGION_NOSIMD)
- SET_FUNCTION(gf,multiply_region,w32,gf_w32_bytwo_p_nosse_multiply_region)
- else
+ if (gf_cpu_supports_intel_sse2 && !(h->region_type & GF_REGION_NOSIMD)) {
SET_FUNCTION(gf,multiply_region,w32,gf_w32_bytwo_p_sse_multiply_region)
- #else
- SET_FUNCTION(gf,multiply_region,w32,gf_w32_bytwo_p_nosse_multiply_region)
- if(h->region_type & GF_REGION_SIMD)
- return 0;
+ } else {
+ #endif
+ SET_FUNCTION(gf,multiply_region,w32,gf_w32_bytwo_p_nosse_multiply_region)
+ if(h->region_type & GF_REGION_SIMD)
+ return 0;
+ #ifdef INTEL_SSE2
+ }
#endif
} else {
SET_FUNCTION(gf,multiply,w32,gf_w32_bytwo_b_multiply)
#ifdef INTEL_SSE2
- if (h->region_type & GF_REGION_NOSIMD)
- SET_FUNCTION(gf,multiply_region,w32,gf_w32_bytwo_b_nosse_multiply_region)
- else
+ if (gf_cpu_supports_intel_sse2 && !(h->region_type & GF_REGION_NOSIMD)) {
SET_FUNCTION(gf,multiply_region,w32,gf_w32_bytwo_b_sse_multiply_region)
- #else
+ } else {
+ #endif
SET_FUNCTION(gf,multiply_region,w32,gf_w32_bytwo_b_nosse_multiply_region)
if(h->region_type & GF_REGION_SIMD)
return 0;
+ #ifdef INTEL_SSE2
+ }
#endif
}
@@ -1755,11 +1762,11 @@ gf_w32_split_4_32_lazy_multiply_region(gf_t *gf, void *src, void *dest, uint32_t
gf_do_final_region_alignment(&rd);
}
+#ifdef INTEL_SSSE3
static
void
gf_w32_split_4_32_lazy_sse_altmap_multiply_region(gf_t *gf, void *src, void *dest, uint32_t val, int bytes, int xor)
{
-#ifdef INTEL_SSSE3
gf_internal_t *h;
int i, j, k;
uint32_t pp, v, *s32, *d32, *top;
@@ -1942,16 +1949,15 @@ gf_w32_split_4_32_lazy_sse_altmap_multiply_region(gf_t *gf, void *src, void *des
}
gf_do_final_region_alignment(&rd);
-
-#endif
}
+#endif
+#ifdef INTEL_SSSE3
static
void
gf_w32_split_4_32_lazy_sse_multiply_region(gf_t *gf, void *src, void *dest, uint32_t val, int bytes, int xor)
{
-#ifdef INTEL_SSSE3
gf_internal_t *h;
int i, j, k;
uint32_t pp, v, *s32, *d32, *top, tmp_table[16];
@@ -2216,9 +2222,8 @@ gf_w32_split_4_32_lazy_sse_multiply_region(gf_t *gf, void *src, void *dest, uint
}
}
gf_do_final_region_alignment(&rd);
-
-#endif
}
+#endif
static
int gf_w32_split_init(gf_t *gf)
@@ -2230,23 +2235,7 @@ int gf_w32_split_init(gf_t *gf)
struct gf_split_8_32_lazy_data *d32;
struct gf_split_16_32_lazy_data *d16;
uint32_t p, basep;
- int i, j, exp, ispclmul, issse3;
- int isneon = 0;
-
-#if defined(INTEL_SSE4_PCLMUL)
- ispclmul = 1;
-#else
- ispclmul = 0;
-#endif
-
-#ifdef INTEL_SSSE3
- issse3 = 1;
-#else
- issse3 = 0;
-#endif
-#ifdef ARM_NEON
- isneon = 1;
-#endif
+ int i, j, exp;
h = (gf_internal_t *) gf->scratch;
@@ -2262,7 +2251,8 @@ int gf_w32_split_init(gf_t *gf)
if (h->arg1 == 8 && h->arg2 == 8) {
SET_FUNCTION(gf,multiply,w32,gf_w32_split_8_8_multiply)
- } else if (ispclmul) {
+#if defined(INTEL_SSE4_PCLMUL)
+ } else if (gf_cpu_supports_intel_pclmul) {
if ((0xfffe0000 & h->prim_poly) == 0){
SET_FUNCTION(gf,multiply,w32,gf_w32_clm_multiply_2)
} else if ((0xffc00000 & h->prim_poly) == 0){
@@ -2270,6 +2260,7 @@ int gf_w32_split_init(gf_t *gf)
} else if ((0xfe000000 & h->prim_poly) == 0){
SET_FUNCTION(gf,multiply,w32,gf_w32_clm_multiply_4)
}
+#endif
} else {
SET_FUNCTION(gf,multiply,w32,gf_w32_bytwo_p_multiply)
}
@@ -2287,33 +2278,39 @@ int gf_w32_split_init(gf_t *gf)
ld2 = (struct gf_split_2_32_lazy_data *) h->private;
ld2->last_value = 0;
#ifdef INTEL_SSSE3
- if (!(h->region_type & GF_REGION_NOSIMD))
+ if (gf_cpu_supports_intel_ssse3 && !(h->region_type & GF_REGION_NOSIMD)) {
SET_FUNCTION(gf,multiply_region,w32,gf_w32_split_2_32_lazy_sse_multiply_region)
- else
+ } else {
+ #endif
SET_FUNCTION(gf,multiply_region,w32,gf_w32_split_2_32_lazy_multiply_region)
- #else
- SET_FUNCTION(gf,multiply_region,w32,gf_w32_split_2_32_lazy_multiply_region)
- if(h->region_type & GF_REGION_SIMD) return 0;
+ if(h->region_type & GF_REGION_SIMD) return 0;
+ #ifdef INTEL_SSSE3
+ }
#endif
return 1;
}
/* 4/32 or Default + SSE - There is no ALTMAP/NOSSE. */
+
if ((h->arg1 == 4 && h->arg2 == 32) || (h->arg1 == 32 && h->arg2 == 4) ||
- ((issse3 || isneon) && h->mult_type == GF_REGION_DEFAULT)) {
+ ((gf_cpu_supports_intel_ssse3 || gf_cpu_supports_arm_neon) && h->mult_type == GF_REGION_DEFAULT)) {
ld4 = (struct gf_split_4_32_lazy_data *) h->private;
ld4->last_value = 0;
- if ((h->region_type & GF_REGION_NOSIMD) || !(issse3 || isneon)) {
+ if ((h->region_type & GF_REGION_NOSIMD) || !(gf_cpu_supports_intel_ssse3 || gf_cpu_supports_arm_neon)) {
SET_FUNCTION(gf,multiply_region,w32,gf_w32_split_4_32_lazy_multiply_region)
- } else if (isneon) {
+ } else if (gf_cpu_supports_arm_neon) {
#ifdef ARM_NEON
gf_w32_neon_split_init(gf);
#endif
} else if (h->region_type & GF_REGION_ALTMAP) {
+#ifdef INTEL_SSSE3
SET_FUNCTION(gf,multiply_region,w32,gf_w32_split_4_32_lazy_sse_altmap_multiply_region)
+#endif
} else {
+#ifdef INTEL_SSSE3
SET_FUNCTION(gf,multiply_region,w32,gf_w32_split_4_32_lazy_sse_multiply_region)
+#endif
}
return 1;
}
@@ -2686,16 +2683,6 @@ int gf_w32_composite_init(gf_t *gf)
int gf_w32_scratch_size(int mult_type, int region_type, int divide_type, int arg1, int arg2)
{
- int issse3 = 0;
- int isneon = 0;
-
-#ifdef INTEL_SSSE3
- issse3 = 1;
-#endif
-#ifdef ARM_NEON
- isneon = 1;
-#endif
-
switch(mult_type)
{
case GF_MULT_BYTWO_p:
@@ -2720,7 +2707,7 @@ int gf_w32_scratch_size(int mult_type, int region_type, int divide_type, int arg
return sizeof(gf_internal_t) + sizeof(struct gf_split_2_32_lazy_data) + 64;
}
if ((arg1 == 8 && arg2 == 32) || (arg2 == 8 && arg1 == 32) ||
- (mult_type == GF_MULT_DEFAULT && !(issse3 || isneon))) {
+ (mult_type == GF_MULT_DEFAULT && !(gf_cpu_supports_intel_ssse3 || gf_cpu_supports_arm_neon))) {
return sizeof(gf_internal_t) + sizeof(struct gf_split_8_32_lazy_data) + 64;
}
if ((arg1 == 4 && arg2 == 32) ||
diff --git a/src/gf_w4.c b/src/gf_w4.c
index 814b0f5..3a7b953 100644
--- a/src/gf_w4.c
+++ b/src/gf_w4.c
@@ -12,6 +12,7 @@
#include <stdio.h>
#include <stdlib.h>
#include "gf_w4.h"
+#include "gf_cpu.h"
#define AB2(ip, am1 ,am2, b, t1, t2) {\
t1 = (b << 1) & am1;\
@@ -134,6 +135,7 @@ gf_w4_shift_multiply (gf_t *gf, gf_val_32_t a, gf_val_32_t b)
/* Ben: This function works, but it is 33% slower than the normal shift mult */
+#if defined(INTEL_SSE4_PCLMUL)
static
inline
gf_val_32_t
@@ -141,8 +143,6 @@ gf_w4_clm_multiply (gf_t *gf, gf_val_32_t a4, gf_val_32_t b4)
{
gf_val_32_t rv = 0;
-#if defined(INTEL_SSE4_PCLMUL)
-
__m128i a, b;
__m128i result;
__m128i prim_poly;
@@ -173,9 +173,9 @@ gf_w4_clm_multiply (gf_t *gf, gf_val_32_t a4, gf_val_32_t b4)
/* Extracts 32 bit value from result. */
rv = ((gf_val_32_t)_mm_extract_epi32(result, 0));
-#endif
return rv;
}
+#endif
static
void
@@ -447,18 +447,19 @@ int gf_w4_single_table_init(gf_t *gf)
SET_FUNCTION(gf,inverse,w32,NULL)
SET_FUNCTION(gf,divide,w32,gf_w4_single_table_divide)
SET_FUNCTION(gf,multiply,w32,gf_w4_single_table_multiply)
- #if defined(INTEL_SSSE3) || defined(ARM_NEON)
- if(h->region_type & (GF_REGION_NOSIMD | GF_REGION_CAUCHY))
- SET_FUNCTION(gf,multiply_region,w32,gf_w4_single_table_multiply_region)
- else
- #if defined(INTEL_SSSE3)
+ #if defined(INTEL_SSSE3)
+ if (gf_cpu_supports_intel_ssse3 && !(h->region_type & (GF_REGION_NOSIMD | GF_REGION_CAUCHY))) {
SET_FUNCTION(gf,multiply_region,w32,gf_w4_single_table_sse_multiply_region)
- #elif defined(ARM_NEON)
+ } else {
+ #elif defined(ARM_NEON)
+ if (gf_cpu_supports_arm_neon && !(h->region_type & (GF_REGION_NOSIMD | GF_REGION_CAUCHY))) {
gf_w4_neon_single_table_init(gf);
- #endif
- #else
- SET_FUNCTION(gf,multiply_region,w32,gf_w4_single_table_multiply_region)
- if (h->region_type & GF_REGION_SIMD) return 0;
+ } else {
+ #endif
+ SET_FUNCTION(gf,multiply_region,w32,gf_w4_single_table_multiply_region)
+ if (h->region_type & GF_REGION_SIMD) return 0;
+ #if defined(INTEL_SSSE3) || defined(ARM_NEON)
+ }
#endif
return 1;
@@ -736,16 +737,13 @@ int gf_w4_table_init(gf_t *gf)
{
int rt;
gf_internal_t *h;
- int simd = 0;
-
-#if defined(INTEL_SSSE3) || defined(ARM_NEON)
- simd = 1;
-#endif
h = (gf_internal_t *) gf->scratch;
rt = (h->region_type);
- if (h->mult_type == GF_MULT_DEFAULT && !simd) rt |= GF_REGION_DOUBLE_TABLE;
+ if (h->mult_type == GF_MULT_DEFAULT &&
+ !(gf_cpu_supports_intel_ssse3 || gf_cpu_supports_arm_neon))
+ rt |= GF_REGION_DOUBLE_TABLE;
if (rt & GF_REGION_DOUBLE_TABLE) {
return gf_w4_double_table_init(gf);
@@ -929,11 +927,11 @@ gf_w4_bytwo_p_sse_multiply_region(gf_t *gf, void *src, void *dest, gf_val_32_t v
#endif
/*
+#ifdef INTEL_SSE2
static
void
gf_w4_bytwo_b_sse_multiply_region(gf_t *gf, void *src, void *dest, gf_val_32_t val, int bytes, int xor)
{
-#ifdef INTEL_SSE2
uint8_t *d8, *s8, tb;
__m128i pp, m1, m2, t1, t2, va, vb;
struct gf_bytwo_data *btd;
@@ -990,8 +988,8 @@ gf_w4_bytwo_b_sse_multiply_region(gf_t *gf, void *src, void *dest, gf_val_32_t v
}
}
gf_do_final_region_alignment(&rd);
-#endif
}
+#endif
*/
#ifdef INTEL_SSE2
@@ -1867,26 +1865,28 @@ int gf_w4_bytwo_init(gf_t *gf)
if (h->mult_type == GF_MULT_BYTWO_p) {
SET_FUNCTION(gf,multiply,w32,gf_w4_bytwo_p_multiply)
#ifdef INTEL_SSE2
- if (h->region_type & GF_REGION_NOSIMD)
- SET_FUNCTION(gf,multiply_region,w32,gf_w4_bytwo_p_nosse_multiply_region)
- else
+ if (gf_cpu_supports_intel_sse2 && !(h->region_type & GF_REGION_NOSIMD)) {
SET_FUNCTION(gf,multiply_region,w32,gf_w4_bytwo_p_sse_multiply_region)
- #else
- SET_FUNCTION(gf,multiply_region,w32,gf_w4_bytwo_p_nosse_multiply_region)
- if (h->region_type & GF_REGION_SIMD)
- return 0;
+ } else {
+ #endif
+ SET_FUNCTION(gf,multiply_region,w32,gf_w4_bytwo_p_nosse_multiply_region)
+ if (h->region_type & GF_REGION_SIMD)
+ return 0;
+ #ifdef INTEL_SSE2
+ }
#endif
} else {
SET_FUNCTION(gf,multiply,w32,gf_w4_bytwo_b_multiply)
#ifdef INTEL_SSE2
- if (h->region_type & GF_REGION_NOSIMD)
- SET_FUNCTION(gf,multiply_region,w32,gf_w4_bytwo_b_nosse_multiply_region)
- else
+ if (gf_cpu_supports_intel_sse2 && !(h->region_type & GF_REGION_NOSIMD)) {
SET_FUNCTION(gf,multiply_region,w32,gf_w4_bytwo_b_sse_multiply_region)
- #else
- SET_FUNCTION(gf,multiply_region,w32,gf_w4_bytwo_b_nosse_multiply_region)
- if (h->region_type & GF_REGION_SIMD)
- return 0;
+ } else {
+ #endif
+ SET_FUNCTION(gf,multiply_region,w32,gf_w4_bytwo_b_nosse_multiply_region)
+ if (h->region_type & GF_REGION_SIMD)
+ return 0;
+ #ifdef INTEL_SSE2
+ }
#endif
}
return 1;
@@ -1897,10 +1897,14 @@ static
int gf_w4_cfm_init(gf_t *gf)
{
#if defined(INTEL_SSE4_PCLMUL)
- SET_FUNCTION(gf,multiply,w32,gf_w4_clm_multiply)
- return 1;
+ if (gf_cpu_supports_intel_pclmul) {
+ SET_FUNCTION(gf,multiply,w32,gf_w4_clm_multiply)
+ return 1;
+ }
#elif defined(ARM_NEON)
- return gf_w4_neon_cfm_init(gf);
+ if (gf_cpu_supports_arm_neon) {
+ return gf_w4_neon_cfm_init(gf);
+ }
#endif
return 0;
}
@@ -1917,15 +1921,6 @@ int gf_w4_shift_init(gf_t *gf)
int gf_w4_scratch_size(int mult_type, int region_type, int divide_type, int arg1, int arg2)
{
- int issse3 = 0, isneon = 0;
-
-#ifdef INTEL_SSSE3
- issse3 = 1;
-#endif
-#ifdef ARM_NEON
- isneon = 1;
-#endif
-
switch(mult_type)
{
case GF_MULT_BYTWO_p:
@@ -1938,7 +1933,8 @@ int gf_w4_scratch_size(int mult_type, int region_type, int divide_type, int arg1
return sizeof(gf_internal_t) + sizeof(struct gf_single_table_data) + 64;
}
- if (mult_type == GF_MULT_DEFAULT && !(issse3 || isneon))
+ if (mult_type == GF_MULT_DEFAULT &&
+ !(gf_cpu_supports_arm_neon || gf_cpu_supports_intel_ssse3))
region_type = GF_REGION_DOUBLE_TABLE;
if (region_type & GF_REGION_DOUBLE_TABLE) {
diff --git a/src/gf_w64.c b/src/gf_w64.c
index a096161..69e55db 100644
--- a/src/gf_w64.c
+++ b/src/gf_w64.c
@@ -12,6 +12,7 @@
#include <stdio.h>
#include <stdlib.h>
#include "gf_w64.h"
+#include "gf_cpu.h"
static
inline
@@ -338,6 +339,8 @@ gf_w64_shift_multiply (gf_t *gf, gf_val_64_t a64, gf_val_64_t b64)
* ELM: Use the Intel carryless multiply instruction to do very fast 64x64 multiply.
*/
+#if defined(INTEL_SSE4_PCLMUL)
+
static
inline
gf_val_64_t
@@ -345,8 +348,6 @@ gf_w64_clm_multiply_2 (gf_t *gf, gf_val_64_t a64, gf_val_64_t b64)
{
gf_val_64_t rv = 0;
-#if defined(INTEL_SSE4_PCLMUL)
-
__m128i a, b;
__m128i result;
__m128i prim_poly;
@@ -376,10 +377,12 @@ gf_w64_clm_multiply_2 (gf_t *gf, gf_val_64_t a64, gf_val_64_t b64)
result = _mm_xor_si128 (result, w);
rv = ((gf_val_64_t)_mm_extract_epi64(result, 0));
-#endif
return rv;
}
+#endif
+#if defined(INTEL_SSE4_PCLMUL)
+
static
inline
gf_val_64_t
@@ -387,8 +390,6 @@ gf_w64_clm_multiply_4 (gf_t *gf, gf_val_64_t a64, gf_val_64_t b64)
{
gf_val_64_t rv = 0;
-#if defined(INTEL_SSE4_PCLMUL)
-
__m128i a, b;
__m128i result;
__m128i prim_poly;
@@ -418,15 +419,15 @@ gf_w64_clm_multiply_4 (gf_t *gf, gf_val_64_t a64, gf_val_64_t b64)
result = _mm_xor_si128 (result, w);
rv = ((gf_val_64_t)_mm_extract_epi64(result, 0));
-#endif
return rv;
}
+#endif
+#if defined(INTEL_SSE4_PCLMUL)
void
gf_w64_clm_multiply_region(gf_t *gf, void *src, void *dest, uint64_t val, int bytes, int xor)
{
-#if defined(INTEL_SSE4_PCLMUL)
gf_internal_t *h;
uint8_t *s8, *d8, *dtop;
gf_region_data rd;
@@ -504,8 +505,8 @@ gf_w64_clm_multiply_region(gf_t *gf, void *src, void *dest, uint64_t val, int by
}
}
gf_do_final_region_alignment(&rd);
-#endif
}
+#endif
void
gf_w64_split_4_64_lazy_multiply_region(gf_t *gf, void *src, void *dest, uint64_t val, int bytes, int xor)
@@ -709,21 +710,23 @@ int gf_w64_cfm_init(gf_t *gf)
SET_FUNCTION(gf,inverse,w64,gf_w64_euclid)
SET_FUNCTION(gf,multiply_region,w64,gf_w64_multiply_region_from_single)
-#if defined(INTEL_SSE4_PCLMUL)
- gf_internal_t *h;
+#if defined(INTEL_SSE4_PCLMUL)
+ if (gf_cpu_supports_intel_pclmul) {
+ gf_internal_t *h;
- h = (gf_internal_t *) gf->scratch;
+ h = (gf_internal_t *) gf->scratch;
- if ((0xfffffffe00000000ULL & h->prim_poly) == 0){
- SET_FUNCTION(gf,multiply,w64,gf_w64_clm_multiply_2)
- SET_FUNCTION(gf,multiply_region,w64,gf_w64_clm_multiply_region_from_single_2)
- }else if((0xfffe000000000000ULL & h->prim_poly) == 0){
- SET_FUNCTION(gf,multiply,w64,gf_w64_clm_multiply_4)
- SET_FUNCTION(gf,multiply_region,w64,gf_w64_clm_multiply_region_from_single_4)
- } else {
- return 0;
+ if ((0xfffffffe00000000ULL & h->prim_poly) == 0){
+ SET_FUNCTION(gf,multiply,w64,gf_w64_clm_multiply_2)
+ SET_FUNCTION(gf,multiply_region,w64,gf_w64_clm_multiply_region_from_single_2)
+ }else if((0xfffe000000000000ULL & h->prim_poly) == 0){
+ SET_FUNCTION(gf,multiply,w64,gf_w64_clm_multiply_4)
+ SET_FUNCTION(gf,multiply_region,w64,gf_w64_clm_multiply_region_from_single_4)
+ } else {
+ return 0;
+ }
+ return 1;
}
- return 1;
#endif
return 0;
@@ -1261,9 +1264,9 @@ gf_w64_bytwo_b_nosse_multiply_region(gf_t *gf, void *src, void *dest, gf_val_64_
v = _mm_srli_epi64(v, 1); }
+#ifdef INTEL_SSE2
void gf_w64_bytwo_p_sse_multiply_region(gf_t *gf, void *src, void *dest, gf_val_64_t val, int bytes, int xor)
{
-#ifdef INTEL_SSE2
int i;
uint8_t *s8, *d8;
uint64_t vrev, one64;
@@ -1322,8 +1325,8 @@ void gf_w64_bytwo_p_sse_multiply_region(gf_t *gf, void *src, void *dest, gf_val_
s8 += 16;
}
gf_do_final_region_alignment(&rd);
-#endif
}
+#endif
#ifdef INTEL_SSE2
static
@@ -1457,26 +1460,28 @@ int gf_w64_bytwo_init(gf_t *gf)
if (h->mult_type == GF_MULT_BYTWO_p) {
SET_FUNCTION(gf,multiply,w64,gf_w64_bytwo_p_multiply)
#ifdef INTEL_SSE2
- if (h->region_type & GF_REGION_NOSIMD)
- SET_FUNCTION(gf,multiply_region,w64,gf_w64_bytwo_p_nosse_multiply_region)
- else
+ if (gf_cpu_supports_intel_sse2 && !(h->region_type & GF_REGION_NOSIMD)) {
SET_FUNCTION(gf,multiply_region,w64,gf_w64_bytwo_p_sse_multiply_region)
- #else
- SET_FUNCTION(gf,multiply_region,w64,gf_w64_bytwo_p_nosse_multiply_region)
- if(h->region_type & GF_REGION_SIMD)
- return 0;
+ } else {
+ #endif
+ SET_FUNCTION(gf,multiply_region,w64,gf_w64_bytwo_p_nosse_multiply_region)
+ if(h->region_type & GF_REGION_SIMD)
+ return 0;
+ #ifdef INTEL_SSE2
+ }
#endif
} else {
SET_FUNCTION(gf,multiply,w64,gf_w64_bytwo_b_multiply)
#ifdef INTEL_SSE2
- if (h->region_type & GF_REGION_NOSIMD)
- SET_FUNCTION(gf,multiply_region,w64,gf_w64_bytwo_b_nosse_multiply_region)
- else
+ if (gf_cpu_supports_intel_sse2 && !(h->region_type & GF_REGION_NOSIMD)) {
SET_FUNCTION(gf,multiply_region,w64,gf_w64_bytwo_b_sse_multiply_region)
- #else
+ } else {
+ #endif
SET_FUNCTION(gf,multiply_region,w64,gf_w64_bytwo_b_nosse_multiply_region)
if(h->region_type & GF_REGION_SIMD)
return 0;
+ #ifdef INTEL_SSE2
+ }
#endif
}
SET_FUNCTION(gf,inverse,w64,gf_w64_euclid)
@@ -1975,18 +1980,20 @@ int gf_w64_split_init(gf_t *gf)
SET_FUNCTION(gf,multiply,w64,gf_w64_bytwo_p_multiply)
#if defined(INTEL_SSE4_PCLMUL)
- if ((!(h->region_type & GF_REGION_NOSIMD) &&
- (h->arg1 == 64 || h->arg2 == 64)) ||
- h->mult_type == GF_MULT_DEFAULT){
-
- if ((0xfffffffe00000000ULL & h->prim_poly) == 0){
- SET_FUNCTION(gf,multiply,w64,gf_w64_clm_multiply_2)
- SET_FUNCTION(gf,multiply_region,w64,gf_w64_clm_multiply_region_from_single_2)
- }else if((0xfffe000000000000ULL & h->prim_poly) == 0){
- SET_FUNCTION(gf,multiply,w64,gf_w64_clm_multiply_4)
- SET_FUNCTION(gf,multiply_region,w64,gf_w64_clm_multiply_region_from_single_4)
- }else{
- return 0;
+ if (gf_cpu_supports_intel_pclmul) {
+ if ((!(h->region_type & GF_REGION_NOSIMD) &&
+ (h->arg1 == 64 || h->arg2 == 64)) ||
+ h->mult_type == GF_MULT_DEFAULT){
+
+ if ((0xfffffffe00000000ULL & h->prim_poly) == 0){
+ SET_FUNCTION(gf,multiply,w64,gf_w64_clm_multiply_2)
+ SET_FUNCTION(gf,multiply_region,w64,gf_w64_clm_multiply_region_from_single_2)
+ }else if((0xfffe000000000000ULL & h->prim_poly) == 0){
+ SET_FUNCTION(gf,multiply,w64,gf_w64_clm_multiply_4)
+ SET_FUNCTION(gf,multiply_region,w64,gf_w64_clm_multiply_region_from_single_4)
+ }else{
+ return 0;
+ }
}
}
#endif
@@ -1996,23 +2003,27 @@ int gf_w64_split_init(gf_t *gf)
/* Allen: set region pointers for default mult type. Single pointers are
* taken care of above (explicitly for sse, implicitly for no sse). */
-#if defined(INTEL_SSE4) || defined(ARCH_AARCH64)
if (h->mult_type == GF_MULT_DEFAULT) {
- d4 = (struct gf_split_4_64_lazy_data *) h->private;
- d4->last_value = 0;
+#if defined(INTEL_SSE4) || defined(ARCH_AARCH64)
+ if (gf_cpu_supports_intel_sse4 || gf_cpu_supports_arm_neon) {
+ d4 = (struct gf_split_4_64_lazy_data *) h->private;
+ d4->last_value = 0;
#if defined(INTEL_SSE4)
- SET_FUNCTION(gf,multiply_region,w64,gf_w64_split_4_64_lazy_sse_multiply_region)
+ if (gf_cpu_supports_intel_sse4)
+ SET_FUNCTION(gf,multiply_region,w64,gf_w64_split_4_64_lazy_sse_multiply_region)
#elif defined(ARCH_AARCH64)
- gf_w64_neon_split_init(gf);
+ if (gf_cpu_supports_arm_neon)
+ gf_w64_neon_split_init(gf);
#endif
- }
-#else
- if (h->mult_type == GF_MULT_DEFAULT) {
- d8 = (struct gf_split_8_64_lazy_data *) h->private;
- d8->last_value = 0;
- SET_FUNCTION(gf,multiply_region,w64,gf_w64_split_8_64_lazy_multiply_region)
- }
+ } else {
+#endif
+ d8 = (struct gf_split_8_64_lazy_data *) h->private;
+ d8->last_value = 0;
+ SET_FUNCTION(gf,multiply_region,w64,gf_w64_split_8_64_lazy_multiply_region)
+#if defined(INTEL_SSE4) || defined(ARCH_AARCH64)
+ }
#endif
+ }
if ((h->arg1 == 4 && h->arg2 == 64) || (h->arg1 == 64 && h->arg2 == 4)) {
d4 = (struct gf_split_4_64_lazy_data *) h->private;
@@ -2022,28 +2033,35 @@ int gf_w64_split_init(gf_t *gf)
if(h->region_type & GF_REGION_ALTMAP)
{
#ifdef INTEL_SSSE3
- SET_FUNCTION(gf,multiply_region,w64,gf_w64_split_4_64_lazy_sse_altmap_multiply_region)
+ if (gf_cpu_supports_intel_ssse3) {
+ SET_FUNCTION(gf,multiply_region,w64,gf_w64_split_4_64_lazy_sse_altmap_multiply_region)
+ } else
#elif defined(ARCH_AARCH64)
- gf_w64_neon_split_init(gf);
- #else
- return 0;
+ if (gf_cpu_supports_arm_neon) {
+ gf_w64_neon_split_init(gf);
+ } else
#endif
+ return 0;
}
else //no altmap
{
#if defined(INTEL_SSE4) || defined(ARCH_AARCH64)
- if(h->region_type & GF_REGION_NOSIMD)
- SET_FUNCTION(gf,multiply_region,w64,gf_w64_split_4_64_lazy_multiply_region)
- else
- #if defined(INTEL_SSE4)
- SET_FUNCTION(gf,multiply_region,w64,gf_w64_split_4_64_lazy_sse_multiply_region)
- #elif defined(ARCH_AARCH64)
- gf_w64_neon_split_init(gf);
- #endif
- #else
+ if(gf_cpu_supports_intel_sse4 || gf_cpu_supports_arm_neon) {
+ if (h->region_type & GF_REGION_NOSIMD) {
+ SET_FUNCTION(gf,multiply_region,w64,gf_w64_split_4_64_lazy_multiply_region)
+ } else
+ #if defined(INTEL_SSE4)
+ SET_FUNCTION(gf,multiply_region,w64,gf_w64_split_4_64_lazy_sse_multiply_region)
+ #elif defined(ARCH_AARCH64)
+ gf_w64_neon_split_init(gf);
+ #endif
+ } else {
+ #endif
SET_FUNCTION(gf,multiply_region,w64,gf_w64_split_4_64_lazy_multiply_region)
if(h->region_type & GF_REGION_SIMD)
return 0;
+ #if defined(INTEL_SSE4) || defined(ARCH_AARCH64)
+ }
#endif
}
}
@@ -2114,11 +2132,15 @@ int gf_w64_scratch_size(int mult_type, int region_type, int divide_type, int arg
* then fall through to split table scratch size code. */
#if defined(INTEL_SSE4) || defined(ARCH_AARCH64)
+ if (gf_cpu_supports_intel_sse4 || gf_cpu_supports_arm_neon) {
arg1 = 64;
arg2 = 4;
-#else
+ } else {
+#endif
arg1 = 64;
arg2 = 8;
+#if defined(INTEL_SSE4) || defined(ARCH_AARCH64)
+ }
#endif
case GF_MULT_SPLIT_TABLE:
diff --git a/src/gf_w8.c b/src/gf_w8.c
index 81a0eba..f647a31 100644
--- a/src/gf_w8.c
+++ b/src/gf_w8.c
@@ -13,6 +13,7 @@
#include <stdio.h>
#include <stdlib.h>
#include <assert.h>
+#include "gf_cpu.h"
#define AB2(ip, am1 ,am2, b, t1, t2) {\
t1 = (b << 1) & am1;\
@@ -127,6 +128,7 @@ uint32_t gf_w8_matrix (gf_t *gf, uint32_t b)
}
+#if defined(INTEL_SSE4_PCLMUL)
static
inline
gf_val_32_t
@@ -134,8 +136,6 @@ gf_w8_clm_multiply_2 (gf_t *gf, gf_val_32_t a8, gf_val_32_t b8)
{
gf_val_32_t rv = 0;
-#if defined(INTEL_SSE4_PCLMUL)
-
__m128i a, b;
__m128i result;
__m128i prim_poly;
@@ -169,10 +169,11 @@ gf_w8_clm_multiply_2 (gf_t *gf, gf_val_32_t a8, gf_val_32_t b8)
rv = ((gf_val_32_t)_mm_extract_epi32(result, 0));
-#endif
return rv;
}
+#endif
+#if defined(INTEL_SSE4_PCLMUL)
static
inline
gf_val_32_t
@@ -180,8 +181,6 @@ gf_w8_clm_multiply_3 (gf_t *gf, gf_val_32_t a8, gf_val_32_t b8)
{
gf_val_32_t rv = 0;
-#if defined(INTEL_SSE4_PCLMUL)
-
__m128i a, b;
__m128i result;
__m128i prim_poly;
@@ -208,10 +207,11 @@ gf_w8_clm_multiply_3 (gf_t *gf, gf_val_32_t a8, gf_val_32_t b8)
rv = ((gf_val_32_t)_mm_extract_epi32(result, 0));
-#endif
return rv;
}
+#endif
+#if defined(INTEL_SSE4_PCLMUL)
static
inline
gf_val_32_t
@@ -219,8 +219,6 @@ gf_w8_clm_multiply_4 (gf_t *gf, gf_val_32_t a8, gf_val_32_t b8)
{
gf_val_32_t rv = 0;
-#if defined(INTEL_SSE4_PCLMUL)
-
__m128i a, b;
__m128i result;
__m128i prim_poly;
@@ -248,9 +246,9 @@ gf_w8_clm_multiply_4 (gf_t *gf, gf_val_32_t a8, gf_val_32_t b8)
/* Extracts 32 bit value from result. */
rv = ((gf_val_32_t)_mm_extract_epi32(result, 0));
-#endif
return rv;
}
+#endif
static
@@ -509,25 +507,29 @@ static
int gf_w8_cfm_init(gf_t *gf)
{
#if defined(INTEL_SSE4_PCLMUL)
- gf_internal_t *h;
-
- h = (gf_internal_t *) gf->scratch;
-
- if ((0xe0 & h->prim_poly) == 0){
- SET_FUNCTION(gf,multiply,w32,gf_w8_clm_multiply_2)
- SET_FUNCTION(gf,multiply_region,w32,gf_w8_clm_multiply_region_from_single_2)
- }else if ((0xc0 & h->prim_poly) == 0){
- SET_FUNCTION(gf,multiply,w32,gf_w8_clm_multiply_3)
- SET_FUNCTION(gf,multiply_region,w32,gf_w8_clm_multiply_region_from_single_3)
- }else if ((0x80 & h->prim_poly) == 0){
- SET_FUNCTION(gf,multiply,w32,gf_w8_clm_multiply_4)
- SET_FUNCTION(gf,multiply_region,w32,gf_w8_clm_multiply_region_from_single_4)
- }else{
- return 0;
- }
- return 1;
+ if (gf_cpu_supports_intel_pclmul) {
+ gf_internal_t *h;
+
+ h = (gf_internal_t *) gf->scratch;
+
+ if ((0xe0 & h->prim_poly) == 0){
+ SET_FUNCTION(gf,multiply,w32,gf_w8_clm_multiply_2)
+ SET_FUNCTION(gf,multiply_region,w32,gf_w8_clm_multiply_region_from_single_2)
+ }else if ((0xc0 & h->prim_poly) == 0){
+ SET_FUNCTION(gf,multiply,w32,gf_w8_clm_multiply_3)
+ SET_FUNCTION(gf,multiply_region,w32,gf_w8_clm_multiply_region_from_single_3)
+ }else if ((0x80 & h->prim_poly) == 0){
+ SET_FUNCTION(gf,multiply,w32,gf_w8_clm_multiply_4)
+ SET_FUNCTION(gf,multiply_region,w32,gf_w8_clm_multiply_region_from_single_4)
+ }else{
+ return 0;
+ }
+ return 1;
+ }
#elif defined(ARM_NEON)
- return gf_w8_neon_cfm_init(gf);
+ if (gf_cpu_supports_arm_neon) {
+ return gf_w8_neon_cfm_init(gf);
+ }
#endif
return 0;
@@ -1103,20 +1105,21 @@ int gf_w8_split_init(gf_t *gf)
}
SET_FUNCTION(gf,multiply,w32,gf_w8_split_multiply)
-
- #if defined(INTEL_SSSE3) || defined(ARM_NEON)
- if (h->region_type & GF_REGION_NOSIMD)
- SET_FUNCTION(gf,multiply_region,w32,gf_w8_split_multiply_region)
- else
- #if defined(INTEL_SSSE3)
+
+ #if defined(INTEL_SSSE3)
+ if (gf_cpu_supports_intel_ssse3 && !(h->region_type & GF_REGION_NOSIMD)) {
SET_FUNCTION(gf,multiply_region,w32,gf_w8_split_multiply_region_sse)
- #elif defined(ARM_NEON)
+ } else {
+ #elif defined(ARM_NEON)
+ if (gf_cpu_supports_arm_neon && !(h->region_type & GF_REGION_NOSIMD)) {
gf_w8_neon_split_init(gf);
- #endif
- #else
+ } else {
+ #endif
SET_FUNCTION(gf,multiply_region,w32,gf_w8_split_multiply_region)
if(h->region_type & GF_REGION_SIMD)
return 0;
+ #if defined(INTEL_SSSE3) || defined(ARM_NEON)
+ }
#endif
return 1;
@@ -1134,17 +1137,12 @@ int gf_w8_table_init(gf_t *gf)
struct gf_w8_double_table_data *dtd = NULL;
struct gf_w8_double_table_lazy_data *ltd = NULL;
struct gf_w8_default_data *dd = NULL;
- int a, b, c, prod, scase, use_simd;
+ int a, b, c, prod, scase;
h = (gf_internal_t *) gf->scratch;
-#if defined(INTEL_SSSE3) || defined(ARM_NEON)
- use_simd = 1;
-#else
- use_simd = 0;
-#endif
-
- if (h->mult_type == GF_MULT_DEFAULT && use_simd) {
+ if (h->mult_type == GF_MULT_DEFAULT &&
+ (gf_cpu_supports_intel_ssse3 || gf_cpu_supports_arm_neon)) {
dd = (struct gf_w8_default_data *)h->private;
scase = 3;
bzero(dd->high, sizeof(uint8_t) * GF_FIELD_SIZE * GF_HALF_SIZE);
@@ -1220,13 +1218,19 @@ int gf_w8_table_init(gf_t *gf)
break;
case 3:
#if defined(INTEL_SSSE3) || defined(ARM_NEON)
- SET_FUNCTION(gf,divide,w32,gf_w8_default_divide)
- SET_FUNCTION(gf,multiply,w32,gf_w8_default_multiply)
+ if (gf_cpu_supports_intel_ssse3 || gf_cpu_supports_arm_neon) {
+ SET_FUNCTION(gf,divide,w32,gf_w8_default_divide)
+ SET_FUNCTION(gf,multiply,w32,gf_w8_default_multiply)
#if defined(INTEL_SSSE3)
- SET_FUNCTION(gf,multiply_region,w32,gf_w8_split_multiply_region_sse)
+ if (gf_cpu_supports_intel_ssse3) {
+ SET_FUNCTION(gf,multiply_region,w32,gf_w8_split_multiply_region_sse)
+ }
#elif defined(ARM_NEON)
- gf_w8_neon_split_init(gf);
+ if (gf_cpu_supports_arm_neon) {
+ gf_w8_neon_split_init(gf);
+ }
#endif
+ }
#endif
break;
}
@@ -2192,26 +2196,28 @@ int gf_w8_bytwo_init(gf_t *gf)
if (h->mult_type == GF_MULT_BYTWO_p) {
SET_FUNCTION(gf,multiply,w32,gf_w8_bytwo_p_multiply)
#ifdef INTEL_SSE2
- if (h->region_type & GF_REGION_NOSIMD)
- SET_FUNCTION(gf,multiply_region,w32,gf_w8_bytwo_p_nosse_multiply_region)
- else
+ if (gf_cpu_supports_intel_sse2 && !(h->region_type & GF_REGION_NOSIMD)) {
SET_FUNCTION(gf,multiply_region,w32,gf_w8_bytwo_p_sse_multiply_region)
-#else
- SET_FUNCTION(gf,multiply_region,w32,gf_w8_bytwo_p_nosse_multiply_region)
- if(h->region_type & GF_REGION_SIMD)
- return 0;
+ } else {
+#endif
+ SET_FUNCTION(gf,multiply_region,w32,gf_w8_bytwo_p_nosse_multiply_region)
+ if(h->region_type & GF_REGION_SIMD)
+ return 0;
+#ifdef INTEL_SSE2
+ }
#endif
} else {
SET_FUNCTION(gf,multiply,w32,gf_w8_bytwo_b_multiply)
#ifdef INTEL_SSE2
- if (h->region_type & GF_REGION_NOSIMD)
- SET_FUNCTION(gf,multiply_region,w32,gf_w8_bytwo_b_nosse_multiply_region)
- else
+ if (gf_cpu_supports_intel_sse2 && !(h->region_type & GF_REGION_NOSIMD)) {
SET_FUNCTION(gf,multiply_region,w32,gf_w8_bytwo_b_sse_multiply_region)
-#else
+ } else {
+#endif
SET_FUNCTION(gf,multiply_region,w32,gf_w8_bytwo_b_nosse_multiply_region)
if(h->region_type & GF_REGION_SIMD)
return 0;
+#ifdef INTEL_SSE2
+ }
#endif
}
return 1;
@@ -2229,9 +2235,9 @@ int gf_w8_scratch_size(int mult_type, int region_type, int divide_type, int arg1
switch(mult_type)
{
case GF_MULT_DEFAULT:
-#if defined(INTEL_SSSE3) || defined(ARM_NEON)
- return sizeof(gf_internal_t) + sizeof(struct gf_w8_default_data) + 64;
-#endif
+ if (gf_cpu_supports_intel_ssse3 || gf_cpu_supports_arm_neon) {
+ return sizeof(gf_internal_t) + sizeof(struct gf_w8_default_data) + 64;
+ }
return sizeof(gf_internal_t) + sizeof(struct gf_w8_single_table_data) + 64;
case GF_MULT_TABLE:
if (region_type == GF_REGION_CAUCHY) {
diff --git a/tools/test_simd.sh b/tools/test_simd.sh
index 1268f87..1b0e319 100755
--- a/tools/test_simd.sh
+++ b/tools/test_simd.sh
@@ -118,6 +118,237 @@ test_compile() {
esac
}
+# disable through build flags
+runtime_arm_flags() {
+ failed=0
+
+ echo "====NO SIMD support..." >> ${1}
+ { ./configure --disable-neon && make clean && make CFLAGS="-DDEBUG_FUNCTIONS"; } || { echo "Compile FAILED" >> ${1}; return 1; }
+ for i in 128 64 32 16 8 4; do
+ { ${script_dir}/gf_methods $i -ACD -X >> ${1}; } || { echo "gf_methods $i FAILED" >> ${1}; ((++failed)); }
+ done
+
+ echo "====FULL SIMD support..." >> ${1}
+ { ./configure && make clean && make CFLAGS="-DDEBUG_FUNCTIONS"; } || { echo "Compile FAILED" >> ${1}; return 1; }
+ for i in 128 64 32 16 8 4; do
+ { ${script_dir}/gf_methods $i -ACD -X >> ${1}; } || { echo "gf_methods $i FAILED" >> ${1}; ((++failed)); }
+ done
+
+ return ${failed}
+}
+
+# build once with FULL SIMD and disable at runtime through environment
+runtime_arm_env() {
+ failed=0
+
+ { ./configure && make clean && make CFLAGS="-DDEBUG_FUNCTIONS"; } || { echo "Compile FAILED" >> ${1}; return 1; }
+
+ echo "====NO SIMD support..." >> ${1}
+ export GF_COMPLETE_DISABLE_NEON=1
+ for i in 128 64 32 16 8 4; do
+ { ${script_dir}/gf_methods $i -ACD -X >> ${1}; } || { echo "gf_methods $i FAILED" >> ${1}; ((++failed)); }
+ done
+
+ echo "====FULL SIMD support..." >> ${1}
+ unset GF_COMPLETE_DISABLE_NEON
+ for i in 128 64 32 16 8 4; do
+ { ${script_dir}/gf_methods $i -ACD -X >> ${1}; } || { echo "gf_methods $i FAILED" >> ${1}; ((++failed)); }
+ done
+
+ return ${failed}
+}
+
+runtime_intel_flags() {
+ failed=0
+
+ echo "====NO SIMD support..." >> ${1}
+ { ./configure --disable-sse && make clean && make CFLAGS="-DDEBUG_FUNCTIONS"; } || { echo "FAIL" >> ${1}; ((++failed)); }
+ for i in 128 64 32 16 8 4; do
+ { ${script_dir}/gf_methods $i -ACD -X >> ${1}; } || { echo "gf_methods $i FAILED" >> ${1}; ((++failed)); }
+ done
+
+ echo "====SSE2 support..." >> ${1}
+ export ax_cv_have_sse_ext=no
+ export ax_cv_have_sse2_ext=yes
+ export ax_cv_have_sse3_ext=no
+ export ax_cv_have_ssse3_ext=no
+ export ax_cv_have_sse41_ext=no
+ export ax_cv_have_sse42_ext=no
+ export ax_cv_have_pclmuldq_ext=no
+ { ./configure && make clean && make CFLAGS="-DDEBUG_FUNCTIONS"; } || { echo "FAIL" >> ${1}; ((++failed)); }
+ for i in 128 64 32 16 8 4; do
+ { ${script_dir}/gf_methods $i -ACD -X >> ${1}; } || { echo "gf_methods $i FAILED" >> ${1}; ((++failed)); }
+ done
+
+ echo "====SSE2,SSE3 support..." >> ${1}
+ export ax_cv_have_sse_ext=no
+ export ax_cv_have_sse2_ext=yes
+ export ax_cv_have_sse3_ext=yes
+ export ax_cv_have_ssse3_ext=no
+ export ax_cv_have_sse41_ext=no
+ export ax_cv_have_sse42_ext=no
+ export ax_cv_have_pclmuldq_ext=no
+ { ./configure && make clean && make CFLAGS="-DDEBUG_FUNCTIONS"; } || { echo "FAIL" >> ${1}; ((++failed)); }
+ for i in 128 64 32 16 8 4; do
+ { ${script_dir}/gf_methods $i -ACD -X >> ${1}; } || { echo "gf_methods $i FAILED" >> ${1}; ((++failed)); }
+ done
+
+ echo "====SSE2,SSE3,SSSE3 support..." >> ${1}
+ export ax_cv_have_sse_ext=no
+ export ax_cv_have_sse2_ext=yes
+ export ax_cv_have_sse3_ext=yes
+ export ax_cv_have_ssse3_ext=yes
+ export ax_cv_have_sse41_ext=no
+ export ax_cv_have_sse42_ext=no
+ export ax_cv_have_pclmuldq_ext=no
+ { ./configure && make clean && make CFLAGS="-DDEBUG_FUNCTIONS"; } || { echo "FAIL" >> ${1}; ((++failed)); }
+ for i in 128 64 32 16 8 4; do
+ { ${script_dir}/gf_methods $i -ACD -X >> ${1}; } || { echo "gf_methods $i FAILED" >> ${1}; ((++failed)); }
+ done
+
+ echo "====SSE2,SSE3,SSSE3,SSE4_1 support..." >> ${1}
+ export ax_cv_have_sse_ext=no
+ export ax_cv_have_sse2_ext=yes
+ export ax_cv_have_sse3_ext=yes
+ export ax_cv_have_ssse3_ext=yes
+ export ax_cv_have_sse41_ext=yes
+ export ax_cv_have_sse42_ext=no
+ export ax_cv_have_pclmuldq_ext=no
+ { ./configure && make clean && make CFLAGS="-DDEBUG_FUNCTIONS"; } || { echo "FAIL" >> ${1}; ((++failed)); }
+ for i in 128 64 32 16 8 4; do
+ { ${script_dir}/gf_methods $i -ACD -X >> ${1}; } || { echo "gf_methods $i FAILED" >> ${1}; ((++failed)); }
+ done
+
+ echo "====SSE2,SSE3,SSSE3,SSE4_2 support..." >> ${1}
+ export ax_cv_have_sse_ext=no
+ export ax_cv_have_sse2_ext=yes
+ export ax_cv_have_sse3_ext=yes
+ export ax_cv_have_ssse3_ext=yes
+ export ax_cv_have_sse41_ext=no
+ export ax_cv_have_sse42_ext=yes
+ export ax_cv_have_pclmuldq_ext=no
+ { ./configure && make clean && make CFLAGS="-DDEBUG_FUNCTIONS"; } || { echo "FAIL" >> ${1}; ((++failed)); }
+ for i in 128 64 32 16 8 4; do
+ { ${script_dir}/gf_methods $i -ACD -X >> ${1}; } || { echo "gf_methods $i FAILED" >> ${1}; ((++failed)); }
+ done
+
+ echo "====FULL SIMD support..." >> ${1}
+ { ./configure && make clean && make CFLAGS="-DDEBUG_FUNCTIONS"; } || { echo "FAIL" >> ${1}; ((++failed)); }
+ for i in 128 64 32 16 8 4; do
+ { ${script_dir}/gf_methods $i -ACD -X >> ${1}; } || { echo "gf_methods $i FAILED" >> ${1}; ((++failed)); }
+ done
+
+ return ${failed}
+}
+
+runtime_intel_env() {
+ failed=0
+
+ # compile a build with full SIMD support
+ { ./configure && make clean && make CFLAGS="-DDEBUG_FUNCTIONS"; } || { echo "Compile FAILED" >> ${1}; return 1; }
+
+ echo "====NO SIMD support..." >> ${1}
+ export GF_COMPLETE_DISABLE_SSE2=1
+ export GF_COMPLETE_DISABLE_SSE3=1
+ export GF_COMPLETE_DISABLE_SSSE3=1
+ export GF_COMPLETE_DISABLE_SSE4=1
+ export GF_COMPLETE_DISABLE_SSE4_PCLMUL=1
+ for i in 128 64 32 16 8 4; do
+ { ${script_dir}/gf_methods $i -ACD -X >> ${1}; } || { echo "gf_methods $i FAILED" >> ${1}; ((++failed)); }
+ done
+
+ echo "====SSE2 support..." >> ${1}
+ unset GF_COMPLETE_DISABLE_SSE2
+ export GF_COMPLETE_DISABLE_SSE3=1
+ export GF_COMPLETE_DISABLE_SSSE3=1
+ export GF_COMPLETE_DISABLE_SSE4=1
+ export GF_COMPLETE_DISABLE_SSE4_PCLMUL=1
+ for i in 128 64 32 16 8 4; do
+ { ${script_dir}/gf_methods $i -ACD -X >> ${1}; } || { echo "gf_methods $i FAILED" >> ${1}; ((++failed)); }
+ done
+
+ echo "====SSE2,SSE3 support..." >> ${1}
+ unset GF_COMPLETE_DISABLE_SSE2
+ unset GF_COMPLETE_DISABLE_SSE3
+ export GF_COMPLETE_DISABLE_SSSE3=1
+ export GF_COMPLETE_DISABLE_SSE4=1
+ export GF_COMPLETE_DISABLE_SSE4_PCLMUL=1
+ for i in 128 64 32 16 8 4; do
+ { ${script_dir}/gf_methods $i -ACD -X >> ${1}; } || { echo "gf_methods $i FAILED" >> ${1}; ((++failed)); }
+ done
+
+ echo "====SSE2,SSE3,SSSE3 support..." >> ${1}
+ unset GF_COMPLETE_DISABLE_SSE2
+ unset GF_COMPLETE_DISABLE_SSE3
+ unset GF_COMPLETE_DISABLE_SSSE3
+ export GF_COMPLETE_DISABLE_SSE4=1
+ export GF_COMPLETE_DISABLE_SSE4_PCLMUL=1
+ for i in 128 64 32 16 8 4; do
+ { ${script_dir}/gf_methods $i -ACD -X >> ${1}; } || { echo "gf_methods $i FAILED" >> ${1}; ((++failed)); }
+ done
+
+ echo "====SSE2,SSE3,SSSE3,SSE4_1 support..." >> ${1}
+ unset GF_COMPLETE_DISABLE_SSE2
+ unset GF_COMPLETE_DISABLE_SSE3
+ unset GF_COMPLETE_DISABLE_SSSE3
+ unset GF_COMPLETE_DISABLE_SSE4
+ export GF_COMPLETE_DISABLE_SSE4_PCLMUL=1
+ for i in 128 64 32 16 8 4; do
+ { ${script_dir}/gf_methods $i -ACD -X >> ${1}; } || { echo "gf_methods $i FAILED" >> ${1}; ((++failed)); }
+ done
+
+ echo "====SSE2,SSE3,SSSE3,SSE4_2 support..." >> ${1}
+ unset GF_COMPLETE_DISABLE_SSE2
+ unset GF_COMPLETE_DISABLE_SSE3
+ unset GF_COMPLETE_DISABLE_SSSE3
+ unset GF_COMPLETE_DISABLE_SSE4
+ export GF_COMPLETE_DISABLE_SSE4_PCLMUL=1
+ for i in 128 64 32 16 8 4; do
+ { ${script_dir}/gf_methods $i -ACD -X >> ${1}; } || { echo "gf_methods $i FAILED" >> ${1}; ((++failed)); }
+ done
+
+ echo "====FULL SIMD support..." >> ${1}
+ unset GF_COMPLETE_DISABLE_SSE2
+ unset GF_COMPLETE_DISABLE_SSE3
+ unset GF_COMPLETE_DISABLE_SSSE3
+ unset GF_COMPLETE_DISABLE_SSE4
+ unset GF_COMPLETE_DISABLE_SSE4_PCLMUL
+ for i in 128 64 32 16 8 4; do
+ { ${script_dir}/gf_methods $i -ACD -X >> ${1}; } || { echo "gf_methods $i FAILED" >> ${1}; ((++failed)); }
+ done
+
+ return ${failed}
+}
+
+test_runtime() {
+ rm -f ${results}.left
+ rm -f ${results}.right
+
+ case $host_cpu in
+ aarch64*|arm*)
+ runtime_arm_flags ${results}.left
+ runtime_arm_env ${results}.right
+ ;;
+ i[[3456]]86*|x86_64*|amd64*)
+ runtime_intel_flags ${results}.left
+ runtime_intel_env ${results}.right
+ ;;
+ esac
+
+ echo "======LEFT======" > ${results}
+ cat ${results}.left >> ${results}
+ echo "======RIGHT======" >> ${results}
+ cat ${results}.right >> ${results}
+ echo "======RESULT======" >> ${results}
+ if diff "${results}.left" "${results}.right"; then
+ echo SUCCESS >> ${results}
+ return 0
+ else
+ echo SUCCESS >> ${results}
+ return 1
+ fi
+}
+
cd ${script_dir}/..
rm -f ${results}
diff --git a/tools/test_simd_qemu.sh b/tools/test_simd_qemu.sh
index a270e20..7b2cb1c 100755
--- a/tools/test_simd_qemu.sh
+++ b/tools/test_simd_qemu.sh
@@ -224,6 +224,8 @@ run_test_simd_basic() {
{ run_test $arch $cpu "unit" && echo "SUCCESS"; } || { echo "FAILED"; ((++failed)); }
echo "=====running functions test"
{ run_test $arch $cpu "functions" && echo "SUCCESS"; } || { echo "FAILED"; ((++failed)); }
+ echo "=====running runtime test"
+ { run_test $arch $cpu "runtime" && echo "SUCCESS"; } || { echo "FAILED"; ((++failed)); }
stop_qemu
return ${failed}