summaryrefslogtreecommitdiff
path: root/src/gf_w64.c
diff options
context:
space:
mode:
Diffstat (limited to 'src/gf_w64.c')
-rw-r--r--src/gf_w64.c164
1 files changed, 93 insertions, 71 deletions
diff --git a/src/gf_w64.c b/src/gf_w64.c
index a096161..69e55db 100644
--- a/src/gf_w64.c
+++ b/src/gf_w64.c
@@ -12,6 +12,7 @@
#include <stdio.h>
#include <stdlib.h>
#include "gf_w64.h"
+#include "gf_cpu.h"
static
inline
@@ -338,6 +339,8 @@ gf_w64_shift_multiply (gf_t *gf, gf_val_64_t a64, gf_val_64_t b64)
* ELM: Use the Intel carryless multiply instruction to do very fast 64x64 multiply.
*/
+#if defined(INTEL_SSE4_PCLMUL)
+
static
inline
gf_val_64_t
@@ -345,8 +348,6 @@ gf_w64_clm_multiply_2 (gf_t *gf, gf_val_64_t a64, gf_val_64_t b64)
{
gf_val_64_t rv = 0;
-#if defined(INTEL_SSE4_PCLMUL)
-
__m128i a, b;
__m128i result;
__m128i prim_poly;
@@ -376,10 +377,12 @@ gf_w64_clm_multiply_2 (gf_t *gf, gf_val_64_t a64, gf_val_64_t b64)
result = _mm_xor_si128 (result, w);
rv = ((gf_val_64_t)_mm_extract_epi64(result, 0));
-#endif
return rv;
}
+#endif
+#if defined(INTEL_SSE4_PCLMUL)
+
static
inline
gf_val_64_t
@@ -387,8 +390,6 @@ gf_w64_clm_multiply_4 (gf_t *gf, gf_val_64_t a64, gf_val_64_t b64)
{
gf_val_64_t rv = 0;
-#if defined(INTEL_SSE4_PCLMUL)
-
__m128i a, b;
__m128i result;
__m128i prim_poly;
@@ -418,15 +419,15 @@ gf_w64_clm_multiply_4 (gf_t *gf, gf_val_64_t a64, gf_val_64_t b64)
result = _mm_xor_si128 (result, w);
rv = ((gf_val_64_t)_mm_extract_epi64(result, 0));
-#endif
return rv;
}
+#endif
+#if defined(INTEL_SSE4_PCLMUL)
void
gf_w64_clm_multiply_region(gf_t *gf, void *src, void *dest, uint64_t val, int bytes, int xor)
{
-#if defined(INTEL_SSE4_PCLMUL)
gf_internal_t *h;
uint8_t *s8, *d8, *dtop;
gf_region_data rd;
@@ -504,8 +505,8 @@ gf_w64_clm_multiply_region(gf_t *gf, void *src, void *dest, uint64_t val, int by
}
}
gf_do_final_region_alignment(&rd);
-#endif
}
+#endif
void
gf_w64_split_4_64_lazy_multiply_region(gf_t *gf, void *src, void *dest, uint64_t val, int bytes, int xor)
@@ -709,21 +710,23 @@ int gf_w64_cfm_init(gf_t *gf)
SET_FUNCTION(gf,inverse,w64,gf_w64_euclid)
SET_FUNCTION(gf,multiply_region,w64,gf_w64_multiply_region_from_single)
-#if defined(INTEL_SSE4_PCLMUL)
- gf_internal_t *h;
+#if defined(INTEL_SSE4_PCLMUL)
+ if (gf_cpu_supports_intel_pclmul) {
+ gf_internal_t *h;
- h = (gf_internal_t *) gf->scratch;
+ h = (gf_internal_t *) gf->scratch;
- if ((0xfffffffe00000000ULL & h->prim_poly) == 0){
- SET_FUNCTION(gf,multiply,w64,gf_w64_clm_multiply_2)
- SET_FUNCTION(gf,multiply_region,w64,gf_w64_clm_multiply_region_from_single_2)
- }else if((0xfffe000000000000ULL & h->prim_poly) == 0){
- SET_FUNCTION(gf,multiply,w64,gf_w64_clm_multiply_4)
- SET_FUNCTION(gf,multiply_region,w64,gf_w64_clm_multiply_region_from_single_4)
- } else {
- return 0;
+ if ((0xfffffffe00000000ULL & h->prim_poly) == 0){
+ SET_FUNCTION(gf,multiply,w64,gf_w64_clm_multiply_2)
+ SET_FUNCTION(gf,multiply_region,w64,gf_w64_clm_multiply_region_from_single_2)
+ }else if((0xfffe000000000000ULL & h->prim_poly) == 0){
+ SET_FUNCTION(gf,multiply,w64,gf_w64_clm_multiply_4)
+ SET_FUNCTION(gf,multiply_region,w64,gf_w64_clm_multiply_region_from_single_4)
+ } else {
+ return 0;
+ }
+ return 1;
}
- return 1;
#endif
return 0;
@@ -1261,9 +1264,9 @@ gf_w64_bytwo_b_nosse_multiply_region(gf_t *gf, void *src, void *dest, gf_val_64_
v = _mm_srli_epi64(v, 1); }
+#ifdef INTEL_SSE2
void gf_w64_bytwo_p_sse_multiply_region(gf_t *gf, void *src, void *dest, gf_val_64_t val, int bytes, int xor)
{
-#ifdef INTEL_SSE2
int i;
uint8_t *s8, *d8;
uint64_t vrev, one64;
@@ -1322,8 +1325,8 @@ void gf_w64_bytwo_p_sse_multiply_region(gf_t *gf, void *src, void *dest, gf_val_
s8 += 16;
}
gf_do_final_region_alignment(&rd);
-#endif
}
+#endif
#ifdef INTEL_SSE2
static
@@ -1457,26 +1460,28 @@ int gf_w64_bytwo_init(gf_t *gf)
if (h->mult_type == GF_MULT_BYTWO_p) {
SET_FUNCTION(gf,multiply,w64,gf_w64_bytwo_p_multiply)
#ifdef INTEL_SSE2
- if (h->region_type & GF_REGION_NOSIMD)
- SET_FUNCTION(gf,multiply_region,w64,gf_w64_bytwo_p_nosse_multiply_region)
- else
+ if (gf_cpu_supports_intel_sse2 && !(h->region_type & GF_REGION_NOSIMD)) {
SET_FUNCTION(gf,multiply_region,w64,gf_w64_bytwo_p_sse_multiply_region)
- #else
- SET_FUNCTION(gf,multiply_region,w64,gf_w64_bytwo_p_nosse_multiply_region)
- if(h->region_type & GF_REGION_SIMD)
- return 0;
+ } else {
+ #endif
+ SET_FUNCTION(gf,multiply_region,w64,gf_w64_bytwo_p_nosse_multiply_region)
+ if(h->region_type & GF_REGION_SIMD)
+ return 0;
+ #ifdef INTEL_SSE2
+ }
#endif
} else {
SET_FUNCTION(gf,multiply,w64,gf_w64_bytwo_b_multiply)
#ifdef INTEL_SSE2
- if (h->region_type & GF_REGION_NOSIMD)
- SET_FUNCTION(gf,multiply_region,w64,gf_w64_bytwo_b_nosse_multiply_region)
- else
+ if (gf_cpu_supports_intel_sse2 && !(h->region_type & GF_REGION_NOSIMD)) {
SET_FUNCTION(gf,multiply_region,w64,gf_w64_bytwo_b_sse_multiply_region)
- #else
+ } else {
+ #endif
SET_FUNCTION(gf,multiply_region,w64,gf_w64_bytwo_b_nosse_multiply_region)
if(h->region_type & GF_REGION_SIMD)
return 0;
+ #ifdef INTEL_SSE2
+ }
#endif
}
SET_FUNCTION(gf,inverse,w64,gf_w64_euclid)
@@ -1975,18 +1980,20 @@ int gf_w64_split_init(gf_t *gf)
SET_FUNCTION(gf,multiply,w64,gf_w64_bytwo_p_multiply)
#if defined(INTEL_SSE4_PCLMUL)
- if ((!(h->region_type & GF_REGION_NOSIMD) &&
- (h->arg1 == 64 || h->arg2 == 64)) ||
- h->mult_type == GF_MULT_DEFAULT){
-
- if ((0xfffffffe00000000ULL & h->prim_poly) == 0){
- SET_FUNCTION(gf,multiply,w64,gf_w64_clm_multiply_2)
- SET_FUNCTION(gf,multiply_region,w64,gf_w64_clm_multiply_region_from_single_2)
- }else if((0xfffe000000000000ULL & h->prim_poly) == 0){
- SET_FUNCTION(gf,multiply,w64,gf_w64_clm_multiply_4)
- SET_FUNCTION(gf,multiply_region,w64,gf_w64_clm_multiply_region_from_single_4)
- }else{
- return 0;
+ if (gf_cpu_supports_intel_pclmul) {
+ if ((!(h->region_type & GF_REGION_NOSIMD) &&
+ (h->arg1 == 64 || h->arg2 == 64)) ||
+ h->mult_type == GF_MULT_DEFAULT){
+
+ if ((0xfffffffe00000000ULL & h->prim_poly) == 0){
+ SET_FUNCTION(gf,multiply,w64,gf_w64_clm_multiply_2)
+ SET_FUNCTION(gf,multiply_region,w64,gf_w64_clm_multiply_region_from_single_2)
+ }else if((0xfffe000000000000ULL & h->prim_poly) == 0){
+ SET_FUNCTION(gf,multiply,w64,gf_w64_clm_multiply_4)
+ SET_FUNCTION(gf,multiply_region,w64,gf_w64_clm_multiply_region_from_single_4)
+ }else{
+ return 0;
+ }
}
}
#endif
@@ -1996,23 +2003,27 @@ int gf_w64_split_init(gf_t *gf)
/* Allen: set region pointers for default mult type. Single pointers are
* taken care of above (explicitly for sse, implicitly for no sse). */
-#if defined(INTEL_SSE4) || defined(ARCH_AARCH64)
if (h->mult_type == GF_MULT_DEFAULT) {
- d4 = (struct gf_split_4_64_lazy_data *) h->private;
- d4->last_value = 0;
+#if defined(INTEL_SSE4) || defined(ARCH_AARCH64)
+ if (gf_cpu_supports_intel_sse4 || gf_cpu_supports_arm_neon) {
+ d4 = (struct gf_split_4_64_lazy_data *) h->private;
+ d4->last_value = 0;
#if defined(INTEL_SSE4)
- SET_FUNCTION(gf,multiply_region,w64,gf_w64_split_4_64_lazy_sse_multiply_region)
+ if (gf_cpu_supports_intel_sse4)
+ SET_FUNCTION(gf,multiply_region,w64,gf_w64_split_4_64_lazy_sse_multiply_region)
#elif defined(ARCH_AARCH64)
- gf_w64_neon_split_init(gf);
+ if (gf_cpu_supports_arm_neon)
+ gf_w64_neon_split_init(gf);
#endif
- }
-#else
- if (h->mult_type == GF_MULT_DEFAULT) {
- d8 = (struct gf_split_8_64_lazy_data *) h->private;
- d8->last_value = 0;
- SET_FUNCTION(gf,multiply_region,w64,gf_w64_split_8_64_lazy_multiply_region)
- }
+ } else {
+#endif
+ d8 = (struct gf_split_8_64_lazy_data *) h->private;
+ d8->last_value = 0;
+ SET_FUNCTION(gf,multiply_region,w64,gf_w64_split_8_64_lazy_multiply_region)
+#if defined(INTEL_SSE4) || defined(ARCH_AARCH64)
+ }
#endif
+ }
if ((h->arg1 == 4 && h->arg2 == 64) || (h->arg1 == 64 && h->arg2 == 4)) {
d4 = (struct gf_split_4_64_lazy_data *) h->private;
@@ -2022,28 +2033,35 @@ int gf_w64_split_init(gf_t *gf)
if(h->region_type & GF_REGION_ALTMAP)
{
#ifdef INTEL_SSSE3
- SET_FUNCTION(gf,multiply_region,w64,gf_w64_split_4_64_lazy_sse_altmap_multiply_region)
+ if (gf_cpu_supports_intel_ssse3) {
+ SET_FUNCTION(gf,multiply_region,w64,gf_w64_split_4_64_lazy_sse_altmap_multiply_region)
+ } else
#elif defined(ARCH_AARCH64)
- gf_w64_neon_split_init(gf);
- #else
- return 0;
+ if (gf_cpu_supports_arm_neon) {
+ gf_w64_neon_split_init(gf);
+ } else
#endif
+ return 0;
}
else //no altmap
{
#if defined(INTEL_SSE4) || defined(ARCH_AARCH64)
- if(h->region_type & GF_REGION_NOSIMD)
- SET_FUNCTION(gf,multiply_region,w64,gf_w64_split_4_64_lazy_multiply_region)
- else
- #if defined(INTEL_SSE4)
- SET_FUNCTION(gf,multiply_region,w64,gf_w64_split_4_64_lazy_sse_multiply_region)
- #elif defined(ARCH_AARCH64)
- gf_w64_neon_split_init(gf);
- #endif
- #else
+ if(gf_cpu_supports_intel_sse4 || gf_cpu_supports_arm_neon) {
+ if (h->region_type & GF_REGION_NOSIMD) {
+ SET_FUNCTION(gf,multiply_region,w64,gf_w64_split_4_64_lazy_multiply_region)
+ } else
+ #if defined(INTEL_SSE4)
+ SET_FUNCTION(gf,multiply_region,w64,gf_w64_split_4_64_lazy_sse_multiply_region)
+ #elif defined(ARCH_AARCH64)
+ gf_w64_neon_split_init(gf);
+ #endif
+ } else {
+ #endif
SET_FUNCTION(gf,multiply_region,w64,gf_w64_split_4_64_lazy_multiply_region)
if(h->region_type & GF_REGION_SIMD)
return 0;
+ #if defined(INTEL_SSE4) || defined(ARCH_AARCH64)
+ }
#endif
}
}
@@ -2114,11 +2132,15 @@ int gf_w64_scratch_size(int mult_type, int region_type, int divide_type, int arg
* then fall through to split table scratch size code. */
#if defined(INTEL_SSE4) || defined(ARCH_AARCH64)
+ if (gf_cpu_supports_intel_sse4 || gf_cpu_supports_arm_neon) {
arg1 = 64;
arg2 = 4;
-#else
+ } else {
+#endif
arg1 = 64;
arg2 = 8;
+#if defined(INTEL_SSE4) || defined(ARCH_AARCH64)
+ }
#endif
case GF_MULT_SPLIT_TABLE: