diff options
Diffstat (limited to 'src/gf_w64.c')
-rw-r--r-- | src/gf_w64.c | 164 |
1 files changed, 93 insertions, 71 deletions
diff --git a/src/gf_w64.c b/src/gf_w64.c index a096161..69e55db 100644 --- a/src/gf_w64.c +++ b/src/gf_w64.c @@ -12,6 +12,7 @@ #include <stdio.h> #include <stdlib.h> #include "gf_w64.h" +#include "gf_cpu.h" static inline @@ -338,6 +339,8 @@ gf_w64_shift_multiply (gf_t *gf, gf_val_64_t a64, gf_val_64_t b64) * ELM: Use the Intel carryless multiply instruction to do very fast 64x64 multiply. */ +#if defined(INTEL_SSE4_PCLMUL) + static inline gf_val_64_t @@ -345,8 +348,6 @@ gf_w64_clm_multiply_2 (gf_t *gf, gf_val_64_t a64, gf_val_64_t b64) { gf_val_64_t rv = 0; -#if defined(INTEL_SSE4_PCLMUL) - __m128i a, b; __m128i result; __m128i prim_poly; @@ -376,10 +377,12 @@ gf_w64_clm_multiply_2 (gf_t *gf, gf_val_64_t a64, gf_val_64_t b64) result = _mm_xor_si128 (result, w); rv = ((gf_val_64_t)_mm_extract_epi64(result, 0)); -#endif return rv; } +#endif +#if defined(INTEL_SSE4_PCLMUL) + static inline gf_val_64_t @@ -387,8 +390,6 @@ gf_w64_clm_multiply_4 (gf_t *gf, gf_val_64_t a64, gf_val_64_t b64) { gf_val_64_t rv = 0; -#if defined(INTEL_SSE4_PCLMUL) - __m128i a, b; __m128i result; __m128i prim_poly; @@ -418,15 +419,15 @@ gf_w64_clm_multiply_4 (gf_t *gf, gf_val_64_t a64, gf_val_64_t b64) result = _mm_xor_si128 (result, w); rv = ((gf_val_64_t)_mm_extract_epi64(result, 0)); -#endif return rv; } +#endif +#if defined(INTEL_SSE4_PCLMUL) void gf_w64_clm_multiply_region(gf_t *gf, void *src, void *dest, uint64_t val, int bytes, int xor) { -#if defined(INTEL_SSE4_PCLMUL) gf_internal_t *h; uint8_t *s8, *d8, *dtop; gf_region_data rd; @@ -504,8 +505,8 @@ gf_w64_clm_multiply_region(gf_t *gf, void *src, void *dest, uint64_t val, int by } } gf_do_final_region_alignment(&rd); -#endif } +#endif void gf_w64_split_4_64_lazy_multiply_region(gf_t *gf, void *src, void *dest, uint64_t val, int bytes, int xor) @@ -709,21 +710,23 @@ int gf_w64_cfm_init(gf_t *gf) SET_FUNCTION(gf,inverse,w64,gf_w64_euclid) SET_FUNCTION(gf,multiply_region,w64,gf_w64_multiply_region_from_single) -#if defined(INTEL_SSE4_PCLMUL) - gf_internal_t *h; +#if defined(INTEL_SSE4_PCLMUL) + if (gf_cpu_supports_intel_pclmul) { + gf_internal_t *h; - h = (gf_internal_t *) gf->scratch; + h = (gf_internal_t *) gf->scratch; - if ((0xfffffffe00000000ULL & h->prim_poly) == 0){ - SET_FUNCTION(gf,multiply,w64,gf_w64_clm_multiply_2) - SET_FUNCTION(gf,multiply_region,w64,gf_w64_clm_multiply_region_from_single_2) - }else if((0xfffe000000000000ULL & h->prim_poly) == 0){ - SET_FUNCTION(gf,multiply,w64,gf_w64_clm_multiply_4) - SET_FUNCTION(gf,multiply_region,w64,gf_w64_clm_multiply_region_from_single_4) - } else { - return 0; + if ((0xfffffffe00000000ULL & h->prim_poly) == 0){ + SET_FUNCTION(gf,multiply,w64,gf_w64_clm_multiply_2) + SET_FUNCTION(gf,multiply_region,w64,gf_w64_clm_multiply_region_from_single_2) + }else if((0xfffe000000000000ULL & h->prim_poly) == 0){ + SET_FUNCTION(gf,multiply,w64,gf_w64_clm_multiply_4) + SET_FUNCTION(gf,multiply_region,w64,gf_w64_clm_multiply_region_from_single_4) + } else { + return 0; + } + return 1; } - return 1; #endif return 0; @@ -1261,9 +1264,9 @@ gf_w64_bytwo_b_nosse_multiply_region(gf_t *gf, void *src, void *dest, gf_val_64_ v = _mm_srli_epi64(v, 1); } +#ifdef INTEL_SSE2 void gf_w64_bytwo_p_sse_multiply_region(gf_t *gf, void *src, void *dest, gf_val_64_t val, int bytes, int xor) { -#ifdef INTEL_SSE2 int i; uint8_t *s8, *d8; uint64_t vrev, one64; @@ -1322,8 +1325,8 @@ void gf_w64_bytwo_p_sse_multiply_region(gf_t *gf, void *src, void *dest, gf_val_ s8 += 16; } gf_do_final_region_alignment(&rd); -#endif } +#endif #ifdef INTEL_SSE2 static @@ -1457,26 +1460,28 @@ int gf_w64_bytwo_init(gf_t *gf) if (h->mult_type == GF_MULT_BYTWO_p) { SET_FUNCTION(gf,multiply,w64,gf_w64_bytwo_p_multiply) #ifdef INTEL_SSE2 - if (h->region_type & GF_REGION_NOSIMD) - SET_FUNCTION(gf,multiply_region,w64,gf_w64_bytwo_p_nosse_multiply_region) - else + if (gf_cpu_supports_intel_sse2 && !(h->region_type & GF_REGION_NOSIMD)) { SET_FUNCTION(gf,multiply_region,w64,gf_w64_bytwo_p_sse_multiply_region) - #else - SET_FUNCTION(gf,multiply_region,w64,gf_w64_bytwo_p_nosse_multiply_region) - if(h->region_type & GF_REGION_SIMD) - return 0; + } else { + #endif + SET_FUNCTION(gf,multiply_region,w64,gf_w64_bytwo_p_nosse_multiply_region) + if(h->region_type & GF_REGION_SIMD) + return 0; + #ifdef INTEL_SSE2 + } #endif } else { SET_FUNCTION(gf,multiply,w64,gf_w64_bytwo_b_multiply) #ifdef INTEL_SSE2 - if (h->region_type & GF_REGION_NOSIMD) - SET_FUNCTION(gf,multiply_region,w64,gf_w64_bytwo_b_nosse_multiply_region) - else + if (gf_cpu_supports_intel_sse2 && !(h->region_type & GF_REGION_NOSIMD)) { SET_FUNCTION(gf,multiply_region,w64,gf_w64_bytwo_b_sse_multiply_region) - #else + } else { + #endif SET_FUNCTION(gf,multiply_region,w64,gf_w64_bytwo_b_nosse_multiply_region) if(h->region_type & GF_REGION_SIMD) return 0; + #ifdef INTEL_SSE2 + } #endif } SET_FUNCTION(gf,inverse,w64,gf_w64_euclid) @@ -1975,18 +1980,20 @@ int gf_w64_split_init(gf_t *gf) SET_FUNCTION(gf,multiply,w64,gf_w64_bytwo_p_multiply) #if defined(INTEL_SSE4_PCLMUL) - if ((!(h->region_type & GF_REGION_NOSIMD) && - (h->arg1 == 64 || h->arg2 == 64)) || - h->mult_type == GF_MULT_DEFAULT){ - - if ((0xfffffffe00000000ULL & h->prim_poly) == 0){ - SET_FUNCTION(gf,multiply,w64,gf_w64_clm_multiply_2) - SET_FUNCTION(gf,multiply_region,w64,gf_w64_clm_multiply_region_from_single_2) - }else if((0xfffe000000000000ULL & h->prim_poly) == 0){ - SET_FUNCTION(gf,multiply,w64,gf_w64_clm_multiply_4) - SET_FUNCTION(gf,multiply_region,w64,gf_w64_clm_multiply_region_from_single_4) - }else{ - return 0; + if (gf_cpu_supports_intel_pclmul) { + if ((!(h->region_type & GF_REGION_NOSIMD) && + (h->arg1 == 64 || h->arg2 == 64)) || + h->mult_type == GF_MULT_DEFAULT){ + + if ((0xfffffffe00000000ULL & h->prim_poly) == 0){ + SET_FUNCTION(gf,multiply,w64,gf_w64_clm_multiply_2) + SET_FUNCTION(gf,multiply_region,w64,gf_w64_clm_multiply_region_from_single_2) + }else if((0xfffe000000000000ULL & h->prim_poly) == 0){ + SET_FUNCTION(gf,multiply,w64,gf_w64_clm_multiply_4) + SET_FUNCTION(gf,multiply_region,w64,gf_w64_clm_multiply_region_from_single_4) + }else{ + return 0; + } } } #endif @@ -1996,23 +2003,27 @@ int gf_w64_split_init(gf_t *gf) /* Allen: set region pointers for default mult type. Single pointers are * taken care of above (explicitly for sse, implicitly for no sse). */ -#if defined(INTEL_SSE4) || defined(ARCH_AARCH64) if (h->mult_type == GF_MULT_DEFAULT) { - d4 = (struct gf_split_4_64_lazy_data *) h->private; - d4->last_value = 0; +#if defined(INTEL_SSE4) || defined(ARCH_AARCH64) + if (gf_cpu_supports_intel_sse4 || gf_cpu_supports_arm_neon) { + d4 = (struct gf_split_4_64_lazy_data *) h->private; + d4->last_value = 0; #if defined(INTEL_SSE4) - SET_FUNCTION(gf,multiply_region,w64,gf_w64_split_4_64_lazy_sse_multiply_region) + if (gf_cpu_supports_intel_sse4) + SET_FUNCTION(gf,multiply_region,w64,gf_w64_split_4_64_lazy_sse_multiply_region) #elif defined(ARCH_AARCH64) - gf_w64_neon_split_init(gf); + if (gf_cpu_supports_arm_neon) + gf_w64_neon_split_init(gf); #endif - } -#else - if (h->mult_type == GF_MULT_DEFAULT) { - d8 = (struct gf_split_8_64_lazy_data *) h->private; - d8->last_value = 0; - SET_FUNCTION(gf,multiply_region,w64,gf_w64_split_8_64_lazy_multiply_region) - } + } else { +#endif + d8 = (struct gf_split_8_64_lazy_data *) h->private; + d8->last_value = 0; + SET_FUNCTION(gf,multiply_region,w64,gf_w64_split_8_64_lazy_multiply_region) +#if defined(INTEL_SSE4) || defined(ARCH_AARCH64) + } #endif + } if ((h->arg1 == 4 && h->arg2 == 64) || (h->arg1 == 64 && h->arg2 == 4)) { d4 = (struct gf_split_4_64_lazy_data *) h->private; @@ -2022,28 +2033,35 @@ int gf_w64_split_init(gf_t *gf) if(h->region_type & GF_REGION_ALTMAP) { #ifdef INTEL_SSSE3 - SET_FUNCTION(gf,multiply_region,w64,gf_w64_split_4_64_lazy_sse_altmap_multiply_region) + if (gf_cpu_supports_intel_ssse3) { + SET_FUNCTION(gf,multiply_region,w64,gf_w64_split_4_64_lazy_sse_altmap_multiply_region) + } else #elif defined(ARCH_AARCH64) - gf_w64_neon_split_init(gf); - #else - return 0; + if (gf_cpu_supports_arm_neon) { + gf_w64_neon_split_init(gf); + } else #endif + return 0; } else //no altmap { #if defined(INTEL_SSE4) || defined(ARCH_AARCH64) - if(h->region_type & GF_REGION_NOSIMD) - SET_FUNCTION(gf,multiply_region,w64,gf_w64_split_4_64_lazy_multiply_region) - else - #if defined(INTEL_SSE4) - SET_FUNCTION(gf,multiply_region,w64,gf_w64_split_4_64_lazy_sse_multiply_region) - #elif defined(ARCH_AARCH64) - gf_w64_neon_split_init(gf); - #endif - #else + if(gf_cpu_supports_intel_sse4 || gf_cpu_supports_arm_neon) { + if (h->region_type & GF_REGION_NOSIMD) { + SET_FUNCTION(gf,multiply_region,w64,gf_w64_split_4_64_lazy_multiply_region) + } else + #if defined(INTEL_SSE4) + SET_FUNCTION(gf,multiply_region,w64,gf_w64_split_4_64_lazy_sse_multiply_region) + #elif defined(ARCH_AARCH64) + gf_w64_neon_split_init(gf); + #endif + } else { + #endif SET_FUNCTION(gf,multiply_region,w64,gf_w64_split_4_64_lazy_multiply_region) if(h->region_type & GF_REGION_SIMD) return 0; + #if defined(INTEL_SSE4) || defined(ARCH_AARCH64) + } #endif } } @@ -2114,11 +2132,15 @@ int gf_w64_scratch_size(int mult_type, int region_type, int divide_type, int arg * then fall through to split table scratch size code. */ #if defined(INTEL_SSE4) || defined(ARCH_AARCH64) + if (gf_cpu_supports_intel_sse4 || gf_cpu_supports_arm_neon) { arg1 = 64; arg2 = 4; -#else + } else { +#endif arg1 = 64; arg2 = 8; +#if defined(INTEL_SSE4) || defined(ARCH_AARCH64) + } #endif case GF_MULT_SPLIT_TABLE: |