diff options
Diffstat (limited to 'src/gf_w32.c')
-rw-r--r-- | src/gf_w32.c | 84 |
1 files changed, 22 insertions, 62 deletions
diff --git a/src/gf_w32.c b/src/gf_w32.c index 5ec2aa7..2e187fd 100644 --- a/src/gf_w32.c +++ b/src/gf_w32.c @@ -12,59 +12,7 @@ #include "gf_int.h" #include <stdio.h> #include <stdlib.h> - -#define GF_FIELD_WIDTH (32) -#define GF_FIRST_BIT (1 << 31) - -#define GF_BASE_FIELD_WIDTH (16) -#define GF_BASE_FIELD_SIZE (1 << GF_BASE_FIELD_WIDTH) -#define GF_BASE_FIELD_GROUP_SIZE GF_BASE_FIELD_SIZE-1 -#define GF_MULTBY_TWO(p) (((p) & GF_FIRST_BIT) ? (((p) << 1) ^ h->prim_poly) : (p) << 1) - -struct gf_split_2_32_lazy_data { - uint32_t tables[16][4]; - uint32_t last_value; -}; - -struct gf_w32_split_8_8_data { - uint32_t tables[7][256][256]; - uint32_t region_tables[4][256]; - uint32_t last_value; -}; - -struct gf_w32_group_data { - uint32_t *reduce; - uint32_t *shift; - int tshift; - uint64_t rmask; - uint32_t *memory; -}; - -struct gf_split_16_32_lazy_data { - uint32_t tables[2][(1<<16)]; - uint32_t last_value; -}; - -struct gf_split_8_32_lazy_data { - uint32_t tables[4][256]; - uint32_t last_value; -}; - -struct gf_split_4_32_lazy_data { - uint32_t tables[8][16]; - uint32_t last_value; -}; - -struct gf_w32_bytwo_data { - uint64_t prim_poly; - uint64_t mask1; - uint64_t mask2; -}; - -struct gf_w32_composite_data { - uint16_t *log; - uint16_t *alog; -}; +#include "gf_w32.h" #define MM_PRINT32(s, r) { uint8_t blah[16], ii; printf("%-12s", s); _mm_storeu_si128((__m128i *)blah, r); for (ii = 0; ii < 16; ii += 4) printf(" %02x%02x%02x%02x", blah[15-ii], blah[14-ii], blah[13-ii], blah[12-ii]); printf("\n"); } @@ -1434,25 +1382,25 @@ int gf_w32_bytwo_init(gf_t *gf) if (h->mult_type == GF_MULT_BYTWO_p) { gf->multiply.w32 = gf_w32_bytwo_p_multiply; #ifdef INTEL_SSE2 - if (h->region_type & GF_REGION_NOSSE) + if (h->region_type & GF_REGION_NOSIMD) gf->multiply_region.w32 = gf_w32_bytwo_p_nosse_multiply_region; else gf->multiply_region.w32 = gf_w32_bytwo_p_sse_multiply_region; #else gf->multiply_region.w32 = gf_w32_bytwo_p_nosse_multiply_region; - if(h->region_type & GF_REGION_SSE) + if(h->region_type & GF_REGION_SIMD) return 0; #endif } else { gf->multiply.w32 = gf_w32_bytwo_b_multiply; #ifdef INTEL_SSE2 - if (h->region_type & GF_REGION_NOSSE) + if (h->region_type & GF_REGION_NOSIMD) gf->multiply_region.w32 = gf_w32_bytwo_b_nosse_multiply_region; else gf->multiply_region.w32 = gf_w32_bytwo_b_sse_multiply_region; #else gf->multiply_region.w32 = gf_w32_bytwo_b_nosse_multiply_region; - if(h->region_type & GF_REGION_SSE) + if(h->region_type & GF_REGION_SIMD) return 0; #endif } @@ -2283,6 +2231,7 @@ int gf_w32_split_init(gf_t *gf) struct gf_split_16_32_lazy_data *d16; uint32_t p, basep; int i, j, exp, ispclmul, issse3; + int isneon = 0; #if defined(INTEL_SSE4_PCLMUL) ispclmul = 1; @@ -2295,6 +2244,9 @@ int gf_w32_split_init(gf_t *gf) #else issse3 = 0; #endif +#ifdef ARM_NEON + isneon = 1; +#endif h = (gf_internal_t *) gf->scratch; @@ -2335,13 +2287,13 @@ int gf_w32_split_init(gf_t *gf) ld2 = (struct gf_split_2_32_lazy_data *) h->private; ld2->last_value = 0; #ifdef INTEL_SSSE3 - if (!(h->region_type & GF_REGION_NOSSE)) + if (!(h->region_type & GF_REGION_NOSIMD)) gf->multiply_region.w32 = gf_w32_split_2_32_lazy_sse_multiply_region; else gf->multiply_region.w32 = gf_w32_split_2_32_lazy_multiply_region; #else gf->multiply_region.w32 = gf_w32_split_2_32_lazy_multiply_region; - if(h->region_type & GF_REGION_SSE) return 0; + if(h->region_type & GF_REGION_SIMD) return 0; #endif return 1; } @@ -2349,11 +2301,15 @@ int gf_w32_split_init(gf_t *gf) /* 4/32 or Default + SSE - There is no ALTMAP/NOSSE. */ if ((h->arg1 == 4 && h->arg2 == 32) || (h->arg1 == 32 && h->arg2 == 4) || - (issse3 && h->mult_type == GF_REGION_DEFAULT)) { + ((issse3 || isneon) && h->mult_type == GF_REGION_DEFAULT)) { ld4 = (struct gf_split_4_32_lazy_data *) h->private; ld4->last_value = 0; - if ((h->region_type & GF_REGION_NOSSE) || !issse3) { + if ((h->region_type & GF_REGION_NOSIMD) || !(issse3 || isneon)) { gf->multiply_region.w32 = gf_w32_split_4_32_lazy_multiply_region; + } else if (isneon) { +#ifdef ARM_NEON + gf_w32_neon_split_init(gf); +#endif } else if (h->region_type & GF_REGION_ALTMAP) { gf->multiply_region.w32 = gf_w32_split_4_32_lazy_sse_altmap_multiply_region; } else { @@ -2731,10 +2687,14 @@ int gf_w32_composite_init(gf_t *gf) int gf_w32_scratch_size(int mult_type, int region_type, int divide_type, int arg1, int arg2) { int issse3 = 0; + int isneon = 0; #ifdef INTEL_SSSE3 issse3 = 1; #endif +#ifdef ARM_NEON + isneon = 1; +#endif switch(mult_type) { @@ -2760,7 +2720,7 @@ int gf_w32_scratch_size(int mult_type, int region_type, int divide_type, int arg return sizeof(gf_internal_t) + sizeof(struct gf_split_2_32_lazy_data) + 64; } if ((arg1 == 8 && arg2 == 32) || (arg2 == 8 && arg1 == 32) || - (mult_type == GF_MULT_DEFAULT && !issse3)) { + (mult_type == GF_MULT_DEFAULT && !(issse3 || isneon))) { return sizeof(gf_internal_t) + sizeof(struct gf_split_8_32_lazy_data) + 64; } if ((arg1 == 4 && arg2 == 32) || |