diff options
author | Janne Grunau <j@jannau.net> | 2014-09-17 16:13:02 +0200 |
---|---|---|
committer | Janne Grunau <j@jannau.net> | 2014-10-24 14:54:27 +0200 |
commit | 370c88b9015cbe874aca81442a5d8f6f99bfb654 (patch) | |
tree | 925455a6e70fcbea9b121bbabcf868402d5e779d | |
parent | 474010a91d35fef5ca7dea77205b6a5c7e68c3e9 (diff) | |
download | gf-complete-370c88b9015cbe874aca81442a5d8f6f99bfb654.tar.gz |
arm: NEON optimisations for gf_w32
Optimisations for 4,32 split table multiplications.
Selected time_tool.sh results on a 1.7 GHz cortex-a9:
Region Best (MB/s): 346.67 W-Method: 32 -m SPLIT 32 4 -r SIMD -
Region Best (MB/s): 92.89 W-Method: 32 -m SPLIT 32 4 -r NOSIMD -
Region Best (MB/s): 258.17 W-Method: 32 -m SPLIT 32 4 -r SIMD -r ALTMAP -
Region Best (MB/s): 162.00 W-Method: 32 -m SPLIT 32 8 -
Region Best (MB/s): 160.53 W-Method: 32 -m SPLIT 8 8 -
Region Best (MB/s): 32.74 W-Method: 32 -m COMPOSITE 2 - -
Region Best (MB/s): 199.79 W-Method: 32 -m COMPOSITE 2 - -r ALTMAP -
-rw-r--r-- | include/gf_w32.h | 71 | ||||
-rw-r--r-- | src/Makefile.am | 3 | ||||
-rw-r--r-- | src/gf_w32.c | 72 | ||||
-rw-r--r-- | src/neon/gf_w32_neon.c | 269 |
4 files changed, 358 insertions, 57 deletions
diff --git a/include/gf_w32.h b/include/gf_w32.h new file mode 100644 index 0000000..3396402 --- /dev/null +++ b/include/gf_w32.h @@ -0,0 +1,71 @@ +/* + * GF-Complete: A Comprehensive Open Source Library for Galois Field Arithmetic + * James S. Plank, Ethan L. Miller, Kevin M. Greenan, + * Benjamin A. Arnold, John A. Burnum, Adam W. Disney, Allen C. McBride. + * + * gf_w32.h + * + * Defines and data structures for 32-bit Galois fields + */ + +#ifndef GF_COMPLETE_GF_W32_H +#define GF_COMPLETE_GF_W32_H + +#include <stdint.h> + +#define GF_FIELD_WIDTH (32) +#define GF_FIRST_BIT (1 << 31) + +#define GF_BASE_FIELD_WIDTH (16) +#define GF_BASE_FIELD_SIZE (1 << GF_BASE_FIELD_WIDTH) +#define GF_BASE_FIELD_GROUP_SIZE GF_BASE_FIELD_SIZE-1 +#define GF_MULTBY_TWO(p) (((p) & GF_FIRST_BIT) ? (((p) << 1) ^ h->prim_poly) : (p) << 1) + +struct gf_split_2_32_lazy_data { + uint32_t tables[16][4]; + uint32_t last_value; +}; + +struct gf_w32_split_8_8_data { + uint32_t tables[7][256][256]; + uint32_t region_tables[4][256]; + uint32_t last_value; +}; + +struct gf_w32_group_data { + uint32_t *reduce; + uint32_t *shift; + int tshift; + uint64_t rmask; + uint32_t *memory; +}; + +struct gf_split_16_32_lazy_data { + uint32_t tables[2][(1<<16)]; + uint32_t last_value; +}; + +struct gf_split_8_32_lazy_data { + uint32_t tables[4][256]; + uint32_t last_value; +}; + +struct gf_split_4_32_lazy_data { + uint32_t tables[8][16]; + uint32_t last_value; +}; + +struct gf_w32_bytwo_data { + uint64_t prim_poly; + uint64_t mask1; + uint64_t mask2; +}; + +struct gf_w32_composite_data { + uint16_t *log; + uint16_t *alog; +}; + +void gf_w32_neon_split_init(gf_t *gf); + +#endif /* GF_COMPLETE_GF_W32_H */ diff --git a/src/Makefile.am b/src/Makefile.am index f04042b..a7f7ced 100644 --- a/src/Makefile.am +++ b/src/Makefile.am @@ -13,7 +13,8 @@ libgf_complete_la_SOURCES = gf.c gf_method.c gf_wgen.c gf_w4.c gf_w8.c gf_w16.c if HAVE_NEON libgf_complete_la_SOURCES += neon/gf_w4_neon.c \ neon/gf_w8_neon.c \ - neon/gf_w16_neon.c + neon/gf_w16_neon.c \ + neon/gf_w32_neon.c endif libgf_complete_la_LDFLAGS = -version-info 1:0:0 diff --git a/src/gf_w32.c b/src/gf_w32.c index 8e7c741..2e187fd 100644 --- a/src/gf_w32.c +++ b/src/gf_w32.c @@ -12,59 +12,7 @@ #include "gf_int.h" #include <stdio.h> #include <stdlib.h> - -#define GF_FIELD_WIDTH (32) -#define GF_FIRST_BIT (1 << 31) - -#define GF_BASE_FIELD_WIDTH (16) -#define GF_BASE_FIELD_SIZE (1 << GF_BASE_FIELD_WIDTH) -#define GF_BASE_FIELD_GROUP_SIZE GF_BASE_FIELD_SIZE-1 -#define GF_MULTBY_TWO(p) (((p) & GF_FIRST_BIT) ? (((p) << 1) ^ h->prim_poly) : (p) << 1) - -struct gf_split_2_32_lazy_data { - uint32_t tables[16][4]; - uint32_t last_value; -}; - -struct gf_w32_split_8_8_data { - uint32_t tables[7][256][256]; - uint32_t region_tables[4][256]; - uint32_t last_value; -}; - -struct gf_w32_group_data { - uint32_t *reduce; - uint32_t *shift; - int tshift; - uint64_t rmask; - uint32_t *memory; -}; - -struct gf_split_16_32_lazy_data { - uint32_t tables[2][(1<<16)]; - uint32_t last_value; -}; - -struct gf_split_8_32_lazy_data { - uint32_t tables[4][256]; - uint32_t last_value; -}; - -struct gf_split_4_32_lazy_data { - uint32_t tables[8][16]; - uint32_t last_value; -}; - -struct gf_w32_bytwo_data { - uint64_t prim_poly; - uint64_t mask1; - uint64_t mask2; -}; - -struct gf_w32_composite_data { - uint16_t *log; - uint16_t *alog; -}; +#include "gf_w32.h" #define MM_PRINT32(s, r) { uint8_t blah[16], ii; printf("%-12s", s); _mm_storeu_si128((__m128i *)blah, r); for (ii = 0; ii < 16; ii += 4) printf(" %02x%02x%02x%02x", blah[15-ii], blah[14-ii], blah[13-ii], blah[12-ii]); printf("\n"); } @@ -2283,6 +2231,7 @@ int gf_w32_split_init(gf_t *gf) struct gf_split_16_32_lazy_data *d16; uint32_t p, basep; int i, j, exp, ispclmul, issse3; + int isneon = 0; #if defined(INTEL_SSE4_PCLMUL) ispclmul = 1; @@ -2295,6 +2244,9 @@ int gf_w32_split_init(gf_t *gf) #else issse3 = 0; #endif +#ifdef ARM_NEON + isneon = 1; +#endif h = (gf_internal_t *) gf->scratch; @@ -2349,11 +2301,15 @@ int gf_w32_split_init(gf_t *gf) /* 4/32 or Default + SSE - There is no ALTMAP/NOSSE. */ if ((h->arg1 == 4 && h->arg2 == 32) || (h->arg1 == 32 && h->arg2 == 4) || - (issse3 && h->mult_type == GF_REGION_DEFAULT)) { + ((issse3 || isneon) && h->mult_type == GF_REGION_DEFAULT)) { ld4 = (struct gf_split_4_32_lazy_data *) h->private; ld4->last_value = 0; - if ((h->region_type & GF_REGION_NOSIMD) || !issse3) { + if ((h->region_type & GF_REGION_NOSIMD) || !(issse3 || isneon)) { gf->multiply_region.w32 = gf_w32_split_4_32_lazy_multiply_region; + } else if (isneon) { +#ifdef ARM_NEON + gf_w32_neon_split_init(gf); +#endif } else if (h->region_type & GF_REGION_ALTMAP) { gf->multiply_region.w32 = gf_w32_split_4_32_lazy_sse_altmap_multiply_region; } else { @@ -2731,10 +2687,14 @@ int gf_w32_composite_init(gf_t *gf) int gf_w32_scratch_size(int mult_type, int region_type, int divide_type, int arg1, int arg2) { int issse3 = 0; + int isneon = 0; #ifdef INTEL_SSSE3 issse3 = 1; #endif +#ifdef ARM_NEON + isneon = 1; +#endif switch(mult_type) { @@ -2760,7 +2720,7 @@ int gf_w32_scratch_size(int mult_type, int region_type, int divide_type, int arg return sizeof(gf_internal_t) + sizeof(struct gf_split_2_32_lazy_data) + 64; } if ((arg1 == 8 && arg2 == 32) || (arg2 == 8 && arg1 == 32) || - (mult_type == GF_MULT_DEFAULT && !issse3)) { + (mult_type == GF_MULT_DEFAULT && !(issse3 || isneon))) { return sizeof(gf_internal_t) + sizeof(struct gf_split_8_32_lazy_data) + 64; } if ((arg1 == 4 && arg2 == 32) || diff --git a/src/neon/gf_w32_neon.c b/src/neon/gf_w32_neon.c new file mode 100644 index 0000000..8231eb3 --- /dev/null +++ b/src/neon/gf_w32_neon.c @@ -0,0 +1,269 @@ +/* + * GF-Complete: A Comprehensive Open Source Library for Galois Field Arithmetic + * James S. Plank, Ethan L. Miller, Kevin M. Greenan, + * Benjamin A. Arnold, John A. Burnum, Adam W. Disney, Allen C. McBride. + * + * Copyright (c) 2014: Janne Grunau <j@jannau.net> + * + * Redistribution and use in source and binary forms, with or without + * modification, are permitted provided that the following conditions + * are met: + * + * - Redistributions of source code must retain the above copyright + * notice, this list of conditions and the following disclaimer. + * + * - Redistributions in binary form must reproduce the above copyright + * notice, this list of conditions and the following disclaimer in + * the documentation and/or other materials provided with the + * distribution. + * + * - Neither the name of the University of Tennessee nor the names of its + * contributors may be used to endorse or promote products derived + * from this software without specific prior written permission. + * + * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS + * "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT + * LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR + * A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT + * HOLDER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, + * INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, + * BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS + * OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED + * AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT + * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY + * WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE + * POSSIBILITY OF SUCH DAMAGE. + * + * gf_w32_neon.c + * + * Neon routines for 32-bit Galois fields + * + */ + + +#include "gf_int.h" +#include <stdio.h> +#include <stdlib.h> +#include "gf_w32.h" + +#ifndef ARCH_AARCH64 +#define vqtbl1q_u8(tbl, v) vcombine_u8(vtbl2_u8(tbl, vget_low_u8(v)), \ + vtbl2_u8(tbl, vget_high_u8(v))) +#endif + +static +void +neon_w32_split_4_32_multiply_region(gf_t *gf, uint32_t *src, uint32_t *dst, + uint32_t *d_end, uint8_t btable[8][4][16], + uint32_t val, int xor, int altmap) +{ + int i, j; +#ifdef ARCH_AARCH64 + uint8x16_t tables[8][4]; +#else + uint8x8x2_t tables[8][4]; +#endif + uint32x4_t v0, v1, v2, v3, s0, s1, s2, s3; + uint8x16_t p0, p1, p2, p3, si, mask1; + uint16x8x2_t r0, r1; + uint8x16x2_t q0, q1; + + for (i = 0; i < 8; i++) { + for (j = 0; j < 4; j++) { +#ifdef ARCH_AARCH64 + tables[i][j] = vld1q_u8(btable[i][j]); +#else + tables[i][j].val[0] = vld1_u8(btable[i][j]); + tables[i][j].val[1] = vld1_u8(btable[i][j] + 8); +#endif + } + } + + mask1 = vdupq_n_u8(0xf); + + while (dst < d_end) { + + v0 = vld1q_u32(src); src += 4; + v1 = vld1q_u32(src); src += 4; + v2 = vld1q_u32(src); src += 4; + v3 = vld1q_u32(src); src += 4; + + if (altmap) { + q0.val[0] = vreinterpretq_u8_u32(v0); + q0.val[1] = vreinterpretq_u8_u32(v1); + q1.val[0] = vreinterpretq_u8_u32(v2); + q1.val[1] = vreinterpretq_u8_u32(v3); + } else { + r0 = vtrnq_u16(vreinterpretq_u16_u32(v0), vreinterpretq_u16_u32(v2)); + r1 = vtrnq_u16(vreinterpretq_u16_u32(v1), vreinterpretq_u16_u32(v3)); + + q0 = vtrnq_u8(vreinterpretq_u8_u16(r0.val[0]), + vreinterpretq_u8_u16(r1.val[0])); + q1 = vtrnq_u8(vreinterpretq_u8_u16(r0.val[1]), + vreinterpretq_u8_u16(r1.val[1])); + } + + si = vandq_u8(q0.val[0], mask1); + p0 = vqtbl1q_u8(tables[0][0], si); + p1 = vqtbl1q_u8(tables[0][1], si); + p2 = vqtbl1q_u8(tables[0][2], si); + p3 = vqtbl1q_u8(tables[0][3], si); + + si = vshrq_n_u8(q0.val[0], 4); + p0 = veorq_u8(p0, vqtbl1q_u8(tables[1][0], si)); + p1 = veorq_u8(p1, vqtbl1q_u8(tables[1][1], si)); + p2 = veorq_u8(p2, vqtbl1q_u8(tables[1][2], si)); + p3 = veorq_u8(p3, vqtbl1q_u8(tables[1][3], si)); + + si = vandq_u8(q0.val[1], mask1); + p0 = veorq_u8(p0, vqtbl1q_u8(tables[2][0], si)); + p1 = veorq_u8(p1, vqtbl1q_u8(tables[2][1], si)); + p2 = veorq_u8(p2, vqtbl1q_u8(tables[2][2], si)); + p3 = veorq_u8(p3, vqtbl1q_u8(tables[2][3], si)); + + si = vshrq_n_u8(q0.val[1], 4); + p0 = veorq_u8(p0, vqtbl1q_u8(tables[3][0], si)); + p1 = veorq_u8(p1, vqtbl1q_u8(tables[3][1], si)); + p2 = veorq_u8(p2, vqtbl1q_u8(tables[3][2], si)); + p3 = veorq_u8(p3, vqtbl1q_u8(tables[3][3], si)); + + si = vandq_u8(q1.val[0], mask1); + p0 = veorq_u8(p0, vqtbl1q_u8(tables[4][0], si)); + p1 = veorq_u8(p1, vqtbl1q_u8(tables[4][1], si)); + p2 = veorq_u8(p2, vqtbl1q_u8(tables[4][2], si)); + p3 = veorq_u8(p3, vqtbl1q_u8(tables[4][3], si)); + + si = vshrq_n_u8(q1.val[0], 4); + p0 = veorq_u8(p0, vqtbl1q_u8(tables[5][0], si)); + p1 = veorq_u8(p1, vqtbl1q_u8(tables[5][1], si)); + p2 = veorq_u8(p2, vqtbl1q_u8(tables[5][2], si)); + p3 = veorq_u8(p3, vqtbl1q_u8(tables[5][3], si)); + + si = vandq_u8(q1.val[1], mask1); + p0 = veorq_u8(p0, vqtbl1q_u8(tables[6][0], si)); + p1 = veorq_u8(p1, vqtbl1q_u8(tables[6][1], si)); + p2 = veorq_u8(p2, vqtbl1q_u8(tables[6][2], si)); + p3 = veorq_u8(p3, vqtbl1q_u8(tables[6][3], si)); + + si = vshrq_n_u8(q1.val[1], 4); + p0 = veorq_u8(p0, vqtbl1q_u8(tables[7][0], si)); + p1 = veorq_u8(p1, vqtbl1q_u8(tables[7][1], si)); + p2 = veorq_u8(p2, vqtbl1q_u8(tables[7][2], si)); + p3 = veorq_u8(p3, vqtbl1q_u8(tables[7][3], si)); + + if (altmap) { + s0 = vreinterpretq_u32_u8(p0); + s1 = vreinterpretq_u32_u8(p1); + s2 = vreinterpretq_u32_u8(p2); + s3 = vreinterpretq_u32_u8(p3); + } else { + q0 = vtrnq_u8(p0, p1); + q1 = vtrnq_u8(p2, p3); + + r0 = vtrnq_u16(vreinterpretq_u16_u8(q0.val[0]), + vreinterpretq_u16_u8(q1.val[0])); + r1 = vtrnq_u16(vreinterpretq_u16_u8(q0.val[1]), + vreinterpretq_u16_u8(q1.val[1])); + + s0 = vreinterpretq_u32_u16(r0.val[0]); + s1 = vreinterpretq_u32_u16(r1.val[0]); + s2 = vreinterpretq_u32_u16(r0.val[1]); + s3 = vreinterpretq_u32_u16(r1.val[1]); + } + + if (xor) { + v0 = vld1q_u32(dst); + v1 = vld1q_u32(dst + 4); + v2 = vld1q_u32(dst + 8); + v3 = vld1q_u32(dst + 12); + s0 = veorq_u32(s0, v0); + s1 = veorq_u32(s1, v1); + s2 = veorq_u32(s2, v2); + s3 = veorq_u32(s3, v3); + } + + vst1q_u32(dst, s0); + vst1q_u32(dst + 4, s1); + vst1q_u32(dst + 8, s2); + vst1q_u32(dst + 12, s3); + + dst += 16; + } +} + +static +inline +void +neon_w32_split_4_32_lazy_multiply_region(gf_t *gf, void *src, void *dest, uint32_t val, int bytes, int xor, int altmap) +{ + gf_internal_t *h; + int i, j, k; + uint32_t pp, v, *s32, *d32, *top, tmp_table[16]; + uint8_t btable[8][4][16]; + gf_region_data rd; + + if (val == 0) { gf_multby_zero(dest, bytes, xor); return; } + if (val == 1) { gf_multby_one(src, dest, bytes, xor); return; } + + h = (gf_internal_t *) gf->scratch; + pp = h->prim_poly; + + gf_set_region_data(&rd, gf, src, dest, bytes, val, xor, 64); + gf_do_initial_region_alignment(&rd); + + s32 = (uint32_t *) rd.s_start; + d32 = (uint32_t *) rd.d_start; + top = (uint32_t *) rd.d_top; + + v = val; + for (i = 0; i < 8; i++) { + tmp_table[0] = 0; + for (j = 1; j < 16; j <<= 1) { + for (k = 0; k < j; k++) { + tmp_table[k^j] = (v ^ tmp_table[k]); + } + v = (v & GF_FIRST_BIT) ? ((v << 1) ^ pp) : (v << 1); + } + for (j = 0; j < 4; j++) { + for (k = 0; k < 16; k++) { + btable[i][j][k] = (uint8_t) tmp_table[k]; + tmp_table[k] >>= 8; + } + } + } + + if (xor) + neon_w32_split_4_32_multiply_region(gf, s32, d32, top, btable, val, 1, altmap); + else + neon_w32_split_4_32_multiply_region(gf, s32, d32, top, btable, val, 0, altmap); + + gf_do_final_region_alignment(&rd); +} + +static +void +gf_w32_split_4_32_lazy_multiply_region_neon(gf_t *gf, void *src, void *dest, + gf_val_32_t val, int bytes, int xor) +{ + neon_w32_split_4_32_lazy_multiply_region(gf, src, dest, val, bytes, xor, 0); +} + +static +void +gf_w32_split_4_32_lazy_altmap_multiply_region_neon(gf_t *gf, void *src, + void *dest, gf_val_32_t val, + int bytes, int xor) +{ + neon_w32_split_4_32_lazy_multiply_region(gf, src, dest, val, bytes, xor, 1); +} + +void gf_w32_neon_split_init(gf_t *gf) +{ + gf_internal_t *h = (gf_internal_t *) gf->scratch; + + if (h->region_type & GF_REGION_ALTMAP) + gf->multiply_region.w32 = gf_w32_split_4_32_lazy_altmap_multiply_region_neon; + else + gf->multiply_region.w32 = gf_w32_split_4_32_lazy_multiply_region_neon; + +} |