diff options
author | Loic Dachary <loic-test4@dachary.org> | 2016-09-13 10:34:23 +0000 |
---|---|---|
committer | Loic Dachary <loic-test4@dachary.org> | 2016-09-13 10:34:23 +0000 |
commit | 51a1abb9185ec6ea35817620d13322047f4fde4d (patch) | |
tree | d2fbe709772cbfe56f05aec396e42b9dab6f459c | |
parent | 8fe7382e2a1f7763be8b12db283cc1570eb64518 (diff) | |
parent | 643743d0482ca09a9dfa57beed196f172a22a78e (diff) | |
download | gf-complete-51a1abb9185ec6ea35817620d13322047f4fde4d.tar.gz |
Merge branch 'neon_fixes' into 'master'
NEON fixes/tweaks
This merge request fixes some issues and adds some tweaks to NEON code:
* SPLIT(16,4) ALTMAP implementation was broken as it only processed half the amount of data. As such, this fixed implementation is significantly slower than the old code (which is to be expected). Fixes #2
* SPLIT(16,4) implementations now merge the ARMv8 and older code path, similar to SPLIT(32,4). This fixes the ALTMAP variant, and also enables the non-ALTMAP version to have consistent sizing
* Unnecessary VTRN removed in non-ALTMAP SPLIT(16,4) as NEON allows (de)interleaving during load/store; because of this, ALTMAP isn't so useful in NEON
* This can also be done for SPLIT(32,4), but I have not implemented it
* I also pulled the `if(xor)` conditional from non-ALTMAP SPLIT(16,4) to outside the loop. It seems to improve performance a bit on my Cortex A7
* It probably should be implemented everywhere else, but I have not done this
* CARRY_FREE was incorrectly enabled on all sizes of w, when it's only available for w=4 and w=8
See merge request !16
-rw-r--r-- | src/gf.c | 2 | ||||
-rw-r--r-- | src/neon/gf_w16_neon.c | 196 |
2 files changed, 59 insertions, 139 deletions
@@ -219,7 +219,7 @@ int gf_error_check(int w, int mult_type, int region_type, int divide_type, #endif #ifdef ARM_NEON - pclmul = 1; + pclmul = (w == 4 || w == 8); sse3 = 1; #endif diff --git a/src/neon/gf_w16_neon.c b/src/neon/gf_w16_neon.c index 95bfd80..2bd3f30 100644 --- a/src/neon/gf_w16_neon.c +++ b/src/neon/gf_w16_neon.c @@ -46,7 +46,11 @@ #include <stdlib.h> #include "gf_w16.h" -#ifdef ARCH_AARCH64 +#ifndef ARCH_AARCH64 +#define vqtbl1q_u8(tbl, v) vcombine_u8(vtbl2_u8(tbl, vget_low_u8(v)), \ + vtbl2_u8(tbl, vget_high_u8(v))) +#endif + static inline void @@ -56,23 +60,32 @@ neon_w16_split_4_multiply_region(gf_t *gf, uint16_t *src, uint16_t *dst, { unsigned i; uint8_t *high = tbl + 4 * 16; - uint16x8_t va0, va1, r0, r1; uint8x16_t loset, rl, rh; uint8x16x2_t va; +#ifdef ARCH_AARCH64 uint8x16_t tbl_h[4], tbl_l[4]; for (i = 0; i < 4; i++) { tbl_l[i] = vld1q_u8(tbl + i*16); tbl_h[i] = vld1q_u8(high + i*16); } +#else + uint8x8x2_t tbl_h[4], tbl_l[4]; + for (i = 0; i < 4; i++) { + tbl_l[i].val[0] = vld1_u8(tbl + i*16); + tbl_l[i].val[1] = vld1_u8(tbl + i*16 + 8); + tbl_h[i].val[0] = vld1_u8(high + i*16); + tbl_h[i].val[1] = vld1_u8(high + i*16 + 8); + } +#endif loset = vdupq_n_u8(0xf); - while (dst < d_end) { - va0 = vld1q_u16(src); - va1 = vld1q_u16(src + 8); - - va = vtrnq_u8(vreinterpretq_u8_u16(va0), vreinterpretq_u8_u16(va1)); + if (xor) { + uint8x16x2_t vb; + while (dst < d_end) { + va = vld2q_u8((uint8_t*)src); + vb = vld2q_u8((uint8_t*)dst); rl = vqtbl1q_u8(tbl_l[0], vandq_u8(va.val[0], loset)); rh = vqtbl1q_u8(tbl_h[0], vandq_u8(va.val[0], loset)); @@ -84,24 +97,38 @@ neon_w16_split_4_multiply_region(gf_t *gf, uint16_t *src, uint16_t *dst, rl = veorq_u8(rl, vqtbl1q_u8(tbl_l[1], va.val[0])); rh = veorq_u8(rh, vqtbl1q_u8(tbl_h[1], va.val[0])); - rl = veorq_u8(rl, vqtbl1q_u8(tbl_l[3], va.val[1])); - rh = veorq_u8(rh, vqtbl1q_u8(tbl_h[3], va.val[1])); + va.val[0] = veorq_u8(rl, vqtbl1q_u8(tbl_l[3], va.val[1])); + va.val[1] = veorq_u8(rh, vqtbl1q_u8(tbl_h[3], va.val[1])); - va = vtrnq_u8(rl, rh); - r0 = vreinterpretq_u16_u8(va.val[0]); - r1 = vreinterpretq_u16_u8(va.val[1]); + va.val[0] = veorq_u8(va.val[0], vb.val[0]); + va.val[1] = veorq_u8(va.val[1], vb.val[1]); + vst2q_u8((uint8_t*)dst, va); - if (xor) { - va0 = vld1q_u16(dst); - va1 = vld1q_u16(dst + 8); - r0 = veorq_u16(r0, va0); - r1 = veorq_u16(r1, va1); - } - vst1q_u16(dst, r0); - vst1q_u16(dst + 8, r1); + src += 16; + dst += 16; + } + } else { + while (dst < d_end) { + va = vld2q_u8((uint8_t*)src); + + rl = vqtbl1q_u8(tbl_l[0], vandq_u8(va.val[0], loset)); + rh = vqtbl1q_u8(tbl_h[0], vandq_u8(va.val[0], loset)); + rl = veorq_u8(rl, vqtbl1q_u8(tbl_l[2], vandq_u8(va.val[1], loset))); + rh = veorq_u8(rh, vqtbl1q_u8(tbl_h[2], vandq_u8(va.val[1], loset))); + + va.val[0] = vshrq_n_u8(va.val[0], 4); + va.val[1] = vshrq_n_u8(va.val[1], 4); + + rl = veorq_u8(rl, vqtbl1q_u8(tbl_l[1], va.val[0])); + rh = veorq_u8(rh, vqtbl1q_u8(tbl_h[1], va.val[0])); + va.val[0] = veorq_u8(rl, vqtbl1q_u8(tbl_l[3], va.val[1])); + va.val[1] = veorq_u8(rh, vqtbl1q_u8(tbl_h[3], va.val[1])); + + vst2q_u8((uint8_t*)dst, va); src += 16; dst += 16; + } } } @@ -118,10 +145,21 @@ neon_w16_split_4_altmap_multiply_region(gf_t *gf, uint8_t *src, uint8x16_t vh, vl, rh, rl; uint8x16_t loset; +#ifdef ARCH_AARCH64 uint8x16_t tbl_h[4], tbl_l[4]; +#else + uint8x8x2_t tbl_h[4], tbl_l[4]; +#endif for (i = 0; i < 4; i++) { +#ifdef ARCH_AARCH64 tbl_l[i] = vld1q_u8(tbl + i*16); tbl_h[i] = vld1q_u8(high + i*16); +#else + tbl_l[i].val[0] = vld1_u8(tbl + i*16); + tbl_l[i].val[1] = vld1_u8(tbl + i*16 + 8); + tbl_h[i].val[0] = vld1_u8(high + i*16); + tbl_h[i].val[1] = vld1_u8(high + i*16 + 8); +#endif } loset = vdupq_n_u8(0xf); @@ -157,125 +195,7 @@ neon_w16_split_4_altmap_multiply_region(gf_t *gf, uint8_t *src, } } -#else /* ARCH_AARCH64 */ - -static -inline -void -neon_w16_split_4_multiply_region(gf_t *gf, uint16_t *src, uint16_t *dst, - uint16_t *d_end, uint8_t *tbl, - gf_val_32_t val, int xor) -{ - unsigned i; - uint8_t *high = tbl + 4 * 16; - uint16x8_t va, r; - uint8x8_t loset, vb, vc, rl, rh; - uint8x8x2_t tbl_h[4], tbl_l[4]; - for (i = 0; i < 4; i++) { - tbl_l[i].val[0] = vld1_u8(tbl + i*16); - tbl_l[i].val[1] = vld1_u8(tbl + i*16 + 8); - tbl_h[i].val[0] = vld1_u8(high + i*16); - tbl_h[i].val[1] = vld1_u8(high + i*16 + 8); - } - - loset = vdup_n_u8(0xf); - - while (dst < d_end) { - va = vld1q_u16(src); - - vb = vmovn_u16(va); - vc = vshrn_n_u16(va, 8); - - rl = vtbl2_u8(tbl_l[0], vand_u8(vb, loset)); - rh = vtbl2_u8(tbl_h[0], vand_u8(vb, loset)); - vb = vshr_n_u8(vb, 4); - rl = veor_u8(rl, vtbl2_u8(tbl_l[2], vand_u8(vc, loset))); - rh = veor_u8(rh, vtbl2_u8(tbl_h[2], vand_u8(vc, loset))); - vc = vshr_n_u8(vc, 4); - rl = veor_u8(rl, vtbl2_u8(tbl_l[1], vb)); - rh = veor_u8(rh, vtbl2_u8(tbl_h[1], vb)); - rl = veor_u8(rl, vtbl2_u8(tbl_l[3], vc)); - rh = veor_u8(rh, vtbl2_u8(tbl_h[3], vc)); - - r = vmovl_u8(rl); - r = vorrq_u16(r, vshll_n_u8(rh, 8)); - - if (xor) { - va = vld1q_u16(dst); - r = veorq_u16(r, va); - } - vst1q_u16(dst, r); - - src += 8; - dst += 8; - } -} - -static -inline -void -neon_w16_split_4_altmap_multiply_region(gf_t *gf, uint8_t *src, - uint8_t *dst, uint8_t *d_end, - uint8_t *tbl, gf_val_32_t val, - int xor) -{ - unsigned i; - uint8_t *high = tbl + 4 * 16; - uint8x8_t vh0, vh1, vl0, vl1, r0, r1, r2, r3; - uint8x8_t loset; - - uint8x8x2_t tbl_h[4], tbl_l[4]; - for (i = 0; i < 4; i++) { - tbl_l[i].val[0] = vld1_u8(tbl + i*16); - tbl_l[i].val[1] = vld1_u8(tbl + i*16 + 8); - tbl_h[i].val[0] = vld1_u8(high + i*16); - tbl_h[i].val[1] = vld1_u8(high + i*16 + 8); - } - - loset = vdup_n_u8(0xf); - - while (dst < d_end) { - vh0 = vld1_u8(src); - vh1 = vld1_u8(src + 8); - vl0 = vld1_u8(src + 16); - vl1 = vld1_u8(src + 24); - - r0 = vtbl2_u8(tbl_l[0], vand_u8(vh0, loset)); - r1 = vtbl2_u8(tbl_h[0], vand_u8(vh1, loset)); - r2 = vtbl2_u8(tbl_l[2], vand_u8(vl0, loset)); - r3 = vtbl2_u8(tbl_h[2], vand_u8(vl1, loset)); - - vh0 = vshr_n_u8(vh0, 4); - vh1 = vshr_n_u8(vh1, 4); - vl0 = vshr_n_u8(vl0, 4); - vl1 = vshr_n_u8(vl1, 4); - - r0 = veor_u8(r0, vtbl2_u8(tbl_l[1], vh0)); - r1 = veor_u8(r1, vtbl2_u8(tbl_h[1], vh1)); - r2 = veor_u8(r2, vtbl2_u8(tbl_l[3], vl0)); - r3 = veor_u8(r3, vtbl2_u8(tbl_h[3], vl1)); - - if (xor) { - vh0 = vld1_u8(dst); - vh1 = vld1_u8(dst + 8); - vl0 = vld1_u8(dst + 16); - vl1 = vld1_u8(dst + 24); - r0 = veor_u8(r0, vh0); - r1 = veor_u8(r1, vh1); - r2 = veor_u8(r2, vl0); - r3 = veor_u8(r3, vl1); - } - vst1_u8(dst, r0); - vst1_u8(dst + 8, r1); - vst1_u8(dst + 16, r2); - vst1_u8(dst + 24, r3); - - src += 32; - dst += 32; - } -} -#endif /* ARCH_AARCH64 */ static inline |