summaryrefslogtreecommitdiff
diff options
context:
space:
mode:
authorLoic Dachary <loic-test4@dachary.org>2016-09-13 10:34:23 +0000
committerLoic Dachary <loic-test4@dachary.org>2016-09-13 10:34:23 +0000
commit51a1abb9185ec6ea35817620d13322047f4fde4d (patch)
treed2fbe709772cbfe56f05aec396e42b9dab6f459c
parent8fe7382e2a1f7763be8b12db283cc1570eb64518 (diff)
parent643743d0482ca09a9dfa57beed196f172a22a78e (diff)
downloadgf-complete-51a1abb9185ec6ea35817620d13322047f4fde4d.tar.gz
Merge branch 'neon_fixes' into 'master'
NEON fixes/tweaks This merge request fixes some issues and adds some tweaks to NEON code: * SPLIT(16,4) ALTMAP implementation was broken as it only processed half the amount of data. As such, this fixed implementation is significantly slower than the old code (which is to be expected). Fixes #2 * SPLIT(16,4) implementations now merge the ARMv8 and older code path, similar to SPLIT(32,4). This fixes the ALTMAP variant, and also enables the non-ALTMAP version to have consistent sizing * Unnecessary VTRN removed in non-ALTMAP SPLIT(16,4) as NEON allows (de)interleaving during load/store; because of this, ALTMAP isn't so useful in NEON * This can also be done for SPLIT(32,4), but I have not implemented it * I also pulled the `if(xor)` conditional from non-ALTMAP SPLIT(16,4) to outside the loop. It seems to improve performance a bit on my Cortex A7 * It probably should be implemented everywhere else, but I have not done this * CARRY_FREE was incorrectly enabled on all sizes of w, when it's only available for w=4 and w=8 See merge request !16
-rw-r--r--src/gf.c2
-rw-r--r--src/neon/gf_w16_neon.c196
2 files changed, 59 insertions, 139 deletions
diff --git a/src/gf.c b/src/gf.c
index 835fb12..b9caa26 100644
--- a/src/gf.c
+++ b/src/gf.c
@@ -219,7 +219,7 @@ int gf_error_check(int w, int mult_type, int region_type, int divide_type,
#endif
#ifdef ARM_NEON
- pclmul = 1;
+ pclmul = (w == 4 || w == 8);
sse3 = 1;
#endif
diff --git a/src/neon/gf_w16_neon.c b/src/neon/gf_w16_neon.c
index 95bfd80..2bd3f30 100644
--- a/src/neon/gf_w16_neon.c
+++ b/src/neon/gf_w16_neon.c
@@ -46,7 +46,11 @@
#include <stdlib.h>
#include "gf_w16.h"
-#ifdef ARCH_AARCH64
+#ifndef ARCH_AARCH64
+#define vqtbl1q_u8(tbl, v) vcombine_u8(vtbl2_u8(tbl, vget_low_u8(v)), \
+ vtbl2_u8(tbl, vget_high_u8(v)))
+#endif
+
static
inline
void
@@ -56,23 +60,32 @@ neon_w16_split_4_multiply_region(gf_t *gf, uint16_t *src, uint16_t *dst,
{
unsigned i;
uint8_t *high = tbl + 4 * 16;
- uint16x8_t va0, va1, r0, r1;
uint8x16_t loset, rl, rh;
uint8x16x2_t va;
+#ifdef ARCH_AARCH64
uint8x16_t tbl_h[4], tbl_l[4];
for (i = 0; i < 4; i++) {
tbl_l[i] = vld1q_u8(tbl + i*16);
tbl_h[i] = vld1q_u8(high + i*16);
}
+#else
+ uint8x8x2_t tbl_h[4], tbl_l[4];
+ for (i = 0; i < 4; i++) {
+ tbl_l[i].val[0] = vld1_u8(tbl + i*16);
+ tbl_l[i].val[1] = vld1_u8(tbl + i*16 + 8);
+ tbl_h[i].val[0] = vld1_u8(high + i*16);
+ tbl_h[i].val[1] = vld1_u8(high + i*16 + 8);
+ }
+#endif
loset = vdupq_n_u8(0xf);
- while (dst < d_end) {
- va0 = vld1q_u16(src);
- va1 = vld1q_u16(src + 8);
-
- va = vtrnq_u8(vreinterpretq_u8_u16(va0), vreinterpretq_u8_u16(va1));
+ if (xor) {
+ uint8x16x2_t vb;
+ while (dst < d_end) {
+ va = vld2q_u8((uint8_t*)src);
+ vb = vld2q_u8((uint8_t*)dst);
rl = vqtbl1q_u8(tbl_l[0], vandq_u8(va.val[0], loset));
rh = vqtbl1q_u8(tbl_h[0], vandq_u8(va.val[0], loset));
@@ -84,24 +97,38 @@ neon_w16_split_4_multiply_region(gf_t *gf, uint16_t *src, uint16_t *dst,
rl = veorq_u8(rl, vqtbl1q_u8(tbl_l[1], va.val[0]));
rh = veorq_u8(rh, vqtbl1q_u8(tbl_h[1], va.val[0]));
- rl = veorq_u8(rl, vqtbl1q_u8(tbl_l[3], va.val[1]));
- rh = veorq_u8(rh, vqtbl1q_u8(tbl_h[3], va.val[1]));
+ va.val[0] = veorq_u8(rl, vqtbl1q_u8(tbl_l[3], va.val[1]));
+ va.val[1] = veorq_u8(rh, vqtbl1q_u8(tbl_h[3], va.val[1]));
- va = vtrnq_u8(rl, rh);
- r0 = vreinterpretq_u16_u8(va.val[0]);
- r1 = vreinterpretq_u16_u8(va.val[1]);
+ va.val[0] = veorq_u8(va.val[0], vb.val[0]);
+ va.val[1] = veorq_u8(va.val[1], vb.val[1]);
+ vst2q_u8((uint8_t*)dst, va);
- if (xor) {
- va0 = vld1q_u16(dst);
- va1 = vld1q_u16(dst + 8);
- r0 = veorq_u16(r0, va0);
- r1 = veorq_u16(r1, va1);
- }
- vst1q_u16(dst, r0);
- vst1q_u16(dst + 8, r1);
+ src += 16;
+ dst += 16;
+ }
+ } else {
+ while (dst < d_end) {
+ va = vld2q_u8((uint8_t*)src);
+
+ rl = vqtbl1q_u8(tbl_l[0], vandq_u8(va.val[0], loset));
+ rh = vqtbl1q_u8(tbl_h[0], vandq_u8(va.val[0], loset));
+ rl = veorq_u8(rl, vqtbl1q_u8(tbl_l[2], vandq_u8(va.val[1], loset)));
+ rh = veorq_u8(rh, vqtbl1q_u8(tbl_h[2], vandq_u8(va.val[1], loset)));
+
+ va.val[0] = vshrq_n_u8(va.val[0], 4);
+ va.val[1] = vshrq_n_u8(va.val[1], 4);
+
+ rl = veorq_u8(rl, vqtbl1q_u8(tbl_l[1], va.val[0]));
+ rh = veorq_u8(rh, vqtbl1q_u8(tbl_h[1], va.val[0]));
+ va.val[0] = veorq_u8(rl, vqtbl1q_u8(tbl_l[3], va.val[1]));
+ va.val[1] = veorq_u8(rh, vqtbl1q_u8(tbl_h[3], va.val[1]));
+
+ vst2q_u8((uint8_t*)dst, va);
src += 16;
dst += 16;
+ }
}
}
@@ -118,10 +145,21 @@ neon_w16_split_4_altmap_multiply_region(gf_t *gf, uint8_t *src,
uint8x16_t vh, vl, rh, rl;
uint8x16_t loset;
+#ifdef ARCH_AARCH64
uint8x16_t tbl_h[4], tbl_l[4];
+#else
+ uint8x8x2_t tbl_h[4], tbl_l[4];
+#endif
for (i = 0; i < 4; i++) {
+#ifdef ARCH_AARCH64
tbl_l[i] = vld1q_u8(tbl + i*16);
tbl_h[i] = vld1q_u8(high + i*16);
+#else
+ tbl_l[i].val[0] = vld1_u8(tbl + i*16);
+ tbl_l[i].val[1] = vld1_u8(tbl + i*16 + 8);
+ tbl_h[i].val[0] = vld1_u8(high + i*16);
+ tbl_h[i].val[1] = vld1_u8(high + i*16 + 8);
+#endif
}
loset = vdupq_n_u8(0xf);
@@ -157,125 +195,7 @@ neon_w16_split_4_altmap_multiply_region(gf_t *gf, uint8_t *src,
}
}
-#else /* ARCH_AARCH64 */
-
-static
-inline
-void
-neon_w16_split_4_multiply_region(gf_t *gf, uint16_t *src, uint16_t *dst,
- uint16_t *d_end, uint8_t *tbl,
- gf_val_32_t val, int xor)
-{
- unsigned i;
- uint8_t *high = tbl + 4 * 16;
- uint16x8_t va, r;
- uint8x8_t loset, vb, vc, rl, rh;
- uint8x8x2_t tbl_h[4], tbl_l[4];
- for (i = 0; i < 4; i++) {
- tbl_l[i].val[0] = vld1_u8(tbl + i*16);
- tbl_l[i].val[1] = vld1_u8(tbl + i*16 + 8);
- tbl_h[i].val[0] = vld1_u8(high + i*16);
- tbl_h[i].val[1] = vld1_u8(high + i*16 + 8);
- }
-
- loset = vdup_n_u8(0xf);
-
- while (dst < d_end) {
- va = vld1q_u16(src);
-
- vb = vmovn_u16(va);
- vc = vshrn_n_u16(va, 8);
-
- rl = vtbl2_u8(tbl_l[0], vand_u8(vb, loset));
- rh = vtbl2_u8(tbl_h[0], vand_u8(vb, loset));
- vb = vshr_n_u8(vb, 4);
- rl = veor_u8(rl, vtbl2_u8(tbl_l[2], vand_u8(vc, loset)));
- rh = veor_u8(rh, vtbl2_u8(tbl_h[2], vand_u8(vc, loset)));
- vc = vshr_n_u8(vc, 4);
- rl = veor_u8(rl, vtbl2_u8(tbl_l[1], vb));
- rh = veor_u8(rh, vtbl2_u8(tbl_h[1], vb));
- rl = veor_u8(rl, vtbl2_u8(tbl_l[3], vc));
- rh = veor_u8(rh, vtbl2_u8(tbl_h[3], vc));
-
- r = vmovl_u8(rl);
- r = vorrq_u16(r, vshll_n_u8(rh, 8));
-
- if (xor) {
- va = vld1q_u16(dst);
- r = veorq_u16(r, va);
- }
- vst1q_u16(dst, r);
-
- src += 8;
- dst += 8;
- }
-}
-
-static
-inline
-void
-neon_w16_split_4_altmap_multiply_region(gf_t *gf, uint8_t *src,
- uint8_t *dst, uint8_t *d_end,
- uint8_t *tbl, gf_val_32_t val,
- int xor)
-{
- unsigned i;
- uint8_t *high = tbl + 4 * 16;
- uint8x8_t vh0, vh1, vl0, vl1, r0, r1, r2, r3;
- uint8x8_t loset;
-
- uint8x8x2_t tbl_h[4], tbl_l[4];
- for (i = 0; i < 4; i++) {
- tbl_l[i].val[0] = vld1_u8(tbl + i*16);
- tbl_l[i].val[1] = vld1_u8(tbl + i*16 + 8);
- tbl_h[i].val[0] = vld1_u8(high + i*16);
- tbl_h[i].val[1] = vld1_u8(high + i*16 + 8);
- }
-
- loset = vdup_n_u8(0xf);
-
- while (dst < d_end) {
- vh0 = vld1_u8(src);
- vh1 = vld1_u8(src + 8);
- vl0 = vld1_u8(src + 16);
- vl1 = vld1_u8(src + 24);
-
- r0 = vtbl2_u8(tbl_l[0], vand_u8(vh0, loset));
- r1 = vtbl2_u8(tbl_h[0], vand_u8(vh1, loset));
- r2 = vtbl2_u8(tbl_l[2], vand_u8(vl0, loset));
- r3 = vtbl2_u8(tbl_h[2], vand_u8(vl1, loset));
-
- vh0 = vshr_n_u8(vh0, 4);
- vh1 = vshr_n_u8(vh1, 4);
- vl0 = vshr_n_u8(vl0, 4);
- vl1 = vshr_n_u8(vl1, 4);
-
- r0 = veor_u8(r0, vtbl2_u8(tbl_l[1], vh0));
- r1 = veor_u8(r1, vtbl2_u8(tbl_h[1], vh1));
- r2 = veor_u8(r2, vtbl2_u8(tbl_l[3], vl0));
- r3 = veor_u8(r3, vtbl2_u8(tbl_h[3], vl1));
-
- if (xor) {
- vh0 = vld1_u8(dst);
- vh1 = vld1_u8(dst + 8);
- vl0 = vld1_u8(dst + 16);
- vl1 = vld1_u8(dst + 24);
- r0 = veor_u8(r0, vh0);
- r1 = veor_u8(r1, vh1);
- r2 = veor_u8(r2, vl0);
- r3 = veor_u8(r3, vl1);
- }
- vst1_u8(dst, r0);
- vst1_u8(dst + 8, r1);
- vst1_u8(dst + 16, r2);
- vst1_u8(dst + 24, r3);
-
- src += 32;
- dst += 32;
- }
-}
-#endif /* ARCH_AARCH64 */
static
inline