summaryrefslogtreecommitdiff
diff options
context:
space:
mode:
authoranimetosho <animetosho@users.noreply.github.com>2015-11-12 22:17:53 +1000
committeranimetosho <animetosho@users.noreply.github.com>2015-11-12 22:17:53 +1000
commit05057e5635e5ef7fb5be3156f477866cce98fbdb (patch)
tree4b5ded8c32a40cb75484d8b93d8bca7e59c3637b
parent438283c12d8378c449b78e011cb4f9c0ff33dcc3 (diff)
downloadgf-complete-05057e5635e5ef7fb5be3156f477866cce98fbdb.tar.gz
Eliminate unnecessary VTRNs in SPLIT(16,4) NEON implementation
Also makes the ARMv8 version consistent with the older one, in terms of processing width
-rw-r--r--src/neon/gf_w16_neon.c101
1 files changed, 22 insertions, 79 deletions
diff --git a/src/neon/gf_w16_neon.c b/src/neon/gf_w16_neon.c
index 346d993..fb637ca 100644
--- a/src/neon/gf_w16_neon.c
+++ b/src/neon/gf_w16_neon.c
@@ -46,7 +46,11 @@
#include <stdlib.h>
#include "gf_w16.h"
-#ifdef ARCH_AARCH64
+#ifndef ARCH_AARCH64
+#define vqtbl1q_u8(tbl, v) vcombine_u8(vtbl2_u8(tbl, vget_low_u8(v)), \
+ vtbl2_u8(tbl, vget_high_u8(v)))
+#endif
+
static
inline
void
@@ -56,23 +60,29 @@ neon_w16_split_4_multiply_region(gf_t *gf, uint16_t *src, uint16_t *dst,
{
unsigned i;
uint8_t *high = tbl + 4 * 16;
- uint16x8_t va0, va1, r0, r1;
uint8x16_t loset, rl, rh;
uint8x16x2_t va;
+#ifdef ARCH_AARCH64
uint8x16_t tbl_h[4], tbl_l[4];
for (i = 0; i < 4; i++) {
tbl_l[i] = vld1q_u8(tbl + i*16);
tbl_h[i] = vld1q_u8(high + i*16);
}
+#else
+ uint8x8x2_t tbl_h[4], tbl_l[4];
+ for (i = 0; i < 4; i++) {
+ tbl_l[i].val[0] = vld1_u8(tbl + i*16);
+ tbl_l[i].val[1] = vld1_u8(tbl + i*16 + 8);
+ tbl_h[i].val[0] = vld1_u8(high + i*16);
+ tbl_h[i].val[1] = vld1_u8(high + i*16 + 8);
+ }
+#endif
loset = vdupq_n_u8(0xf);
while (dst < d_end) {
- va0 = vld1q_u16(src);
- va1 = vld1q_u16(src + 8);
-
- va = vtrnq_u8(vreinterpretq_u8_u16(va0), vreinterpretq_u8_u16(va1));
+ va = vld2q_u8((uint8_t*)src);
rl = vqtbl1q_u8(tbl_l[0], vandq_u8(va.val[0], loset));
rh = vqtbl1q_u8(tbl_h[0], vandq_u8(va.val[0], loset));
@@ -84,88 +94,21 @@ neon_w16_split_4_multiply_region(gf_t *gf, uint16_t *src, uint16_t *dst,
rl = veorq_u8(rl, vqtbl1q_u8(tbl_l[1], va.val[0]));
rh = veorq_u8(rh, vqtbl1q_u8(tbl_h[1], va.val[0]));
- rl = veorq_u8(rl, vqtbl1q_u8(tbl_l[3], va.val[1]));
- rh = veorq_u8(rh, vqtbl1q_u8(tbl_h[3], va.val[1]));
-
- va = vtrnq_u8(rl, rh);
- r0 = vreinterpretq_u16_u8(va.val[0]);
- r1 = vreinterpretq_u16_u8(va.val[1]);
+ va.val[0] = veorq_u8(rl, vqtbl1q_u8(tbl_l[3], va.val[1]));
+ va.val[1] = veorq_u8(rh, vqtbl1q_u8(tbl_h[3], va.val[1]));
if (xor) {
- va0 = vld1q_u16(dst);
- va1 = vld1q_u16(dst + 8);
- r0 = veorq_u16(r0, va0);
- r1 = veorq_u16(r1, va1);
+ uint8x16x2_t vb = vld2q_u8((uint8_t*)dst);
+ va.val[0] = veorq_u8(va.val[0], vb.val[0]);
+ va.val[1] = veorq_u8(va.val[1], vb.val[1]);
}
- vst1q_u16(dst, r0);
- vst1q_u16(dst + 8, r1);
+ vst2q_u8((uint8_t*)dst, va);
src += 16;
dst += 16;
}
}
-#else /* ARCH_AARCH64 */
-
-static
-inline
-void
-neon_w16_split_4_multiply_region(gf_t *gf, uint16_t *src, uint16_t *dst,
- uint16_t *d_end, uint8_t *tbl,
- gf_val_32_t val, int xor)
-{
- unsigned i;
- uint8_t *high = tbl + 4 * 16;
- uint16x8_t va, r;
- uint8x8_t loset, vb, vc, rl, rh;
-
- uint8x8x2_t tbl_h[4], tbl_l[4];
- for (i = 0; i < 4; i++) {
- tbl_l[i].val[0] = vld1_u8(tbl + i*16);
- tbl_l[i].val[1] = vld1_u8(tbl + i*16 + 8);
- tbl_h[i].val[0] = vld1_u8(high + i*16);
- tbl_h[i].val[1] = vld1_u8(high + i*16 + 8);
- }
-
- loset = vdup_n_u8(0xf);
-
- while (dst < d_end) {
- va = vld1q_u16(src);
-
- vb = vmovn_u16(va);
- vc = vshrn_n_u16(va, 8);
-
- rl = vtbl2_u8(tbl_l[0], vand_u8(vb, loset));
- rh = vtbl2_u8(tbl_h[0], vand_u8(vb, loset));
- vb = vshr_n_u8(vb, 4);
- rl = veor_u8(rl, vtbl2_u8(tbl_l[2], vand_u8(vc, loset)));
- rh = veor_u8(rh, vtbl2_u8(tbl_h[2], vand_u8(vc, loset)));
- vc = vshr_n_u8(vc, 4);
- rl = veor_u8(rl, vtbl2_u8(tbl_l[1], vb));
- rh = veor_u8(rh, vtbl2_u8(tbl_h[1], vb));
- rl = veor_u8(rl, vtbl2_u8(tbl_l[3], vc));
- rh = veor_u8(rh, vtbl2_u8(tbl_h[3], vc));
-
- r = vmovl_u8(rl);
- r = vorrq_u16(r, vshll_n_u8(rh, 8));
-
- if (xor) {
- va = vld1q_u16(dst);
- r = veorq_u16(r, va);
- }
- vst1q_u16(dst, r);
-
- src += 8;
- dst += 8;
- }
-}
-#endif /* ARCH_AARCH64 */
-
-#ifndef ARCH_AARCH64
-#define vqtbl1q_u8(tbl, v) vcombine_u8(vtbl2_u8(tbl, vget_low_u8(v)), \
- vtbl2_u8(tbl, vget_high_u8(v)))
-#endif
-
static
inline
void