diff options
author | animetosho <animetosho@users.noreply.github.com> | 2015-11-12 22:17:53 +1000 |
---|---|---|
committer | animetosho <animetosho@users.noreply.github.com> | 2015-11-12 22:17:53 +1000 |
commit | 05057e5635e5ef7fb5be3156f477866cce98fbdb (patch) | |
tree | 4b5ded8c32a40cb75484d8b93d8bca7e59c3637b | |
parent | 438283c12d8378c449b78e011cb4f9c0ff33dcc3 (diff) | |
download | gf-complete-05057e5635e5ef7fb5be3156f477866cce98fbdb.tar.gz |
Eliminate unnecessary VTRNs in SPLIT(16,4) NEON implementation
Also makes the ARMv8 version consistent with the older one, in terms of processing width
-rw-r--r-- | src/neon/gf_w16_neon.c | 101 |
1 files changed, 22 insertions, 79 deletions
diff --git a/src/neon/gf_w16_neon.c b/src/neon/gf_w16_neon.c index 346d993..fb637ca 100644 --- a/src/neon/gf_w16_neon.c +++ b/src/neon/gf_w16_neon.c @@ -46,7 +46,11 @@ #include <stdlib.h> #include "gf_w16.h" -#ifdef ARCH_AARCH64 +#ifndef ARCH_AARCH64 +#define vqtbl1q_u8(tbl, v) vcombine_u8(vtbl2_u8(tbl, vget_low_u8(v)), \ + vtbl2_u8(tbl, vget_high_u8(v))) +#endif + static inline void @@ -56,23 +60,29 @@ neon_w16_split_4_multiply_region(gf_t *gf, uint16_t *src, uint16_t *dst, { unsigned i; uint8_t *high = tbl + 4 * 16; - uint16x8_t va0, va1, r0, r1; uint8x16_t loset, rl, rh; uint8x16x2_t va; +#ifdef ARCH_AARCH64 uint8x16_t tbl_h[4], tbl_l[4]; for (i = 0; i < 4; i++) { tbl_l[i] = vld1q_u8(tbl + i*16); tbl_h[i] = vld1q_u8(high + i*16); } +#else + uint8x8x2_t tbl_h[4], tbl_l[4]; + for (i = 0; i < 4; i++) { + tbl_l[i].val[0] = vld1_u8(tbl + i*16); + tbl_l[i].val[1] = vld1_u8(tbl + i*16 + 8); + tbl_h[i].val[0] = vld1_u8(high + i*16); + tbl_h[i].val[1] = vld1_u8(high + i*16 + 8); + } +#endif loset = vdupq_n_u8(0xf); while (dst < d_end) { - va0 = vld1q_u16(src); - va1 = vld1q_u16(src + 8); - - va = vtrnq_u8(vreinterpretq_u8_u16(va0), vreinterpretq_u8_u16(va1)); + va = vld2q_u8((uint8_t*)src); rl = vqtbl1q_u8(tbl_l[0], vandq_u8(va.val[0], loset)); rh = vqtbl1q_u8(tbl_h[0], vandq_u8(va.val[0], loset)); @@ -84,88 +94,21 @@ neon_w16_split_4_multiply_region(gf_t *gf, uint16_t *src, uint16_t *dst, rl = veorq_u8(rl, vqtbl1q_u8(tbl_l[1], va.val[0])); rh = veorq_u8(rh, vqtbl1q_u8(tbl_h[1], va.val[0])); - rl = veorq_u8(rl, vqtbl1q_u8(tbl_l[3], va.val[1])); - rh = veorq_u8(rh, vqtbl1q_u8(tbl_h[3], va.val[1])); - - va = vtrnq_u8(rl, rh); - r0 = vreinterpretq_u16_u8(va.val[0]); - r1 = vreinterpretq_u16_u8(va.val[1]); + va.val[0] = veorq_u8(rl, vqtbl1q_u8(tbl_l[3], va.val[1])); + va.val[1] = veorq_u8(rh, vqtbl1q_u8(tbl_h[3], va.val[1])); if (xor) { - va0 = vld1q_u16(dst); - va1 = vld1q_u16(dst + 8); - r0 = veorq_u16(r0, va0); - r1 = veorq_u16(r1, va1); + uint8x16x2_t vb = vld2q_u8((uint8_t*)dst); + va.val[0] = veorq_u8(va.val[0], vb.val[0]); + va.val[1] = veorq_u8(va.val[1], vb.val[1]); } - vst1q_u16(dst, r0); - vst1q_u16(dst + 8, r1); + vst2q_u8((uint8_t*)dst, va); src += 16; dst += 16; } } -#else /* ARCH_AARCH64 */ - -static -inline -void -neon_w16_split_4_multiply_region(gf_t *gf, uint16_t *src, uint16_t *dst, - uint16_t *d_end, uint8_t *tbl, - gf_val_32_t val, int xor) -{ - unsigned i; - uint8_t *high = tbl + 4 * 16; - uint16x8_t va, r; - uint8x8_t loset, vb, vc, rl, rh; - - uint8x8x2_t tbl_h[4], tbl_l[4]; - for (i = 0; i < 4; i++) { - tbl_l[i].val[0] = vld1_u8(tbl + i*16); - tbl_l[i].val[1] = vld1_u8(tbl + i*16 + 8); - tbl_h[i].val[0] = vld1_u8(high + i*16); - tbl_h[i].val[1] = vld1_u8(high + i*16 + 8); - } - - loset = vdup_n_u8(0xf); - - while (dst < d_end) { - va = vld1q_u16(src); - - vb = vmovn_u16(va); - vc = vshrn_n_u16(va, 8); - - rl = vtbl2_u8(tbl_l[0], vand_u8(vb, loset)); - rh = vtbl2_u8(tbl_h[0], vand_u8(vb, loset)); - vb = vshr_n_u8(vb, 4); - rl = veor_u8(rl, vtbl2_u8(tbl_l[2], vand_u8(vc, loset))); - rh = veor_u8(rh, vtbl2_u8(tbl_h[2], vand_u8(vc, loset))); - vc = vshr_n_u8(vc, 4); - rl = veor_u8(rl, vtbl2_u8(tbl_l[1], vb)); - rh = veor_u8(rh, vtbl2_u8(tbl_h[1], vb)); - rl = veor_u8(rl, vtbl2_u8(tbl_l[3], vc)); - rh = veor_u8(rh, vtbl2_u8(tbl_h[3], vc)); - - r = vmovl_u8(rl); - r = vorrq_u16(r, vshll_n_u8(rh, 8)); - - if (xor) { - va = vld1q_u16(dst); - r = veorq_u16(r, va); - } - vst1q_u16(dst, r); - - src += 8; - dst += 8; - } -} -#endif /* ARCH_AARCH64 */ - -#ifndef ARCH_AARCH64 -#define vqtbl1q_u8(tbl, v) vcombine_u8(vtbl2_u8(tbl, vget_low_u8(v)), \ - vtbl2_u8(tbl, vget_high_u8(v))) -#endif - static inline void |