summaryrefslogtreecommitdiff
path: root/src/neon/gf_w16_neon.c
diff options
context:
space:
mode:
Diffstat (limited to 'src/neon/gf_w16_neon.c')
-rw-r--r--src/neon/gf_w16_neon.c196
1 files changed, 58 insertions, 138 deletions
diff --git a/src/neon/gf_w16_neon.c b/src/neon/gf_w16_neon.c
index 95bfd80..2bd3f30 100644
--- a/src/neon/gf_w16_neon.c
+++ b/src/neon/gf_w16_neon.c
@@ -46,7 +46,11 @@
#include <stdlib.h>
#include "gf_w16.h"
-#ifdef ARCH_AARCH64
+#ifndef ARCH_AARCH64
+#define vqtbl1q_u8(tbl, v) vcombine_u8(vtbl2_u8(tbl, vget_low_u8(v)), \
+ vtbl2_u8(tbl, vget_high_u8(v)))
+#endif
+
static
inline
void
@@ -56,23 +60,32 @@ neon_w16_split_4_multiply_region(gf_t *gf, uint16_t *src, uint16_t *dst,
{
unsigned i;
uint8_t *high = tbl + 4 * 16;
- uint16x8_t va0, va1, r0, r1;
uint8x16_t loset, rl, rh;
uint8x16x2_t va;
+#ifdef ARCH_AARCH64
uint8x16_t tbl_h[4], tbl_l[4];
for (i = 0; i < 4; i++) {
tbl_l[i] = vld1q_u8(tbl + i*16);
tbl_h[i] = vld1q_u8(high + i*16);
}
+#else
+ uint8x8x2_t tbl_h[4], tbl_l[4];
+ for (i = 0; i < 4; i++) {
+ tbl_l[i].val[0] = vld1_u8(tbl + i*16);
+ tbl_l[i].val[1] = vld1_u8(tbl + i*16 + 8);
+ tbl_h[i].val[0] = vld1_u8(high + i*16);
+ tbl_h[i].val[1] = vld1_u8(high + i*16 + 8);
+ }
+#endif
loset = vdupq_n_u8(0xf);
- while (dst < d_end) {
- va0 = vld1q_u16(src);
- va1 = vld1q_u16(src + 8);
-
- va = vtrnq_u8(vreinterpretq_u8_u16(va0), vreinterpretq_u8_u16(va1));
+ if (xor) {
+ uint8x16x2_t vb;
+ while (dst < d_end) {
+ va = vld2q_u8((uint8_t*)src);
+ vb = vld2q_u8((uint8_t*)dst);
rl = vqtbl1q_u8(tbl_l[0], vandq_u8(va.val[0], loset));
rh = vqtbl1q_u8(tbl_h[0], vandq_u8(va.val[0], loset));
@@ -84,24 +97,38 @@ neon_w16_split_4_multiply_region(gf_t *gf, uint16_t *src, uint16_t *dst,
rl = veorq_u8(rl, vqtbl1q_u8(tbl_l[1], va.val[0]));
rh = veorq_u8(rh, vqtbl1q_u8(tbl_h[1], va.val[0]));
- rl = veorq_u8(rl, vqtbl1q_u8(tbl_l[3], va.val[1]));
- rh = veorq_u8(rh, vqtbl1q_u8(tbl_h[3], va.val[1]));
+ va.val[0] = veorq_u8(rl, vqtbl1q_u8(tbl_l[3], va.val[1]));
+ va.val[1] = veorq_u8(rh, vqtbl1q_u8(tbl_h[3], va.val[1]));
- va = vtrnq_u8(rl, rh);
- r0 = vreinterpretq_u16_u8(va.val[0]);
- r1 = vreinterpretq_u16_u8(va.val[1]);
+ va.val[0] = veorq_u8(va.val[0], vb.val[0]);
+ va.val[1] = veorq_u8(va.val[1], vb.val[1]);
+ vst2q_u8((uint8_t*)dst, va);
- if (xor) {
- va0 = vld1q_u16(dst);
- va1 = vld1q_u16(dst + 8);
- r0 = veorq_u16(r0, va0);
- r1 = veorq_u16(r1, va1);
- }
- vst1q_u16(dst, r0);
- vst1q_u16(dst + 8, r1);
+ src += 16;
+ dst += 16;
+ }
+ } else {
+ while (dst < d_end) {
+ va = vld2q_u8((uint8_t*)src);
+
+ rl = vqtbl1q_u8(tbl_l[0], vandq_u8(va.val[0], loset));
+ rh = vqtbl1q_u8(tbl_h[0], vandq_u8(va.val[0], loset));
+ rl = veorq_u8(rl, vqtbl1q_u8(tbl_l[2], vandq_u8(va.val[1], loset)));
+ rh = veorq_u8(rh, vqtbl1q_u8(tbl_h[2], vandq_u8(va.val[1], loset)));
+
+ va.val[0] = vshrq_n_u8(va.val[0], 4);
+ va.val[1] = vshrq_n_u8(va.val[1], 4);
+
+ rl = veorq_u8(rl, vqtbl1q_u8(tbl_l[1], va.val[0]));
+ rh = veorq_u8(rh, vqtbl1q_u8(tbl_h[1], va.val[0]));
+ va.val[0] = veorq_u8(rl, vqtbl1q_u8(tbl_l[3], va.val[1]));
+ va.val[1] = veorq_u8(rh, vqtbl1q_u8(tbl_h[3], va.val[1]));
+
+ vst2q_u8((uint8_t*)dst, va);
src += 16;
dst += 16;
+ }
}
}
@@ -118,10 +145,21 @@ neon_w16_split_4_altmap_multiply_region(gf_t *gf, uint8_t *src,
uint8x16_t vh, vl, rh, rl;
uint8x16_t loset;
+#ifdef ARCH_AARCH64
uint8x16_t tbl_h[4], tbl_l[4];
+#else
+ uint8x8x2_t tbl_h[4], tbl_l[4];
+#endif
for (i = 0; i < 4; i++) {
+#ifdef ARCH_AARCH64
tbl_l[i] = vld1q_u8(tbl + i*16);
tbl_h[i] = vld1q_u8(high + i*16);
+#else
+ tbl_l[i].val[0] = vld1_u8(tbl + i*16);
+ tbl_l[i].val[1] = vld1_u8(tbl + i*16 + 8);
+ tbl_h[i].val[0] = vld1_u8(high + i*16);
+ tbl_h[i].val[1] = vld1_u8(high + i*16 + 8);
+#endif
}
loset = vdupq_n_u8(0xf);
@@ -157,125 +195,7 @@ neon_w16_split_4_altmap_multiply_region(gf_t *gf, uint8_t *src,
}
}
-#else /* ARCH_AARCH64 */
-
-static
-inline
-void
-neon_w16_split_4_multiply_region(gf_t *gf, uint16_t *src, uint16_t *dst,
- uint16_t *d_end, uint8_t *tbl,
- gf_val_32_t val, int xor)
-{
- unsigned i;
- uint8_t *high = tbl + 4 * 16;
- uint16x8_t va, r;
- uint8x8_t loset, vb, vc, rl, rh;
- uint8x8x2_t tbl_h[4], tbl_l[4];
- for (i = 0; i < 4; i++) {
- tbl_l[i].val[0] = vld1_u8(tbl + i*16);
- tbl_l[i].val[1] = vld1_u8(tbl + i*16 + 8);
- tbl_h[i].val[0] = vld1_u8(high + i*16);
- tbl_h[i].val[1] = vld1_u8(high + i*16 + 8);
- }
-
- loset = vdup_n_u8(0xf);
-
- while (dst < d_end) {
- va = vld1q_u16(src);
-
- vb = vmovn_u16(va);
- vc = vshrn_n_u16(va, 8);
-
- rl = vtbl2_u8(tbl_l[0], vand_u8(vb, loset));
- rh = vtbl2_u8(tbl_h[0], vand_u8(vb, loset));
- vb = vshr_n_u8(vb, 4);
- rl = veor_u8(rl, vtbl2_u8(tbl_l[2], vand_u8(vc, loset)));
- rh = veor_u8(rh, vtbl2_u8(tbl_h[2], vand_u8(vc, loset)));
- vc = vshr_n_u8(vc, 4);
- rl = veor_u8(rl, vtbl2_u8(tbl_l[1], vb));
- rh = veor_u8(rh, vtbl2_u8(tbl_h[1], vb));
- rl = veor_u8(rl, vtbl2_u8(tbl_l[3], vc));
- rh = veor_u8(rh, vtbl2_u8(tbl_h[3], vc));
-
- r = vmovl_u8(rl);
- r = vorrq_u16(r, vshll_n_u8(rh, 8));
-
- if (xor) {
- va = vld1q_u16(dst);
- r = veorq_u16(r, va);
- }
- vst1q_u16(dst, r);
-
- src += 8;
- dst += 8;
- }
-}
-
-static
-inline
-void
-neon_w16_split_4_altmap_multiply_region(gf_t *gf, uint8_t *src,
- uint8_t *dst, uint8_t *d_end,
- uint8_t *tbl, gf_val_32_t val,
- int xor)
-{
- unsigned i;
- uint8_t *high = tbl + 4 * 16;
- uint8x8_t vh0, vh1, vl0, vl1, r0, r1, r2, r3;
- uint8x8_t loset;
-
- uint8x8x2_t tbl_h[4], tbl_l[4];
- for (i = 0; i < 4; i++) {
- tbl_l[i].val[0] = vld1_u8(tbl + i*16);
- tbl_l[i].val[1] = vld1_u8(tbl + i*16 + 8);
- tbl_h[i].val[0] = vld1_u8(high + i*16);
- tbl_h[i].val[1] = vld1_u8(high + i*16 + 8);
- }
-
- loset = vdup_n_u8(0xf);
-
- while (dst < d_end) {
- vh0 = vld1_u8(src);
- vh1 = vld1_u8(src + 8);
- vl0 = vld1_u8(src + 16);
- vl1 = vld1_u8(src + 24);
-
- r0 = vtbl2_u8(tbl_l[0], vand_u8(vh0, loset));
- r1 = vtbl2_u8(tbl_h[0], vand_u8(vh1, loset));
- r2 = vtbl2_u8(tbl_l[2], vand_u8(vl0, loset));
- r3 = vtbl2_u8(tbl_h[2], vand_u8(vl1, loset));
-
- vh0 = vshr_n_u8(vh0, 4);
- vh1 = vshr_n_u8(vh1, 4);
- vl0 = vshr_n_u8(vl0, 4);
- vl1 = vshr_n_u8(vl1, 4);
-
- r0 = veor_u8(r0, vtbl2_u8(tbl_l[1], vh0));
- r1 = veor_u8(r1, vtbl2_u8(tbl_h[1], vh1));
- r2 = veor_u8(r2, vtbl2_u8(tbl_l[3], vl0));
- r3 = veor_u8(r3, vtbl2_u8(tbl_h[3], vl1));
-
- if (xor) {
- vh0 = vld1_u8(dst);
- vh1 = vld1_u8(dst + 8);
- vl0 = vld1_u8(dst + 16);
- vl1 = vld1_u8(dst + 24);
- r0 = veor_u8(r0, vh0);
- r1 = veor_u8(r1, vh1);
- r2 = veor_u8(r2, vl0);
- r3 = veor_u8(r3, vl1);
- }
- vst1_u8(dst, r0);
- vst1_u8(dst + 8, r1);
- vst1_u8(dst + 16, r2);
- vst1_u8(dst + 24, r3);
-
- src += 32;
- dst += 32;
- }
-}
-#endif /* ARCH_AARCH64 */
static
inline