summaryrefslogtreecommitdiff
diff options
context:
space:
mode:
authoranimetosho <animetosho@users.noreply.github.com>2015-11-12 21:17:13 +1000
committeranimetosho <animetosho@users.noreply.github.com>2015-11-12 21:17:13 +1000
commit438283c12d8378c449b78e011cb4f9c0ff33dcc3 (patch)
treeef8f16c1c4632b3d53cbc84cf20ac67ab08dbc48
parentf373b138aae6ee052ca711e90837ca11bbedd156 (diff)
downloadgf-complete-438283c12d8378c449b78e011cb4f9c0ff33dcc3.tar.gz
Use similar strategy for SPLIT(16,4) ALTMAP NEON implementation as SPLIT(32,4)
-rw-r--r--src/neon/gf_w16_neon.c136
1 files changed, 41 insertions, 95 deletions
diff --git a/src/neon/gf_w16_neon.c b/src/neon/gf_w16_neon.c
index a9fc8c4..346d993 100644
--- a/src/neon/gf_w16_neon.c
+++ b/src/neon/gf_w16_neon.c
@@ -105,58 +105,6 @@ neon_w16_split_4_multiply_region(gf_t *gf, uint16_t *src, uint16_t *dst,
}
}
-static
-inline
-void
-neon_w16_split_4_altmap_multiply_region(gf_t *gf, uint8_t *src,
- uint8_t *dst, uint8_t *d_end,
- uint8_t *tbl, gf_val_32_t val,
- int xor)
-{
- unsigned i;
- uint8_t *high = tbl + 4 * 16;
- uint8x16_t vh, vl, rh, rl;
- uint8x16_t loset;
-
- uint8x16_t tbl_h[4], tbl_l[4];
- for (i = 0; i < 4; i++) {
- tbl_l[i] = vld1q_u8(tbl + i*16);
- tbl_h[i] = vld1q_u8(high + i*16);
- }
-
- loset = vdupq_n_u8(0xf);
-
- while (dst < d_end) {
- vh = vld1q_u8(src);
- vl = vld1q_u8(src + 16);
-
- rl = vqtbl1q_u8(tbl_l[0], vandq_u8(vl, loset));
- rh = vqtbl1q_u8(tbl_h[0], vandq_u8(vl, loset));
- rl = veorq_u8(rl, vqtbl1q_u8(tbl_l[2], vandq_u8(vh, loset)));
- rh = veorq_u8(rh, vqtbl1q_u8(tbl_h[2], vandq_u8(vh, loset)));
-
- vl = vshrq_n_u8(vl, 4);
- vh = vshrq_n_u8(vh, 4);
-
- rl = veorq_u8(rl, vqtbl1q_u8(tbl_l[1], vl));
- rh = veorq_u8(rh, vqtbl1q_u8(tbl_h[1], vl));
- rl = veorq_u8(rl, vqtbl1q_u8(tbl_l[3], vh));
- rh = veorq_u8(rh, vqtbl1q_u8(tbl_h[3], vh));
-
- if (xor) {
- vh = vld1q_u8(dst);
- vl = vld1q_u8(dst + 16);
- rh = veorq_u8(rh, vh);
- rl = veorq_u8(rl, vl);
- }
- vst1q_u8(dst, rh);
- vst1q_u8(dst + 16, rl);
-
- src += 32;
- dst += 32;
- }
-}
-
#else /* ARCH_AARCH64 */
static
@@ -211,6 +159,12 @@ neon_w16_split_4_multiply_region(gf_t *gf, uint16_t *src, uint16_t *dst,
dst += 8;
}
}
+#endif /* ARCH_AARCH64 */
+
+#ifndef ARCH_AARCH64
+#define vqtbl1q_u8(tbl, v) vcombine_u8(vtbl2_u8(tbl, vget_low_u8(v)), \
+ vtbl2_u8(tbl, vget_high_u8(v)))
+#endif
static
inline
@@ -222,68 +176,60 @@ neon_w16_split_4_altmap_multiply_region(gf_t *gf, uint8_t *src,
{
unsigned i;
uint8_t *high = tbl + 4 * 16;
- uint8x8_t vh0, vh1, vl0, vl1, rh0, rh1, rl0, rl1;
- uint8x8_t loset;
+ uint8x16_t vh, vl, rh, rl;
+ uint8x16_t loset;
+#ifdef ARCH_AARCH64
+ uint8x16_t tbl_h[4], tbl_l[4];
+#else
uint8x8x2_t tbl_h[4], tbl_l[4];
+#endif
for (i = 0; i < 4; i++) {
+#ifdef ARCH_AARCH64
+ tbl_l[i] = vld1q_u8(tbl + i*16);
+ tbl_h[i] = vld1q_u8(high + i*16);
+#else
tbl_l[i].val[0] = vld1_u8(tbl + i*16);
tbl_l[i].val[1] = vld1_u8(tbl + i*16 + 8);
tbl_h[i].val[0] = vld1_u8(high + i*16);
tbl_h[i].val[1] = vld1_u8(high + i*16 + 8);
+#endif
}
- loset = vdup_n_u8(0xf);
+ loset = vdupq_n_u8(0xf);
while (dst < d_end) {
- vh0 = vld1_u8(src);
- vh1 = vld1_u8(src + 8);
- vl0 = vld1_u8(src + 16);
- vl1 = vld1_u8(src + 24);
-
- rl0 = vtbl2_u8(tbl_l[0], vand_u8(vl0, loset));
- rl1 = vtbl2_u8(tbl_l[0], vand_u8(vl1, loset));
- rh0 = vtbl2_u8(tbl_h[0], vand_u8(vl0, loset));
- rh1 = vtbl2_u8(tbl_h[0], vand_u8(vl1, loset));
- rl0 = veor_u8(rl0, vtbl2_u8(tbl_l[2], vand_u8(vh0, loset)));
- rl1 = veor_u8(rl1, vtbl2_u8(tbl_l[2], vand_u8(vh1, loset)));
- rh0 = veor_u8(rh0, vtbl2_u8(tbl_h[2], vand_u8(vh0, loset)));
- rh1 = veor_u8(rh1, vtbl2_u8(tbl_h[2], vand_u8(vh1, loset)));
-
- vh0 = vshr_n_u8(vh0, 4);
- vh1 = vshr_n_u8(vh1, 4);
- vl0 = vshr_n_u8(vl0, 4);
- vl1 = vshr_n_u8(vl1, 4);
-
- rl0 = veor_u8(rl0, vtbl2_u8(tbl_l[1], vl0));
- rl1 = veor_u8(rl1, vtbl2_u8(tbl_l[1], vl1));
- rh0 = veor_u8(rh0, vtbl2_u8(tbl_h[1], vl0));
- rh1 = veor_u8(rh1, vtbl2_u8(tbl_h[1], vl1));
- rl0 = veor_u8(rl0, vtbl2_u8(tbl_l[3], vh0));
- rl1 = veor_u8(rl1, vtbl2_u8(tbl_l[3], vh1));
- rh0 = veor_u8(rh0, vtbl2_u8(tbl_h[3], vh0));
- rh1 = veor_u8(rh1, vtbl2_u8(tbl_h[3], vh1));
+ vh = vld1q_u8(src);
+ vl = vld1q_u8(src + 16);
+
+ rl = vqtbl1q_u8(tbl_l[0], vandq_u8(vl, loset));
+ rh = vqtbl1q_u8(tbl_h[0], vandq_u8(vl, loset));
+ rl = veorq_u8(rl, vqtbl1q_u8(tbl_l[2], vandq_u8(vh, loset)));
+ rh = veorq_u8(rh, vqtbl1q_u8(tbl_h[2], vandq_u8(vh, loset)));
+
+ vl = vshrq_n_u8(vl, 4);
+ vh = vshrq_n_u8(vh, 4);
+
+ rl = veorq_u8(rl, vqtbl1q_u8(tbl_l[1], vl));
+ rh = veorq_u8(rh, vqtbl1q_u8(tbl_h[1], vl));
+ rl = veorq_u8(rl, vqtbl1q_u8(tbl_l[3], vh));
+ rh = veorq_u8(rh, vqtbl1q_u8(tbl_h[3], vh));
if (xor) {
- vh0 = vld1_u8(dst);
- vh1 = vld1_u8(dst + 8);
- vl0 = vld1_u8(dst + 16);
- vl1 = vld1_u8(dst + 24);
- rh0 = veor_u8(rh0, vh0);
- rh1 = veor_u8(rh1, vh1);
- rl0 = veor_u8(rl0, vl0);
- rl1 = veor_u8(rl1, vl1);
+ vh = vld1q_u8(dst);
+ vl = vld1q_u8(dst + 16);
+ rh = veorq_u8(rh, vh);
+ rl = veorq_u8(rl, vl);
}
- vst1_u8(dst, rh0);
- vst1_u8(dst + 8, rh1);
- vst1_u8(dst + 16, rl0);
- vst1_u8(dst + 24, rl1);
+ vst1q_u8(dst, rh);
+ vst1q_u8(dst + 16, rl);
src += 32;
dst += 32;
}
}
-#endif /* ARCH_AARCH64 */
+
+
static
inline