From 643743d0482ca09a9dfa57beed196f172a22a78e Mon Sep 17 00:00:00 2001 From: animetosho Date: Sat, 14 Nov 2015 16:32:25 +1000 Subject: Move conditional outside loop for NEON SPLIT4 implementation Seems to improve performance a fair bit --- src/neon/gf_w16_neon.c | 35 +++++++++++++++++++++++++++++------ 1 file changed, 29 insertions(+), 6 deletions(-) diff --git a/src/neon/gf_w16_neon.c b/src/neon/gf_w16_neon.c index fb637ca..2bd3f30 100644 --- a/src/neon/gf_w16_neon.c +++ b/src/neon/gf_w16_neon.c @@ -81,8 +81,11 @@ neon_w16_split_4_multiply_region(gf_t *gf, uint16_t *src, uint16_t *dst, loset = vdupq_n_u8(0xf); - while (dst < d_end) { + if (xor) { + uint8x16x2_t vb; + while (dst < d_end) { va = vld2q_u8((uint8_t*)src); + vb = vld2q_u8((uint8_t*)dst); rl = vqtbl1q_u8(tbl_l[0], vandq_u8(va.val[0], loset)); rh = vqtbl1q_u8(tbl_h[0], vandq_u8(va.val[0], loset)); @@ -97,15 +100,35 @@ neon_w16_split_4_multiply_region(gf_t *gf, uint16_t *src, uint16_t *dst, va.val[0] = veorq_u8(rl, vqtbl1q_u8(tbl_l[3], va.val[1])); va.val[1] = veorq_u8(rh, vqtbl1q_u8(tbl_h[3], va.val[1])); - if (xor) { - uint8x16x2_t vb = vld2q_u8((uint8_t*)dst); - va.val[0] = veorq_u8(va.val[0], vb.val[0]); - va.val[1] = veorq_u8(va.val[1], vb.val[1]); - } + va.val[0] = veorq_u8(va.val[0], vb.val[0]); + va.val[1] = veorq_u8(va.val[1], vb.val[1]); vst2q_u8((uint8_t*)dst, va); src += 16; dst += 16; + } + } else { + while (dst < d_end) { + va = vld2q_u8((uint8_t*)src); + + rl = vqtbl1q_u8(tbl_l[0], vandq_u8(va.val[0], loset)); + rh = vqtbl1q_u8(tbl_h[0], vandq_u8(va.val[0], loset)); + rl = veorq_u8(rl, vqtbl1q_u8(tbl_l[2], vandq_u8(va.val[1], loset))); + rh = veorq_u8(rh, vqtbl1q_u8(tbl_h[2], vandq_u8(va.val[1], loset))); + + va.val[0] = vshrq_n_u8(va.val[0], 4); + va.val[1] = vshrq_n_u8(va.val[1], 4); + + rl = veorq_u8(rl, vqtbl1q_u8(tbl_l[1], va.val[0])); + rh = veorq_u8(rh, vqtbl1q_u8(tbl_h[1], va.val[0])); + va.val[0] = veorq_u8(rl, vqtbl1q_u8(tbl_l[3], va.val[1])); + va.val[1] = veorq_u8(rh, vqtbl1q_u8(tbl_h[3], va.val[1])); + + vst2q_u8((uint8_t*)dst, va); + + src += 16; + dst += 16; + } } } -- cgit v1.2.1