From f373b138aae6ee052ca711e90837ca11bbedd156 Mon Sep 17 00:00:00 2001 From: animetosho Date: Thu, 12 Nov 2015 21:09:44 +1000 Subject: Initial fix for SPLIT(16,4) ALTMAP NEON (non ARMv8) --- src/neon/gf_w16_neon.c | 42 +++++++++++++++++++++++++----------------- 1 file changed, 25 insertions(+), 17 deletions(-) diff --git a/src/neon/gf_w16_neon.c b/src/neon/gf_w16_neon.c index 95bfd80..a9fc8c4 100644 --- a/src/neon/gf_w16_neon.c +++ b/src/neon/gf_w16_neon.c @@ -222,7 +222,7 @@ neon_w16_split_4_altmap_multiply_region(gf_t *gf, uint8_t *src, { unsigned i; uint8_t *high = tbl + 4 * 16; - uint8x8_t vh0, vh1, vl0, vl1, r0, r1, r2, r3; + uint8x8_t vh0, vh1, vl0, vl1, rh0, rh1, rl0, rl1; uint8x8_t loset; uint8x8x2_t tbl_h[4], tbl_l[4]; @@ -241,35 +241,43 @@ neon_w16_split_4_altmap_multiply_region(gf_t *gf, uint8_t *src, vl0 = vld1_u8(src + 16); vl1 = vld1_u8(src + 24); - r0 = vtbl2_u8(tbl_l[0], vand_u8(vh0, loset)); - r1 = vtbl2_u8(tbl_h[0], vand_u8(vh1, loset)); - r2 = vtbl2_u8(tbl_l[2], vand_u8(vl0, loset)); - r3 = vtbl2_u8(tbl_h[2], vand_u8(vl1, loset)); + rl0 = vtbl2_u8(tbl_l[0], vand_u8(vl0, loset)); + rl1 = vtbl2_u8(tbl_l[0], vand_u8(vl1, loset)); + rh0 = vtbl2_u8(tbl_h[0], vand_u8(vl0, loset)); + rh1 = vtbl2_u8(tbl_h[0], vand_u8(vl1, loset)); + rl0 = veor_u8(rl0, vtbl2_u8(tbl_l[2], vand_u8(vh0, loset))); + rl1 = veor_u8(rl1, vtbl2_u8(tbl_l[2], vand_u8(vh1, loset))); + rh0 = veor_u8(rh0, vtbl2_u8(tbl_h[2], vand_u8(vh0, loset))); + rh1 = veor_u8(rh1, vtbl2_u8(tbl_h[2], vand_u8(vh1, loset))); vh0 = vshr_n_u8(vh0, 4); vh1 = vshr_n_u8(vh1, 4); vl0 = vshr_n_u8(vl0, 4); vl1 = vshr_n_u8(vl1, 4); - r0 = veor_u8(r0, vtbl2_u8(tbl_l[1], vh0)); - r1 = veor_u8(r1, vtbl2_u8(tbl_h[1], vh1)); - r2 = veor_u8(r2, vtbl2_u8(tbl_l[3], vl0)); - r3 = veor_u8(r3, vtbl2_u8(tbl_h[3], vl1)); + rl0 = veor_u8(rl0, vtbl2_u8(tbl_l[1], vl0)); + rl1 = veor_u8(rl1, vtbl2_u8(tbl_l[1], vl1)); + rh0 = veor_u8(rh0, vtbl2_u8(tbl_h[1], vl0)); + rh1 = veor_u8(rh1, vtbl2_u8(tbl_h[1], vl1)); + rl0 = veor_u8(rl0, vtbl2_u8(tbl_l[3], vh0)); + rl1 = veor_u8(rl1, vtbl2_u8(tbl_l[3], vh1)); + rh0 = veor_u8(rh0, vtbl2_u8(tbl_h[3], vh0)); + rh1 = veor_u8(rh1, vtbl2_u8(tbl_h[3], vh1)); if (xor) { vh0 = vld1_u8(dst); vh1 = vld1_u8(dst + 8); vl0 = vld1_u8(dst + 16); vl1 = vld1_u8(dst + 24); - r0 = veor_u8(r0, vh0); - r1 = veor_u8(r1, vh1); - r2 = veor_u8(r2, vl0); - r3 = veor_u8(r3, vl1); + rh0 = veor_u8(rh0, vh0); + rh1 = veor_u8(rh1, vh1); + rl0 = veor_u8(rl0, vl0); + rl1 = veor_u8(rl1, vl1); } - vst1_u8(dst, r0); - vst1_u8(dst + 8, r1); - vst1_u8(dst + 16, r2); - vst1_u8(dst + 24, r3); + vst1_u8(dst, rh0); + vst1_u8(dst + 8, rh1); + vst1_u8(dst + 16, rl0); + vst1_u8(dst + 24, rl1); src += 32; dst += 32; -- cgit v1.2.1