summaryrefslogtreecommitdiff
diff options
context:
space:
mode:
authorKyrylo Tkachov <kyrylo.tkachov@arm.com>2013-12-20 16:10:43 +0000
committerKyrylo Tkachov <ktkachov@gcc.gnu.org>2013-12-20 16:10:43 +0000
commit392356899b2de257a3b13c7a8aacc5140de9b4ee (patch)
treea03ddbcfaa36d1cceef338ca603fefd12b90b99d
parent1fc017b632c79110bfd8db534e76b81318cb7530 (diff)
downloadgcc-392356899b2de257a3b13c7a8aacc5140de9b4ee.tar.gz
neon.ml (crypto_intrinsics): Add vceq_64 and vtst_p64.
[gcc/] 2013-12-20 Kyrylo Tkachov <kyrylo.tkachov@arm.com> * config/arm/neon.ml (crypto_intrinsics): Add vceq_64 and vtst_p64. * config/arm/arm_neon.h: Regenerate. * config/arm/neon-docgen.ml: Add vceq_p64 and vtst_p64. * doc/arm-neon-intrinsics.texi: Regenerate. [gcc/testsuite/] 2013-12-20 Kyrylo Tkachov <kyrylo.tkachov@arm.com> * gcc.target/arm/neon-vceq_p64.c: New test. * gcc.target/arm/neon-vtst_p64.c: Likewise. From-SVN: r206151
-rw-r--r--gcc/ChangeLog7
-rw-r--r--gcc/config/arm/arm_neon.h35
-rw-r--r--gcc/config/arm/neon-docgen.ml8
-rw-r--r--gcc/config/arm/neon.ml35
-rw-r--r--gcc/doc/arm-neon-intrinsics.texi8
-rw-r--r--gcc/testsuite/ChangeLog5
-rw-r--r--gcc/testsuite/gcc.target/arm/neon-vceq_p64.c38
-rw-r--r--gcc/testsuite/gcc.target/arm/neon-vtst_p64.c38
8 files changed, 174 insertions, 0 deletions
diff --git a/gcc/ChangeLog b/gcc/ChangeLog
index bc9f5a3db70..2f4f57e1296 100644
--- a/gcc/ChangeLog
+++ b/gcc/ChangeLog
@@ -1,3 +1,10 @@
+2013-12-20 Kyrylo Tkachov <kyrylo.tkachov@arm.com>
+
+ * config/arm/neon.ml (crypto_intrinsics): Add vceq_64 and vtst_p64.
+ * config/arm/arm_neon.h: Regenerate.
+ * config/arm/neon-docgen.ml: Add vceq_p64 and vtst_p64.
+ * doc/arm-neon-intrinsics.texi: Regenerate.
+
2013-12-20 Vladimir Makarov <vmakarov@redhat.com>
* config/arm/arm.h (THUMB_SECONDARY_OUTPUT_RELOAD_CLASS): Return NO_REGS
diff --git a/gcc/config/arm/arm_neon.h b/gcc/config/arm/arm_neon.h
index 59ef22c530d..1abbba2256c 100644
--- a/gcc/config/arm/arm_neon.h
+++ b/gcc/config/arm/arm_neon.h
@@ -13278,6 +13278,41 @@ vstrq_p128 (poly128_t * __ptr, poly128_t __val)
#endif
}
+/* The vceq_p64 intrinsic does not map to a single instruction.
+ Instead we emulate it by performing a 32-bit variant of the vceq
+ and applying a pairwise min reduction to the result.
+ vceq_u32 will produce two 32-bit halves, each of which will contain either
+ all ones or all zeros depending on whether the corresponding 32-bit
+ halves of the poly64_t were equal. The whole poly64_t values are equal
+ if and only if both halves are equal, i.e. vceq_u32 returns all ones.
+ If the result is all zeroes for any half then the whole result is zeroes.
+ This is what the pairwise min reduction achieves. */
+
+__extension__ static __inline uint64x1_t __attribute__ ((__always_inline__))
+vceq_p64 (poly64x1_t __a, poly64x1_t __b)
+{
+ uint32x2_t __t_a = vreinterpret_u32_p64 (__a);
+ uint32x2_t __t_b = vreinterpret_u32_p64 (__b);
+ uint32x2_t __c = vceq_u32 (__t_a, __t_b);
+ uint32x2_t __m = vpmin_u32 (__c, __c);
+ return vreinterpret_u64_u32 (__m);
+}
+
+/* The vtst_p64 intrinsic does not map to a single instruction.
+ We emulate it in way similar to vceq_p64 above but here we do
+ a reduction with max since if any two corresponding bits
+ in the two poly64_t's match, then the whole result must be all ones. */
+
+__extension__ static __inline uint64x1_t __attribute__ ((__always_inline__))
+vtst_p64 (poly64x1_t __a, poly64x1_t __b)
+{
+ uint32x2_t __t_a = vreinterpret_u32_p64 (__a);
+ uint32x2_t __t_b = vreinterpret_u32_p64 (__b);
+ uint32x2_t __c = vtst_u32 (__t_a, __t_b);
+ uint32x2_t __m = vpmax_u32 (__c, __c);
+ return vreinterpret_u64_u32 (__m);
+}
+
__extension__ static __inline uint8x16_t __attribute__ ((__always_inline__))
vaeseq_u8 (uint8x16_t __data, uint8x16_t __key)
{
diff --git a/gcc/config/arm/neon-docgen.ml b/gcc/config/arm/neon-docgen.ml
index 66d21cf1139..46cae14fdc2 100644
--- a/gcc/config/arm/neon-docgen.ml
+++ b/gcc/config/arm/neon-docgen.ml
@@ -340,6 +340,14 @@ let crypto_doc =
@end itemize
@itemize @bullet
+@item uint64x1_t vceq_p64 (poly64x1_t, poly64x1_t)
+@end itemize
+
+@itemize @bullet
+@item uint64x1_t vtst_p64 (poly64x1_t, poly64x1_t)
+@end itemize
+
+@itemize @bullet
@item uint32_t vsha1h_u32 (uint32_t)
@*@emph{Form of expected instruction(s):} @code{sha1h.32 @var{q0}, @var{q1}}
@end itemize
diff --git a/gcc/config/arm/neon.ml b/gcc/config/arm/neon.ml
index 968c17121e7..738ee066bb0 100644
--- a/gcc/config/arm/neon.ml
+++ b/gcc/config/arm/neon.ml
@@ -2208,6 +2208,41 @@ vstrq_p128 (poly128_t * __ptr, poly128_t __val)
#endif
}
+/* The vceq_p64 intrinsic does not map to a single instruction.
+ Instead we emulate it by performing a 32-bit variant of the vceq
+ and applying a pairwise min reduction to the result.
+ vceq_u32 will produce two 32-bit halves, each of which will contain either
+ all ones or all zeros depending on whether the corresponding 32-bit
+ halves of the poly64_t were equal. The whole poly64_t values are equal
+ if and only if both halves are equal, i.e. vceq_u32 returns all ones.
+ If the result is all zeroes for any half then the whole result is zeroes.
+ This is what the pairwise min reduction achieves. */
+
+__extension__ static __inline uint64x1_t __attribute__ ((__always_inline__))
+vceq_p64 (poly64x1_t __a, poly64x1_t __b)
+{
+ uint32x2_t __t_a = vreinterpret_u32_p64 (__a);
+ uint32x2_t __t_b = vreinterpret_u32_p64 (__b);
+ uint32x2_t __c = vceq_u32 (__t_a, __t_b);
+ uint32x2_t __m = vpmin_u32 (__c, __c);
+ return vreinterpret_u64_u32 (__m);
+}
+
+/* The vtst_p64 intrinsic does not map to a single instruction.
+ We emulate it in way similar to vceq_p64 above but here we do
+ a reduction with max since if any two corresponding bits
+ in the two poly64_t's match, then the whole result must be all ones. */
+
+__extension__ static __inline uint64x1_t __attribute__ ((__always_inline__))
+vtst_p64 (poly64x1_t __a, poly64x1_t __b)
+{
+ uint32x2_t __t_a = vreinterpret_u32_p64 (__a);
+ uint32x2_t __t_b = vreinterpret_u32_p64 (__b);
+ uint32x2_t __c = vtst_u32 (__t_a, __t_b);
+ uint32x2_t __m = vpmax_u32 (__c, __c);
+ return vreinterpret_u64_u32 (__m);
+}
+
__extension__ static __inline uint8x16_t __attribute__ ((__always_inline__))
vaeseq_u8 (uint8x16_t __data, uint8x16_t __key)
{
diff --git a/gcc/doc/arm-neon-intrinsics.texi b/gcc/doc/arm-neon-intrinsics.texi
index 610892d6463..b1468683f83 100644
--- a/gcc/doc/arm-neon-intrinsics.texi
+++ b/gcc/doc/arm-neon-intrinsics.texi
@@ -11939,6 +11939,14 @@
@end itemize
@itemize @bullet
+@item uint64x1_t vceq_p64 (poly64x1_t, poly64x1_t)
+@end itemize
+
+@itemize @bullet
+@item uint64x1_t vtst_p64 (poly64x1_t, poly64x1_t)
+@end itemize
+
+@itemize @bullet
@item uint32_t vsha1h_u32 (uint32_t)
@*@emph{Form of expected instruction(s):} @code{sha1h.32 @var{q0}, @var{q1}}
@end itemize
diff --git a/gcc/testsuite/ChangeLog b/gcc/testsuite/ChangeLog
index fbc6244d4de..95afd485006 100644
--- a/gcc/testsuite/ChangeLog
+++ b/gcc/testsuite/ChangeLog
@@ -1,3 +1,8 @@
+2013-12-20 Kyrylo Tkachov <kyrylo.tkachov@arm.com>
+
+ * gcc.target/arm/neon-vceq_p64.c: New test.
+ * gcc.target/arm/neon-vtst_p64.c: Likewise.
+
2013-12-20 Bingfeng Mei <bmei@broadcom.com>
PR tree-optimization/59544
diff --git a/gcc/testsuite/gcc.target/arm/neon-vceq_p64.c b/gcc/testsuite/gcc.target/arm/neon-vceq_p64.c
new file mode 100644
index 00000000000..21a6a78a221
--- /dev/null
+++ b/gcc/testsuite/gcc.target/arm/neon-vceq_p64.c
@@ -0,0 +1,38 @@
+/* { dg-do run } */
+/* { dg-require-effective-target arm_crypto_ok } */
+/* { dg-require-effective-target arm_neon_hw } */
+/* { dg-add-options arm_crypto } */
+
+#include "arm_neon.h"
+#include <stdio.h>
+
+extern void abort (void);
+
+int
+main (void)
+{
+ uint64_t args[] = { 0x0, 0xdeadbeef, ~0xdeadbeef, 0xffff,
+ ~0xffff, 0xffffffff, ~0xffffffff, ~0x0 };
+ int i, j;
+
+ for (i = 0; i < sizeof (args) / sizeof (args[0]); ++i)
+ {
+ for (j = 0; j < sizeof (args) / sizeof (args[0]); ++j)
+ {
+ uint64_t a1 = args[i];
+ uint64_t a2 = args[j];
+ uint64_t res = vceq_p64 (vreinterpret_p64_u64 (a1),
+ vreinterpret_p64_u64 (a2));
+ uint64_t exp = (a1 == a2) ? ~0x0 : 0x0;
+
+ if (res != exp)
+ {
+ fprintf (stderr, "vceq_p64 (a1= %lx, a2= %lx)"
+ " returned %lx, expected %lx\n",
+ a1, a2, res, exp);
+ abort ();
+ }
+ }
+ }
+ return 0;
+}
diff --git a/gcc/testsuite/gcc.target/arm/neon-vtst_p64.c b/gcc/testsuite/gcc.target/arm/neon-vtst_p64.c
new file mode 100644
index 00000000000..3a0b117c261
--- /dev/null
+++ b/gcc/testsuite/gcc.target/arm/neon-vtst_p64.c
@@ -0,0 +1,38 @@
+/* { dg-do run } */
+/* { dg-require-effective-target arm_crypto_ok } */
+/* { dg-require-effective-target arm_neon_hw } */
+/* { dg-add-options arm_crypto } */
+
+#include "arm_neon.h"
+#include <stdio.h>
+
+extern void abort (void);
+
+int
+main (void)
+{
+ uint64_t args[] = { 0x0, 0xdeadbeef, ~0xdeadbeef, 0xffff,
+ ~0xffff, 0xffffffff, ~0xffffffff, ~0x0 };
+ int i, j;
+
+ for (i = 0; i < sizeof (args) / sizeof (args[0]); ++i)
+ {
+ for (j = 0; j < sizeof (args) / sizeof (args[0]); ++j)
+ {
+ uint64_t a1 = args[i];
+ uint64_t a2 = args[j];
+ uint64_t res = vtst_p64 (vreinterpret_p64_u64 (a1),
+ vreinterpret_p64_u64 (a2));
+ uint64_t exp = (a1 & a2) ? ~0x0 : 0x0;
+
+ if (res != exp)
+ {
+ fprintf (stderr, "vtst_p64 (a1= %lx, a2= %lx)"
+ " returned %lx, expected %lx\n",
+ a1, a2, res, exp);
+ abort ();
+ }
+ }
+ }
+ return 0;
+}