summaryrefslogtreecommitdiff
diff options
context:
space:
mode:
authorrearnsha <rearnsha@138bc75d-0d04-0410-961f-82ee72b054a4>2012-03-22 17:54:55 +0000
committerrearnsha <rearnsha@138bc75d-0d04-0410-961f-82ee72b054a4>2012-03-22 17:54:55 +0000
commitc434932e63f12b760465c39bd9fcf8031042dce0 (patch)
treefb178b3c53b755463feef6cd03fe3c9044a0caae
parentaf3c0d00a28c5befe0e49254ad1d0ee23e03d035 (diff)
downloadgcc-c434932e63f12b760465c39bd9fcf8031042dce0.tar.gz
* lex.c (search_line_fast): Provide Neon-optimized version for ARM.
git-svn-id: svn+ssh://gcc.gnu.org/svn/gcc/trunk@185702 138bc75d-0d04-0410-961f-82ee72b054a4
-rw-r--r--libcpp/ChangeLog4
-rw-r--r--libcpp/lex.c63
2 files changed, 67 insertions, 0 deletions
diff --git a/libcpp/ChangeLog b/libcpp/ChangeLog
index 97177e89916..133620b3b70 100644
--- a/libcpp/ChangeLog
+++ b/libcpp/ChangeLog
@@ -1,3 +1,7 @@
+2012-03-22 Richard Earnshaw <rearnsha@arm.com>
+
+ * lex.c (search_line_fast): Provide Neon-optimized version for ARM.
+
2012-03-14 Rainer Orth <ro@CeBiTec.Uni-Bielefeld.DE>
* lex.c: Remove Solaris 8 reference.
diff --git a/libcpp/lex.c b/libcpp/lex.c
index 05b9e2b0b93..9d23002d84b 100644
--- a/libcpp/lex.c
+++ b/libcpp/lex.c
@@ -629,6 +629,69 @@ search_line_fast (const uchar *s, const uchar *end ATTRIBUTE_UNUSED)
}
}
+#elif defined (__ARM_NEON__)
+#include "arm_neon.h"
+
+static const uchar *
+search_line_fast (const uchar *s, const uchar *end ATTRIBUTE_UNUSED)
+{
+ const uint8x16_t repl_nl = vdupq_n_u8 ('\n');
+ const uint8x16_t repl_cr = vdupq_n_u8 ('\r');
+ const uint8x16_t repl_bs = vdupq_n_u8 ('\\');
+ const uint8x16_t repl_qm = vdupq_n_u8 ('?');
+ const uint8x16_t xmask = (uint8x16_t) vdupq_n_u64 (0x8040201008040201ULL);
+
+ unsigned int misalign, found, mask;
+ const uint8_t *p;
+ uint8x16_t data;
+
+ /* Align the source pointer. */
+ misalign = (uintptr_t)s & 15;
+ p = (const uint8_t *)((uintptr_t)s & -16);
+ data = vld1q_u8 (p);
+
+ /* Create a mask for the bytes that are valid within the first
+ 16-byte block. The Idea here is that the AND with the mask
+ within the loop is "free", since we need some AND or TEST
+ insn in order to set the flags for the branch anyway. */
+ mask = (-1u << misalign) & 0xffff;
+
+ /* Main loop, processing 16 bytes at a time. */
+ goto start;
+
+ do
+ {
+ uint8x8_t l;
+ uint16x4_t m;
+ uint32x2_t n;
+ uint8x16_t t, u, v, w;
+
+ p += 16;
+ data = vld1q_u8 (p);
+ mask = 0xffff;
+
+ start:
+ t = vceqq_u8 (data, repl_nl);
+ u = vceqq_u8 (data, repl_cr);
+ v = vorrq_u8 (t, vceqq_u8 (data, repl_bs));
+ w = vorrq_u8 (u, vceqq_u8 (data, repl_qm));
+ t = vandq_u8 (vorrq_u8 (v, w), xmask);
+ l = vpadd_u8 (vget_low_u8 (t), vget_high_u8 (t));
+ m = vpaddl_u8 (l);
+ n = vpaddl_u16 (m);
+
+ found = vget_lane_u32 ((uint32x2_t) vorr_u64 ((uint64x1_t) n,
+ vshr_n_u64 ((uint64x1_t) n, 24)), 0);
+ found &= mask;
+ }
+ while (!found);
+
+ /* FOUND contains 1 in bits for which we matched a relevant
+ character. Conversion to the byte index is trivial. */
+ found = __builtin_ctz (found);
+ return (const uchar *)p + found;
+}
+
#else
/* We only have one accellerated alternative. Use a direct call so that