summaryrefslogtreecommitdiff
diff options
context:
space:
mode:
authorRichard Henderson <rth@redhat.com>2010-08-24 11:35:01 -0700
committerUlrich Drepper <drepper@redhat.com>2010-08-24 11:35:01 -0700
commit73f27d5e722ece05a66c124406cc8ca4305f4cbd (patch)
treec8f5325a97e7868535b3bd8d6c4cd9de579e03ab
parent84b9230c404aed4fd3a7bb3d045ca367043dde8c (diff)
downloadglibc-73f27d5e722ece05a66c124406cc8ca4305f4cbd.tar.gz
Clean up SSE variable shifts
-rw-r--r--ChangeLog17
-rw-r--r--sysdeps/i386/i686/multiarch/Makefile2
-rw-r--r--sysdeps/i386/i686/multiarch/varshift.S1
-rw-r--r--sysdeps/i386/i686/multiarch/varshift.h1
-rw-r--r--sysdeps/x86_64/multiarch/Makefile2
-rw-r--r--sysdeps/x86_64/multiarch/strcspn-c.c154
-rw-r--r--sysdeps/x86_64/multiarch/strspn-c.c152
-rw-r--r--sysdeps/x86_64/multiarch/strstr.c62
-rw-r--r--sysdeps/x86_64/multiarch/varshift.S30
-rw-r--r--sysdeps/x86_64/multiarch/varshift.h27
10 files changed, 93 insertions, 355 deletions
diff --git a/ChangeLog b/ChangeLog
index 1da347c7d9..f8050d7b88 100644
--- a/ChangeLog
+++ b/ChangeLog
@@ -1,3 +1,20 @@
+2010-08-24 Richard Henderson <rth@redhat.com>
+ Ulrich Drepper <drepper@redhat.com>
+ H.J. Lu <hongjiu.lu@intel.com>
+
+ * sysdeps/i386/i686/multiarch/Makefile (sysdep_routines): Add varshift.
+ * sysdeps/x86_64/multiarch/Makefile (sysdep_routines): Likewise.
+ * sysdeps/x86_64/multiarch/strcspn-c.c: Include "varshift.h".
+ Replace _mm_srli_si128 with __m128i_shift_right. Replace
+ _mm_alignr_epi8 with _mm_loadu_si128.
+ * sysdeps/x86_64/multiarch/strspn-c.c: Likewise.
+ * sysdeps/x86_64/multiarch/strstr.c: Include "varshift.h".
+ (__m128i_shift_right): Removed.
+ * sysdeps/i386/i686/multiarch/varshift.h: New file.
+ * sysdeps/i386/i686/multiarch/varshift.S: New file.
+ * sysdeps/x86_64/multiarch/varshift.h: New file.
+ * sysdeps/x86_64/multiarch/varshift.S: New file.
+
2010-08-21 Mike Frysinger <vapier@gentoo.org>
* configure.in: Move assembler checks to before sysdep dir checking.
diff --git a/sysdeps/i386/i686/multiarch/Makefile b/sysdeps/i386/i686/multiarch/Makefile
index 12bcfc273f..26f3e58064 100644
--- a/sysdeps/i386/i686/multiarch/Makefile
+++ b/sysdeps/i386/i686/multiarch/Makefile
@@ -9,7 +9,7 @@ sysdep_routines += bzero-sse2 memset-sse2 memcpy-ssse3 mempcpy-ssse3 \
memmove-ssse3-rep bcopy-ssse3 bcopy-ssse3-rep \
memset-sse2-rep bzero-sse2-rep strcmp-ssse3 \
strcmp-sse4 strncmp-c strncmp-ssse3 strncmp-sse4 \
- memcmp-ssse3 memcmp-sse4 strcasestr-nonascii
+ memcmp-ssse3 memcmp-sse4 strcasestr-nonascii varshift
ifeq (yes,$(config-cflags-sse4))
sysdep_routines += strcspn-c strpbrk-c strspn-c strstr-c strcasestr-c
CFLAGS-strcspn-c.c += -msse4
diff --git a/sysdeps/i386/i686/multiarch/varshift.S b/sysdeps/i386/i686/multiarch/varshift.S
new file mode 100644
index 0000000000..41afaf721c
--- /dev/null
+++ b/sysdeps/i386/i686/multiarch/varshift.S
@@ -0,0 +1 @@
+#include <sysdeps/x86_64/multiarch/varshift.S>
diff --git a/sysdeps/i386/i686/multiarch/varshift.h b/sysdeps/i386/i686/multiarch/varshift.h
new file mode 100644
index 0000000000..7c72c70d67
--- /dev/null
+++ b/sysdeps/i386/i686/multiarch/varshift.h
@@ -0,0 +1 @@
+#include <sysdeps/x86_64/multiarch/varshift.h>
diff --git a/sysdeps/x86_64/multiarch/Makefile b/sysdeps/x86_64/multiarch/Makefile
index b124524b2e..27dc56321d 100644
--- a/sysdeps/x86_64/multiarch/Makefile
+++ b/sysdeps/x86_64/multiarch/Makefile
@@ -10,7 +10,7 @@ sysdep_routines += stpncpy-c strncpy-c strcmp-ssse3 strncmp-ssse3 \
memmove-ssse3-back strcasestr-nonascii strcasecmp_l-ssse3 \
strncase_l-ssse3
ifeq (yes,$(config-cflags-sse4))
-sysdep_routines += strcspn-c strpbrk-c strspn-c strstr-c strcasestr-c
+sysdep_routines += strcspn-c strpbrk-c strspn-c strstr-c strcasestr-c varshift
CFLAGS-strcspn-c.c += -msse4
CFLAGS-strpbrk-c.c += -msse4
CFLAGS-strspn-c.c += -msse4
diff --git a/sysdeps/x86_64/multiarch/strcspn-c.c b/sysdeps/x86_64/multiarch/strcspn-c.c
index daeebe1bf5..04aba46237 100644
--- a/sysdeps/x86_64/multiarch/strcspn-c.c
+++ b/sysdeps/x86_64/multiarch/strcspn-c.c
@@ -20,6 +20,7 @@
#include <nmmintrin.h>
#include <string.h>
+#include "varshift.h"
/* We use 0x2:
_SIDD_SBYTE_OPS
@@ -86,8 +87,6 @@ STRCSPN_SSE42 (const char *s, const char *a)
const char *aligned;
__m128i mask;
- /* Fake initialization. gcc otherwise will warn. */
- asm ("" : "=xm" (mask));
int offset = (int) ((size_t) a & 15);
if (offset != 0)
{
@@ -95,54 +94,7 @@ STRCSPN_SSE42 (const char *s, const char *a)
aligned = (const char *) ((size_t) a & -16L);
__m128i mask0 = _mm_load_si128 ((__m128i *) aligned);
- switch (offset)
- {
- case 1:
- mask = _mm_srli_si128 (mask0, 1);
- break;
- case 2:
- mask = _mm_srli_si128 (mask0, 2);
- break;
- case 3:
- mask = _mm_srli_si128 (mask0, 3);
- break;
- case 4:
- mask = _mm_srli_si128 (mask0, 4);
- break;
- case 5:
- mask = _mm_srli_si128 (mask0, 5);
- break;
- case 6:
- mask = _mm_srli_si128 (mask0, 6);
- break;
- case 7:
- mask = _mm_srli_si128 (mask0, 7);
- break;
- case 8:
- mask = _mm_srli_si128 (mask0, 8);
- break;
- case 9:
- mask = _mm_srli_si128 (mask0, 9);
- break;
- case 10:
- mask = _mm_srli_si128 (mask0, 10);
- break;
- case 11:
- mask = _mm_srli_si128 (mask0, 11);
- break;
- case 12:
- mask = _mm_srli_si128 (mask0, 12);
- break;
- case 13:
- mask = _mm_srli_si128 (mask0, 13);
- break;
- case 14:
- mask = _mm_srli_si128 (mask0, 14);
- break;
- case 15:
- mask = _mm_srli_si128 (mask0, 15);
- break;
- }
+ mask = __m128i_shift_right (mask0, offset);
/* Find where the NULL terminator is. */
int length = _mm_cmpistri (mask, mask, 0x3a);
@@ -159,55 +111,10 @@ STRCSPN_SSE42 (const char *s, const char *a)
if (index != 0)
{
- /* Combine mask0 and mask1. */
- switch (offset)
- {
- case 1:
- mask = _mm_alignr_epi8 (mask1, mask0, 1);
- break;
- case 2:
- mask = _mm_alignr_epi8 (mask1, mask0, 2);
- break;
- case 3:
- mask = _mm_alignr_epi8 (mask1, mask0, 3);
- break;
- case 4:
- mask = _mm_alignr_epi8 (mask1, mask0, 4);
- break;
- case 5:
- mask = _mm_alignr_epi8 (mask1, mask0, 5);
- break;
- case 6:
- mask = _mm_alignr_epi8 (mask1, mask0, 6);
- break;
- case 7:
- mask = _mm_alignr_epi8 (mask1, mask0, 7);
- break;
- case 8:
- mask = _mm_alignr_epi8 (mask1, mask0, 8);
- break;
- case 9:
- mask = _mm_alignr_epi8 (mask1, mask0, 9);
- break;
- case 10:
- mask = _mm_alignr_epi8 (mask1, mask0, 10);
- break;
- case 11:
- mask = _mm_alignr_epi8 (mask1, mask0, 11);
- break;
- case 12:
- mask = _mm_alignr_epi8 (mask1, mask0, 12);
- break;
- case 13:
- mask = _mm_alignr_epi8 (mask1, mask0, 13);
- break;
- case 14:
- mask = _mm_alignr_epi8 (mask1, mask0, 14);
- break;
- case 15:
- mask = _mm_alignr_epi8 (mask1, mask0, 15);
- break;
- }
+ /* Combine mask0 and mask1. We could play games with
+ palignr, but frankly this data should be in L1 now
+ so do the merge via an unaligned load. */
+ mask = _mm_loadu_si128 ((__m128i *) a);
}
}
}
@@ -234,54 +141,7 @@ STRCSPN_SSE42 (const char *s, const char *a)
aligned = (const char *) ((size_t) s & -16L);
__m128i value = _mm_load_si128 ((__m128i *) aligned);
- switch (offset)
- {
- case 1:
- value = _mm_srli_si128 (value, 1);
- break;
- case 2:
- value = _mm_srli_si128 (value, 2);
- break;
- case 3:
- value = _mm_srli_si128 (value, 3);
- break;
- case 4:
- value = _mm_srli_si128 (value, 4);
- break;
- case 5:
- value = _mm_srli_si128 (value, 5);
- break;
- case 6:
- value = _mm_srli_si128 (value, 6);
- break;
- case 7:
- value = _mm_srli_si128 (value, 7);
- break;
- case 8:
- value = _mm_srli_si128 (value, 8);
- break;
- case 9:
- value = _mm_srli_si128 (value, 9);
- break;
- case 10:
- value = _mm_srli_si128 (value, 10);
- break;
- case 11:
- value = _mm_srli_si128 (value, 11);
- break;
- case 12:
- value = _mm_srli_si128 (value, 12);
- break;
- case 13:
- value = _mm_srli_si128 (value, 13);
- break;
- case 14:
- value = _mm_srli_si128 (value, 14);
- break;
- case 15:
- value = _mm_srli_si128 (value, 15);
- break;
- }
+ value = __m128i_shift_right (value, offset);
int length = _mm_cmpistri (mask, value, 0x2);
/* No need to check ZFlag since ZFlag is always 1. */
diff --git a/sysdeps/x86_64/multiarch/strspn-c.c b/sysdeps/x86_64/multiarch/strspn-c.c
index be9e8ac0a8..ab58549f9b 100644
--- a/sysdeps/x86_64/multiarch/strspn-c.c
+++ b/sysdeps/x86_64/multiarch/strspn-c.c
@@ -20,6 +20,7 @@
#include <nmmintrin.h>
#include <string.h>
+#include "varshift.h"
/* We use 0x12:
_SIDD_SBYTE_OPS
@@ -71,54 +72,7 @@ __strspn_sse42 (const char *s, const char *a)
aligned = (const char *) ((size_t) a & -16L);
__m128i mask0 = _mm_load_si128 ((__m128i *) aligned);
- switch (offset)
- {
- case 1:
- mask = _mm_srli_si128 (mask0, 1);
- break;
- case 2:
- mask = _mm_srli_si128 (mask0, 2);
- break;
- case 3:
- mask = _mm_srli_si128 (mask0, 3);
- break;
- case 4:
- mask = _mm_srli_si128 (mask0, 4);
- break;
- case 5:
- mask = _mm_srli_si128 (mask0, 5);
- break;
- case 6:
- mask = _mm_srli_si128 (mask0, 6);
- break;
- case 7:
- mask = _mm_srli_si128 (mask0, 7);
- break;
- case 8:
- mask = _mm_srli_si128 (mask0, 8);
- break;
- case 9:
- mask = _mm_srli_si128 (mask0, 9);
- break;
- case 10:
- mask = _mm_srli_si128 (mask0, 10);
- break;
- case 11:
- mask = _mm_srli_si128 (mask0, 11);
- break;
- case 12:
- mask = _mm_srli_si128 (mask0, 12);
- break;
- case 13:
- mask = _mm_srli_si128 (mask0, 13);
- break;
- case 14:
- mask = _mm_srli_si128 (mask0, 14);
- break;
- case 15:
- mask = _mm_srli_si128 (mask0, 15);
- break;
- }
+ mask = __m128i_shift_right (mask0, offset);
/* Find where the NULL terminator is. */
int length = _mm_cmpistri (mask, mask, 0x3a);
@@ -135,55 +89,10 @@ __strspn_sse42 (const char *s, const char *a)
if (index != 0)
{
- /* Combine mask0 and mask1. */
- switch (offset)
- {
- case 1:
- mask = _mm_alignr_epi8 (mask1, mask0, 1);
- break;
- case 2:
- mask = _mm_alignr_epi8 (mask1, mask0, 2);
- break;
- case 3:
- mask = _mm_alignr_epi8 (mask1, mask0, 3);
- break;
- case 4:
- mask = _mm_alignr_epi8 (mask1, mask0, 4);
- break;
- case 5:
- mask = _mm_alignr_epi8 (mask1, mask0, 5);
- break;
- case 6:
- mask = _mm_alignr_epi8 (mask1, mask0, 6);
- break;
- case 7:
- mask = _mm_alignr_epi8 (mask1, mask0, 7);
- break;
- case 8:
- mask = _mm_alignr_epi8 (mask1, mask0, 8);
- break;
- case 9:
- mask = _mm_alignr_epi8 (mask1, mask0, 9);
- break;
- case 10:
- mask = _mm_alignr_epi8 (mask1, mask0, 10);
- break;
- case 11:
- mask = _mm_alignr_epi8 (mask1, mask0, 11);
- break;
- case 12:
- mask = _mm_alignr_epi8 (mask1, mask0, 12);
- break;
- case 13:
- mask = _mm_alignr_epi8 (mask1, mask0, 13);
- break;
- case 14:
- mask = _mm_alignr_epi8 (mask1, mask0, 14);
- break;
- case 15:
- mask = _mm_alignr_epi8 (mask1, mask0, 15);
- break;
- }
+ /* Combine mask0 and mask1. We could play games with
+ palignr, but frankly this data should be in L1 now
+ so do the merge via an unaligned load. */
+ mask = _mm_loadu_si128 ((__m128i *) a);
}
}
}
@@ -210,54 +119,7 @@ __strspn_sse42 (const char *s, const char *a)
aligned = (const char *) ((size_t) s & -16L);
__m128i value = _mm_load_si128 ((__m128i *) aligned);
- switch (offset)
- {
- case 1:
- value = _mm_srli_si128 (value, 1);
- break;
- case 2:
- value = _mm_srli_si128 (value, 2);
- break;
- case 3:
- value = _mm_srli_si128 (value, 3);
- break;
- case 4:
- value = _mm_srli_si128 (value, 4);
- break;
- case 5:
- value = _mm_srli_si128 (value, 5);
- break;
- case 6:
- value = _mm_srli_si128 (value, 6);
- break;
- case 7:
- value = _mm_srli_si128 (value, 7);
- break;
- case 8:
- value = _mm_srli_si128 (value, 8);
- break;
- case 9:
- value = _mm_srli_si128 (value, 9);
- break;
- case 10:
- value = _mm_srli_si128 (value, 10);
- break;
- case 11:
- value = _mm_srli_si128 (value, 11);
- break;
- case 12:
- value = _mm_srli_si128 (value, 12);
- break;
- case 13:
- value = _mm_srli_si128 (value, 13);
- break;
- case 14:
- value = _mm_srli_si128 (value, 14);
- break;
- case 15:
- value = _mm_srli_si128 (value, 15);
- break;
- }
+ value = __m128i_shift_right (value, offset);
int length = _mm_cmpistri (mask, value, 0x12);
/* No need to check CFlag since it is always 1. */
diff --git a/sysdeps/x86_64/multiarch/strstr.c b/sysdeps/x86_64/multiarch/strstr.c
index 45d7a550ac..b408b752fa 100644
--- a/sysdeps/x86_64/multiarch/strstr.c
+++ b/sysdeps/x86_64/multiarch/strstr.c
@@ -19,6 +19,7 @@
02111-1307 USA. */
#include <nmmintrin.h>
+#include "varshift.h"
#ifndef STRSTR_SSE42
# define STRSTR_SSE42 __strstr_sse42
@@ -82,67 +83,6 @@
5. failed string compare, go back to scanning
*/
-/* Fix-up of removal of unneeded data due to 16B aligned load
- parameters:
- value: 16B data loaded from 16B aligned address.
- offset: Offset of target data address relative to 16B aligned load
- address.
- */
-
-static __inline__ __m128i
-__m128i_shift_right (__m128i value, int offset)
-{
- switch (offset)
- {
- case 1:
- value = _mm_srli_si128 (value, 1);
- break;
- case 2:
- value = _mm_srli_si128 (value, 2);
- break;
- case 3:
- value = _mm_srli_si128 (value, 3);
- break;
- case 4:
- value = _mm_srli_si128 (value, 4);
- break;
- case 5:
- value = _mm_srli_si128 (value, 5);
- break;
- case 6:
- value = _mm_srli_si128 (value, 6);
- break;
- case 7:
- value = _mm_srli_si128 (value, 7);
- break;
- case 8:
- value = _mm_srli_si128 (value, 8);
- break;
- case 9:
- value = _mm_srli_si128 (value, 9);
- break;
- case 10:
- value = _mm_srli_si128 (value, 10);
- break;
- case 11:
- value = _mm_srli_si128 (value, 11);
- break;
- case 12:
- value = _mm_srli_si128 (value, 12);
- break;
- case 13:
- value = _mm_srli_si128 (value, 13);
- break;
- case 14:
- value = _mm_srli_si128 (value, 14);
- break;
- case 15:
- value = _mm_srli_si128 (value, 15);
- break;
- }
- return value;
-}
-
/* Simple replacement of movdqu to address 4KB boundary cross issue.
If EOS occurs within less than 16B before 4KB boundary, we don't
cross to next page. */
diff --git a/sysdeps/x86_64/multiarch/varshift.S b/sysdeps/x86_64/multiarch/varshift.S
new file mode 100644
index 0000000000..b50f98bb55
--- /dev/null
+++ b/sysdeps/x86_64/multiarch/varshift.S
@@ -0,0 +1,30 @@
+/* Helper for variable shifts of SSE registers.
+ Copyright (C) 2010 Free Software Foundation, Inc.
+ This file is part of the GNU C Library.
+
+ The GNU C Library is free software; you can redistribute it and/or
+ modify it under the terms of the GNU Lesser General Public
+ License as published by the Free Software Foundation; either
+ version 2.1 of the License, or (at your option) any later version.
+
+ The GNU C Library is distributed in the hope that it will be useful,
+ but WITHOUT ANY WARRANTY; without even the implied warranty of
+ MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
+ Lesser General Public License for more details.
+
+ You should have received a copy of the GNU Lesser General Public
+ License along with the GNU C Library; if not, write to the Free
+ Software Foundation, Inc., 59 Temple Place, Suite 330, Boston, MA
+ 02111-1307 USA. */
+
+#include <sysdep.h>
+
+
+ .section .rodata
+ .hidden ___m128i_shift_right
+ .globl ___m128i_shift_right
+ .size ___m128i_shift_right, 31
+
+___m128i_shift_right:
+ .byte 0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15
+ .byte -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1
diff --git a/sysdeps/x86_64/multiarch/varshift.h b/sysdeps/x86_64/multiarch/varshift.h
new file mode 100644
index 0000000000..d679739f69
--- /dev/null
+++ b/sysdeps/x86_64/multiarch/varshift.h
@@ -0,0 +1,27 @@
+/* Helper for variable shifts of SSE registers.
+ Copyright (C) 2010 Free Software Foundation, Inc.
+ This file is part of the GNU C Library.
+
+ The GNU C Library is free software; you can redistribute it and/or
+ modify it under the terms of the GNU Lesser General Public
+ License as published by the Free Software Foundation; either
+ version 2.1 of the License, or (at your option) any later version.
+
+ The GNU C Library is distributed in the hope that it will be useful,
+ but WITHOUT ANY WARRANTY; without even the implied warranty of
+ MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
+ Lesser General Public License for more details.
+
+ You should have received a copy of the GNU Lesser General Public
+ License along with the GNU C Library; if not, write to the Free
+ Software Foundation, Inc., 59 Temple Place, Suite 330, Boston, MA
+ 02111-1307 USA. */
+
+
+extern char ___m128i_shift_right[31] __attribute__((visibility("hidden")));
+
+static __inline__ __m128i
+__m128i_shift_right (__m128i value, unsigned long offset)
+{
+ return _mm_shuffle_epi8 (value, _mm_loadu_si128 ((__m128 *) (___m128i_shift_right + offset)));
+}