summaryrefslogtreecommitdiff
path: root/sysdeps
diff options
context:
space:
mode:
Diffstat (limited to 'sysdeps')
-rwxr-xr-xsysdeps/i386/configure550
-rw-r--r--sysdeps/i386/configure.in8
-rw-r--r--sysdeps/i386/i686/multiarch/Makefile11
-rw-r--r--sysdeps/i386/i686/multiarch/strcasestr-c.c2
-rw-r--r--sysdeps/i386/i686/multiarch/strcasestr.c1
-rw-r--r--sysdeps/i386/i686/multiarch/strcspn-c.c2
-rw-r--r--sysdeps/i386/i686/multiarch/strcspn.S114
-rw-r--r--sysdeps/i386/i686/multiarch/strlen.S154
-rw-r--r--sysdeps/i386/i686/multiarch/strpbrk-c.c2
-rw-r--r--sysdeps/i386/i686/multiarch/strpbrk.S3
-rw-r--r--sysdeps/i386/i686/multiarch/strspn-c.c2
-rw-r--r--sysdeps/i386/i686/multiarch/strspn.S95
-rw-r--r--sysdeps/i386/i686/multiarch/strstr-c.c12
-rw-r--r--sysdeps/i386/i686/multiarch/strstr.c1
-rw-r--r--sysdeps/x86_64/cacheinfo.c42
-rw-r--r--sysdeps/x86_64/dl-trampoline.S244
-rw-r--r--sysdeps/x86_64/dl-trampoline.h269
-rw-r--r--sysdeps/x86_64/multiarch/Makefile2
-rw-r--r--sysdeps/x86_64/multiarch/rawmemchr.S1
-rw-r--r--sysdeps/x86_64/multiarch/strcmp-ssse3.S3
-rw-r--r--sysdeps/x86_64/multiarch/strcmp.S12
-rw-r--r--sysdeps/x86_64/multiarch/strcspn-c.c6
-rw-r--r--sysdeps/x86_64/multiarch/strlen.S1
-rw-r--r--sysdeps/x86_64/multiarch/strncmp-ssse3.S4
-rw-r--r--sysdeps/x86_64/multiarch/strspn-c.c4
-rw-r--r--sysdeps/x86_64/strcmp.S211
26 files changed, 1450 insertions, 306 deletions
diff --git a/sysdeps/i386/configure b/sysdeps/i386/configure
index ced0b31d0f..d1c4f7f501 100755
--- a/sysdeps/i386/configure
+++ b/sysdeps/i386/configure
@@ -1,13 +1,468 @@
+# Factoring default headers for most tests.
+ac_includes_default="\
+#include <stdio.h>
+#ifdef HAVE_SYS_TYPES_H
+# include <sys/types.h>
+#endif
+#ifdef HAVE_SYS_STAT_H
+# include <sys/stat.h>
+#endif
+#ifdef STDC_HEADERS
+# include <stdlib.h>
+# include <stddef.h>
+#else
+# ifdef HAVE_STDLIB_H
+# include <stdlib.h>
+# endif
+#endif
+#ifdef HAVE_STRING_H
+# if !defined STDC_HEADERS && defined HAVE_MEMORY_H
+# include <memory.h>
+# endif
+# include <string.h>
+#endif
+#ifdef HAVE_STRINGS_H
+# include <strings.h>
+#endif
+#ifdef HAVE_INTTYPES_H
+# include <inttypes.h>
+#endif
+#ifdef HAVE_STDINT_H
+# include <stdint.h>
+#endif
+#ifdef HAVE_UNISTD_H
+# include <unistd.h>
+#endif"
+
# This file is generated from configure.in by Autoconf. DO NOT EDIT!
# Local configure fragment for sysdeps/i386.
-{ echo "$as_me:$LINENO: checking if gcc provides <cpuid.h>" >&5
-echo $ECHO_N "checking if gcc provides <cpuid.h>... $ECHO_C" >&6; }
-if test "${libc_cv_gcc_cpuid+set}" = set; then
+
+{ echo "$as_me:$LINENO: checking for grep that handles long lines and -e" >&5
+echo $ECHO_N "checking for grep that handles long lines and -e... $ECHO_C" >&6; }
+if test "${ac_cv_path_GREP+set}" = set; then
+ echo $ECHO_N "(cached) $ECHO_C" >&6
+else
+ # Extract the first word of "grep ggrep" to use in msg output
+if test -z "$GREP"; then
+set dummy grep ggrep; ac_prog_name=$2
+if test "${ac_cv_path_GREP+set}" = set; then
+ echo $ECHO_N "(cached) $ECHO_C" >&6
+else
+ ac_path_GREP_found=false
+# Loop through the user's path and test for each of PROGNAME-LIST
+as_save_IFS=$IFS; IFS=$PATH_SEPARATOR
+for as_dir in $PATH$PATH_SEPARATOR/usr/xpg4/bin
+do
+ IFS=$as_save_IFS
+ test -z "$as_dir" && as_dir=.
+ for ac_prog in grep ggrep; do
+ for ac_exec_ext in '' $ac_executable_extensions; do
+ ac_path_GREP="$as_dir/$ac_prog$ac_exec_ext"
+ { test -f "$ac_path_GREP" && $as_test_x "$ac_path_GREP"; } || continue
+ # Check for GNU ac_path_GREP and select it if it is found.
+ # Check for GNU $ac_path_GREP
+case `"$ac_path_GREP" --version 2>&1` in
+*GNU*)
+ ac_cv_path_GREP="$ac_path_GREP" ac_path_GREP_found=:;;
+*)
+ ac_count=0
+ echo $ECHO_N "0123456789$ECHO_C" >"conftest.in"
+ while :
+ do
+ cat "conftest.in" "conftest.in" >"conftest.tmp"
+ mv "conftest.tmp" "conftest.in"
+ cp "conftest.in" "conftest.nl"
+ echo 'GREP' >> "conftest.nl"
+ "$ac_path_GREP" -e 'GREP$' -e '-(cannot match)-' < "conftest.nl" >"conftest.out" 2>/dev/null || break
+ diff "conftest.out" "conftest.nl" >/dev/null 2>&1 || break
+ ac_count=`expr $ac_count + 1`
+ if test $ac_count -gt ${ac_path_GREP_max-0}; then
+ # Best one so far, save it but keep looking for a better one
+ ac_cv_path_GREP="$ac_path_GREP"
+ ac_path_GREP_max=$ac_count
+ fi
+ # 10*(2^10) chars as input seems more than enough
+ test $ac_count -gt 10 && break
+ done
+ rm -f conftest.in conftest.tmp conftest.nl conftest.out;;
+esac
+
+
+ $ac_path_GREP_found && break 3
+ done
+done
+
+done
+IFS=$as_save_IFS
+
+
+fi
+
+GREP="$ac_cv_path_GREP"
+if test -z "$GREP"; then
+ { { echo "$as_me:$LINENO: error: no acceptable $ac_prog_name could be found in $PATH$PATH_SEPARATOR/usr/xpg4/bin" >&5
+echo "$as_me: error: no acceptable $ac_prog_name could be found in $PATH$PATH_SEPARATOR/usr/xpg4/bin" >&2;}
+ { (exit 1); exit 1; }; }
+fi
+
+else
+ ac_cv_path_GREP=$GREP
+fi
+
+
+fi
+{ echo "$as_me:$LINENO: result: $ac_cv_path_GREP" >&5
+echo "${ECHO_T}$ac_cv_path_GREP" >&6; }
+ GREP="$ac_cv_path_GREP"
+
+
+{ echo "$as_me:$LINENO: checking for egrep" >&5
+echo $ECHO_N "checking for egrep... $ECHO_C" >&6; }
+if test "${ac_cv_path_EGREP+set}" = set; then
+ echo $ECHO_N "(cached) $ECHO_C" >&6
+else
+ if echo a | $GREP -E '(a|b)' >/dev/null 2>&1
+ then ac_cv_path_EGREP="$GREP -E"
+ else
+ # Extract the first word of "egrep" to use in msg output
+if test -z "$EGREP"; then
+set dummy egrep; ac_prog_name=$2
+if test "${ac_cv_path_EGREP+set}" = set; then
+ echo $ECHO_N "(cached) $ECHO_C" >&6
+else
+ ac_path_EGREP_found=false
+# Loop through the user's path and test for each of PROGNAME-LIST
+as_save_IFS=$IFS; IFS=$PATH_SEPARATOR
+for as_dir in $PATH$PATH_SEPARATOR/usr/xpg4/bin
+do
+ IFS=$as_save_IFS
+ test -z "$as_dir" && as_dir=.
+ for ac_prog in egrep; do
+ for ac_exec_ext in '' $ac_executable_extensions; do
+ ac_path_EGREP="$as_dir/$ac_prog$ac_exec_ext"
+ { test -f "$ac_path_EGREP" && $as_test_x "$ac_path_EGREP"; } || continue
+ # Check for GNU ac_path_EGREP and select it if it is found.
+ # Check for GNU $ac_path_EGREP
+case `"$ac_path_EGREP" --version 2>&1` in
+*GNU*)
+ ac_cv_path_EGREP="$ac_path_EGREP" ac_path_EGREP_found=:;;
+*)
+ ac_count=0
+ echo $ECHO_N "0123456789$ECHO_C" >"conftest.in"
+ while :
+ do
+ cat "conftest.in" "conftest.in" >"conftest.tmp"
+ mv "conftest.tmp" "conftest.in"
+ cp "conftest.in" "conftest.nl"
+ echo 'EGREP' >> "conftest.nl"
+ "$ac_path_EGREP" 'EGREP$' < "conftest.nl" >"conftest.out" 2>/dev/null || break
+ diff "conftest.out" "conftest.nl" >/dev/null 2>&1 || break
+ ac_count=`expr $ac_count + 1`
+ if test $ac_count -gt ${ac_path_EGREP_max-0}; then
+ # Best one so far, save it but keep looking for a better one
+ ac_cv_path_EGREP="$ac_path_EGREP"
+ ac_path_EGREP_max=$ac_count
+ fi
+ # 10*(2^10) chars as input seems more than enough
+ test $ac_count -gt 10 && break
+ done
+ rm -f conftest.in conftest.tmp conftest.nl conftest.out;;
+esac
+
+
+ $ac_path_EGREP_found && break 3
+ done
+done
+
+done
+IFS=$as_save_IFS
+
+
+fi
+
+EGREP="$ac_cv_path_EGREP"
+if test -z "$EGREP"; then
+ { { echo "$as_me:$LINENO: error: no acceptable $ac_prog_name could be found in $PATH$PATH_SEPARATOR/usr/xpg4/bin" >&5
+echo "$as_me: error: no acceptable $ac_prog_name could be found in $PATH$PATH_SEPARATOR/usr/xpg4/bin" >&2;}
+ { (exit 1); exit 1; }; }
+fi
+
+else
+ ac_cv_path_EGREP=$EGREP
+fi
+
+
+ fi
+fi
+{ echo "$as_me:$LINENO: result: $ac_cv_path_EGREP" >&5
+echo "${ECHO_T}$ac_cv_path_EGREP" >&6; }
+ EGREP="$ac_cv_path_EGREP"
+
+
+{ echo "$as_me:$LINENO: checking for ANSI C header files" >&5
+echo $ECHO_N "checking for ANSI C header files... $ECHO_C" >&6; }
+if test "${ac_cv_header_stdc+set}" = set; then
echo $ECHO_N "(cached) $ECHO_C" >&6
else
cat >conftest.$ac_ext <<_ACEOF
+/* confdefs.h. */
+_ACEOF
+cat confdefs.h >>conftest.$ac_ext
+cat >>conftest.$ac_ext <<_ACEOF
+/* end confdefs.h. */
+#include <stdlib.h>
+#include <stdarg.h>
+#include <string.h>
+#include <float.h>
+
+int
+main ()
+{
+
+ ;
+ return 0;
+}
+_ACEOF
+rm -f conftest.$ac_objext
+if { (ac_try="$ac_compile"
+case "(($ac_try" in
+ *\"* | *\`* | *\\*) ac_try_echo=\$ac_try;;
+ *) ac_try_echo=$ac_try;;
+esac
+eval "echo \"\$as_me:$LINENO: $ac_try_echo\"") >&5
+ (eval "$ac_compile") 2>conftest.er1
+ ac_status=$?
+ grep -v '^ *+' conftest.er1 >conftest.err
+ rm -f conftest.er1
+ cat conftest.err >&5
+ echo "$as_me:$LINENO: \$? = $ac_status" >&5
+ (exit $ac_status); } && {
+ test -z "$ac_c_werror_flag" ||
+ test ! -s conftest.err
+ } && test -s conftest.$ac_objext; then
+ ac_cv_header_stdc=yes
+else
+ echo "$as_me: failed program was:" >&5
+sed 's/^/| /' conftest.$ac_ext >&5
+
+ ac_cv_header_stdc=no
+fi
+
+rm -f core conftest.err conftest.$ac_objext conftest.$ac_ext
+
+if test $ac_cv_header_stdc = yes; then
+ # SunOS 4.x string.h does not declare mem*, contrary to ANSI.
+ cat >conftest.$ac_ext <<_ACEOF
+/* confdefs.h. */
+_ACEOF
+cat confdefs.h >>conftest.$ac_ext
+cat >>conftest.$ac_ext <<_ACEOF
+/* end confdefs.h. */
+#include <string.h>
+
+_ACEOF
+if (eval "$ac_cpp conftest.$ac_ext") 2>&5 |
+ $EGREP "memchr" >/dev/null 2>&1; then
+ :
+else
+ ac_cv_header_stdc=no
+fi
+rm -f conftest*
+
+fi
+
+if test $ac_cv_header_stdc = yes; then
+ # ISC 2.0.2 stdlib.h does not declare free, contrary to ANSI.
+ cat >conftest.$ac_ext <<_ACEOF
+/* confdefs.h. */
+_ACEOF
+cat confdefs.h >>conftest.$ac_ext
+cat >>conftest.$ac_ext <<_ACEOF
+/* end confdefs.h. */
+#include <stdlib.h>
+
+_ACEOF
+if (eval "$ac_cpp conftest.$ac_ext") 2>&5 |
+ $EGREP "free" >/dev/null 2>&1; then
+ :
+else
+ ac_cv_header_stdc=no
+fi
+rm -f conftest*
+
+fi
+
+if test $ac_cv_header_stdc = yes; then
+ # /bin/cc in Irix-4.0.5 gets non-ANSI ctype macros unless using -ansi.
+ if test "$cross_compiling" = yes; then
+ :
+else
+ cat >conftest.$ac_ext <<_ACEOF
+/* confdefs.h. */
+_ACEOF
+cat confdefs.h >>conftest.$ac_ext
+cat >>conftest.$ac_ext <<_ACEOF
+/* end confdefs.h. */
+#include <ctype.h>
+#include <stdlib.h>
+#if ((' ' & 0x0FF) == 0x020)
+# define ISLOWER(c) ('a' <= (c) && (c) <= 'z')
+# define TOUPPER(c) (ISLOWER(c) ? 'A' + ((c) - 'a') : (c))
+#else
+# define ISLOWER(c) \
+ (('a' <= (c) && (c) <= 'i') \
+ || ('j' <= (c) && (c) <= 'r') \
+ || ('s' <= (c) && (c) <= 'z'))
+# define TOUPPER(c) (ISLOWER(c) ? ((c) | 0x40) : (c))
+#endif
+
+#define XOR(e, f) (((e) && !(f)) || (!(e) && (f)))
+int
+main ()
+{
+ int i;
+ for (i = 0; i < 256; i++)
+ if (XOR (islower (i), ISLOWER (i))
+ || toupper (i) != TOUPPER (i))
+ return 2;
+ return 0;
+}
+_ACEOF
+rm -f conftest$ac_exeext
+if { (ac_try="$ac_link"
+case "(($ac_try" in
+ *\"* | *\`* | *\\*) ac_try_echo=\$ac_try;;
+ *) ac_try_echo=$ac_try;;
+esac
+eval "echo \"\$as_me:$LINENO: $ac_try_echo\"") >&5
+ (eval "$ac_link") 2>&5
+ ac_status=$?
+ echo "$as_me:$LINENO: \$? = $ac_status" >&5
+ (exit $ac_status); } && { ac_try='./conftest$ac_exeext'
+ { (case "(($ac_try" in
+ *\"* | *\`* | *\\*) ac_try_echo=\$ac_try;;
+ *) ac_try_echo=$ac_try;;
+esac
+eval "echo \"\$as_me:$LINENO: $ac_try_echo\"") >&5
+ (eval "$ac_try") 2>&5
+ ac_status=$?
+ echo "$as_me:$LINENO: \$? = $ac_status" >&5
+ (exit $ac_status); }; }; then
+ :
+else
+ echo "$as_me: program exited with status $ac_status" >&5
+echo "$as_me: failed program was:" >&5
+sed 's/^/| /' conftest.$ac_ext >&5
+
+( exit $ac_status )
+ac_cv_header_stdc=no
+fi
+rm -f core *.core core.conftest.* gmon.out bb.out conftest$ac_exeext conftest.$ac_objext conftest.$ac_ext
+fi
+
+
+fi
+fi
+{ echo "$as_me:$LINENO: result: $ac_cv_header_stdc" >&5
+echo "${ECHO_T}$ac_cv_header_stdc" >&6; }
+if test $ac_cv_header_stdc = yes; then
+
+cat >>confdefs.h <<\_ACEOF
+#define STDC_HEADERS 1
+_ACEOF
+
+fi
+
+# On IRIX 5.3, sys/types and inttypes.h are conflicting.
+
+
+
+
+
+
+
+
+
+for ac_header in sys/types.h sys/stat.h stdlib.h string.h memory.h strings.h \
+ inttypes.h stdint.h unistd.h
+do
+as_ac_Header=`echo "ac_cv_header_$ac_header" | $as_tr_sh`
+{ echo "$as_me:$LINENO: checking for $ac_header" >&5
+echo $ECHO_N "checking for $ac_header... $ECHO_C" >&6; }
+if { as_var=$as_ac_Header; eval "test \"\${$as_var+set}\" = set"; }; then
+ echo $ECHO_N "(cached) $ECHO_C" >&6
+else
+ cat >conftest.$ac_ext <<_ACEOF
+/* confdefs.h. */
+_ACEOF
+cat confdefs.h >>conftest.$ac_ext
+cat >>conftest.$ac_ext <<_ACEOF
+/* end confdefs.h. */
+$ac_includes_default
+
+#include <$ac_header>
+_ACEOF
+rm -f conftest.$ac_objext
+if { (ac_try="$ac_compile"
+case "(($ac_try" in
+ *\"* | *\`* | *\\*) ac_try_echo=\$ac_try;;
+ *) ac_try_echo=$ac_try;;
+esac
+eval "echo \"\$as_me:$LINENO: $ac_try_echo\"") >&5
+ (eval "$ac_compile") 2>conftest.er1
+ ac_status=$?
+ grep -v '^ *+' conftest.er1 >conftest.err
+ rm -f conftest.er1
+ cat conftest.err >&5
+ echo "$as_me:$LINENO: \$? = $ac_status" >&5
+ (exit $ac_status); } && {
+ test -z "$ac_c_werror_flag" ||
+ test ! -s conftest.err
+ } && test -s conftest.$ac_objext; then
+ eval "$as_ac_Header=yes"
+else
+ echo "$as_me: failed program was:" >&5
+sed 's/^/| /' conftest.$ac_ext >&5
+
+ eval "$as_ac_Header=no"
+fi
+
+rm -f core conftest.err conftest.$ac_objext conftest.$ac_ext
+fi
+ac_res=`eval echo '${'$as_ac_Header'}'`
+ { echo "$as_me:$LINENO: result: $ac_res" >&5
+echo "${ECHO_T}$ac_res" >&6; }
+if test `eval echo '${'$as_ac_Header'}'` = yes; then
+ cat >>confdefs.h <<_ACEOF
+#define `echo "HAVE_$ac_header" | $as_tr_cpp` 1
+_ACEOF
+
+fi
+
+done
+
+
+if test "${ac_cv_header_cpuid_h+set}" = set; then
+ { echo "$as_me:$LINENO: checking for cpuid.h" >&5
+echo $ECHO_N "checking for cpuid.h... $ECHO_C" >&6; }
+if test "${ac_cv_header_cpuid_h+set}" = set; then
+ echo $ECHO_N "(cached) $ECHO_C" >&6
+fi
+{ echo "$as_me:$LINENO: result: $ac_cv_header_cpuid_h" >&5
+echo "${ECHO_T}$ac_cv_header_cpuid_h" >&6; }
+else
+ # Is the header compilable?
+{ echo "$as_me:$LINENO: checking cpuid.h usability" >&5
+echo $ECHO_N "checking cpuid.h usability... $ECHO_C" >&6; }
+cat >conftest.$ac_ext <<_ACEOF
+/* confdefs.h. */
+_ACEOF
+cat confdefs.h >>conftest.$ac_ext
+cat >>conftest.$ac_ext <<_ACEOF
+/* end confdefs.h. */
+$ac_includes_default
#include <cpuid.h>
_ACEOF
rm -f conftest.$ac_objext
@@ -27,24 +482,103 @@ eval "echo \"\$as_me:$LINENO: $ac_try_echo\"") >&5
test -z "$ac_c_werror_flag" ||
test ! -s conftest.err
} && test -s conftest.$ac_objext; then
- libc_cv_gcc_cpuid=yes
+ ac_header_compiler=yes
else
echo "$as_me: failed program was:" >&5
sed 's/^/| /' conftest.$ac_ext >&5
- libc_cv_gcc_cpuid=no
+ ac_header_compiler=no
fi
rm -f core conftest.err conftest.$ac_objext conftest.$ac_ext
+{ echo "$as_me:$LINENO: result: $ac_header_compiler" >&5
+echo "${ECHO_T}$ac_header_compiler" >&6; }
+
+# Is the header present?
+{ echo "$as_me:$LINENO: checking cpuid.h presence" >&5
+echo $ECHO_N "checking cpuid.h presence... $ECHO_C" >&6; }
+cat >conftest.$ac_ext <<_ACEOF
+/* confdefs.h. */
+_ACEOF
+cat confdefs.h >>conftest.$ac_ext
+cat >>conftest.$ac_ext <<_ACEOF
+/* end confdefs.h. */
+#include <cpuid.h>
+_ACEOF
+if { (ac_try="$ac_cpp conftest.$ac_ext"
+case "(($ac_try" in
+ *\"* | *\`* | *\\*) ac_try_echo=\$ac_try;;
+ *) ac_try_echo=$ac_try;;
+esac
+eval "echo \"\$as_me:$LINENO: $ac_try_echo\"") >&5
+ (eval "$ac_cpp conftest.$ac_ext") 2>conftest.er1
+ ac_status=$?
+ grep -v '^ *+' conftest.er1 >conftest.err
+ rm -f conftest.er1
+ cat conftest.err >&5
+ echo "$as_me:$LINENO: \$? = $ac_status" >&5
+ (exit $ac_status); } >/dev/null && {
+ test -z "$ac_c_preproc_warn_flag$ac_c_werror_flag" ||
+ test ! -s conftest.err
+ }; then
+ ac_header_preproc=yes
+else
+ echo "$as_me: failed program was:" >&5
+sed 's/^/| /' conftest.$ac_ext >&5
+
+ ac_header_preproc=no
fi
-{ echo "$as_me:$LINENO: result: $libc_cv_gcc_cpuid" >&5
-echo "${ECHO_T}$libc_cv_gcc_cpuid" >&6; }
-if test $libc_cv_gcc_cpuid != yes; then
+
+rm -f conftest.err conftest.$ac_ext
+{ echo "$as_me:$LINENO: result: $ac_header_preproc" >&5
+echo "${ECHO_T}$ac_header_preproc" >&6; }
+
+# So? What about this header?
+case $ac_header_compiler:$ac_header_preproc:$ac_c_preproc_warn_flag in
+ yes:no: )
+ { echo "$as_me:$LINENO: WARNING: cpuid.h: accepted by the compiler, rejected by the preprocessor!" >&5
+echo "$as_me: WARNING: cpuid.h: accepted by the compiler, rejected by the preprocessor!" >&2;}
+ { echo "$as_me:$LINENO: WARNING: cpuid.h: proceeding with the compiler's result" >&5
+echo "$as_me: WARNING: cpuid.h: proceeding with the compiler's result" >&2;}
+ ac_header_preproc=yes
+ ;;
+ no:yes:* )
+ { echo "$as_me:$LINENO: WARNING: cpuid.h: present but cannot be compiled" >&5
+echo "$as_me: WARNING: cpuid.h: present but cannot be compiled" >&2;}
+ { echo "$as_me:$LINENO: WARNING: cpuid.h: check for missing prerequisite headers?" >&5
+echo "$as_me: WARNING: cpuid.h: check for missing prerequisite headers?" >&2;}
+ { echo "$as_me:$LINENO: WARNING: cpuid.h: see the Autoconf documentation" >&5
+echo "$as_me: WARNING: cpuid.h: see the Autoconf documentation" >&2;}
+ { echo "$as_me:$LINENO: WARNING: cpuid.h: section \"Present But Cannot Be Compiled\"" >&5
+echo "$as_me: WARNING: cpuid.h: section \"Present But Cannot Be Compiled\"" >&2;}
+ { echo "$as_me:$LINENO: WARNING: cpuid.h: proceeding with the preprocessor's result" >&5
+echo "$as_me: WARNING: cpuid.h: proceeding with the preprocessor's result" >&2;}
+ { echo "$as_me:$LINENO: WARNING: cpuid.h: in the future, the compiler will take precedence" >&5
+echo "$as_me: WARNING: cpuid.h: in the future, the compiler will take precedence" >&2;}
+
+ ;;
+esac
+{ echo "$as_me:$LINENO: checking for cpuid.h" >&5
+echo $ECHO_N "checking for cpuid.h... $ECHO_C" >&6; }
+if test "${ac_cv_header_cpuid_h+set}" = set; then
+ echo $ECHO_N "(cached) $ECHO_C" >&6
+else
+ ac_cv_header_cpuid_h=$ac_header_preproc
+fi
+{ echo "$as_me:$LINENO: result: $ac_cv_header_cpuid_h" >&5
+echo "${ECHO_T}$ac_cv_header_cpuid_h" >&6; }
+
+fi
+if test $ac_cv_header_cpuid_h = yes; then
+ :
+else
{ { echo "$as_me:$LINENO: error: gcc must provide the <cpuid.h> header" >&5
echo "$as_me: error: gcc must provide the <cpuid.h> header" >&2;}
{ (exit 1); exit 1; }; }
fi
+
+
{ echo "$as_me:$LINENO: checking if -g produces usable source locations for assembler-with-cpp" >&5
echo $ECHO_N "checking if -g produces usable source locations for assembler-with-cpp... $ECHO_C" >&6; }
if test "${libc_cv_cpp_asm_debuginfo+set}" = set; then
diff --git a/sysdeps/i386/configure.in b/sysdeps/i386/configure.in
index 800f928fbd..12dceaf844 100644
--- a/sysdeps/i386/configure.in
+++ b/sysdeps/i386/configure.in
@@ -1,12 +1,8 @@
GLIBC_PROVIDES dnl See aclocal.m4 in the top level source directory.
# Local configure fragment for sysdeps/i386.
-AC_CACHE_CHECK([if gcc provides <cpuid.h>], libc_cv_gcc_cpuid, [dnl
-AC_COMPILE_IFELSE([#include <cpuid.h>], libc_cv_gcc_cpuid=yes,
- libc_cv_gcc_cpuid=no)])
-if test $libc_cv_gcc_cpuid != yes; then
- AC_MSG_ERROR([gcc must provide the <cpuid.h> header])
-fi
+AC_HEADER_CHECK([cpuid.h], ,
+ [AC_MSG_ERROR([gcc must provide the <cpuid.h> header])])
AC_CACHE_CHECK(if -g produces usable source locations for assembler-with-cpp,
libc_cv_cpp_asm_debuginfo, [dnl
diff --git a/sysdeps/i386/i686/multiarch/Makefile b/sysdeps/i386/i686/multiarch/Makefile
index 33d98c36e6..e1553b284e 100644
--- a/sysdeps/i386/i686/multiarch/Makefile
+++ b/sysdeps/i386/i686/multiarch/Makefile
@@ -2,3 +2,14 @@ ifeq ($(subdir),csu)
aux += init-arch
gen-as-const-headers += ifunc-defines.sym
endif
+
+ifeq ($(subdir),string)
+ifeq (yes,$(config-cflags-sse4))
+sysdep_routines += strcspn-c strpbrk-c strspn-c strstr-c strcasestr-c
+CFLAGS-strcspn-c.c += -msse4
+CFLAGS-strpbrk-c.c += -msse4
+CFLAGS-strspn-c.c += -msse4
+CFLAGS-strstr.c += -msse4
+CFLAGS-strcasestr.c += -msse4
+endif
+endif
diff --git a/sysdeps/i386/i686/multiarch/strcasestr-c.c b/sysdeps/i386/i686/multiarch/strcasestr-c.c
new file mode 100644
index 0000000000..0d52b0e47a
--- /dev/null
+++ b/sysdeps/i386/i686/multiarch/strcasestr-c.c
@@ -0,0 +1,2 @@
+#define __strcasestr_sse2 __strcasestr_ia32
+#include <sysdeps/x86_64/multiarch/strcasestr-c.c>
diff --git a/sysdeps/i386/i686/multiarch/strcasestr.c b/sysdeps/i386/i686/multiarch/strcasestr.c
new file mode 100644
index 0000000000..511bb29ede
--- /dev/null
+++ b/sysdeps/i386/i686/multiarch/strcasestr.c
@@ -0,0 +1 @@
+#include <sysdeps/x86_64/multiarch/strcasestr.c>
diff --git a/sysdeps/i386/i686/multiarch/strcspn-c.c b/sysdeps/i386/i686/multiarch/strcspn-c.c
new file mode 100644
index 0000000000..6d61e190a8
--- /dev/null
+++ b/sysdeps/i386/i686/multiarch/strcspn-c.c
@@ -0,0 +1,2 @@
+#define __strcspn_sse2 __strcspn_ia32
+#include <sysdeps/x86_64/multiarch/strcspn-c.c>
diff --git a/sysdeps/i386/i686/multiarch/strcspn.S b/sysdeps/i386/i686/multiarch/strcspn.S
new file mode 100644
index 0000000000..73e7eb45a8
--- /dev/null
+++ b/sysdeps/i386/i686/multiarch/strcspn.S
@@ -0,0 +1,114 @@
+/* Multiple versions of strcspn
+ Copyright (C) 2009 Free Software Foundation, Inc.
+ Contributed by Intel Corporation.
+ This file is part of the GNU C Library.
+
+ The GNU C Library is free software; you can redistribute it and/or
+ modify it under the terms of the GNU Lesser General Public
+ License as published by the Free Software Foundation; either
+ version 2.1 of the License, or (at your option) any later version.
+
+ The GNU C Library is distributed in the hope that it will be useful,
+ but WITHOUT ANY WARRANTY; without even the implied warranty of
+ MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
+ Lesser General Public License for more details.
+
+ You should have received a copy of the GNU Lesser General Public
+ License along with the GNU C Library; if not, write to the Free
+ Software Foundation, Inc., 59 Temple Place, Suite 330, Boston, MA
+ 02111-1307 USA. */
+
+#include <config.h>
+
+#ifdef HAVE_SSE4_SUPPORT
+
+#include <sysdep.h>
+#include <ifunc-defines.h>
+
+#ifdef USE_AS_STRPBRK
+#define STRCSPN_SSE42 __strpbrk_sse42
+#define STRCSPN_IA32 __strpbrk_ia32
+#define __GI_STRCSPN __GI_strpbrk
+#else
+#ifndef STRCSPN
+#define STRCSPN strcspn
+#define STRCSPN_SSE42 __strcspn_sse42
+#define STRCSPN_IA32 __strcspn_ia32
+#define __GI_STRCSPN __GI_strcspn
+#endif
+#endif
+
+/* Define multiple versions only for the definition in libc. Don't
+ define multiple versions for strpbrk in static library since we
+ need strpbrk before the initialization happened. */
+#if (defined SHARED || !defined USE_AS_STRPBRK) && !defined NOT_IN_libc
+# ifdef SHARED
+ .section .gnu.linkonce.t.__i686.get_pc_thunk.bx,"ax",@progbits
+ .globl __i686.get_pc_thunk.bx
+ .hidden __i686.get_pc_thunk.bx
+ .p2align 4
+ .type __i686.get_pc_thunk.bx,@function
+__i686.get_pc_thunk.bx:
+ movl (%esp), %ebx
+ ret
+
+ .text
+ENTRY(STRCSPN)
+ .type STRCSPN, @gnu_indirect_function
+ pushl %ebx
+ cfi_adjust_cfa_offset (4)
+ cfi_rel_offset (ebx, 0)
+ call __i686.get_pc_thunk.bx
+ addl $_GLOBAL_OFFSET_TABLE_, %ebx
+ cmpl $0, KIND_OFFSET+__cpu_features@GOTOFF(%ebx)
+ jne 1f
+ call __init_cpu_features
+1: leal STRCSPN_IA32@GOTOFF(%ebx), %eax
+ testl $(1<<20), CPUID_OFFSET+COMMON_CPUID_INDEX_1*CPUID_SIZE+CPUID_ECX_OFFSET+__cpu_features@GOTOFF(%ebx)
+ jz 2f
+ leal STRCSPN_SSE42@GOTOFF(%ebx), %eax
+2: popl %ebx
+ cfi_adjust_cfa_offset (-4);
+ cfi_restore (ebx)
+ ret
+END(STRCSPN)
+# else
+ .text
+ENTRY(STRCSPN)
+ .type STRCSPN, @gnu_indirect_function
+ cmpl $0, KIND_OFFSET+__cpu_features
+ jne 1f
+ call __init_cpu_features
+1: leal STRCSPN_IA32, %eax
+ testl $(1<<20), CPUID_OFFSET+COMMON_CPUID_INDEX_1*CPUID_SIZE+CPUID_ECX_OFFSET+__cpu_features
+ jz 2f
+ leal STRCSPN_SSE42, %eax
+2: ret
+END(STRCSPN)
+# endif
+
+# undef ENTRY
+# define ENTRY(name) \
+ .type STRCSPN_IA32, @function; \
+ .globl STRCSPN_IA32; \
+ .p2align 4; \
+ STRCSPN_IA32: cfi_startproc; \
+ CALL_MCOUNT
+# undef END
+# define END(name) \
+ cfi_endproc; .size STRCSPN_IA32, .-STRCSPN_IA32
+# undef libc_hidden_builtin_def
+/* IFUNC doesn't work with the hidden functions in shared library since
+ they will be called without setting up EBX needed for PLT which is
+ used by IFUNC. */
+# define libc_hidden_builtin_def(name) \
+ .globl __GI_STRCSPN; __GI_STRCSPN = STRCSPN_IA32
+#endif
+
+#endif /* HAVE_SSE4_SUPPORT */
+
+#ifdef USE_AS_STRPBRK
+#include "../../strpbrk.S"
+#else
+#include "../../strcspn.S"
+#endif
diff --git a/sysdeps/i386/i686/multiarch/strlen.S b/sysdeps/i386/i686/multiarch/strlen.S
new file mode 100644
index 0000000000..0c1e8646ff
--- /dev/null
+++ b/sysdeps/i386/i686/multiarch/strlen.S
@@ -0,0 +1,154 @@
+/* Multiple versions of strlen
+ Copyright (C) 2009 Free Software Foundation, Inc.
+ Contributed by Intel Corporation.
+ This file is part of the GNU C Library.
+
+ The GNU C Library is free software; you can redistribute it and/or
+ modify it under the terms of the GNU Lesser General Public
+ License as published by the Free Software Foundation; either
+ version 2.1 of the License, or (at your option) any later version.
+
+ The GNU C Library is distributed in the hope that it will be useful,
+ but WITHOUT ANY WARRANTY; without even the implied warranty of
+ MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
+ Lesser General Public License for more details.
+
+ You should have received a copy of the GNU Lesser General Public
+ License along with the GNU C Library; if not, write to the Free
+ Software Foundation, Inc., 59 Temple Place, Suite 330, Boston, MA
+ 02111-1307 USA. */
+
+#include <sysdep.h>
+#include <ifunc-defines.h>
+
+/* Define multiple versions only for the definition in libc and for the
+ DSO. In static binaries, we need strlen before the initialization
+ happened. */
+#if defined SHARED && !defined NOT_IN_libc
+ .section .gnu.linkonce.t.__i686.get_pc_thunk.bx,"ax",@progbits
+ .globl __i686.get_pc_thunk.bx
+ .hidden __i686.get_pc_thunk.bx
+ .p2align 4
+ .type __i686.get_pc_thunk.bx,@function
+__i686.get_pc_thunk.bx:
+ movl (%esp), %ebx
+ ret
+
+ .text
+ENTRY(strlen)
+ .type strlen, @gnu_indirect_function
+ pushl %ebx
+ cfi_adjust_cfa_offset (4)
+ cfi_rel_offset (ebx, 0)
+ call __i686.get_pc_thunk.bx
+ addl $_GLOBAL_OFFSET_TABLE_, %ebx
+ cmpl $0, KIND_OFFSET+__cpu_features@GOTOFF(%ebx)
+ jne 1f
+ call __init_cpu_features
+1: leal __strlen_ia32@GOTOFF(%ebx), %eax
+ testl $(1<<26), CPUID_OFFSET+COMMON_CPUID_INDEX_1*CPUID_SIZE+CPUID_EDX_OFFSET+__cpu_features@GOTOFF(%ebx)
+ jz 2f
+ leal __strlen_sse2@GOTOFF(%ebx), %eax
+2: popl %ebx
+ cfi_adjust_cfa_offset (-4);
+ cfi_restore (ebx)
+ ret
+END(strlen)
+
+#define CFI_POP(REG) \
+ cfi_adjust_cfa_offset (-4); \
+ cfi_restore (REG)
+
+#define RETURN popl %esi; CFI_POP (esi); ret
+
+ .text
+ENTRY (__strlen_sse2)
+/*
+ * This implementation uses SSE instructions to compare up to 16 bytes
+ * at a time looking for the end of string (null char).
+ */
+ pushl %esi
+ cfi_adjust_cfa_offset (4)
+ cfi_rel_offset (%esi, 0)
+ mov 8(%esp), %eax
+ mov %eax, %ecx
+ pxor %xmm0, %xmm0 /* 16 null chars */
+ mov %eax, %esi
+ and $15, %ecx
+ jz 1f /* string is 16 byte aligned */
+
+ /*
+ * Unaligned case. Round down to 16-byte boundary before comparing
+ * 16 bytes for a null char. The code then compensates for any extra chars
+ * preceding the start of the string.
+ */
+ and $-16, %esi
+
+ pcmpeqb (%esi), %xmm0
+ lea 16(%eax), %esi
+ pmovmskb %xmm0, %edx
+
+ shr %cl, %edx /* Compensate for bytes preceding the string */
+ test %edx, %edx
+ jnz 2f
+ sub %ecx, %esi /* no null, adjust to next 16-byte boundary */
+ pxor %xmm0, %xmm0 /* clear xmm0, may have been changed... */
+
+ .p2align 4
+1: /* 16 byte aligned */
+ pcmpeqb (%esi), %xmm0 /* look for null bytes */
+ pmovmskb %xmm0, %edx /* move each byte mask of %xmm0 to edx */
+
+ add $16, %esi /* prepare to search next 16 bytes */
+ test %edx, %edx /* if no null byte, %edx must be 0 */
+ jnz 2f /* found a null */
+
+ pcmpeqb (%esi), %xmm0
+ pmovmskb %xmm0, %edx
+ add $16, %esi
+ test %edx, %edx
+ jnz 2f
+
+ pcmpeqb (%esi), %xmm0
+ pmovmskb %xmm0, %edx
+ add $16, %esi
+ test %edx, %edx
+ jnz 2f
+
+ pcmpeqb (%esi), %xmm0
+ pmovmskb %xmm0, %edx
+ add $16, %esi
+ test %edx, %edx
+ jz 1b
+
+2:
+ neg %eax
+ lea -16(%eax, %esi), %eax /* calculate exact offset */
+ bsf %edx, %ecx /* Least significant 1 bit is index of null */
+ add %ecx, %eax
+ popl %esi
+ cfi_adjust_cfa_offset (-4)
+ cfi_restore (%esi)
+ ret
+
+END (__strlen_sse2)
+
+# undef ENTRY
+# define ENTRY(name) \
+ .type __strlen_ia32, @function; \
+ .globl __strlen_ia32; \
+ .p2align 4; \
+ __strlen_ia32: cfi_startproc; \
+ CALL_MCOUNT
+# undef END
+# define END(name) \
+ cfi_endproc; .size __strlen_ia32, .-__strlen_ia32
+# undef libc_hidden_builtin_def
+/* IFUNC doesn't work with the hidden functions in shared library since
+ they will be called without setting up EBX needed for PLT which is
+ used by IFUNC. */
+# define libc_hidden_builtin_def(name) \
+ .globl __GI_strlen; __GI_strlen = __strlen_ia32
+#endif
+
+#include "../../i586/strlen.S"
diff --git a/sysdeps/i386/i686/multiarch/strpbrk-c.c b/sysdeps/i386/i686/multiarch/strpbrk-c.c
new file mode 100644
index 0000000000..5db62053b3
--- /dev/null
+++ b/sysdeps/i386/i686/multiarch/strpbrk-c.c
@@ -0,0 +1,2 @@
+#define __strpbrk_sse2 __strpbrk_ia32
+#include <sysdeps/x86_64/multiarch/strpbrk-c.c>
diff --git a/sysdeps/i386/i686/multiarch/strpbrk.S b/sysdeps/i386/i686/multiarch/strpbrk.S
new file mode 100644
index 0000000000..ed5bca6a94
--- /dev/null
+++ b/sysdeps/i386/i686/multiarch/strpbrk.S
@@ -0,0 +1,3 @@
+#define STRCSPN strpbrk
+#define USE_AS_STRPBRK
+#include "strcspn.S"
diff --git a/sysdeps/i386/i686/multiarch/strspn-c.c b/sysdeps/i386/i686/multiarch/strspn-c.c
new file mode 100644
index 0000000000..bea09dea71
--- /dev/null
+++ b/sysdeps/i386/i686/multiarch/strspn-c.c
@@ -0,0 +1,2 @@
+#define __strspn_sse2 __strspn_ia32
+#include <sysdeps/x86_64/multiarch/strspn-c.c>
diff --git a/sysdeps/i386/i686/multiarch/strspn.S b/sysdeps/i386/i686/multiarch/strspn.S
new file mode 100644
index 0000000000..f306d2d1fb
--- /dev/null
+++ b/sysdeps/i386/i686/multiarch/strspn.S
@@ -0,0 +1,95 @@
+/* Multiple versions of strspn
+ Copyright (C) 2009 Free Software Foundation, Inc.
+ Contributed by Intel Corporation.
+ This file is part of the GNU C Library.
+
+ The GNU C Library is free software; you can redistribute it and/or
+ modify it under the terms of the GNU Lesser General Public
+ License as published by the Free Software Foundation; either
+ version 2.1 of the License, or (at your option) any later version.
+
+ The GNU C Library is distributed in the hope that it will be useful,
+ but WITHOUT ANY WARRANTY; without even the implied warranty of
+ MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
+ Lesser General Public License for more details.
+
+ You should have received a copy of the GNU Lesser General Public
+ License along with the GNU C Library; if not, write to the Free
+ Software Foundation, Inc., 59 Temple Place, Suite 330, Boston, MA
+ 02111-1307 USA. */
+
+#include <config.h>
+
+#ifdef HAVE_SSE4_SUPPORT
+
+#include <sysdep.h>
+#include <ifunc-defines.h>
+
+/* Define multiple versions only for the definition in libc. */
+#ifndef NOT_IN_libc
+# ifdef SHARED
+ .section .gnu.linkonce.t.__i686.get_pc_thunk.bx,"ax",@progbits
+ .globl __i686.get_pc_thunk.bx
+ .hidden __i686.get_pc_thunk.bx
+ .p2align 4
+ .type __i686.get_pc_thunk.bx,@function
+__i686.get_pc_thunk.bx:
+ movl (%esp), %ebx
+ ret
+
+ .text
+ENTRY(strspn)
+ .type strspn, @gnu_indirect_function
+ pushl %ebx
+ cfi_adjust_cfa_offset (4)
+ cfi_rel_offset (ebx, 0)
+ call __i686.get_pc_thunk.bx
+ addl $_GLOBAL_OFFSET_TABLE_, %ebx
+ cmpl $0, KIND_OFFSET+__cpu_features@GOTOFF(%ebx)
+ jne 1f
+ call __init_cpu_features
+1: leal __strspn_ia32@GOTOFF(%ebx), %eax
+ testl $(1<<20), CPUID_OFFSET+COMMON_CPUID_INDEX_1*CPUID_SIZE+CPUID_ECX_OFFSET+__cpu_features@GOTOFF(%ebx)
+ jz 2f
+ leal __strspn_sse42@GOTOFF(%ebx), %eax
+2: popl %ebx
+ cfi_adjust_cfa_offset (-4);
+ cfi_restore (ebx)
+ ret
+END(strspn)
+# else
+ .text
+ENTRY(strspn)
+ .type strspn, @gnu_indirect_function
+ cmpl $0, KIND_OFFSET+__cpu_features
+ jne 1f
+ call __init_cpu_features
+1: leal __strspn_ia32, %eax
+ testl $(1<<20), CPUID_OFFSET+COMMON_CPUID_INDEX_1*CPUID_SIZE+CPUID_ECX_OFFSET+__cpu_features
+ jz 2f
+ leal __strspn_sse42, %eax
+2: ret
+END(strspn)
+# endif
+
+# undef ENTRY
+# define ENTRY(name) \
+ .type __strspn_ia32, @function; \
+ .globl __strspn_ia32; \
+ .p2align 4
+ __strspn_ia32: cfi_startproc; \
+ CALL_MCOUNT
+# undef END
+# define END(name) \
+ cfi_endproc; .size __strspn_ia32, .-__strspn_ia32
+# undef libc_hidden_builtin_def
+/* IFUNC doesn't work with the hidden functions in shared library since
+ they will be called without setting up EBX needed for PLT which is
+ used by IFUNC. */
+# define libc_hidden_builtin_def(name) \
+ .globl __GI_strspn; __GI_strspn = __strspn_ia32
+#endif
+
+#endif /* HAVE_SSE4_SUPPORT */
+
+#include "../../strspn.S"
diff --git a/sysdeps/i386/i686/multiarch/strstr-c.c b/sysdeps/i386/i686/multiarch/strstr-c.c
new file mode 100644
index 0000000000..7ef1157ce4
--- /dev/null
+++ b/sysdeps/i386/i686/multiarch/strstr-c.c
@@ -0,0 +1,12 @@
+#include "init-arch.h"
+
+#define STRSTR __strstr_ia32
+#undef libc_hidden_builtin_def
+#define libc_hidden_builtin_def(name) \
+ __hidden_ver1 (__strstr_ia32, __GI_strstr, __strstr_ia32);
+
+#include "string/strstr.c"
+
+extern char *__strstr_sse42 (const char *, const char *);
+
+libc_ifunc (strstr, HAS_SSE4_2 ? __strstr_sse42 : __strstr_ia32);
diff --git a/sysdeps/i386/i686/multiarch/strstr.c b/sysdeps/i386/i686/multiarch/strstr.c
new file mode 100644
index 0000000000..a97428c125
--- /dev/null
+++ b/sysdeps/i386/i686/multiarch/strstr.c
@@ -0,0 +1 @@
+#include <sysdeps/x86_64/multiarch/strstr.c>
diff --git a/sysdeps/x86_64/cacheinfo.c b/sysdeps/x86_64/cacheinfo.c
index f252fc2c6c..5b66c62eb3 100644
--- a/sysdeps/x86_64/cacheinfo.c
+++ b/sysdeps/x86_64/cacheinfo.c
@@ -516,13 +516,15 @@ init_cacheinfo (void)
shared = handle_intel (_SC_LEVEL2_CACHE_SIZE, max_cpuid);
}
+ unsigned int ebx_1;
+
#ifdef USE_MULTIARCH
eax = __cpu_features.cpuid[COMMON_CPUID_INDEX_1].eax;
- ebx = __cpu_features.cpuid[COMMON_CPUID_INDEX_1].ebx;
+ ebx_1 = __cpu_features.cpuid[COMMON_CPUID_INDEX_1].ebx;
ecx = __cpu_features.cpuid[COMMON_CPUID_INDEX_1].ecx;
edx = __cpu_features.cpuid[COMMON_CPUID_INDEX_1].edx;
#else
- __cpuid (1, eax, ebx, ecx, edx);
+ __cpuid (1, eax, ebx_1, ecx, edx);
#endif
#ifndef DISABLE_PREFERRED_MEMORY_INSTRUCTION
@@ -554,14 +556,46 @@ init_cacheinfo (void)
}
while (((eax >> 5) & 0x7) != level);
- threads = ((eax >> 14) & 0x3ff) + 1;
+ threads = (eax >> 14) & 0x3ff;
+
+ /* If max_cpuid >= 11, THREADS is the maximum number of
+ addressable IDs for logical processors sharing the
+ cache, instead of the maximum number of threads
+ sharing the cache. */
+ if (threads && max_cpuid >= 11)
+ {
+ /* Find the number of logical processors shipped in
+ one core and apply count mask. */
+ i = 0;
+ while (1)
+ {
+ __cpuid_count (11, i++, eax, ebx, ecx, edx);
+
+ int shipped = ebx & 0xff;
+ int type = ecx & 0xff0;
+ if (shipped == 0 || type == 0)
+ break;
+ else if (type == 0x200)
+ {
+ int count_mask;
+
+ /* Compute count mask. */
+ asm ("bsr %1, %0"
+ : "=r" (count_mask) : "g" (threads));
+ count_mask = ~(-1 << (count_mask + 1));
+ threads = (shipped - 1) & count_mask;
+ break;
+ }
+ }
+ }
+ threads += 1;
}
else
{
intel_bug_no_cache_info:
/* Assume that all logical threads share the highest cache level. */
- threads = (ebx >> 16) & 0xff;
+ threads = (ebx_1 >> 16) & 0xff;
}
/* Cap usage of highest cache level to the number of supported
diff --git a/sysdeps/x86_64/dl-trampoline.S b/sysdeps/x86_64/dl-trampoline.S
index 20da6956f1..f9c60ad5cf 100644
--- a/sysdeps/x86_64/dl-trampoline.S
+++ b/sysdeps/x86_64/dl-trampoline.S
@@ -146,247 +146,17 @@ L(have_avx):
2: movl %eax, L(have_avx)(%rip)
cmpl $0, %eax
-1: js L(no_avx1)
+1: js L(no_avx)
- /* This is to support AVX audit modules. */
- vmovdqu %ymm0, (LR_VECTOR_OFFSET)(%rsp)
- vmovdqu %ymm1, (LR_VECTOR_OFFSET + VECTOR_SIZE)(%rsp)
- vmovdqu %ymm2, (LR_VECTOR_OFFSET + VECTOR_SIZE*2)(%rsp)
- vmovdqu %ymm3, (LR_VECTOR_OFFSET + VECTOR_SIZE*3)(%rsp)
- vmovdqu %ymm4, (LR_VECTOR_OFFSET + VECTOR_SIZE*4)(%rsp)
- vmovdqu %ymm5, (LR_VECTOR_OFFSET + VECTOR_SIZE*5)(%rsp)
- vmovdqu %ymm6, (LR_VECTOR_OFFSET + VECTOR_SIZE*6)(%rsp)
- vmovdqu %ymm7, (LR_VECTOR_OFFSET + VECTOR_SIZE*7)(%rsp)
+# define RESTORE_AVX
+# include "dl-trampoline.h"
- /* Save xmm0-xmm7 registers to detect if any of them are
- changed by audit module. */
- vmovdqa %xmm0, (LR_SIZE)(%rsp)
- vmovdqa %xmm1, (LR_SIZE + XMM_SIZE)(%rsp)
- vmovdqa %xmm2, (LR_SIZE + XMM_SIZE*2)(%rsp)
- vmovdqa %xmm3, (LR_SIZE + XMM_SIZE*3)(%rsp)
- vmovdqa %xmm4, (LR_SIZE + XMM_SIZE*4)(%rsp)
- vmovdqa %xmm5, (LR_SIZE + XMM_SIZE*5)(%rsp)
- vmovdqa %xmm6, (LR_SIZE + XMM_SIZE*6)(%rsp)
- vmovdqa %xmm7, (LR_SIZE + XMM_SIZE*7)(%rsp)
-
-L(no_avx1):
-# endif
-
- movq %rsp, %rcx # La_x86_64_regs pointer to %rcx.
- movq 48(%rbx), %rdx # Load return address if needed.
- movq 40(%rbx), %rsi # Copy args pushed by PLT in register.
- movq 32(%rbx), %rdi # %rdi: link_map, %rsi: reloc_index
- leaq 16(%rbx), %r8
- call _dl_profile_fixup # Call resolver.
-
- movq %rax, %r11 # Save return value.
-
- movq 8(%rbx), %rax # Get back register content.
- movq LR_RDX_OFFSET(%rsp), %rdx
- movq LR_R8_OFFSET(%rsp), %r8
- movq LR_R9_OFFSET(%rsp), %r9
-
- movaps (LR_XMM_OFFSET)(%rsp), %xmm0
- movaps (LR_XMM_OFFSET + XMM_SIZE)(%rsp), %xmm1
- movaps (LR_XMM_OFFSET + XMM_SIZE*2)(%rsp), %xmm2
- movaps (LR_XMM_OFFSET + XMM_SIZE*3)(%rsp), %xmm3
- movaps (LR_XMM_OFFSET + XMM_SIZE*4)(%rsp), %xmm4
- movaps (LR_XMM_OFFSET + XMM_SIZE*5)(%rsp), %xmm5
- movaps (LR_XMM_OFFSET + XMM_SIZE*6)(%rsp), %xmm6
- movaps (LR_XMM_OFFSET + XMM_SIZE*7)(%rsp), %xmm7
-
-# ifdef HAVE_AVX_SUPPORT
- cmpl $0, L(have_avx)(%rip)
- js L(no_avx2)
-
- /* Check if any xmm0-xmm7 registers are changed by audit
- module. */
- vpcmpeqq (LR_SIZE)(%rsp), %xmm0, %xmm8
- vpmovmskb %xmm8, %esi
- cmpl $0xffff, %esi
- jne 1f
- vmovdqu (LR_VECTOR_OFFSET)(%rsp), %ymm0
-
-1: vpcmpeqq (LR_SIZE + XMM_SIZE)(%rsp), %xmm1, %xmm8
- vpmovmskb %xmm8, %esi
- cmpl $0xffff, %esi
- jne 1f
- vmovdqu (LR_VECTOR_OFFSET + VECTOR_SIZE)(%rsp), %ymm1
-
-1: vpcmpeqq (LR_SIZE + XMM_SIZE*2)(%rsp), %xmm2, %xmm8
- vpmovmskb %xmm8, %esi
- cmpl $0xffff, %esi
- jne 1f
- vmovdqu (LR_VECTOR_OFFSET + VECTOR_SIZE*2)(%rsp), %ymm2
-
-1: vpcmpeqq (LR_SIZE + XMM_SIZE*3)(%rsp), %xmm3, %xmm8
- vpmovmskb %xmm8, %esi
- cmpl $0xffff, %esi
- jne 1f
- vmovdqu (LR_VECTOR_OFFSET + VECTOR_SIZE*3)(%rsp), %ymm3
-
-1: vpcmpeqq (LR_SIZE + XMM_SIZE*4)(%rsp), %xmm4, %xmm8
- vpmovmskb %xmm8, %esi
- cmpl $0xffff, %esi
- jne 1f
- vmovdqu (LR_VECTOR_OFFSET + VECTOR_SIZE*4)(%rsp), %ymm4
-
-1: vpcmpeqq (LR_SIZE + XMM_SIZE*5)(%rsp), %xmm5, %xmm8
- vpmovmskb %xmm8, %esi
- cmpl $0xffff, %esi
- jne 1f
- vmovdqu (LR_VECTOR_OFFSET + VECTOR_SIZE*5)(%rsp), %ymm5
-
-1: vpcmpeqq (LR_SIZE + XMM_SIZE*6)(%rsp), %xmm6, %xmm8
- vpmovmskb %xmm8, %esi
- cmpl $0xffff, %esi
- jne 1f
- vmovdqu (LR_VECTOR_OFFSET + VECTOR_SIZE*6)(%rsp), %ymm6
-
-1: vpcmpeqq (LR_SIZE + XMM_SIZE*7)(%rsp), %xmm7, %xmm8
- vpmovmskb %xmm8, %esi
- cmpl $0xffff, %esi
- jne 1f
- vmovdqu (LR_VECTOR_OFFSET + VECTOR_SIZE*7)(%rsp), %ymm7
-
-L(no_avx2):
-1:
-# endif
- movq 16(%rbx), %r10 # Anything in framesize?
- testq %r10, %r10
- jns 3f
-
- /* There's nothing in the frame size, so there
- will be no call to the _dl_call_pltexit. */
-
- /* Get back registers content. */
- movq LR_RCX_OFFSET(%rsp), %rcx
- movq LR_RSI_OFFSET(%rsp), %rsi
- movq LR_RDI_OFFSET(%rsp), %rdi
-
- movq %rbx, %rsp
- movq (%rsp), %rbx
- cfi_restore(rbx)
- cfi_def_cfa_register(%rsp)
-
- addq $48, %rsp # Adjust the stack to the return value
- # (eats the reloc index and link_map)
- cfi_adjust_cfa_offset(-48)
- jmp *%r11 # Jump to function address.
-
-3:
- cfi_adjust_cfa_offset(48)
- cfi_rel_offset(%rbx, 0)
- cfi_def_cfa_register(%rbx)
-
- /* At this point we need to prepare new stack for the function
- which has to be called. We copy the original stack to a
- temporary buffer of the size specified by the 'framesize'
- returned from _dl_profile_fixup */
-
- leaq LR_RSP_OFFSET(%rbx), %rsi # stack
- addq $8, %r10
- andq $0xfffffffffffffff0, %r10
- movq %r10, %rcx
- subq %r10, %rsp
- movq %rsp, %rdi
- shrq $3, %rcx
- rep
- movsq
-
- movq 24(%rdi), %rcx # Get back register content.
- movq 32(%rdi), %rsi
- movq 40(%rdi), %rdi
-
- call *%r11
-
- mov 24(%rbx), %rsp # Drop the copied stack content
-
- /* Now we have to prepare the La_x86_64_retval structure for the
- _dl_call_pltexit. The La_x86_64_regs is being pointed by rsp now,
- so we just need to allocate the sizeof(La_x86_64_retval) space on
- the stack, since the alignment has already been taken care of. */
-# ifdef HAVE_AVX_SUPPORT
- /* sizeof(La_x86_64_retval). Need extra space for 2 SSE
- registers to detect if xmm0/xmm1 registers are changed
- by audit module. */
- subq $(LRV_SIZE + XMM_SIZE*2), %rsp
-# else
- subq $LRV_SIZE, %rsp # sizeof(La_x86_64_retval)
-# endif
- movq %rsp, %rcx # La_x86_64_retval argument to %rcx.
-
- /* Fill in the La_x86_64_retval structure. */
- movq %rax, LRV_RAX_OFFSET(%rcx)
- movq %rdx, LRV_RDX_OFFSET(%rcx)
-
- movaps %xmm0, LRV_XMM0_OFFSET(%rcx)
- movaps %xmm1, LRV_XMM1_OFFSET(%rcx)
-
-# ifdef HAVE_AVX_SUPPORT
- cmpl $0, L(have_avx)(%rip)
- js L(no_avx3)
-
- /* This is to support AVX audit modules. */
- vmovdqu %ymm0, LRV_VECTOR0_OFFSET(%rcx)
- vmovdqu %ymm1, LRV_VECTOR1_OFFSET(%rcx)
-
- /* Save xmm0/xmm1 registers to detect if they are changed
- by audit module. */
- vmovdqa %xmm0, (LRV_SIZE)(%rcx)
- vmovdqa %xmm1, (LRV_SIZE + XMM_SIZE)(%rcx)
-
-L(no_avx3):
-# endif
-
- fstpt LRV_ST0_OFFSET(%rcx)
- fstpt LRV_ST1_OFFSET(%rcx)
-
- movq 24(%rbx), %rdx # La_x86_64_regs argument to %rdx.
- movq 40(%rbx), %rsi # Copy args pushed by PLT in register.
- movq 32(%rbx), %rdi # %rdi: link_map, %rsi: reloc_index
- call _dl_call_pltexit
-
- /* Restore return registers. */
- movq LRV_RAX_OFFSET(%rsp), %rax
- movq LRV_RDX_OFFSET(%rsp), %rdx
-
- movaps LRV_XMM0_OFFSET(%rsp), %xmm0
- movaps LRV_XMM1_OFFSET(%rsp), %xmm1
-
-# ifdef HAVE_AVX_SUPPORT
- cmpl $0, L(have_avx)(%rip)
- js L(no_avx4)
-
- /* Check if xmm0/xmm1 registers are changed by audit module. */
- vpcmpeqq (LRV_SIZE)(%rsp), %xmm0, %xmm2
- vpmovmskb %xmm2, %esi
- cmpl $0xffff, %esi
- jne 1f
- vmovdqu LRV_VECTOR0_OFFSET(%rsp), %ymm0
-
-1: vpcmpeqq (LRV_SIZE + XMM_SIZE)(%rsp), %xmm1, %xmm2
- vpmovmskb %xmm2, %esi
- cmpl $0xffff, %esi
- jne 1f
- vmovdqu LRV_VECTOR1_OFFSET(%rsp), %ymm1
-
-L(no_avx4):
-1:
+ .align 16
+L(no_avx):
# endif
- fldt LRV_ST1_OFFSET(%rsp)
- fldt LRV_ST0_OFFSET(%rsp)
-
- movq %rbx, %rsp
- movq (%rsp), %rbx
- cfi_restore(rbx)
- cfi_def_cfa_register(%rsp)
-
- addq $48, %rsp # Adjust the stack to the return value
- # (eats the reloc index and link_map)
- cfi_adjust_cfa_offset(-48)
- retq
+# undef RESTORE_AVX
+# include "dl-trampoline.h"
cfi_endproc
.size _dl_runtime_profile, .-_dl_runtime_profile
diff --git a/sysdeps/x86_64/dl-trampoline.h b/sysdeps/x86_64/dl-trampoline.h
new file mode 100644
index 0000000000..5d49ed4408
--- /dev/null
+++ b/sysdeps/x86_64/dl-trampoline.h
@@ -0,0 +1,269 @@
+/* Partial PLT profile trampoline to save and restore x86-64 vector
+ registers.
+ Copyright (C) 2009 Free Software Foundation, Inc.
+ This file is part of the GNU C Library.
+
+ The GNU C Library is free software; you can redistribute it and/or
+ modify it under the terms of the GNU Lesser General Public
+ License as published by the Free Software Foundation; either
+ version 2.1 of the License, or (at your option) any later version.
+
+ The GNU C Library is distributed in the hope that it will be useful,
+ but WITHOUT ANY WARRANTY; without even the implied warranty of
+ MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
+ Lesser General Public License for more details.
+
+ You should have received a copy of the GNU Lesser General Public
+ License along with the GNU C Library; if not, write to the Free
+ Software Foundation, Inc., 59 Temple Place, Suite 330, Boston, MA
+ 02111-1307 USA. */
+
+#ifdef RESTORE_AVX
+ /* This is to support AVX audit modules. */
+ vmovdqu %ymm0, (LR_VECTOR_OFFSET)(%rsp)
+ vmovdqu %ymm1, (LR_VECTOR_OFFSET + VECTOR_SIZE)(%rsp)
+ vmovdqu %ymm2, (LR_VECTOR_OFFSET + VECTOR_SIZE*2)(%rsp)
+ vmovdqu %ymm3, (LR_VECTOR_OFFSET + VECTOR_SIZE*3)(%rsp)
+ vmovdqu %ymm4, (LR_VECTOR_OFFSET + VECTOR_SIZE*4)(%rsp)
+ vmovdqu %ymm5, (LR_VECTOR_OFFSET + VECTOR_SIZE*5)(%rsp)
+ vmovdqu %ymm6, (LR_VECTOR_OFFSET + VECTOR_SIZE*6)(%rsp)
+ vmovdqu %ymm7, (LR_VECTOR_OFFSET + VECTOR_SIZE*7)(%rsp)
+
+ /* Save xmm0-xmm7 registers to detect if any of them are
+ changed by audit module. */
+ vmovdqa %xmm0, (LR_SIZE)(%rsp)
+ vmovdqa %xmm1, (LR_SIZE + XMM_SIZE)(%rsp)
+ vmovdqa %xmm2, (LR_SIZE + XMM_SIZE*2)(%rsp)
+ vmovdqa %xmm3, (LR_SIZE + XMM_SIZE*3)(%rsp)
+ vmovdqa %xmm4, (LR_SIZE + XMM_SIZE*4)(%rsp)
+ vmovdqa %xmm5, (LR_SIZE + XMM_SIZE*5)(%rsp)
+ vmovdqa %xmm6, (LR_SIZE + XMM_SIZE*6)(%rsp)
+ vmovdqa %xmm7, (LR_SIZE + XMM_SIZE*7)(%rsp)
+#endif
+
+ movq %rsp, %rcx # La_x86_64_regs pointer to %rcx.
+ movq 48(%rbx), %rdx # Load return address if needed.
+ movq 40(%rbx), %rsi # Copy args pushed by PLT in register.
+ movq 32(%rbx), %rdi # %rdi: link_map, %rsi: reloc_index
+ leaq 16(%rbx), %r8
+ call _dl_profile_fixup # Call resolver.
+
+ movq %rax, %r11 # Save return value.
+
+ movq 8(%rbx), %rax # Get back register content.
+ movq LR_RDX_OFFSET(%rsp), %rdx
+ movq LR_R8_OFFSET(%rsp), %r8
+ movq LR_R9_OFFSET(%rsp), %r9
+
+ movaps (LR_XMM_OFFSET)(%rsp), %xmm0
+ movaps (LR_XMM_OFFSET + XMM_SIZE)(%rsp), %xmm1
+ movaps (LR_XMM_OFFSET + XMM_SIZE*2)(%rsp), %xmm2
+ movaps (LR_XMM_OFFSET + XMM_SIZE*3)(%rsp), %xmm3
+ movaps (LR_XMM_OFFSET + XMM_SIZE*4)(%rsp), %xmm4
+ movaps (LR_XMM_OFFSET + XMM_SIZE*5)(%rsp), %xmm5
+ movaps (LR_XMM_OFFSET + XMM_SIZE*6)(%rsp), %xmm6
+ movaps (LR_XMM_OFFSET + XMM_SIZE*7)(%rsp), %xmm7
+
+#ifdef RESTORE_AVX
+ /* Check if any xmm0-xmm7 registers are changed by audit
+ module. */
+ vpcmpeqq (LR_SIZE)(%rsp), %xmm0, %xmm8
+ vpmovmskb %xmm8, %esi
+ cmpl $0xffff, %esi
+ je 2f
+ vmovdqa %xmm0, (LR_VECTOR_OFFSET)(%rsp)
+ jmp 1f
+2: vmovdqu (LR_VECTOR_OFFSET)(%rsp), %ymm0
+ vmovdqa %xmm0, (LR_XMM_OFFSET)(%rsp)
+
+1: vpcmpeqq (LR_SIZE + XMM_SIZE)(%rsp), %xmm1, %xmm8
+ vpmovmskb %xmm8, %esi
+ cmpl $0xffff, %esi
+ je 2f
+ vmovdqa %xmm1, (LR_VECTOR_OFFSET + VECTOR_SIZE)(%rsp)
+ jmp 1f
+2: vmovdqu (LR_VECTOR_OFFSET + VECTOR_SIZE)(%rsp), %ymm1
+ vmovdqa %xmm1, (LR_XMM_OFFSET + XMM_SIZE)(%rsp)
+
+1: vpcmpeqq (LR_SIZE + XMM_SIZE*2)(%rsp), %xmm2, %xmm8
+ vpmovmskb %xmm8, %esi
+ cmpl $0xffff, %esi
+ je 2f
+ vmovdqa %xmm2, (LR_VECTOR_OFFSET + VECTOR_SIZE*2)(%rsp)
+ jmp 1f
+2: vmovdqu (LR_VECTOR_OFFSET + VECTOR_SIZE*2)(%rsp), %ymm2
+ vmovdqa %xmm2, (LR_XMM_OFFSET + XMM_SIZE*2)(%rsp)
+
+1: vpcmpeqq (LR_SIZE + XMM_SIZE*3)(%rsp), %xmm3, %xmm8
+ vpmovmskb %xmm8, %esi
+ cmpl $0xffff, %esi
+ je 2f
+ vmovdqa %xmm3, (LR_VECTOR_OFFSET + VECTOR_SIZE*3)(%rsp)
+ jmp 1f
+2: vmovdqu (LR_VECTOR_OFFSET + VECTOR_SIZE*3)(%rsp), %ymm3
+ vmovdqa %xmm3, (LR_XMM_OFFSET + XMM_SIZE*3)(%rsp)
+
+1: vpcmpeqq (LR_SIZE + XMM_SIZE*4)(%rsp), %xmm4, %xmm8
+ vpmovmskb %xmm8, %esi
+ cmpl $0xffff, %esi
+ je 2f
+ vmovdqa %xmm4, (LR_VECTOR_OFFSET + VECTOR_SIZE*4)(%rsp)
+ jmp 1f
+2: vmovdqu (LR_VECTOR_OFFSET + VECTOR_SIZE*4)(%rsp), %ymm4
+ vmovdqa %xmm4, (LR_XMM_OFFSET + XMM_SIZE*4)(%rsp)
+
+1: vpcmpeqq (LR_SIZE + XMM_SIZE*5)(%rsp), %xmm5, %xmm8
+ vpmovmskb %xmm8, %esi
+ cmpl $0xffff, %esi
+ je 2f
+ vmovdqa %xmm5, (LR_VECTOR_OFFSET + VECTOR_SIZE*5)(%rsp)
+ jmp 1f
+2: vmovdqu (LR_VECTOR_OFFSET + VECTOR_SIZE*5)(%rsp), %ymm5
+ vmovdqa %xmm5, (LR_XMM_OFFSET + XMM_SIZE*5)(%rsp)
+
+1: vpcmpeqq (LR_SIZE + XMM_SIZE*6)(%rsp), %xmm6, %xmm8
+ vpmovmskb %xmm8, %esi
+ cmpl $0xffff, %esi
+ je 2f
+ vmovdqa %xmm6, (LR_VECTOR_OFFSET + VECTOR_SIZE*6)(%rsp)
+ jmp 1f
+2: vmovdqu (LR_VECTOR_OFFSET + VECTOR_SIZE*6)(%rsp), %ymm6
+ vmovdqa %xmm6, (LR_XMM_OFFSET + XMM_SIZE*6)(%rsp)
+
+1: vpcmpeqq (LR_SIZE + XMM_SIZE*7)(%rsp), %xmm7, %xmm8
+ vpmovmskb %xmm8, %esi
+ cmpl $0xffff, %esi
+ je 2f
+ vmovdqa %xmm7, (LR_VECTOR_OFFSET + VECTOR_SIZE*7)(%rsp)
+ jmp 1f
+2: vmovdqu (LR_VECTOR_OFFSET + VECTOR_SIZE*7)(%rsp), %ymm7
+ vmovdqa %xmm7, (LR_XMM_OFFSET + XMM_SIZE*7)(%rsp)
+
+1:
+#endif
+ movq 16(%rbx), %r10 # Anything in framesize?
+ testq %r10, %r10
+ jns 3f
+
+ /* There's nothing in the frame size, so there
+ will be no call to the _dl_call_pltexit. */
+
+ /* Get back registers content. */
+ movq LR_RCX_OFFSET(%rsp), %rcx
+ movq LR_RSI_OFFSET(%rsp), %rsi
+ movq LR_RDI_OFFSET(%rsp), %rdi
+
+ movq %rbx, %rsp
+ movq (%rsp), %rbx
+ cfi_restore(rbx)
+ cfi_def_cfa_register(%rsp)
+
+ addq $48, %rsp # Adjust the stack to the return value
+ # (eats the reloc index and link_map)
+ cfi_adjust_cfa_offset(-48)
+ jmp *%r11 # Jump to function address.
+
+3:
+ cfi_adjust_cfa_offset(48)
+ cfi_rel_offset(%rbx, 0)
+ cfi_def_cfa_register(%rbx)
+
+ /* At this point we need to prepare new stack for the function
+ which has to be called. We copy the original stack to a
+ temporary buffer of the size specified by the 'framesize'
+ returned from _dl_profile_fixup */
+
+ leaq LR_RSP_OFFSET(%rbx), %rsi # stack
+ addq $8, %r10
+ andq $0xfffffffffffffff0, %r10
+ movq %r10, %rcx
+ subq %r10, %rsp
+ movq %rsp, %rdi
+ shrq $3, %rcx
+ rep
+ movsq
+
+ movq 24(%rdi), %rcx # Get back register content.
+ movq 32(%rdi), %rsi
+ movq 40(%rdi), %rdi
+
+ call *%r11
+
+ mov 24(%rbx), %rsp # Drop the copied stack content
+
+ /* Now we have to prepare the La_x86_64_retval structure for the
+ _dl_call_pltexit. The La_x86_64_regs is being pointed by rsp now,
+ so we just need to allocate the sizeof(La_x86_64_retval) space on
+ the stack, since the alignment has already been taken care of. */
+# ifdef RESTORE_AVX
+ /* sizeof(La_x86_64_retval). Need extra space for 2 SSE
+ registers to detect if xmm0/xmm1 registers are changed
+ by audit module. */
+ subq $(LRV_SIZE + XMM_SIZE*2), %rsp
+# else
+ subq $LRV_SIZE, %rsp # sizeof(La_x86_64_retval)
+# endif
+ movq %rsp, %rcx # La_x86_64_retval argument to %rcx.
+
+ /* Fill in the La_x86_64_retval structure. */
+ movq %rax, LRV_RAX_OFFSET(%rcx)
+ movq %rdx, LRV_RDX_OFFSET(%rcx)
+
+ movaps %xmm0, LRV_XMM0_OFFSET(%rcx)
+ movaps %xmm1, LRV_XMM1_OFFSET(%rcx)
+
+# ifdef RESTORE_AVX
+ /* This is to support AVX audit modules. */
+ vmovdqu %ymm0, LRV_VECTOR0_OFFSET(%rcx)
+ vmovdqu %ymm1, LRV_VECTOR1_OFFSET(%rcx)
+
+ /* Save xmm0/xmm1 registers to detect if they are changed
+ by audit module. */
+ vmovdqa %xmm0, (LRV_SIZE)(%rcx)
+ vmovdqa %xmm1, (LRV_SIZE + XMM_SIZE)(%rcx)
+# endif
+
+ fstpt LRV_ST0_OFFSET(%rcx)
+ fstpt LRV_ST1_OFFSET(%rcx)
+
+ movq 24(%rbx), %rdx # La_x86_64_regs argument to %rdx.
+ movq 40(%rbx), %rsi # Copy args pushed by PLT in register.
+ movq 32(%rbx), %rdi # %rdi: link_map, %rsi: reloc_index
+ call _dl_call_pltexit
+
+ /* Restore return registers. */
+ movq LRV_RAX_OFFSET(%rsp), %rax
+ movq LRV_RDX_OFFSET(%rsp), %rdx
+
+ movaps LRV_XMM0_OFFSET(%rsp), %xmm0
+ movaps LRV_XMM1_OFFSET(%rsp), %xmm1
+
+# ifdef RESTORE_AVX
+ /* Check if xmm0/xmm1 registers are changed by audit module. */
+ vpcmpeqq (LRV_SIZE)(%rsp), %xmm0, %xmm2
+ vpmovmskb %xmm2, %esi
+ cmpl $0xffff, %esi
+ jne 1f
+ vmovdqu LRV_VECTOR0_OFFSET(%rsp), %ymm0
+
+1: vpcmpeqq (LRV_SIZE + XMM_SIZE)(%rsp), %xmm1, %xmm2
+ vpmovmskb %xmm2, %esi
+ cmpl $0xffff, %esi
+ jne 1f
+ vmovdqu LRV_VECTOR1_OFFSET(%rsp), %ymm1
+
+1:
+# endif
+
+ fldt LRV_ST1_OFFSET(%rsp)
+ fldt LRV_ST0_OFFSET(%rsp)
+
+ movq %rbx, %rsp
+ movq (%rsp), %rbx
+ cfi_restore(rbx)
+ cfi_def_cfa_register(%rsp)
+
+ addq $48, %rsp # Adjust the stack to the return value
+ # (eats the reloc index and link_map)
+ cfi_adjust_cfa_offset(-48)
+ retq
diff --git a/sysdeps/x86_64/multiarch/Makefile b/sysdeps/x86_64/multiarch/Makefile
index b066402204..0ded3b3261 100644
--- a/sysdeps/x86_64/multiarch/Makefile
+++ b/sysdeps/x86_64/multiarch/Makefile
@@ -4,7 +4,7 @@ gen-as-const-headers += ifunc-defines.sym
endif
ifeq ($(subdir),string)
-sysdep_routines += stpncpy-c strncpy-c
+sysdep_routines += stpncpy-c strncpy-c strcmp-ssse3 strncmp-ssse3
ifeq (yes,$(config-cflags-sse4))
sysdep_routines += strcspn-c strpbrk-c strspn-c strstr-c strcasestr-c
CFLAGS-strcspn-c.c += -msse4
diff --git a/sysdeps/x86_64/multiarch/rawmemchr.S b/sysdeps/x86_64/multiarch/rawmemchr.S
index d4f265f430..08fd8769fc 100644
--- a/sysdeps/x86_64/multiarch/rawmemchr.S
+++ b/sysdeps/x86_64/multiarch/rawmemchr.S
@@ -38,6 +38,7 @@ END(rawmemchr)
strong_alias (rawmemchr, __rawmemchr)
+ .section .text.sse4.2,"ax",@progbits
.align 16
.type __rawmemchr_sse42, @function
__rawmemchr_sse42:
diff --git a/sysdeps/x86_64/multiarch/strcmp-ssse3.S b/sysdeps/x86_64/multiarch/strcmp-ssse3.S
new file mode 100644
index 0000000000..98cecb8942
--- /dev/null
+++ b/sysdeps/x86_64/multiarch/strcmp-ssse3.S
@@ -0,0 +1,3 @@
+#define USE_SSSE3 1
+#define STRCMP __strcmp_ssse3
+#include "../strcmp.S"
diff --git a/sysdeps/x86_64/multiarch/strcmp.S b/sysdeps/x86_64/multiarch/strcmp.S
index 1a315737af..05adf1e2e6 100644
--- a/sysdeps/x86_64/multiarch/strcmp.S
+++ b/sysdeps/x86_64/multiarch/strcmp.S
@@ -34,6 +34,7 @@
mov %r9, %r11
#define STRCMP_SSE42 __strncmp_sse42
+#define STRCMP_SSSE3 __strncmp_ssse3
#define STRCMP_SSE2 __strncmp_sse2
#define __GI_STRCMP __GI_strncmp
#else
@@ -41,6 +42,7 @@
#ifndef STRCMP
#define STRCMP strcmp
#define STRCMP_SSE42 __strcmp_sse42
+#define STRCMP_SSSE3 __strcmp_ssse3
#define STRCMP_SSE2 __strcmp_sse2
#define __GI_STRCMP __GI_strcmp
#endif
@@ -60,10 +62,14 @@ ENTRY(STRCMP)
cmpl $0, __cpu_features+KIND_OFFSET(%rip)
jne 1f
call __init_cpu_features
-1: leaq STRCMP_SSE2(%rip), %rax
- testl $(1<<20), __cpu_features+CPUID_OFFSET+COMMON_CPUID_INDEX_1*CPUID_SIZE+CPUID_ECX_OFFSET(%rip)
- jz 2f
+1:
leaq STRCMP_SSE42(%rip), %rax
+ testl $(1<<20), __cpu_features+CPUID_OFFSET+COMMON_CPUID_INDEX_1*CPUID_SIZE+CPUID_ECX_OFFSET(%rip)
+ jnz 2f
+ leaq STRCMP_SSSE3(%rip), %rax
+ testl $(1<<9), __cpu_features+CPUID_OFFSET+COMMON_CPUID_INDEX_1*CPUID_SIZE+CPUID_ECX_OFFSET(%rip)
+ jnz 2f
+ leaq STRCMP_SSE2(%rip), %rax
2: ret
END(STRCMP)
diff --git a/sysdeps/x86_64/multiarch/strcspn-c.c b/sysdeps/x86_64/multiarch/strcspn-c.c
index 4512267d3f..daeebe1bf5 100644
--- a/sysdeps/x86_64/multiarch/strcspn-c.c
+++ b/sysdeps/x86_64/multiarch/strcspn-c.c
@@ -86,11 +86,13 @@ STRCSPN_SSE42 (const char *s, const char *a)
const char *aligned;
__m128i mask;
+ /* Fake initialization. gcc otherwise will warn. */
+ asm ("" : "=xm" (mask));
int offset = (int) ((size_t) a & 15);
if (offset != 0)
{
/* Load masks. */
- aligned = (const char *) ((size_t) a & 0xfffffffffffffff0L);
+ aligned = (const char *) ((size_t) a & -16L);
__m128i mask0 = _mm_load_si128 ((__m128i *) aligned);
switch (offset)
@@ -229,7 +231,7 @@ STRCSPN_SSE42 (const char *s, const char *a)
if (offset != 0)
{
/* Check partial string. */
- aligned = (const char *) ((size_t) s & 0xfffffffffffffff0L);
+ aligned = (const char *) ((size_t) s & -16L);
__m128i value = _mm_load_si128 ((__m128i *) aligned);
switch (offset)
diff --git a/sysdeps/x86_64/multiarch/strlen.S b/sysdeps/x86_64/multiarch/strlen.S
index 82b03ccc28..4342c6cdab 100644
--- a/sysdeps/x86_64/multiarch/strlen.S
+++ b/sysdeps/x86_64/multiarch/strlen.S
@@ -40,6 +40,7 @@ ENTRY(strlen)
END(strlen)
+ .section .text.sse4.2,"ax",@progbits
.align 16
.type __strlen_sse42, @function
__strlen_sse42:
diff --git a/sysdeps/x86_64/multiarch/strncmp-ssse3.S b/sysdeps/x86_64/multiarch/strncmp-ssse3.S
new file mode 100644
index 0000000000..a320a3e949
--- /dev/null
+++ b/sysdeps/x86_64/multiarch/strncmp-ssse3.S
@@ -0,0 +1,4 @@
+#define USE_SSSE3 1
+#define STRCMP __strncmp_ssse3
+#define USE_AS_STRNCMP
+#include "../strcmp.S"
diff --git a/sysdeps/x86_64/multiarch/strspn-c.c b/sysdeps/x86_64/multiarch/strspn-c.c
index 5b99f0d383..be9e8ac0a8 100644
--- a/sysdeps/x86_64/multiarch/strspn-c.c
+++ b/sysdeps/x86_64/multiarch/strspn-c.c
@@ -68,7 +68,7 @@ __strspn_sse42 (const char *s, const char *a)
if (offset != 0)
{
/* Load masks. */
- aligned = (const char *) ((size_t) a & 0xfffffffffffffff0L);
+ aligned = (const char *) ((size_t) a & -16L);
__m128i mask0 = _mm_load_si128 ((__m128i *) aligned);
switch (offset)
@@ -207,7 +207,7 @@ __strspn_sse42 (const char *s, const char *a)
if (offset != 0)
{
/* Check partial string. */
- aligned = (const char *) ((size_t) s & 0xfffffffffffffff0L);
+ aligned = (const char *) ((size_t) s & -16L);
__m128i value = _mm_load_si128 ((__m128i *) aligned);
switch (offset)
diff --git a/sysdeps/x86_64/strcmp.S b/sysdeps/x86_64/strcmp.S
index 340a64ba35..650ec173b6 100644
--- a/sysdeps/x86_64/strcmp.S
+++ b/sysdeps/x86_64/strcmp.S
@@ -51,7 +51,12 @@
# endif
#endif
+#ifndef USE_SSSE3
.text
+#else
+ .section .text.ssse3,"ax",@progbits
+#endif
+
ENTRY (BP_SYM (STRCMP))
#ifdef NOT_IN_libc
/* Simple version since we can't use SSE registers in ld.so. */
@@ -244,9 +249,13 @@ LABEL(gobble_ashr_1):
movdqa (%rdi, %rcx), %xmm2
movdqa %xmm2, %xmm4 /* store for next cycle */
+#ifndef USE_SSSE3
psrldq $1, %xmm3
pslldq $15, %xmm2
por %xmm3, %xmm2 /* merge into one 16byte value */
+#else
+ palignr $1, %xmm3, %xmm2 /* merge into one 16byte value */
+#endif
pcmpeqb %xmm1, %xmm0
pcmpeqb %xmm2, %xmm1
@@ -269,9 +278,13 @@ LABEL(gobble_ashr_1):
movdqa (%rdi, %rcx), %xmm2
movdqa %xmm2, %xmm4 /* store for next cycle */
+#ifndef USE_SSSE3
psrldq $1, %xmm3
- pslldq $15, %xmm2
+ pslldq $15, %xmm2
por %xmm3, %xmm2 /* merge into one 16byte value */
+#else
+ palignr $1, %xmm3, %xmm2 /* merge into one 16byte value */
+#endif
pcmpeqb %xmm1, %xmm0
pcmpeqb %xmm2, %xmm1
@@ -363,9 +376,13 @@ LABEL(gobble_ashr_2):
movdqa (%rdi, %rcx), %xmm2
movdqa %xmm2, %xmm4
+#ifndef USE_SSSE3
psrldq $2, %xmm3
pslldq $14, %xmm2
- por %xmm3, %xmm2
+ por %xmm3, %xmm2 /* merge into one 16byte value */
+#else
+ palignr $2, %xmm3, %xmm2 /* merge into one 16byte value */
+#endif
pcmpeqb %xmm1, %xmm0
pcmpeqb %xmm2, %xmm1
@@ -389,9 +406,13 @@ LABEL(gobble_ashr_2):
movdqa (%rdi, %rcx), %xmm2
movdqa %xmm2, %xmm4
+#ifndef USE_SSSE3
psrldq $2, %xmm3
- pslldq $14, %xmm2
- por %xmm3, %xmm2
+ pslldq $14, %xmm2
+ por %xmm3, %xmm2 /* merge into one 16byte value */
+#else
+ palignr $2, %xmm3, %xmm2 /* merge into one 16byte value */
+#endif
pcmpeqb %xmm1, %xmm0
pcmpeqb %xmm2, %xmm1
@@ -477,9 +498,13 @@ LABEL(gobble_ashr_3):
movdqa (%rdi, %rcx), %xmm2
movdqa %xmm2, %xmm4
+#ifndef USE_SSSE3
psrldq $3, %xmm3
pslldq $13, %xmm2
- por %xmm3, %xmm2
+ por %xmm3, %xmm2 /* merge into one 16byte value */
+#else
+ palignr $3, %xmm3, %xmm2 /* merge into one 16byte value */
+#endif
pcmpeqb %xmm1, %xmm0
pcmpeqb %xmm2, %xmm1
@@ -503,9 +528,13 @@ LABEL(gobble_ashr_3):
movdqa (%rdi, %rcx), %xmm2
movdqa %xmm2, %xmm4
+#ifndef USE_SSSE3
psrldq $3, %xmm3
- pslldq $13, %xmm2
- por %xmm3, %xmm2
+ pslldq $13, %xmm2
+ por %xmm3, %xmm2 /* merge into one 16byte value */
+#else
+ palignr $3, %xmm3, %xmm2 /* merge into one 16byte value */
+#endif
pcmpeqb %xmm1, %xmm0
pcmpeqb %xmm2, %xmm1
@@ -591,9 +620,13 @@ LABEL(gobble_ashr_4):
movdqa (%rdi, %rcx), %xmm2
movdqa %xmm2, %xmm4
+#ifndef USE_SSSE3
psrldq $4, %xmm3
pslldq $12, %xmm2
- por %xmm3, %xmm2
+ por %xmm3, %xmm2 /* merge into one 16byte value */
+#else
+ palignr $4, %xmm3, %xmm2 /* merge into one 16byte value */
+#endif
pcmpeqb %xmm1, %xmm0
pcmpeqb %xmm2, %xmm1
@@ -617,9 +650,13 @@ LABEL(gobble_ashr_4):
movdqa (%rdi, %rcx), %xmm2
movdqa %xmm2, %xmm4
+#ifndef USE_SSSE3
psrldq $4, %xmm3
- pslldq $12, %xmm2
- por %xmm3, %xmm2
+ pslldq $12, %xmm2
+ por %xmm3, %xmm2 /* merge into one 16byte value */
+#else
+ palignr $4, %xmm3, %xmm2 /* merge into one 16byte value */
+#endif
pcmpeqb %xmm1, %xmm0
pcmpeqb %xmm2, %xmm1
@@ -705,9 +742,13 @@ LABEL(gobble_ashr_5):
movdqa (%rdi, %rcx), %xmm2
movdqa %xmm2, %xmm4
+#ifndef USE_SSSE3
psrldq $5, %xmm3
pslldq $11, %xmm2
- por %xmm3, %xmm2
+ por %xmm3, %xmm2 /* merge into one 16byte value */
+#else
+ palignr $5, %xmm3, %xmm2 /* merge into one 16byte value */
+#endif
pcmpeqb %xmm1, %xmm0
pcmpeqb %xmm2, %xmm1
@@ -731,9 +772,13 @@ LABEL(gobble_ashr_5):
movdqa (%rdi, %rcx), %xmm2
movdqa %xmm2, %xmm4
+#ifndef USE_SSSE3
psrldq $5, %xmm3
- pslldq $11, %xmm2
- por %xmm3, %xmm2
+ pslldq $11, %xmm2
+ por %xmm3, %xmm2 /* merge into one 16byte value */
+#else
+ palignr $5, %xmm3, %xmm2 /* merge into one 16byte value */
+#endif
pcmpeqb %xmm1, %xmm0
pcmpeqb %xmm2, %xmm1
@@ -819,9 +864,13 @@ LABEL(gobble_ashr_6):
movdqa (%rdi, %rcx), %xmm2
movdqa %xmm2, %xmm4
+#ifndef USE_SSSE3
psrldq $6, %xmm3
pslldq $10, %xmm2
- por %xmm3, %xmm2
+ por %xmm3, %xmm2 /* merge into one 16byte value */
+#else
+ palignr $6, %xmm3, %xmm2 /* merge into one 16byte value */
+#endif
pcmpeqb %xmm1, %xmm0
pcmpeqb %xmm2, %xmm1
@@ -845,9 +894,13 @@ LABEL(gobble_ashr_6):
movdqa (%rdi, %rcx), %xmm2
movdqa %xmm2, %xmm4
+#ifndef USE_SSSE3
psrldq $6, %xmm3
- pslldq $10, %xmm2
- por %xmm3, %xmm2
+ pslldq $10, %xmm2
+ por %xmm3, %xmm2 /* merge into one 16byte value */
+#else
+ palignr $6, %xmm3, %xmm2 /* merge into one 16byte value */
+#endif
pcmpeqb %xmm1, %xmm0
pcmpeqb %xmm2, %xmm1
@@ -933,9 +986,13 @@ LABEL(gobble_ashr_7):
movdqa (%rdi, %rcx), %xmm2
movdqa %xmm2, %xmm4
+#ifndef USE_SSSE3
psrldq $7, %xmm3
pslldq $9, %xmm2
- por %xmm3, %xmm2
+ por %xmm3, %xmm2 /* merge into one 16byte value */
+#else
+ palignr $7, %xmm3, %xmm2 /* merge into one 16byte value */
+#endif
pcmpeqb %xmm1, %xmm0
pcmpeqb %xmm2, %xmm1
@@ -959,9 +1016,13 @@ LABEL(gobble_ashr_7):
movdqa (%rdi, %rcx), %xmm2
movdqa %xmm2, %xmm4
+#ifndef USE_SSSE3
psrldq $7, %xmm3
- pslldq $9, %xmm2
- por %xmm3, %xmm2
+ pslldq $9, %xmm2
+ por %xmm3, %xmm2 /* merge into one 16byte value */
+#else
+ palignr $7, %xmm3, %xmm2 /* merge into one 16byte value */
+#endif
pcmpeqb %xmm1, %xmm0
pcmpeqb %xmm2, %xmm1
@@ -1047,9 +1108,13 @@ LABEL(gobble_ashr_8):
movdqa (%rdi, %rcx), %xmm2
movdqa %xmm2, %xmm4
+#ifndef USE_SSSE3
psrldq $8, %xmm3
pslldq $8, %xmm2
- por %xmm3, %xmm2
+ por %xmm3, %xmm2 /* merge into one 16byte value */
+#else
+ palignr $8, %xmm3, %xmm2 /* merge into one 16byte value */
+#endif
pcmpeqb %xmm1, %xmm0
pcmpeqb %xmm2, %xmm1
@@ -1073,9 +1138,13 @@ LABEL(gobble_ashr_8):
movdqa (%rdi, %rcx), %xmm2
movdqa %xmm2, %xmm4
+#ifndef USE_SSSE3
psrldq $8, %xmm3
- pslldq $8, %xmm2
- por %xmm3, %xmm2
+ pslldq $8, %xmm2
+ por %xmm3, %xmm2 /* merge into one 16byte value */
+#else
+ palignr $8, %xmm3, %xmm2 /* merge into one 16byte value */
+#endif
pcmpeqb %xmm1, %xmm0
pcmpeqb %xmm2, %xmm1
@@ -1161,9 +1230,13 @@ LABEL(gobble_ashr_9):
movdqa (%rdi, %rcx), %xmm2
movdqa %xmm2, %xmm4
+#ifndef USE_SSSE3
psrldq $9, %xmm3
pslldq $7, %xmm2
- por %xmm3, %xmm2
+ por %xmm3, %xmm2 /* merge into one 16byte value */
+#else
+ palignr $9, %xmm3, %xmm2 /* merge into one 16byte value */
+#endif
pcmpeqb %xmm1, %xmm0
pcmpeqb %xmm2, %xmm1
@@ -1187,9 +1260,13 @@ LABEL(gobble_ashr_9):
movdqa (%rdi, %rcx), %xmm2
movdqa %xmm2, %xmm4
+#ifndef USE_SSSE3
psrldq $9, %xmm3
- pslldq $7, %xmm2
- por %xmm3, %xmm2
+ pslldq $7, %xmm2
+ por %xmm3, %xmm2 /* merge into one 16byte value */
+#else
+ palignr $9, %xmm3, %xmm2 /* merge into one 16byte value */
+#endif
pcmpeqb %xmm1, %xmm0
pcmpeqb %xmm2, %xmm1
@@ -1275,9 +1352,13 @@ LABEL(gobble_ashr_10):
movdqa (%rdi, %rcx), %xmm2
movdqa %xmm2, %xmm4
+#ifndef USE_SSSE3
psrldq $10, %xmm3
pslldq $6, %xmm2
- por %xmm3, %xmm2
+ por %xmm3, %xmm2 /* merge into one 16byte value */
+#else
+ palignr $10, %xmm3, %xmm2 /* merge into one 16byte value */
+#endif
pcmpeqb %xmm1, %xmm0
pcmpeqb %xmm2, %xmm1
@@ -1301,9 +1382,13 @@ LABEL(gobble_ashr_10):
movdqa (%rdi, %rcx), %xmm2
movdqa %xmm2, %xmm4
+#ifndef USE_SSSE3
psrldq $10, %xmm3
- pslldq $6, %xmm2
- por %xmm3, %xmm2
+ pslldq $6, %xmm2
+ por %xmm3, %xmm2 /* merge into one 16byte value */
+#else
+ palignr $10, %xmm3, %xmm2 /* merge into one 16byte value */
+#endif
pcmpeqb %xmm1, %xmm0
pcmpeqb %xmm2, %xmm1
@@ -1389,9 +1474,13 @@ LABEL(gobble_ashr_11):
movdqa (%rdi, %rcx), %xmm2
movdqa %xmm2, %xmm4
+#ifndef USE_SSSE3
psrldq $11, %xmm3
pslldq $5, %xmm2
- por %xmm3, %xmm2
+ por %xmm3, %xmm2 /* merge into one 16byte value */
+#else
+ palignr $11, %xmm3, %xmm2 /* merge into one 16byte value */
+#endif
pcmpeqb %xmm1, %xmm0
pcmpeqb %xmm2, %xmm1
@@ -1415,9 +1504,13 @@ LABEL(gobble_ashr_11):
movdqa (%rdi, %rcx), %xmm2
movdqa %xmm2, %xmm4
+#ifndef USE_SSSE3
psrldq $11, %xmm3
- pslldq $5, %xmm2
- por %xmm3, %xmm2
+ pslldq $5, %xmm2
+ por %xmm3, %xmm2 /* merge into one 16byte value */
+#else
+ palignr $11, %xmm3, %xmm2 /* merge into one 16byte value */
+#endif
pcmpeqb %xmm1, %xmm0
pcmpeqb %xmm2, %xmm1
@@ -1503,9 +1596,13 @@ LABEL(gobble_ashr_12):
movdqa (%rdi, %rcx), %xmm2
movdqa %xmm2, %xmm4
+#ifndef USE_SSSE3
psrldq $12, %xmm3
pslldq $4, %xmm2
- por %xmm3, %xmm2
+ por %xmm3, %xmm2 /* merge into one 16byte value */
+#else
+ palignr $12, %xmm3, %xmm2 /* merge into one 16byte value */
+#endif
pcmpeqb %xmm1, %xmm0
pcmpeqb %xmm2, %xmm1
@@ -1529,9 +1626,13 @@ LABEL(gobble_ashr_12):
movdqa (%rdi, %rcx), %xmm2
movdqa %xmm2, %xmm4
+#ifndef USE_SSSE3
psrldq $12, %xmm3
- pslldq $4, %xmm2
- por %xmm3, %xmm2
+ pslldq $4, %xmm2
+ por %xmm3, %xmm2 /* merge into one 16byte value */
+#else
+ palignr $12, %xmm3, %xmm2 /* merge into one 16byte value */
+#endif
pcmpeqb %xmm1, %xmm0
pcmpeqb %xmm2, %xmm1
@@ -1617,9 +1718,13 @@ LABEL(gobble_ashr_13):
movdqa (%rdi, %rcx), %xmm2
movdqa %xmm2, %xmm4
+#ifndef USE_SSSE3
psrldq $13, %xmm3
pslldq $3, %xmm2
- por %xmm3, %xmm2
+ por %xmm3, %xmm2 /* merge into one 16byte value */
+#else
+ palignr $13, %xmm3, %xmm2 /* merge into one 16byte value */
+#endif
pcmpeqb %xmm1, %xmm0
pcmpeqb %xmm2, %xmm1
@@ -1643,9 +1748,13 @@ LABEL(gobble_ashr_13):
movdqa (%rdi, %rcx), %xmm2
movdqa %xmm2, %xmm4
+#ifndef USE_SSSE3
psrldq $13, %xmm3
- pslldq $3, %xmm2
- por %xmm3, %xmm2
+ pslldq $3, %xmm2
+ por %xmm3, %xmm2 /* merge into one 16byte value */
+#else
+ palignr $13, %xmm3, %xmm2 /* merge into one 16byte value */
+#endif
pcmpeqb %xmm1, %xmm0
pcmpeqb %xmm2, %xmm1
@@ -1731,9 +1840,13 @@ LABEL(gobble_ashr_14):
movdqa (%rdi, %rcx), %xmm2
movdqa %xmm2, %xmm4
+#ifndef USE_SSSE3
psrldq $14, %xmm3
pslldq $2, %xmm2
- por %xmm3, %xmm2
+ por %xmm3, %xmm2 /* merge into one 16byte value */
+#else
+ palignr $14, %xmm3, %xmm2 /* merge into one 16byte value */
+#endif
pcmpeqb %xmm1, %xmm0
pcmpeqb %xmm2, %xmm1
@@ -1757,9 +1870,13 @@ LABEL(gobble_ashr_14):
movdqa (%rdi, %rcx), %xmm2
movdqa %xmm2, %xmm4
+#ifndef USE_SSSE3
psrldq $14, %xmm3
- pslldq $2, %xmm2
- por %xmm3, %xmm2
+ pslldq $2, %xmm2
+ por %xmm3, %xmm2 /* merge into one 16byte value */
+#else
+ palignr $14, %xmm3, %xmm2 /* merge into one 16byte value */
+#endif
pcmpeqb %xmm1, %xmm0
pcmpeqb %xmm2, %xmm1
@@ -1847,9 +1964,13 @@ LABEL(gobble_ashr_15):
movdqa (%rdi, %rcx), %xmm2
movdqa %xmm2, %xmm4
+#ifndef USE_SSSE3
psrldq $15, %xmm3
pslldq $1, %xmm2
- por %xmm3, %xmm2
+ por %xmm3, %xmm2 /* merge into one 16byte value */
+#else
+ palignr $15, %xmm3, %xmm2 /* merge into one 16byte value */
+#endif
pcmpeqb %xmm1, %xmm0
pcmpeqb %xmm2, %xmm1
@@ -1873,9 +1994,13 @@ LABEL(gobble_ashr_15):
movdqa (%rdi, %rcx), %xmm2
movdqa %xmm2, %xmm4
+#ifndef USE_SSSE3
psrldq $15, %xmm3
- pslldq $1, %xmm2
- por %xmm3, %xmm2
+ pslldq $1, %xmm2
+ por %xmm3, %xmm2 /* merge into one 16byte value */
+#else
+ palignr $15, %xmm3, %xmm2 /* merge into one 16byte value */
+#endif
pcmpeqb %xmm1, %xmm0
pcmpeqb %xmm2, %xmm1