diff options
40 files changed, 1434 insertions, 1545 deletions
diff --git a/libc/ChangeLog b/libc/ChangeLog index f74e0a5d8..e81ffc170 100644 --- a/libc/ChangeLog +++ b/libc/ChangeLog @@ -1,3 +1,72 @@ +2011-12-23 Ulrich Drepper <drepper@gmail.com> + + * version.h (RELEASE): Bump for 2.15 release. + * include/features.h (__GLIBC_MINOR__): Bump to 15. + + * sysdeps/x86_64/dl-machine.h: Fix typos in comments. + Patch by Marek Polacek <mpolacek@redhat.com>. + + * bits/byteswap.h: Protect long long constants with __extension__. + * sysdeps/i386/bits/byteswap.h: Likewise. + * sysdeps/ia64/bits/byteswap.h: Likewise. + * sysdeps/s390/bits/byteswap.h: Likewise. + * sysdeps/x86_64/bits/byteswap.h: Likewise. + +2011-12-23 Liubov Dmitrieva <liubov.dmitrieva@gmail.com> + + [BZ #13540] + * sysdeps/x86_64/multiarch/strcpy-ssse3.S: Fix overrun in + destination buffer. + * sysdeps/x86_64/multiarch/wcscpy-ssse3.S: Likewise. + +2011-12-23 Marek Polacek <polacek@redhat.com> + + * elf/dl-addr.c (determine_info): Add inline keyword. + * elf/tst-auditmod4b.c (check_avx): Likewise. + * elf/tst-auditmod6b.c (check_avx): Likewise. + * elf/tst-auditmod6c.c (check_avx): Likewise. + * elf/tst-auditmod7b.c (check_avx): Likewise. + +2011-12-23 Ulrich Drepper <drepper@gmail.com> + + * sysdeps/i386/fpu/bits/fenv.h (feraiseexcept): Also enable for + !__SSE_MATH__. + +2011-12-23 Liubov Dmitrieva <liubov.dmitrieva@gmail.com> + + [BZ #13540] + * sysdeps/i386/i686/multiarch/wcscpy-ssse3.S: Fix wrong copying + processing for last bytes. + +2011-08-06 Bruno Haible <bruno@clisp.org> + + [BZ #13061] + * iconvdata/cp1258.c (comp_table_data): Combine U+00A8 U+0301 to + U+0385, not to U+1FEE. + + [BZ #13062] + * iconvdata/tcvn5712-1.c (comp_table_data): Remove useless and wrong + entry for U+00A5 U+0301. + +2011-12-22 Ulrich Drepper <drepper@gmail.com> + + [BZ #13166] + * inet/getnameinfo.c (getnameinfo): Return EAI_OVERFLOW if the + buffer for the output is too small. + + * sysdeps/i386/fpu/bits/fenv.h [__SSE_MATH__]: Add feraiseexcept + optimization. + + [BZ #13185] + * sysdeps/i386/fpu/fgetexcptflg.c (__fegetexceptflag): Also return + SSE flags if possible. + +2011-12-22 Liubov Dmitrieva <liubov.dmitrieva@gmail.com> + + [BZ #13540] + * sysdeps/i386/i686/multiarch/strcpy-ssse3.S: Fix wrong copying + processing for last bytes. + 2011-12-22 Joseph Myers <joseph@codesourcery.com> * sysdeps/unix/sysv/linux/Makefile (syscall-list-variants) @@ -53,6 +122,7 @@ 2011-11-18 Richard B. Kreckel <kreckel@ginac.de> [BZ #13305] + [BZ #12786] * math/s_cacosh.c: Fix rare miscomputation in cacosh(). * math/s_cacoshf.c: Likewise. * math/s_cacoshl.c: Likewise. @@ -1,4 +1,4 @@ -GNU C Library NEWS -- history of user-visible changes. 2011-12-22 +GNU C Library NEWS -- history of user-visible changes. 2011-12-23 Copyright (C) 1992-2009, 2010, 2011 Free Software Foundation, Inc. See the end for copying conditions. @@ -9,12 +9,13 @@ Version 2.15 * The following bugs are resolved with this release: - 6779, 6783, 9696, 10103, 10709, 11589, 12403, 12847, 12868, 12852, 12874, - 12885, 12892, 12907, 12922, 12935, 13007, 13021, 13067, 13068, 13090, - 13092, 13114, 13118, 13123, 13134, 13138, 13147, 13150, 13179, 13192, - 13268, 13276, 13282, 13291, 13305, 13328, 13335, 13337, 13344, 13358, - 13367, 13413, 13416, 13423, 13439, 13446, 13472, 13484, 13506, 13515, - 13523, 13524, 13538 + 6779, 6783, 9696, 10103, 10709, 11589, 12403, 12786, 12840, 12847, 12868, + 12852, 12874, 12885, 12892, 12906, 12907, 12922, 12935, 12962, 13007, + 13021, 13061, 13062, 13067, 13068, 13085, 13088, 13090, 13092, 13096, + 13114, 13118, 13123, 13134, 13138, 13147, 13150, 13166, 13179, 13185, + 13189, 13192, 13268, 13276, 13282, 13291, 13305, 13328, 13335, 13337, + 13344, 13358, 13367, 13413, 13416, 13423, 13439, 13446, 13472, 13484, + 13506, 13515, 13523, 13524, 13538, 13540 * New program pldd to list loaded object of a process Implemented by Ulrich Drepper. @@ -66,7 +67,7 @@ Version 2.15 * Optimized nearbyint and strcasecmp for PPC. Implemented by Adhemerval Zanella. -* New locales: bho_IN, unm_US, es_CU +* New locales: bho_IN, unm_US, es_CU, ta_LK Version 2.14 diff --git a/libc/bits/byteswap.h b/libc/bits/byteswap.h index 45cb9471e..6df2f28c0 100644 --- a/libc/bits/byteswap.h +++ b/libc/bits/byteswap.h @@ -1,5 +1,6 @@ /* Macros to swap the order of bytes in integer values. - Copyright (C) 1997,1998,2000-2002,2005,2008 Free Software Foundation, Inc. + Copyright (C) 1997,1998,2000-2002,2005,2008,2011 + Free Software Foundation, Inc. This file is part of the GNU C Library. The GNU C Library is free software; you can redistribute it and/or @@ -60,20 +61,20 @@ __bswap_32 (unsigned int __bsx) #if defined __GNUC__ && __GNUC__ >= 2 /* Swap bytes in 64 bit value. */ # define __bswap_constant_64(x) \ - ((((x) & 0xff00000000000000ull) >> 56) \ - | (((x) & 0x00ff000000000000ull) >> 40) \ - | (((x) & 0x0000ff0000000000ull) >> 24) \ - | (((x) & 0x000000ff00000000ull) >> 8) \ - | (((x) & 0x00000000ff000000ull) << 8) \ - | (((x) & 0x0000000000ff0000ull) << 24) \ - | (((x) & 0x000000000000ff00ull) << 40) \ - | (((x) & 0x00000000000000ffull) << 56)) + (__extension__ ((((x) & 0xff00000000000000ull) >> 56) \ + | (((x) & 0x00ff000000000000ull) >> 40) \ + | (((x) & 0x0000ff0000000000ull) >> 24) \ + | (((x) & 0x000000ff00000000ull) >> 8) \ + | (((x) & 0x00000000ff000000ull) << 8) \ + | (((x) & 0x0000000000ff0000ull) << 24) \ + | (((x) & 0x000000000000ff00ull) << 40) \ + | (((x) & 0x00000000000000ffull) << 56))) # define __bswap_64(x) \ (__extension__ \ ({ union { __extension__ unsigned long long int __ll; \ unsigned int __l[2]; } __w, __r; \ - if (__builtin_constant_p (x)) \ + if (__builtin_constant_p (x)) \ __r.__ll = __bswap_constant_64 (x); \ else \ { \ diff --git a/libc/elf/dl-addr.c b/libc/elf/dl-addr.c index 2b53a5ed0..788225635 100644 --- a/libc/elf/dl-addr.c +++ b/libc/elf/dl-addr.c @@ -1,5 +1,5 @@ /* Locate the shared object symbol nearest a given address. - Copyright (C) 1996-2007, 2009 Free Software Foundation, Inc. + Copyright (C) 1996-2007, 2009, 2011 Free Software Foundation, Inc. This file is part of the GNU C Library. The GNU C Library is free software; you can redistribute it and/or @@ -22,7 +22,7 @@ #include <ldsodefs.h> -static void +static inline void __attribute ((always_inline)) determine_info (const ElfW(Addr) addr, struct link_map *match, Dl_info *info, struct link_map **mapp, const ElfW(Sym) **symbolp) diff --git a/libc/elf/tst-auditmod4b.c b/libc/elf/tst-auditmod4b.c index a6d3c6a6c..761d97ce9 100644 --- a/libc/elf/tst-auditmod4b.c +++ b/libc/elf/tst-auditmod4b.c @@ -108,7 +108,7 @@ la_symbind64 (Elf64_Sym *sym, unsigned int ndx, uintptr_t *refcook, static int avx = -1; -static int +static inline int __attribute ((always_inline)) check_avx (void) { diff --git a/libc/elf/tst-auditmod6b.c b/libc/elf/tst-auditmod6b.c index f756b5022..a7a60b992 100644 --- a/libc/elf/tst-auditmod6b.c +++ b/libc/elf/tst-auditmod6b.c @@ -108,7 +108,7 @@ la_symbind64 (Elf64_Sym *sym, unsigned int ndx, uintptr_t *refcook, static int avx = -1; -static int +static inline int __attribute ((always_inline)) check_avx (void) { diff --git a/libc/elf/tst-auditmod6c.c b/libc/elf/tst-auditmod6c.c index 49cbf0549..e0b5ac231 100644 --- a/libc/elf/tst-auditmod6c.c +++ b/libc/elf/tst-auditmod6c.c @@ -108,7 +108,7 @@ la_symbind64 (Elf64_Sym *sym, unsigned int ndx, uintptr_t *refcook, static int avx = -1; -static int +static inline int __attribute ((always_inline)) check_avx (void) { diff --git a/libc/elf/tst-auditmod7b.c b/libc/elf/tst-auditmod7b.c index eb237586f..a27d38540 100644 --- a/libc/elf/tst-auditmod7b.c +++ b/libc/elf/tst-auditmod7b.c @@ -108,7 +108,7 @@ la_symbind64 (Elf64_Sym *sym, unsigned int ndx, uintptr_t *refcook, static int avx = -1; -static int +static inline int __attribute ((always_inline)) check_avx (void) { diff --git a/libc/iconvdata/cp1258.c b/libc/iconvdata/cp1258.c index 2b741ba96..b7d23182e 100644 --- a/libc/iconvdata/cp1258.c +++ b/libc/iconvdata/cp1258.c @@ -197,8 +197,7 @@ static const struct { 0x0077, 0x1E83 }, { 0x0079, 0x00FD }, { 0x007A, 0x017A }, - /* { 0x00A5, 0x0385 }, Wrong, A5 is Yen sign */ - { 0x00A8, 0x1FEE }, + { 0x00A8, 0x0385 }, /* prefer U+0385 over U+1FEE */ { 0x00C2, 0x1EA4 }, { 0x00C5, 0x01FA }, { 0x00C6, 0x01FC }, diff --git a/libc/iconvdata/tcvn5712-1.c b/libc/iconvdata/tcvn5712-1.c index c94dadb2e..3cfdf468d 100644 --- a/libc/iconvdata/tcvn5712-1.c +++ b/libc/iconvdata/tcvn5712-1.c @@ -1,5 +1,5 @@ /* Conversion to and from TCVN5712-1. - Copyright (C) 2001, 2002, 2004 Free Software Foundation, Inc. + Copyright (C) 2001, 2002, 2004, 2011 Free Software Foundation, Inc. This file is part of the GNU C Library. Contributed by Ulrich Drepper <drepper@redhat.com>, 2001. @@ -158,7 +158,7 @@ static const struct { 0x01AF, 0x1EEA }, { 0x01B0, 0x1EEB }, #define COMP_TABLE_IDX_0301 (COMP_TABLE_IDX_0300 + COMP_TABLE_LEN_0300) -#define COMP_TABLE_LEN_0301 51 +#define COMP_TABLE_LEN_0301 50 { 0x0041, 0x00C1 }, { 0x0043, 0x0106 }, { 0x0045, 0x00C9 }, @@ -193,8 +193,7 @@ static const struct { 0x0077, 0x1E83 }, { 0x0079, 0x00FD }, { 0x007A, 0x017A }, - { 0x00A5, 0x0385 }, - /*{ 0x00A8, 0x1FEE },*/ + /*{ 0x00A8, 0x0385 },*//* prefer U+0385 over U+1FEE */ { 0x00C2, 0x1EA4 }, /*{ 0x00C5, 0x01FA },*/ /*{ 0x00C6, 0x01FC },*/ @@ -492,7 +491,7 @@ static const struct #include <iconv/loop.c> -/* Next, define the conversion function from UCS4 to CP1258. */ +/* Next, define the conversion function from UCS4 to TCVN5712-1. */ static const unsigned char from_ucs4[] = { diff --git a/libc/include/features.h b/libc/include/features.h index b16129562..f34dc3159 100644 --- a/libc/include/features.h +++ b/libc/include/features.h @@ -338,7 +338,7 @@ /* Major and minor version number of the GNU C library package. Use these macros to test for features in specific releases. */ #define __GLIBC__ 2 -#define __GLIBC_MINOR__ 14 +#define __GLIBC_MINOR__ 15 #define __GLIBC_PREREQ(maj, min) \ ((__GLIBC__ << 16) + __GLIBC_MINOR__ >= ((maj) << 16) + (min)) diff --git a/libc/inet/getnameinfo.c b/libc/inet/getnameinfo.c index 6fb6ad6e1..436604b75 100644 --- a/libc/inet/getnameinfo.c +++ b/libc/inet/getnameinfo.c @@ -346,10 +346,11 @@ getnameinfo (const struct sockaddr *sa, socklen_t addrlen, char *host, "%u", scopeid); if (real_hostlen + scopelen + 1 > hostlen) - /* XXX We should not fail here. Simply enlarge - the buffer or return with out of memory. */ - return EAI_SYSTEM; - memcpy (host + real_hostlen, scopebuf, scopelen + 1); + /* Signal the buffer is too small. This is + what inet_ntop does. */ + c = NULL; + else + memcpy (host + real_hostlen, scopebuf, scopelen + 1); } } else @@ -357,7 +358,7 @@ getnameinfo (const struct sockaddr *sa, socklen_t addrlen, char *host, (const void *) &(((const struct sockaddr_in *) sa)->sin_addr), host, hostlen); if (c == NULL) - return EAI_SYSTEM; + return EAI_OVERFLOW; } ok = 1; } diff --git a/libc/localedata/ChangeLog b/libc/localedata/ChangeLog index 17c135dfe..bb9c3886e 100644 --- a/libc/localedata/ChangeLog +++ b/libc/localedata/ChangeLog @@ -1,5 +1,31 @@ +2011-12-23 Ulrich Drepper <drepper@gmail.com> + + [BZ #12840] + * locales/sv_SE: Modernize date format. + + [BZ #12906] + * SUPPORTED (SUPPORTED-LOCALES): Add wal_ET entry. + + * locales/wal_ET: Remove lang_ab entry. + + [BZ #12962] + * locales/fi_FI: Various fixups. + Patch by Marko Myllynen <myllynen@redhat.com>. + + [BZ #13085] + * locales/ta_LK: New file. + * SUPPORTED (SUPPORTED-LOCALES): Add appropriate entry + + * locales/si_LK: Add country_ab2, country_ab3, country_num. + 2011-12-22 Ulrich Drepper <drepper@gmail.com> + [BZ #13096] + * locales/fi_FI: Fix collation reordering rules. + + [BZ #13189] + * SUPPORTED (SUPPORTED-LOCALES): Add ur_IN entry. + [BZ #13282] * locales/brx_IN: New file. * SUPPORTED (SUPPORTED-LOCALES): Add appropriate entry diff --git a/libc/localedata/SUPPORTED b/libc/localedata/SUPPORTED index a4e639794..1fd78472c 100644 --- a/libc/localedata/SUPPORTED +++ b/libc/localedata/SUPPORTED @@ -380,6 +380,7 @@ sv_SE/ISO-8859-1 \ sw_KE/UTF-8 \ sw_TZ/UTF-8 \ ta_IN/UTF-8 \ +ta_LK/UTF-8 \ te_IN/UTF-8 \ tg_TJ.UTF-8/UTF-8 \ tg_TJ/KOI8-T \ @@ -403,6 +404,7 @@ ug_CN/UTF-8 \ uk_UA.UTF-8/UTF-8 \ uk_UA/KOI8-U \ unm_US/UTF-8 \ +ur_IN/UTF-8 \ ur_PK/UTF-8 \ uz_UZ/ISO-8859-1 \ uz_UZ@cyrillic/UTF-8 \ @@ -413,6 +415,7 @@ wa_BE/ISO-8859-1 \ wa_BE@euro/ISO-8859-15 \ wa_BE.UTF-8/UTF-8 \ wae_CH/UTF-8 \ +wal_ET/UTF-8 \ wo_SN/UTF-8 \ xh_ZA.UTF-8/UTF-8 \ xh_ZA/ISO-8859-1 \ diff --git a/libc/localedata/locales/fi_FI b/libc/localedata/locales/fi_FI index c3604871f..f51700c28 100644 --- a/libc/localedata/locales/fi_FI +++ b/libc/localedata/locales/fi_FI @@ -1,4 +1,4 @@ -escape_char / +escape_char / comment_char % % Finnish language locale for Finland @@ -45,10 +45,10 @@ category "fi_FI:2000";LC_NUMERIC category "fi_FI:2000";LC_MONETARY category "fi_FI:2000";LC_MESSAGES category "fi_FI:2000";LC_PAPER +category "fi_FI:2000";LC_MEASUREMENT category "fi_FI:2000";LC_NAME category "fi_FI:2000";LC_ADDRESS category "fi_FI:2000";LC_TELEPHONE - END LC_IDENTIFICATION LC_COLLATE @@ -63,64 +63,68 @@ reorder-after <z> <a-diaerisis> <o-diaerisis> -reorder-after <U005A> +reorder-after <U007A> <U00E5> <a-ring>;<BAS>;<MIN>;IGNORE -<U00C5> <a-ring>;<BAS>;<CAP>;IGNORE <U01FB> <a-ring>;<ACA>;<MIN>;IGNORE -<U01FA> <a-ring>;<ACA>;<CAP>;IGNORE <U00E4> <a-diaerisis>;<BAS>;<MIN>;IGNORE -<U00C4> <a-diaerisis>;<BAS>;<CAP>;IGNORE <U00E6> <a-diaerisis>;<REU>;<MIN>;IGNORE -<U00C6> <a-diaerisis>;<REU>;<CAP>;IGNORE <U01FD> <a-diaerisis>;<U01FD>;<MIN>;IGNORE -<U01FC> <a-diaerisis>;<U01FD>;<CAP>;IGNORE <U01E3> <a-diaerisis>;<MAC>;<MIN>;IGNORE -<U01E2> <a-diaerisis>;<MAC>;<CAP>;IGNORE <U00F6> <o-diaerisis>;<BAS>;<MIN>;IGNORE -<U00D6> <o-diaerisis>;<BAS>;<CAP>;IGNORE <U00F8> <o-diaerisis>;<U00D8>;<MIN>;IGNORE -<U00D8> <o-diaerisis>;<U00D8>;<CAP>;IGNORE <U01FF> <o-diaerisis>;<U01FF>;<MIN>;IGNORE -<U01FE> <o-diaerisis>;<U01FF>;<CAP>;IGNORE <U00F5> <o-diaerisis>;<TIL>;<MIN>;IGNORE +reorder-after <U005A> +<U00C5> <a-ring>;<BAS>;<CAP>;IGNORE +<U01FA> <a-ring>;<ACA>;<CAP>;IGNORE +<U00C4> <a-diaerisis>;<BAS>;<CAP>;IGNORE +<U00C6> <a-diaerisis>;<REU>;<CAP>;IGNORE +<U01FC> <a-diaerisis>;<U01FD>;<CAP>;IGNORE +<U01E2> <a-diaerisis>;<MAC>;<CAP>;IGNORE +<U00D6> <o-diaerisis>;<BAS>;<CAP>;IGNORE +<U00D8> <o-diaerisis>;<U00D8>;<CAP>;IGNORE +<U01FE> <o-diaerisis>;<U01FF>;<CAP>;IGNORE <U00D5> <o-diaerisis>;<TIL>;<CAP>;IGNORE -reorder-after <U016A> +reorder-after <U016B> <U0076> <v>;<U0056>;<BAS>;<MIN> -<U0056> <v>;<U0056>;<BAS>;<CAP> <U1E7D> <v>;<U0056>;<TIL>;<MIN> -<U1E7C> <v>;<U0056>;<TIL>;<CAP> <U0077> <w>;<U0057>;<BAS>;<MIN> -<U0057> <w>;<U0057>;<BAS>;<CAP> <U1E83> <w>;<U0057>;<ACA>;<MIN> -<U1E82> <w>;<U0057>;<ACA>;<CAP> <U1E81> <w>;<U0057>;<GRA>;<MIN> -<U1E80> <w>;<U0057>;<GRA>;<CAP> <U0175> <w>;<U0057>;<CIR>;<MIN> -<U0174> <w>;<U0057>;<CIR>;<CAP> <U1E85> <w>;<U0057>;<REU>;<MIN> -<U1E84> <w>;<U0057>;<REU>;<CAP> <U1E87> <w>;<U0057>;<PCT>;<MIN> +reorder-after <U016A> +<U0056> <v>;<U0056>;<BAS>;<CAP> +<U1E7C> <v>;<U0056>;<TIL>;<CAP> +<U0057> <w>;<U0057>;<BAS>;<CAP> +<U1E82> <w>;<U0057>;<ACA>;<CAP> +<U1E80> <w>;<U0057>;<GRA>;<CAP> +<U0174> <w>;<U0057>;<CIR>;<CAP> +<U1E84> <w>;<U0057>;<REU>;<CAP> <U1E86> <w>;<U0057>;<PCT>;<CAP> reorder-after <U00FF> <U00FC> <y>;<DTT>;<MIN>;IGNORE +reorder-after <U0178> <U00DC> <y>;<DTT>;<CAP>;IGNORE % Present in iso14651_t1, but these definitions seem to have been % removed from latest iso14651 tables. -reorder-after <U0162> +reorder-after <U0163> <U00FE> "<t><h>";"<LIG><LIG>";"<MIN><MIN>";IGNORE +reorder-after <U0162> <U00DE> "<t><h>";"<LIG><LIG>";"<CAP><CAP>";IGNORE reorder-after <U0064> <U00F0> <d>;<PCL>;<MIN>;IGNORE -<U00D0> <d>;<PCL>;<CAP>;IGNORE <U0111> <d>;<OBL>;<MIN>;IGNORE +reorder-after <U0044> +<U00D0> <d>;<PCL>;<CAP>;IGNORE <U0110> <d>;<OBL>;<CAP>;IGNORE reorder-end - END LC_COLLATE LC_CTYPE @@ -141,12 +145,10 @@ positive_sign "" negative_sign "<U002D>" int_frac_digits 2 frac_digits 2 -% int_curr_symbol precedes -% curr_symbol succeeds p_cs_precedes 0 -p_sep_by_space 2 +p_sep_by_space 1 n_cs_precedes 0 -n_sep_by_space 2 +n_sep_by_space 1 p_sign_posn 1 n_sign_posn 1 END LC_MONETARY @@ -168,18 +170,18 @@ day "<U0073><U0075><U006E><U006E><U0075><U006E><U0074><U0061><U0069>";/ "<U0074><U006F><U0072><U0073><U0074><U0061><U0069>";/ "<U0070><U0065><U0072><U006A><U0061><U006E><U0074><U0061><U0069>";/ "<U006C><U0061><U0075><U0061><U006E><U0074><U0061><U0069>" -abmon "<U0074><U0061><U006D><U006D><U0069><U00A0>";/ - "<U0068><U0065><U006C><U006D><U0069><U00A0>";/ +abmon "<U0074><U0061><U006D><U006D><U0069>";/ + "<U0068><U0065><U006C><U006D><U0069>";/ "<U006D><U0061><U0061><U006C><U0069><U0073>";/ - "<U0068><U0075><U0068><U0074><U0069><U00A0>";/ - "<U0074><U006F><U0075><U006B><U006F><U00A0>";/ - "<U006B><U0065><U0073><U00E4><U00A0><U00A0>";/ - "<U0068><U0065><U0069><U006E><U00E4><U00A0>";/ - "<U0065><U006C><U006F><U00A0><U00A0><U00A0>";/ - "<U0073><U0079><U0079><U0073><U00A0><U00A0>";/ - "<U006C><U006F><U006B><U0061><U00A0><U00A0>";/ + "<U0068><U0075><U0068><U0074><U0069>";/ + "<U0074><U006F><U0075><U006B><U006F>";/ + "<U006B><U0065><U0073><U00E4>";/ + "<U0068><U0065><U0069><U006E><U00E4>";/ + "<U0065><U006C><U006F>";/ + "<U0073><U0079><U0079><U0073>";/ + "<U006C><U006F><U006B><U0061>";/ "<U006D><U0061><U0072><U0072><U0061><U0073>";/ - "<U006A><U006F><U0075><U006C><U0075><U00A0>" + "<U006A><U006F><U0075><U006C><U0075>" mon "<U0074><U0061><U006D><U006D><U0069><U006B><U0075><U0075>";/ "<U0068><U0065><U006C><U006D><U0069><U006B><U0075><U0075>";/ "<U006D><U0061><U0061><U006C><U0069><U0073><U006B><U0075><U0075>";/ @@ -202,13 +204,14 @@ t_fmt_ampm "" date_fmt "<U0025><U0061><U0020><U0025><U002D><U0064><U002E><U0025>/ <U002D><U006D><U002E><U0025><U0059><U0020><U0025><U0048><U002E><U0025>/ <U004D><U002E><U0025><U0053><U0020><U0025><U007A>" +week 7;19971130;4 first_weekday 2 % Monday first_workday 2 % Monday END LC_TIME LC_MESSAGES -yesexpr "<U005E><U005B><U004B><U006B><U004A><U006A><U0059><U0079><U005D><U002E><U002A>" -noexpr "<U005E><U005B><U004E><U006E><U0045><U0065><U005D><U002E><U002A>" +yesexpr "<U005E><U005B><U004B><U006B><U0059><U0079><U005D><U002E><U002A>" +noexpr "<U005E><U005B><U0045><U0065><U004E><U006E><U005D><U002E><U002A>" END LC_MESSAGES LC_PAPER @@ -217,6 +220,7 @@ width 210 END LC_PAPER LC_TELEPHONE +tel_dom_fmt "<U0028><U0025><U0041><U0029><U0020><U0025><U006C>" tel_int_fmt "<U002B><U0025><U0063><U0020><U0025><U0061><U0020><U0025>/ <U006C>" int_prefix "<U0033><U0035><U0038>" @@ -230,15 +234,25 @@ END LC_MEASUREMENT LC_NAME name_fmt "<U0025><U0064><U0025><U0074><U0025><U0067><U0025><U0074>/ <U0025><U006D><U0025><U0074><U0025><U0066>" +% Finnish equivalents for Mr/Mrs/Miss/Ms are herra/rouva/rouva/neiti +% but they are practically never used, thus we don't define them here. END LC_NAME LC_ADDRESS -postal_fmt "<U0025><U0066><U0025><U004E><U0025><U0061><U0025><U004E>/ -<U0025><U0064><U0025><U004E><U0025><U0062><U0025><U004E><U0025><U0073>/ -<U0020><U0025><U0068><U0020><U0025><U0065><U0020><U0025><U0072><U0025>/ -<U004E><U0025><U007A><U0020><U0025><U0054><U0025>/ +postal_fmt "<U0025><U0066><U0025><U004E><U0025><U0064><U0025><U004E>/ +<U0025><U0062><U0025><U004E><U0025><U0061><U0025><U004E><U0025><U0073>/ +<U0020><U0025><U0068><U0025><U0074><U0025><U0065><U0025><U0074><U0025>/ +<U0072><U0025><U004E><U0025><U007A><U0020><U0025><U0054><U0025>/ <U004E><U0025><U0063><U0025><U004E>" country_ab2 "<U0046><U0049>" country_ab3 "<U0046><U0049><U004E>" country_num 246 +country_name "<U0053><U0075><U006F><U006D><U0069>" +country_post "<U0046><U0049>" +country_car "<U0046><U0049><U004E>" +country_isbn 952 +lang_name "<U0073><U0075><U006F><U006D><U0069>" +lang_ab "<U0066><U0069>" +lang_term "<U0066><U0069><U006E>" +lang_lib "<U0066><U0069><U006E>" END LC_ADDRESS diff --git a/libc/localedata/locales/si_LK b/libc/localedata/locales/si_LK index daf618caf..8d8643f78 100644 --- a/libc/localedata/locales/si_LK +++ b/libc/localedata/locales/si_LK @@ -85,46 +85,46 @@ LC_TIME % % Abbreviated weekday names (%a) abday "<U0D89>";"<U0DC3>";/ - "<U0D85>";"<U0DB6>";/ - "<U0DB6><U0DCA><U200D><U0DBB>";"<U0DC3><U0DD2>";/ - "<U0DC3><U0DD9>" + "<U0D85>";"<U0DB6>";/ + "<U0DB6><U0DCA><U200D><U0DBB>";"<U0DC3><U0DD2>";/ + "<U0DC3><U0DD9>" % % Full weekday names (%A) day "<U0D89><U0DBB><U0DD2><U0DAF><U0DCF>";/ - "<U0DC3><U0DB3><U0DD4><U0DAF><U0DCF>";/ - "<U0D85><U0D9F><U0DC4><U0DBB><U0DD4><U0DC0><U0DCF><U0DAF><U0DCF>";/ - "<U0DB6><U0DAF><U0DCF><U0DAF><U0DCF>";/ - "<U0DB6><U0DCA><U200D><U0DBB><U0DC4><U0DC3><U0DCA><U0DB4><U0DAD><U0DD2><U0DB1><U0DCA><U0DAF><U0DCF>";/ + "<U0DC3><U0DB3><U0DD4><U0DAF><U0DCF>";/ + "<U0D85><U0D9F><U0DC4><U0DBB><U0DD4><U0DC0><U0DCF><U0DAF><U0DCF>";/ + "<U0DB6><U0DAF><U0DCF><U0DAF><U0DCF>";/ + "<U0DB6><U0DCA><U200D><U0DBB><U0DC4><U0DC3><U0DCA><U0DB4><U0DAD><U0DD2><U0DB1><U0DCA><U0DAF><U0DCF>";/ "<U0DC3><U0DD2><U0D9A><U0DD4><U0DBB><U0DCF><U0DAF><U0DCF>";/ - "<U0DC3><U0DD9><U0DB1><U0DC3><U0DD4><U0DBB><U0DCF><U0DAF><U0DCF>" + "<U0DC3><U0DD9><U0DB1><U0DC3><U0DD4><U0DBB><U0DCF><U0DAF><U0DCF>" % % Abbreviated month names (%b) abmon "<U0DA2><U0DB1>";/ - "<U0DB4><U0DD9><U0DB6>";/ - "<U0DB8><U0DCF><U0DBB><U0DCA>";/ - "<U0D85><U0DB4><U0DCA><U200D><U0DBB><U0DD2>";/ - "<U0DB8><U0DD0><U0DBA><U0DD2>";/ - "<U0DA2><U0DD6><U0DB1><U0DD2>";/ - "<U0DA2><U0DD6><U0DBD><U0DD2>";/ - "<U0D85><U0D9C><U0DDD>";/ - "<U0DC3><U0DD0><U0DB4><U0DCA>";/ - "<U0D94><U0D9A><U0DCA>";/ - "<U0DB1><U0DD9><U0DC0><U0DD0>";/ - "<U0DAF><U0DD9><U0DC3><U0DD0>" + "<U0DB4><U0DD9><U0DB6>";/ + "<U0DB8><U0DCF><U0DBB><U0DCA>";/ + "<U0D85><U0DB4><U0DCA><U200D><U0DBB><U0DD2>";/ + "<U0DB8><U0DD0><U0DBA><U0DD2>";/ + "<U0DA2><U0DD6><U0DB1><U0DD2>";/ + "<U0DA2><U0DD6><U0DBD><U0DD2>";/ + "<U0D85><U0D9C><U0DDD>";/ + "<U0DC3><U0DD0><U0DB4><U0DCA>";/ + "<U0D94><U0D9A><U0DCA>";/ + "<U0DB1><U0DD9><U0DC0><U0DD0>";/ + "<U0DAF><U0DD9><U0DC3><U0DD0>" % % Full month names (%B) mon "<U0DA2><U0DB1><U0DC0><U0DCF><U0DBB><U0DD2>";/ "<U0DB4><U0DD9><U0DB6><U0DBB><U0DC0><U0DCF><U0DBB><U0DD2>";/ - "<U0DB8><U0DCF><U0DBB><U0DCA><U0DAD><U0DD4>";/ - "<U0D85><U0DB4><U0DCA><U200D><U0DBB><U0DD2><U0DBA><U0DD9><U0DBD><U0DCA>";/ - "<U0DB8><U0DD0><U0DBA><U0DD2>";/ - "<U0DA2><U0DD6><U0DB1><U0DD2>";/ - "<U0DA2><U0DD6><U0DBD><U0DD2>";/ - "<U0D85><U0D9C><U0DDD><U0DC3><U0DCA><U0DAD><U0DD4>";/ - "<U0DC3><U0DD0><U0DB4><U0DCA><U0DAD><U0DD0><U0DB8><U0DCA><U0DB6><U0DBB><U0DCA>";/ - "<U0D94><U0D9A><U0DCA><U0DAD><U0DDD><U0DB6><U0DBB><U0DCA>";/ - "<U0DB1><U0DDC><U0DC0><U0DD0><U0DB8><U0DCA><U0DB6><U0DBB><U0DCA>";/ - "<U0DAF><U0DD9><U0DC3><U0DD0><U0DB8><U0DCA><U0DB6><U0DBB><U0DCA>" + "<U0DB8><U0DCF><U0DBB><U0DCA><U0DAD><U0DD4>";/ + "<U0D85><U0DB4><U0DCA><U200D><U0DBB><U0DD2><U0DBA><U0DD9><U0DBD><U0DCA>";/ + "<U0DB8><U0DD0><U0DBA><U0DD2>";/ + "<U0DA2><U0DD6><U0DB1><U0DD2>";/ + "<U0DA2><U0DD6><U0DBD><U0DD2>";/ + "<U0D85><U0D9C><U0DDD><U0DC3><U0DCA><U0DAD><U0DD4>";/ + "<U0DC3><U0DD0><U0DB4><U0DCA><U0DAD><U0DD0><U0DB8><U0DCA><U0DB6><U0DBB><U0DCA>";/ + "<U0D94><U0D9A><U0DCA><U0DAD><U0DDD><U0DB6><U0DBB><U0DCA>";/ + "<U0DB1><U0DDC><U0DC0><U0DD0><U0DB8><U0DCA><U0DB6><U0DBB><U0DCA>";/ + "<U0DAF><U0DD9><U0DC3><U0DD0><U0DB8><U0DCA><U0DB6><U0DBB><U0DCA>" % % Equivalent of AM PM am_pm "<U0DB4><U0DD9><U002E><U0DC0><U002E>";"<U0DB4><U002E><U0DC0><U002E>" @@ -196,6 +196,10 @@ LC_ADDRESS % postal_fmt "<U0025><U007A><U0025><U0063><U0025><U0054><U0025><U0073><U0025><U0062><U0025><U0065><U0025><U0072>" +country_ab2 "<U004C><U004B>" +country_ab3 "<U004C><U004B><U0041>" +country_num 144 + END LC_ADDRESS diff --git a/libc/localedata/locales/sv_SE b/libc/localedata/locales/sv_SE index f558e2471..2dbc85364 100644 --- a/libc/localedata/locales/sv_SE +++ b/libc/localedata/locales/sv_SE @@ -169,9 +169,9 @@ mon "<U006A><U0061><U006E><U0075><U0061><U0072><U0069>";/ "<U006F><U006B><U0074><U006F><U0062><U0065><U0072>";/ "<U006E><U006F><U0076><U0065><U006D><U0062><U0065><U0072>";/ "<U0064><U0065><U0063><U0065><U006D><U0062><U0065><U0072>" -d_t_fmt "<U0025><U0061><U0020><U0025><U0065><U0020><U0025><U0062><U0020><U0025><U0059><U0020><U0025><U0048><U002E><U0025><U004D><U002E><U0025><U0053>" +d_t_fmt "<U0025><U0061><U0020><U0025><U0065><U0020><U0025><U0062><U0020><U0025><U0059><U0020><U0025><U0048><U003A><U0025><U004D><U003A><U0025><U0053>" d_fmt "<U0025><U0059><U002D><U0025><U006D><U002D><U0025><U0064>" -t_fmt "<U0025><U0048><U002E><U0025><U004D><U002E><U0025><U0053>" +t_fmt "<U0025><U0048><U003A><U0025><U004D><U003A><U0025><U0053>" am_pm "";"" t_fmt_ampm "" date_fmt "<U0025><U0061><U0020><U0025><U0062><U0020><U0025><U0065>/ diff --git a/libc/localedata/locales/ta_LK b/libc/localedata/locales/ta_LK new file mode 100644 index 000000000..80b2f69a7 --- /dev/null +++ b/libc/localedata/locales/ta_LK @@ -0,0 +1,85 @@ +comment_char % +escape_char / +% +% Tamil language locale for Sri Lanka +% Language: ta +% Territory: LK +% Revision: 1.0 +% Date: 2011,August,11 +% Application: general +% Users: general +% Charset: SLS 1326:2008 +% Distribution and use is free, also +% for commercial purposes. + +LC_IDENTIFICATION +title "Tamil language locale for Sri Lanka" +source "J.Yogaraj" +address "30/36Q -2/1,Charles Apartments, De Silva Cross Rd,/ + Kalubowila,Dehiwela,SriLanka." +contact "94-777-315206" +email "yogaraj.ubuntu@gmail.com" +tel "94-112-765773" +fax "" +language "Tamil" +territory "Sri Lanka" +revision "1.0" +date "2011,August,11" +% +category "ta_LK:2000";LC_IDENTIFICATION +category "ta_LK:2000";LC_CTYPE +category "ta_LK:2000";LC_COLLATE +category "ta_LK:2000";LC_TIME +category "ta_LK:2000";LC_NUMERIC +category "ta_LK:2000";LC_MONETARY +category "ta_LK:2000";LC_MESSAGES +category "ta_LK:2000";LC_PAPER +category "ta_LK:2000";LC_NAME +category "ta_LK:2000";LC_ADDRESS +category "ta_LK:2000";LC_TELEPHONE + +END LC_IDENTIFICATION + +LC_COLLATE +copy "ta_IN" +END LC_COLLATE + +LC_CTYPE +copy "ta_IN" +END LC_CTYPE + +LC_MESSAGES +copy "ta_IN" +END LC_MESSAGES + +LC_MONETARY +copy "ta_IN" +END LC_MONETARY + +LC_NUMERIC +copy "ta_IN" +END LC_NUMERIC + +LC_TIME +copy "ta_IN" +END LC_TIME + +LC_PAPER +copy "si_LK" +END LC_PAPER + +LC_TELEPHONE +copy "si_LK" +END LC_TELEPHONE + +LC_MEASUREMENT +copy "si_LK" +END LC_MEASUREMENT + +LC_NAME +copy "ta_IN" +END LC_NAME + +LC_ADDRESS +copy "si_LK" +END LC_ADDRESS diff --git a/libc/localedata/locales/wal_ET b/libc/localedata/locales/wal_ET index 33953cd60..7846531f4 100644 --- a/libc/localedata/locales/wal_ET +++ b/libc/localedata/locales/wal_ET @@ -110,7 +110,6 @@ country_num 231 % 210 found in at least one ISO 3166 doc % country_car unknown % country_isbn unknown, Need ISO 2108 lang_name "<U12C8><U120B><U12ED><U1273><U1271>" -lang_ab "<U0077><U0061><U006C>" lang_term "<U0077><U0061><U006C>" lang_lib "<U0077><U0061><U006C>" @@ -144,12 +143,12 @@ LC_TIME % Abbreviated weekday names (%a) % abday "<U12C8><U130B> ";/ - "<U1233><U12ED><U1296>";/ - "<U121B><U1246><U1233>";/ - "<U12A0><U1229><U12CB>";/ - "<U1203><U1219><U1233>";/ - "<U12A0><U122D><U1263>";/ - "<U1244><U122B> " + "<U1233><U12ED><U1296>";/ + "<U121B><U1246><U1233>";/ + "<U12A0><U1229><U12CB>";/ + "<U1203><U1219><U1233>";/ + "<U12A0><U122D><U1263>";/ + "<U1244><U122B> " % % Full weekday names (%A) % @@ -169,32 +168,32 @@ day "<U12C8><U130B>";/ % Abbreviated month names (%b) % abmon "<U1303><U1295><U12E9>";/ - "<U134C><U1265><U1229>";/ - "<U121B><U122D><U127D>";/ - "<U12A4><U1355><U1228>";/ - "<U121C><U12ED><U0020>";/ - "<U1301><U1295><U0020>";/ - "<U1301><U120B><U12ED>";/ - "<U12A6><U1308><U1235>";/ - "<U1234><U1355><U1274>";/ - "<U12A6><U12AD><U1270>";/ - "<U1296><U126C><U121D>";/ - "<U12F2><U1234><U121D>" + "<U134C><U1265><U1229>";/ + "<U121B><U122D><U127D>";/ + "<U12A4><U1355><U1228>";/ + "<U121C><U12ED><U0020>";/ + "<U1301><U1295><U0020>";/ + "<U1301><U120B><U12ED>";/ + "<U12A6><U1308><U1235>";/ + "<U1234><U1355><U1274>";/ + "<U12A6><U12AD><U1270>";/ + "<U1296><U126C><U121D>";/ + "<U12F2><U1234><U121D>" % % Full month names (%B) % mon "<U1303><U1295><U12E9><U12C8><U122A>";/ - "<U134C><U1265><U1229><U12C8><U122A>";/ - "<U121B><U122D><U127D>";/ - "<U12A4><U1355><U1228><U120D>";/ - "<U121C><U12ED>";/ - "<U1301><U1295>";/ - "<U1301><U120B><U12ED>";/ - "<U12A6><U1308><U1235><U1275>";/ - "<U1234><U1355><U1274><U121D><U1260><U122D>";/ - "<U12A6><U12AD><U1270><U12CD><U1260><U122D>";/ - "<U1296><U126C><U121D><U1260><U122D>";/ - "<U12F2><U1234><U121D><U1260><U122D>" + "<U134C><U1265><U1229><U12C8><U122A>";/ + "<U121B><U122D><U127D>";/ + "<U12A4><U1355><U1228><U120D>";/ + "<U121C><U12ED>";/ + "<U1301><U1295>";/ + "<U1301><U120B><U12ED>";/ + "<U12A6><U1308><U1235><U1275>";/ + "<U1234><U1355><U1274><U121D><U1260><U122D>";/ + "<U12A6><U12AD><U1270><U12CD><U1260><U122D>";/ + "<U1296><U126C><U121D><U1260><U122D>";/ + "<U12F2><U1234><U121D><U1260><U122D>" % % Equivalent of AM PM % @@ -202,7 +201,7 @@ mon "<U1303><U1295><U12E9><U12C8><U122A>";/ % also <U12A1><U1218><U122D><U1232> % am_pm "<U121B><U1208><U12F6>";/ - "<U1243><U121B>" + "<U1243><U121B>" % % Appropriate date representation (%x) % "%d/%m/%Y" diff --git a/libc/nptl/ChangeLog b/libc/nptl/ChangeLog index 31177bf17..e8ff69ab0 100644 --- a/libc/nptl/ChangeLog +++ b/libc/nptl/ChangeLog @@ -1,3 +1,18 @@ +2011-12-22 Ulrich Drepper <drepper@gmail.com> + + * sysdeps/pthread/gai_misc.h (__gai_create_helper_thread): Use + __pthread_get_minstack. + * sysdeps/unix/sysv/linux/mq_notify.c (init_mq_netlink): Likewise. + + [BZ #13088] + * sysdeps/unix/sysv/linux/timer_routines.c: Get minimum stack size + through __pthread_get_minstack. + * nptl-init.c (__pthread_initialize_minimal_internal): Get page size + directly from _rtld_global_ro. + (__pthread_get_minstack): New function. + * pthreadP.h: Declare __pthread_get_minstack. + * Versions (libpthread) [GLIBC_PRIVATE]: Add __pthread_get_minstack. + 2011-12-21 Ulrich Drepper <drepper@gmail.com> [BZ #13515] diff --git a/libc/nptl/Versions b/libc/nptl/Versions index 5a884202f..6a1037550 100644 --- a/libc/nptl/Versions +++ b/libc/nptl/Versions @@ -255,6 +255,6 @@ libpthread { GLIBC_PRIVATE { __pthread_initialize_minimal; __pthread_clock_gettime; __pthread_clock_settime; - __pthread_unwind; + __pthread_unwind; __pthread_get_minstack; } } diff --git a/libc/nptl/nptl-init.c b/libc/nptl/nptl-init.c index db45cab23..434922446 100644 --- a/libc/nptl/nptl-init.c +++ b/libc/nptl/nptl-init.c @@ -427,7 +427,7 @@ __pthread_initialize_minimal_internal (void) /* Make sure it meets the minimum size that allocate_stack (allocatestack.c) will demand, which depends on the page size. */ - const uintptr_t pagesz = __sysconf (_SC_PAGESIZE); + const uintptr_t pagesz = GLRO(dl_pagesize); const size_t minstack = pagesz + __static_tls_size + MINIMAL_REST_STACK; if (limit.rlim_cur < minstack) limit.rlim_cur = minstack; @@ -469,3 +469,13 @@ __pthread_initialize_minimal_internal (void) } strong_alias (__pthread_initialize_minimal_internal, __pthread_initialize_minimal) + + +size_t +__pthread_get_minstack (const pthread_attr_t *attr) +{ + struct pthread_attr *iattr = (struct pthread_attr *) attr; + + return (GLRO(dl_pagesize) + __static_tls_size + PTHREAD_STACK_MIN + + iattr->guardsize); +} diff --git a/libc/nptl/pthreadP.h b/libc/nptl/pthreadP.h index df4f4d769..845434e50 100644 --- a/libc/nptl/pthreadP.h +++ b/libc/nptl/pthreadP.h @@ -397,6 +397,7 @@ weak_function; extern void __pthread_init_static_tls (struct link_map *) attribute_hidden; +extern size_t __pthread_get_minstack (const pthread_attr_t *attr); /* Namespace save aliases. */ extern int __pthread_getschedparam (pthread_t thread_id, int *policy, diff --git a/libc/nptl/sysdeps/pthread/gai_misc.h b/libc/nptl/sysdeps/pthread/gai_misc.h index 9094c1e37..cbbe47657 100644 --- a/libc/nptl/sysdeps/pthread/gai_misc.h +++ b/libc/nptl/sysdeps/pthread/gai_misc.h @@ -1,4 +1,4 @@ -/* Copyright (C) 2006, 2007, 2008 Free Software Foundation, Inc. +/* Copyright (C) 2006, 2007, 2008, 2011 Free Software Foundation, Inc. This file is part of the GNU C Library. The GNU C Library is free software; you can redistribute it and/or @@ -97,7 +97,9 @@ __gai_create_helper_thread (pthread_t *threadp, void *(*tf) (void *), pthread_attr_setdetachstate (&attr, PTHREAD_CREATE_DETACHED); /* The helper thread needs only very little resources. */ - (void) pthread_attr_setstacksize (&attr, 4 * PTHREAD_STACK_MIN); + (void) pthread_attr_setstacksize (&attr, + __pthread_get_minstack (&attr) + + 4 * PTHREAD_STACK_MIN); /* Block all signals in the helper thread. To do this thoroughly we temporarily have to block all signals here. */ diff --git a/libc/nptl/sysdeps/unix/sysv/linux/mq_notify.c b/libc/nptl/sysdeps/unix/sysv/linux/mq_notify.c index 49ddeae05..11ffc328e 100644 --- a/libc/nptl/sysdeps/unix/sysv/linux/mq_notify.c +++ b/libc/nptl/sysdeps/unix/sysv/linux/mq_notify.c @@ -1,4 +1,4 @@ -/* Copyright (C) 2004, 2005, 2008 Free Software Foundation, Inc. +/* Copyright (C) 2004, 2005, 2008, 2011 Free Software Foundation, Inc. This file is part of the GNU C Library. Contribute by Ulrich Drepper <drepper@redhat.com>, 2004. @@ -201,7 +201,7 @@ init_mq_netlink (void) (void) pthread_attr_init (&attr); (void) pthread_attr_setdetachstate (&attr, PTHREAD_CREATE_DETACHED); /* We do not need much stack space, the bare minimum will be enough. */ - (void) pthread_attr_setstacksize (&attr, PTHREAD_STACK_MIN); + (void) pthread_attr_setstacksize (&attr, __pthread_get_minstack (&attr)); /* Temporarily block all signals so that the newly created thread inherits the mask. */ diff --git a/libc/nptl/sysdeps/unix/sysv/linux/timer_routines.c b/libc/nptl/sysdeps/unix/sysv/linux/timer_routines.c index b159316fb..44da8563d 100644 --- a/libc/nptl/sysdeps/unix/sysv/linux/timer_routines.c +++ b/libc/nptl/sysdeps/unix/sysv/linux/timer_routines.c @@ -1,4 +1,4 @@ -/* Copyright (C) 2003, 2004, 2005, 2006, 2007 Free Software Foundation, Inc. +/* Copyright (C) 2003-2007, 2011 Free Software Foundation, Inc. This file is part of the GNU C Library. Contributed by Ulrich Drepper <drepper@redhat.com>, 2003. @@ -165,7 +165,7 @@ __start_helper_thread (void) and should go away automatically when canceled. */ pthread_attr_t attr; (void) pthread_attr_init (&attr); - (void) pthread_attr_setstacksize (&attr, PTHREAD_STACK_MIN); + (void) pthread_attr_setstacksize (&attr, __pthread_get_minstack (&attr)); /* Block all signals in the helper thread but SIGSETXID. To do this thoroughly we temporarily have to block all signals here. The diff --git a/libc/sysdeps/i386/bits/byteswap.h b/libc/sysdeps/i386/bits/byteswap.h index c246ae86c..ddfb785c6 100644 --- a/libc/sysdeps/i386/bits/byteswap.h +++ b/libc/sysdeps/i386/bits/byteswap.h @@ -1,5 +1,5 @@ /* Macros to swap the order of bytes in integer values. - Copyright (C) 1997, 1998, 2000, 2002, 2003, 2006, 2007, 2008, 2010 + Copyright (C) 1997, 1998, 2000, 2002, 2003, 2006, 2007, 2008, 2010, 2011 Free Software Foundation, Inc. This file is part of the GNU C Library. @@ -109,15 +109,15 @@ __bswap_32 (unsigned int __bsx) #if defined __GNUC__ && __GNUC__ >= 2 /* Swap bytes in 64 bit value. */ -#define __bswap_constant_64(x) \ - ((((x) & 0xff00000000000000ull) >> 56) \ - | (((x) & 0x00ff000000000000ull) >> 40) \ - | (((x) & 0x0000ff0000000000ull) >> 24) \ - | (((x) & 0x000000ff00000000ull) >> 8) \ - | (((x) & 0x00000000ff000000ull) << 8) \ - | (((x) & 0x0000000000ff0000ull) << 24) \ - | (((x) & 0x000000000000ff00ull) << 40) \ - | (((x) & 0x00000000000000ffull) << 56)) +# define __bswap_constant_64(x) \ + (__extension__ ((((x) & 0xff00000000000000ull) >> 56) \ + | (((x) & 0x00ff000000000000ull) >> 40) \ + | (((x) & 0x0000ff0000000000ull) >> 24) \ + | (((x) & 0x000000ff00000000ull) >> 8) \ + | (((x) & 0x00000000ff000000ull) << 8) \ + | (((x) & 0x0000000000ff0000ull) << 24) \ + | (((x) & 0x000000000000ff00ull) << 40) \ + | (((x) & 0x00000000000000ffull) << 56))) # define __bswap_64(x) \ (__extension__ \ diff --git a/libc/sysdeps/i386/fpu/bits/fenv.h b/libc/sysdeps/i386/fpu/bits/fenv.h index ef3fcb384..8c00771cc 100644 --- a/libc/sysdeps/i386/fpu/bits/fenv.h +++ b/libc/sysdeps/i386/fpu/bits/fenv.h @@ -1,4 +1,4 @@ -/* Copyright (C) 1997, 1998, 1999, 2000 Free Software Foundation, Inc. +/* Copyright (C) 1997, 1998, 1999, 2000, 2011 Free Software Foundation, Inc. This file is part of the GNU C Library. The GNU C Library is free software; you can redistribute it and/or @@ -88,3 +88,51 @@ fenv_t; /* Floating-point environment where none of the exception is masked. */ # define FE_NOMASK_ENV ((__const fenv_t *) -2) #endif + + +#ifdef __USE_EXTERN_INLINES +__BEGIN_DECLS + +/* Optimized versions. */ +extern int __REDIRECT_NTH (__feraiseexcept_renamed, (int), feraiseexcept); +__extern_inline int +__NTH (feraiseexcept (int __excepts)) +{ + if (__builtin_constant_p (__excepts) + && (__excepts & ~(FE_INVALID | FE_DIVBYZERO)) == 0) + { + if ((FE_INVALID & __excepts) != 0) + { + /* One example of a invalid operation is 0.0 / 0.0. */ + float __f = 0.0; + +# ifdef __SSE_MATH__ + __asm__ __volatile__ ("divss %0, %0 " : : "x" (__f)); +# else + __asm__ __volatile__ ("fdiv %%st, %%st(0); fwait" + : "=t" (__f) : "0" (__f)); +# endif + (void) &__f; + } + if ((FE_DIVBYZERO & __excepts) != 0) + { + float __f = 1.0; + float __g = 0.0; + +# ifdef __SSE_MATH__ + __asm__ __volatile__ ("divss %1, %0" : : "x" (__f), "x" (__g)); +# else + __asm__ __volatile__ ("fdivp %%st(1), %%st; fwait" + : "=t" (__f) : "0" (__f), "u" (__g) : "st(1)"); +# endif + (void) &__f; + } + + return 0; + } + + return __feraiseexcept_renamed (__excepts); +} + +__END_DECLS +#endif diff --git a/libc/sysdeps/i386/fpu/fgetexcptflg.c b/libc/sysdeps/i386/fpu/fgetexcptflg.c index 5f60511b6..1a0e6df63 100644 --- a/libc/sysdeps/i386/fpu/fgetexcptflg.c +++ b/libc/sysdeps/i386/fpu/fgetexcptflg.c @@ -1,5 +1,5 @@ /* Store current representation for exceptions. - Copyright (C) 1997,99,2000,01 Free Software Foundation, Inc. + Copyright (C) 1997,99,2000,01,11 Free Software Foundation, Inc. This file is part of the GNU C Library. Contributed by Ulrich Drepper <drepper@cygnus.com>, 1997. @@ -19,7 +19,10 @@ 02111-1307 USA. */ #include <fenv.h> -#include <bp-sym.h> +#include <unistd.h> +#include <ldsodefs.h> +#include <dl-procinfo.h> + int __fegetexceptflag (fexcept_t *flagp, int excepts) @@ -31,6 +34,17 @@ __fegetexceptflag (fexcept_t *flagp, int excepts) *flagp = temp & excepts & FE_ALL_EXCEPT; + /* If the CPU supports SSE, we clear the MXCSR as well. */ + if ((GLRO(dl_hwcap) & HWCAP_I386_XMM) != 0) + { + unsigned int sse_exc; + + /* Get the current MXCSR. */ + __asm__ ("stmxcsr %0" : "=m" (*&sse_exc)); + + *flagp |= sse_exc & excepts & FE_ALL_EXCEPT; + } + /* Success. */ return 0; } @@ -38,7 +52,7 @@ __fegetexceptflag (fexcept_t *flagp, int excepts) #include <shlib-compat.h> #if SHLIB_COMPAT (libm, GLIBC_2_1, GLIBC_2_2) strong_alias (__fegetexceptflag, __old_fegetexceptflag) -compat_symbol (libm, BP_SYM (__old_fegetexceptflag), BP_SYM (fegetexceptflag), GLIBC_2_1); +compat_symbol (libm, __old_fegetexceptflag, fegetexceptflag, GLIBC_2_1); #endif -versioned_symbol (libm, BP_SYM (__fegetexceptflag), BP_SYM (fegetexceptflag), GLIBC_2_2); +versioned_symbol (libm, __fegetexceptflag, fegetexceptflag, GLIBC_2_2); diff --git a/libc/sysdeps/i386/i686/multiarch/strcpy-ssse3.S b/libc/sysdeps/i386/i686/multiarch/strcpy-ssse3.S index 073856ff8..470ddbe27 100644 --- a/libc/sysdeps/i386/i686/multiarch/strcpy-ssse3.S +++ b/libc/sysdeps/i386/i686/multiarch/strcpy-ssse3.S @@ -20,6 +20,7 @@ #ifndef NOT_IN_libc + # ifndef USE_AS_STRCAT # include <sysdep.h> @@ -31,8 +32,8 @@ cfi_adjust_cfa_offset (-4); \ cfi_restore (REG) -# define PUSH(REG) pushl REG; CFI_PUSH (REG) -# define POP(REG) popl REG; CFI_POP (REG) +# define PUSH(REG) pushl REG; CFI_PUSH (REG) +# define POP(REG) popl REG; CFI_POP (REG) # ifndef STRCPY # define STRCPY __strcpy_ssse3 @@ -40,14 +41,22 @@ # ifdef USE_AS_STRNCPY # define PARMS 8 -# define ENTRANCE PUSH(%ebx) -# define RETURN POP(%ebx); ret; CFI_PUSH(%ebx); -# define RETURN1 POP(%edi); POP(%ebx); ret; CFI_PUSH(%ebx); CFI_PUSH(%edi) +# define ENTRANCE PUSH (%ebx) +# define RETURN POP (%ebx); ret; CFI_PUSH (%ebx); +# define RETURN1 POP (%edi); POP (%ebx); ret; CFI_PUSH (%ebx); CFI_PUSH (%edi) # else # define PARMS 4 # define ENTRANCE # define RETURN ret -# define RETURN1 POP(%edi); ret; CFI_PUSH(%edi) +# define RETURN1 POP (%edi); ret; CFI_PUSH (%edi) +# endif + +# ifdef USE_AS_STPCPY +# define SAVE_RESULT(n) lea n(%edx), %eax +# define SAVE_RESULT_TAIL(n) lea n(%edx), %eax +# else +# define SAVE_RESULT(n) movl %edi, %eax +# define SAVE_RESULT_TAIL(n) movl %edx, %eax # endif # define STR1 PARMS @@ -60,9 +69,7 @@ movl - 4 byte movlpd - 8 byte movaps - 16 byte - requires 16 byte alignment - of sourse and destination adresses. - 16 byte alignment: adress is 32bit value, - right four bit of adress shall be 0. + of sourse and destination adresses. */ .text @@ -72,8 +79,6 @@ ENTRY (STRCPY) mov STR2(%esp), %ecx # ifdef USE_AS_STRNCPY movl LEN(%esp), %ebx - test %ebx, %ebx - jz L(ExitTail0) cmp $8, %ebx jbe L(StrncpyExit8Bytes) # endif @@ -127,39 +132,23 @@ ENTRY (STRCPY) sub $16, %ebx and $0xf, %esi -/* add 16 bytes ecx_shift to ebx */ +/* add 16 bytes ecx_offset to ebx */ add %esi, %ebx # endif lea 16(%ecx), %esi -/* Now: - esi = alignment_16(ecx) + ecx_shift + 16; - ecx_shift = ecx - alignment_16(ecx) -*/ and $-16, %esi -/* Now: - esi = alignment_16(ecx) + 16 -*/ pxor %xmm0, %xmm0 movlpd (%ecx), %xmm1 movlpd %xmm1, (%edx) -/* - look if there is zero symbol in next 16 bytes of string - from esi to esi + 15 and form mask in xmm0 -*/ + pcmpeqb (%esi), %xmm0 movlpd 8(%ecx), %xmm1 movlpd %xmm1, 8(%edx) -/* convert byte mask in xmm0 to bit mask */ - pmovmskb %xmm0, %eax sub %ecx, %esi -/* esi = 16 - ecx_shift */ - -/* eax = 0: there isn't end of string from position esi to esi+15 */ - # ifdef USE_AS_STRNCPY sub $16, %ebx jbe L(CopyFrom1To16BytesCase2OrCase3) @@ -169,17 +158,9 @@ ENTRY (STRCPY) mov %edx, %eax lea 16(%edx), %edx -/* Now: - edx = edx + 16 = alignment_16(edx) + edx_shift + 16 -*/ and $-16, %edx - -/* Now: edx = alignment_16(edx) + 16 */ - sub %edx, %eax -/* Now: eax = edx_shift - 16 */ - # ifdef USE_AS_STRNCPY add %eax, %esi lea -1(%esi), %esi @@ -191,22 +172,11 @@ ENTRY (STRCPY) L(ContinueCopy): # endif sub %eax, %ecx -/* Now: - case ecx_shift >= edx_shift: - ecx = alignment_16(ecx) + (ecx_shift - edx_shift) + 16 - case ecx_shift < edx_shift: - ecx = alignment_16(ecx) + (16 + ecx_shift - edx_shift) -*/ mov %ecx, %eax and $0xf, %eax -/* Now: - case ecx_shift >= edx_shift: eax = ecx_shift - edx_shift - case ecx_shift < edx_shift: eax = (16 + ecx_shift - edx_shift) - eax can be 0, 1, ..., 15 -*/ mov $0, %esi -/* case: ecx_shift == edx_shift */ +/* case: ecx_offset == edx_offset */ jz L(Align16Both) @@ -323,7 +293,7 @@ L(Align16Both): sub %ecx, %eax sub %eax, %edx # ifdef USE_AS_STRNCPY - lea 48+64(%ebx, %eax), %ebx + lea 112(%ebx, %eax), %ebx # endif mov $-0x40, %esi @@ -441,7 +411,6 @@ L(Shl1Start): jnz L(Shl1LoopExit) palignr $1, %xmm1, %xmm2 - movaps %xmm3, %xmm1 movaps %xmm2, (%edx) movaps 31(%ecx), %xmm2 @@ -449,7 +418,6 @@ L(Shl1Start): lea 16(%edx), %edx pmovmskb %xmm0, %eax lea 16(%ecx), %ecx - movaps %xmm2, %xmm3 # ifdef USE_AS_STRNCPY sub $16, %ebx jbe L(StrncpyExit1Case2OrCase3) @@ -457,8 +425,7 @@ L(Shl1Start): test %eax, %eax jnz L(Shl1LoopExit) - palignr $1, %xmm1, %xmm2 - movaps %xmm3, %xmm1 + palignr $1, %xmm3, %xmm2 movaps %xmm2, (%edx) lea 31(%ecx), %ecx lea 16(%edx), %edx @@ -506,11 +473,11 @@ L(Shl1LoopStart): jmp L(Shl1LoopStart) L(Shl1LoopExit): - movaps (%edx), %xmm6 - psrldq $15, %xmm6 + movlpd (%ecx), %xmm0 + movlpd %xmm0, (%edx) + movlpd 7(%ecx), %xmm0 + movlpd %xmm0, 7(%edx) mov $15, %esi - palignr $1, %xmm1, %xmm6 - movaps %xmm6, (%edx) jmp L(CopyFrom1To16Bytes) .p2align 4 @@ -563,7 +530,6 @@ L(Shl2Start): jnz L(Shl2LoopExit) palignr $2, %xmm1, %xmm2 - movaps %xmm3, %xmm1 movaps %xmm2, (%edx) movaps 30(%ecx), %xmm2 @@ -571,7 +537,6 @@ L(Shl2Start): lea 16(%edx), %edx pmovmskb %xmm0, %eax lea 16(%ecx), %ecx - movaps %xmm2, %xmm3 # ifdef USE_AS_STRNCPY sub $16, %ebx jbe L(StrncpyExit2Case2OrCase3) @@ -579,8 +544,7 @@ L(Shl2Start): test %eax, %eax jnz L(Shl2LoopExit) - palignr $2, %xmm1, %xmm2 - movaps %xmm3, %xmm1 + palignr $2, %xmm3, %xmm2 movaps %xmm2, (%edx) lea 30(%ecx), %ecx lea 16(%edx), %edx @@ -628,11 +592,11 @@ L(Shl2LoopStart): jmp L(Shl2LoopStart) L(Shl2LoopExit): - movaps (%edx), %xmm6 - psrldq $14, %xmm6 + movlpd (%ecx), %xmm0 + movlpd 6(%ecx), %xmm1 + movlpd %xmm0, (%edx) + movlpd %xmm1, 6(%edx) mov $14, %esi - palignr $2, %xmm1, %xmm6 - movaps %xmm6, (%edx) jmp L(CopyFrom1To16Bytes) .p2align 4 @@ -685,7 +649,6 @@ L(Shl3Start): jnz L(Shl3LoopExit) palignr $3, %xmm1, %xmm2 - movaps %xmm3, %xmm1 movaps %xmm2, (%edx) movaps 29(%ecx), %xmm2 @@ -693,7 +656,6 @@ L(Shl3Start): lea 16(%edx), %edx pmovmskb %xmm0, %eax lea 16(%ecx), %ecx - movaps %xmm2, %xmm3 # ifdef USE_AS_STRNCPY sub $16, %ebx jbe L(StrncpyExit3Case2OrCase3) @@ -701,8 +663,7 @@ L(Shl3Start): test %eax, %eax jnz L(Shl3LoopExit) - palignr $3, %xmm1, %xmm2 - movaps %xmm3, %xmm1 + palignr $3, %xmm3, %xmm2 movaps %xmm2, (%edx) lea 29(%ecx), %ecx lea 16(%edx), %edx @@ -750,11 +711,11 @@ L(Shl3LoopStart): jmp L(Shl3LoopStart) L(Shl3LoopExit): - movaps (%edx), %xmm6 - psrldq $13, %xmm6 + movlpd (%ecx), %xmm0 + movlpd 5(%ecx), %xmm1 + movlpd %xmm0, (%edx) + movlpd %xmm1, 5(%edx) mov $13, %esi - palignr $3, %xmm1, %xmm6 - movaps %xmm6, (%edx) jmp L(CopyFrom1To16Bytes) .p2align 4 @@ -807,7 +768,6 @@ L(Shl4Start): jnz L(Shl4LoopExit) palignr $4, %xmm1, %xmm2 - movaps %xmm3, %xmm1 movaps %xmm2, (%edx) movaps 28(%ecx), %xmm2 @@ -815,7 +775,6 @@ L(Shl4Start): lea 16(%edx), %edx pmovmskb %xmm0, %eax lea 16(%ecx), %ecx - movaps %xmm2, %xmm3 # ifdef USE_AS_STRNCPY sub $16, %ebx jbe L(StrncpyExit4Case2OrCase3) @@ -823,8 +782,7 @@ L(Shl4Start): test %eax, %eax jnz L(Shl4LoopExit) - palignr $4, %xmm1, %xmm2 - movaps %xmm3, %xmm1 + palignr $4, %xmm3, %xmm2 movaps %xmm2, (%edx) lea 28(%ecx), %ecx lea 16(%edx), %edx @@ -872,11 +830,11 @@ L(Shl4LoopStart): jmp L(Shl4LoopStart) L(Shl4LoopExit): - movaps (%edx), %xmm6 - psrldq $12, %xmm6 + movlpd (%ecx), %xmm0 + movl 8(%ecx), %esi + movlpd %xmm0, (%edx) + movl %esi, 8(%edx) mov $12, %esi - palignr $4, %xmm1, %xmm6 - movaps %xmm6, (%edx) jmp L(CopyFrom1To16Bytes) .p2align 4 @@ -929,7 +887,6 @@ L(Shl5Start): jnz L(Shl5LoopExit) palignr $5, %xmm1, %xmm2 - movaps %xmm3, %xmm1 movaps %xmm2, (%edx) movaps 27(%ecx), %xmm2 @@ -937,7 +894,6 @@ L(Shl5Start): lea 16(%edx), %edx pmovmskb %xmm0, %eax lea 16(%ecx), %ecx - movaps %xmm2, %xmm3 # ifdef USE_AS_STRNCPY sub $16, %ebx jbe L(StrncpyExit5Case2OrCase3) @@ -945,8 +901,7 @@ L(Shl5Start): test %eax, %eax jnz L(Shl5LoopExit) - palignr $5, %xmm1, %xmm2 - movaps %xmm3, %xmm1 + palignr $5, %xmm3, %xmm2 movaps %xmm2, (%edx) lea 27(%ecx), %ecx lea 16(%edx), %edx @@ -994,11 +949,11 @@ L(Shl5LoopStart): jmp L(Shl5LoopStart) L(Shl5LoopExit): - movaps (%edx), %xmm6 - psrldq $11, %xmm6 + movlpd (%ecx), %xmm0 + movl 7(%ecx), %esi + movlpd %xmm0, (%edx) + movl %esi, 7(%edx) mov $11, %esi - palignr $5, %xmm1, %xmm6 - movaps %xmm6, (%edx) jmp L(CopyFrom1To16Bytes) .p2align 4 @@ -1051,7 +1006,6 @@ L(Shl6Start): jnz L(Shl6LoopExit) palignr $6, %xmm1, %xmm2 - movaps %xmm3, %xmm1 movaps %xmm2, (%edx) movaps 26(%ecx), %xmm2 @@ -1059,7 +1013,6 @@ L(Shl6Start): lea 16(%edx), %edx pmovmskb %xmm0, %eax lea 16(%ecx), %ecx - movaps %xmm2, %xmm3 # ifdef USE_AS_STRNCPY sub $16, %ebx jbe L(StrncpyExit6Case2OrCase3) @@ -1067,8 +1020,7 @@ L(Shl6Start): test %eax, %eax jnz L(Shl6LoopExit) - palignr $6, %xmm1, %xmm2 - movaps %xmm3, %xmm1 + palignr $6, %xmm3, %xmm2 movaps %xmm2, (%edx) lea 26(%ecx), %ecx lea 16(%edx), %edx @@ -1116,11 +1068,11 @@ L(Shl6LoopStart): jmp L(Shl6LoopStart) L(Shl6LoopExit): - movaps (%edx), %xmm6 - psrldq $10, %xmm6 + movlpd (%ecx), %xmm0 + movl 6(%ecx), %esi + movlpd %xmm0, (%edx) + movl %esi, 6(%edx) mov $10, %esi - palignr $6, %xmm1, %xmm6 - movaps %xmm6, (%edx) jmp L(CopyFrom1To16Bytes) .p2align 4 @@ -1173,7 +1125,6 @@ L(Shl7Start): jnz L(Shl7LoopExit) palignr $7, %xmm1, %xmm2 - movaps %xmm3, %xmm1 movaps %xmm2, (%edx) movaps 25(%ecx), %xmm2 @@ -1181,7 +1132,6 @@ L(Shl7Start): lea 16(%edx), %edx pmovmskb %xmm0, %eax lea 16(%ecx), %ecx - movaps %xmm2, %xmm3 # ifdef USE_AS_STRNCPY sub $16, %ebx jbe L(StrncpyExit7Case2OrCase3) @@ -1189,8 +1139,7 @@ L(Shl7Start): test %eax, %eax jnz L(Shl7LoopExit) - palignr $7, %xmm1, %xmm2 - movaps %xmm3, %xmm1 + palignr $7, %xmm3, %xmm2 movaps %xmm2, (%edx) lea 25(%ecx), %ecx lea 16(%edx), %edx @@ -1238,11 +1187,11 @@ L(Shl7LoopStart): jmp L(Shl7LoopStart) L(Shl7LoopExit): - movaps (%edx), %xmm6 - psrldq $9, %xmm6 + movlpd (%ecx), %xmm0 + movl 5(%ecx), %esi + movlpd %xmm0, (%edx) + movl %esi, 5(%edx) mov $9, %esi - palignr $7, %xmm1, %xmm6 - movaps %xmm6, (%edx) jmp L(CopyFrom1To16Bytes) .p2align 4 @@ -1295,7 +1244,6 @@ L(Shl8Start): jnz L(Shl8LoopExit) palignr $8, %xmm1, %xmm2 - movaps %xmm3, %xmm1 movaps %xmm2, (%edx) movaps 24(%ecx), %xmm2 @@ -1303,7 +1251,6 @@ L(Shl8Start): lea 16(%edx), %edx pmovmskb %xmm0, %eax lea 16(%ecx), %ecx - movaps %xmm2, %xmm3 # ifdef USE_AS_STRNCPY sub $16, %ebx jbe L(StrncpyExit8Case2OrCase3) @@ -1311,8 +1258,7 @@ L(Shl8Start): test %eax, %eax jnz L(Shl8LoopExit) - palignr $8, %xmm1, %xmm2 - movaps %xmm3, %xmm1 + palignr $8, %xmm3, %xmm2 movaps %xmm2, (%edx) lea 24(%ecx), %ecx lea 16(%edx), %edx @@ -1360,11 +1306,9 @@ L(Shl8LoopStart): jmp L(Shl8LoopStart) L(Shl8LoopExit): - movaps (%edx), %xmm6 - psrldq $8, %xmm6 + movlpd (%ecx), %xmm0 + movlpd %xmm0, (%edx) mov $8, %esi - palignr $8, %xmm1, %xmm6 - movaps %xmm6, (%edx) jmp L(CopyFrom1To16Bytes) .p2align 4 @@ -1417,7 +1361,6 @@ L(Shl9Start): jnz L(Shl9LoopExit) palignr $9, %xmm1, %xmm2 - movaps %xmm3, %xmm1 movaps %xmm2, (%edx) movaps 23(%ecx), %xmm2 @@ -1425,7 +1368,6 @@ L(Shl9Start): lea 16(%edx), %edx pmovmskb %xmm0, %eax lea 16(%ecx), %ecx - movaps %xmm2, %xmm3 # ifdef USE_AS_STRNCPY sub $16, %ebx jbe L(StrncpyExit9Case2OrCase3) @@ -1433,8 +1375,7 @@ L(Shl9Start): test %eax, %eax jnz L(Shl9LoopExit) - palignr $9, %xmm1, %xmm2 - movaps %xmm3, %xmm1 + palignr $9, %xmm3, %xmm2 movaps %xmm2, (%edx) lea 23(%ecx), %ecx lea 16(%edx), %edx @@ -1482,11 +1423,9 @@ L(Shl9LoopStart): jmp L(Shl9LoopStart) L(Shl9LoopExit): - movaps (%edx), %xmm6 - psrldq $7, %xmm6 + movlpd -1(%ecx), %xmm0 + movlpd %xmm0, -1(%edx) mov $7, %esi - palignr $9, %xmm1, %xmm6 - movaps %xmm6, (%edx) jmp L(CopyFrom1To16Bytes) .p2align 4 @@ -1539,7 +1478,6 @@ L(Shl10Start): jnz L(Shl10LoopExit) palignr $10, %xmm1, %xmm2 - movaps %xmm3, %xmm1 movaps %xmm2, (%edx) movaps 22(%ecx), %xmm2 @@ -1547,7 +1485,6 @@ L(Shl10Start): lea 16(%edx), %edx pmovmskb %xmm0, %eax lea 16(%ecx), %ecx - movaps %xmm2, %xmm3 # ifdef USE_AS_STRNCPY sub $16, %ebx jbe L(StrncpyExit10Case2OrCase3) @@ -1555,8 +1492,7 @@ L(Shl10Start): test %eax, %eax jnz L(Shl10LoopExit) - palignr $10, %xmm1, %xmm2 - movaps %xmm3, %xmm1 + palignr $10, %xmm3, %xmm2 movaps %xmm2, (%edx) lea 22(%ecx), %ecx lea 16(%edx), %edx @@ -1604,11 +1540,9 @@ L(Shl10LoopStart): jmp L(Shl10LoopStart) L(Shl10LoopExit): - movaps (%edx), %xmm6 - psrldq $6, %xmm6 + movlpd -2(%ecx), %xmm0 + movlpd %xmm0, -2(%edx) mov $6, %esi - palignr $10, %xmm1, %xmm6 - movaps %xmm6, (%edx) jmp L(CopyFrom1To16Bytes) .p2align 4 @@ -1661,7 +1595,6 @@ L(Shl11Start): jnz L(Shl11LoopExit) palignr $11, %xmm1, %xmm2 - movaps %xmm3, %xmm1 movaps %xmm2, (%edx) movaps 21(%ecx), %xmm2 @@ -1669,7 +1602,6 @@ L(Shl11Start): lea 16(%edx), %edx pmovmskb %xmm0, %eax lea 16(%ecx), %ecx - movaps %xmm2, %xmm3 # ifdef USE_AS_STRNCPY sub $16, %ebx jbe L(StrncpyExit11Case2OrCase3) @@ -1677,8 +1609,7 @@ L(Shl11Start): test %eax, %eax jnz L(Shl11LoopExit) - palignr $11, %xmm1, %xmm2 - movaps %xmm3, %xmm1 + palignr $11, %xmm3, %xmm2 movaps %xmm2, (%edx) lea 21(%ecx), %ecx lea 16(%edx), %edx @@ -1726,11 +1657,9 @@ L(Shl11LoopStart): jmp L(Shl11LoopStart) L(Shl11LoopExit): - movaps (%edx), %xmm6 - psrldq $5, %xmm6 + movlpd -3(%ecx), %xmm0 + movlpd %xmm0, -3(%edx) mov $5, %esi - palignr $11, %xmm1, %xmm6 - movaps %xmm6, (%edx) jmp L(CopyFrom1To16Bytes) .p2align 4 @@ -1783,7 +1712,6 @@ L(Shl12Start): jnz L(Shl12LoopExit) palignr $12, %xmm1, %xmm2 - movaps %xmm3, %xmm1 movaps %xmm2, (%edx) movaps 20(%ecx), %xmm2 @@ -1791,7 +1719,6 @@ L(Shl12Start): lea 16(%edx), %edx pmovmskb %xmm0, %eax lea 16(%ecx), %ecx - movaps %xmm2, %xmm3 # ifdef USE_AS_STRNCPY sub $16, %ebx jbe L(StrncpyExit12Case2OrCase3) @@ -1799,8 +1726,7 @@ L(Shl12Start): test %eax, %eax jnz L(Shl12LoopExit) - palignr $12, %xmm1, %xmm2 - movaps %xmm3, %xmm1 + palignr $12, %xmm3, %xmm2 movaps %xmm2, (%edx) lea 20(%ecx), %ecx lea 16(%edx), %edx @@ -1848,11 +1774,9 @@ L(Shl12LoopStart): jmp L(Shl12LoopStart) L(Shl12LoopExit): - movaps (%edx), %xmm6 - psrldq $4, %xmm6 + movl (%ecx), %esi + movl %esi, (%edx) mov $4, %esi - palignr $12, %xmm1, %xmm6 - movaps %xmm6, (%edx) jmp L(CopyFrom1To16Bytes) .p2align 4 @@ -1905,7 +1829,6 @@ L(Shl13Start): jnz L(Shl13LoopExit) palignr $13, %xmm1, %xmm2 - movaps %xmm3, %xmm1 movaps %xmm2, (%edx) movaps 19(%ecx), %xmm2 @@ -1913,7 +1836,6 @@ L(Shl13Start): lea 16(%edx), %edx pmovmskb %xmm0, %eax lea 16(%ecx), %ecx - movaps %xmm2, %xmm3 # ifdef USE_AS_STRNCPY sub $16, %ebx jbe L(StrncpyExit13Case2OrCase3) @@ -1921,8 +1843,7 @@ L(Shl13Start): test %eax, %eax jnz L(Shl13LoopExit) - palignr $13, %xmm1, %xmm2 - movaps %xmm3, %xmm1 + palignr $13, %xmm3, %xmm2 movaps %xmm2, (%edx) lea 19(%ecx), %ecx lea 16(%edx), %edx @@ -1970,11 +1891,9 @@ L(Shl13LoopStart): jmp L(Shl13LoopStart) L(Shl13LoopExit): - movaps (%edx), %xmm6 - psrldq $3, %xmm6 + movl -1(%ecx), %esi + movl %esi, -1(%edx) mov $3, %esi - palignr $13, %xmm1, %xmm6 - movaps %xmm6, (%edx) jmp L(CopyFrom1To16Bytes) .p2align 4 @@ -2027,7 +1946,6 @@ L(Shl14Start): jnz L(Shl14LoopExit) palignr $14, %xmm1, %xmm2 - movaps %xmm3, %xmm1 movaps %xmm2, (%edx) movaps 18(%ecx), %xmm2 @@ -2035,7 +1953,6 @@ L(Shl14Start): lea 16(%edx), %edx pmovmskb %xmm0, %eax lea 16(%ecx), %ecx - movaps %xmm2, %xmm3 # ifdef USE_AS_STRNCPY sub $16, %ebx jbe L(StrncpyExit14Case2OrCase3) @@ -2043,8 +1960,7 @@ L(Shl14Start): test %eax, %eax jnz L(Shl14LoopExit) - palignr $14, %xmm1, %xmm2 - movaps %xmm3, %xmm1 + palignr $14, %xmm3, %xmm2 movaps %xmm2, (%edx) lea 18(%ecx), %ecx lea 16(%edx), %edx @@ -2092,11 +2008,9 @@ L(Shl14LoopStart): jmp L(Shl14LoopStart) L(Shl14LoopExit): - movaps (%edx), %xmm6 - psrldq $2, %xmm6 + movl -2(%ecx), %esi + movl %esi, -2(%edx) mov $2, %esi - palignr $14, %xmm1, %xmm6 - movaps %xmm6, (%edx) jmp L(CopyFrom1To16Bytes) .p2align 4 @@ -2149,7 +2063,6 @@ L(Shl15Start): jnz L(Shl15LoopExit) palignr $15, %xmm1, %xmm2 - movaps %xmm3, %xmm1 movaps %xmm2, (%edx) movaps 17(%ecx), %xmm2 @@ -2157,7 +2070,6 @@ L(Shl15Start): lea 16(%edx), %edx pmovmskb %xmm0, %eax lea 16(%ecx), %ecx - movaps %xmm2, %xmm3 # ifdef USE_AS_STRNCPY sub $16, %ebx jbe L(StrncpyExit15Case2OrCase3) @@ -2165,8 +2077,7 @@ L(Shl15Start): test %eax, %eax jnz L(Shl15LoopExit) - palignr $15, %xmm1, %xmm2 - movaps %xmm3, %xmm1 + palignr $15, %xmm3, %xmm2 movaps %xmm2, (%edx) lea 17(%ecx), %ecx lea 16(%edx), %edx @@ -2214,15 +2125,14 @@ L(Shl15LoopStart): jmp L(Shl15LoopStart) L(Shl15LoopExit): - movaps (%edx), %xmm6 - psrldq $1, %xmm6 + movl -3(%ecx), %esi + movl %esi, -3(%edx) mov $1, %esi - palignr $15, %xmm1, %xmm6 - movaps %xmm6, (%edx) # ifdef USE_AS_STRCAT jmp L(CopyFrom1To16Bytes) # endif + # ifndef USE_AS_STRCAT .p2align 4 @@ -2235,15 +2145,38 @@ L(CopyFrom1To16Bytes): POP (%esi) test %al, %al - jz L(ExitHigh) + jz L(ExitHigh8) + +L(CopyFrom1To16BytesLess8): + mov %al, %ah + and $15, %ah + jz L(ExitHigh4) + test $0x01, %al jnz L(Exit1) test $0x02, %al jnz L(Exit2) test $0x04, %al jnz L(Exit3) - test $0x08, %al - jnz L(Exit4) + + .p2align 4 +L(Exit4): + movl (%ecx), %eax + movl %eax, (%edx) + SAVE_RESULT (3) +# ifdef USE_AS_STRNCPY + sub $4, %ebx + lea 4(%edx), %ecx + jnz L(StrncpyFillTailWithZero1) +# ifdef USE_AS_STPCPY + cmpb $1, (%eax) + sbb $-1, %eax +# endif +# endif + RETURN1 + + .p2align 4 +L(ExitHigh4): test $0x10, %al jnz L(Exit5) test $0x20, %al @@ -2255,11 +2188,7 @@ L(CopyFrom1To16Bytes): L(Exit8): movlpd (%ecx), %xmm0 movlpd %xmm0, (%edx) -# ifdef USE_AS_STPCPY - lea 7(%edx), %eax -# else - movl %edi, %eax -# endif + SAVE_RESULT (7) # ifdef USE_AS_STRNCPY sub $8, %ebx lea 8(%edx), %ecx @@ -2272,15 +2201,38 @@ L(Exit8): RETURN1 .p2align 4 -L(ExitHigh): +L(ExitHigh8): + mov %ah, %al + and $15, %al + jz L(ExitHigh12) + test $0x01, %ah jnz L(Exit9) test $0x02, %ah jnz L(Exit10) test $0x04, %ah jnz L(Exit11) - test $0x08, %ah - jnz L(Exit12) + + .p2align 4 +L(Exit12): + movlpd (%ecx), %xmm0 + movl 8(%ecx), %eax + movlpd %xmm0, (%edx) + movl %eax, 8(%edx) + SAVE_RESULT (11) +# ifdef USE_AS_STRNCPY + sub $12, %ebx + lea 12(%edx), %ecx + jnz L(StrncpyFillTailWithZero1) +# ifdef USE_AS_STPCPY + cmpb $1, (%eax) + sbb $-1, %eax +# endif +# endif + RETURN1 + + .p2align 4 +L(ExitHigh12): test $0x10, %ah jnz L(Exit13) test $0x20, %ah @@ -2290,15 +2242,9 @@ L(ExitHigh): .p2align 4 L(Exit16): - movlpd (%ecx), %xmm0 - movlpd %xmm0, (%edx) - movlpd 8(%ecx), %xmm0 - movlpd %xmm0, 8(%edx) -# ifdef USE_AS_STPCPY - lea 15(%edx), %eax -# else - movl %edi, %eax -# endif + movdqu (%ecx), %xmm0 + movdqu %xmm0, (%edx) + SAVE_RESULT (15) # ifdef USE_AS_STRNCPY sub $16, %ebx lea 16(%edx), %ecx @@ -2310,7 +2256,7 @@ L(Exit16): # endif RETURN1 -# ifdef USE_AS_STRNCPY +# ifdef USE_AS_STRNCPY CFI_PUSH(%esi) @@ -2318,79 +2264,84 @@ L(Exit16): L(CopyFrom1To16BytesCase2): add $16, %ebx add %esi, %ecx - lea (%esi, %edx), %esi - lea -9(%ebx), %edx - and $1<<7, %dh - or %al, %dh - test %dh, %dh - lea (%esi), %edx + add %esi, %edx + POP (%esi) + + test %al, %al jz L(ExitHighCase2) - cmp $1, %ebx - je L(Exit1) + cmp $8, %ebx + ja L(CopyFrom1To16BytesLess8) + test $0x01, %al jnz L(Exit1) - cmp $2, %ebx - je L(Exit2) + cmp $1, %ebx + je L(Exit1) test $0x02, %al jnz L(Exit2) - cmp $3, %ebx - je L(Exit3) + cmp $2, %ebx + je L(Exit2) test $0x04, %al jnz L(Exit3) - cmp $4, %ebx - je L(Exit4) + cmp $3, %ebx + je L(Exit3) test $0x08, %al jnz L(Exit4) - cmp $5, %ebx - je L(Exit5) + cmp $4, %ebx + je L(Exit4) test $0x10, %al jnz L(Exit5) - cmp $6, %ebx - je L(Exit6) + cmp $5, %ebx + je L(Exit5) test $0x20, %al jnz L(Exit6) - cmp $7, %ebx - je L(Exit7) + cmp $6, %ebx + je L(Exit6) test $0x40, %al jnz L(Exit7) + cmp $7, %ebx + je L(Exit7) jmp L(Exit8) .p2align 4 L(ExitHighCase2): - cmp $9, %ebx - je L(Exit9) + cmp $8, %ebx + jbe L(CopyFrom1To16BytesLess8Case3) + test $0x01, %ah jnz L(Exit9) - cmp $10, %ebx - je L(Exit10) + cmp $9, %ebx + je L(Exit9) test $0x02, %ah jnz L(Exit10) - cmp $11, %ebx - je L(Exit11) + cmp $10, %ebx + je L(Exit10) test $0x04, %ah jnz L(Exit11) - cmp $12, %ebx - je L(Exit12) + cmp $11, %ebx + je L(Exit11) test $0x8, %ah jnz L(Exit12) - cmp $13, %ebx - je L(Exit13) + cmp $12, %ebx + je L(Exit12) test $0x10, %ah jnz L(Exit13) - cmp $14, %ebx - je L(Exit14) + cmp $13, %ebx + je L(Exit13) test $0x20, %ah jnz L(Exit14) - cmp $15, %ebx - je L(Exit15) + cmp $14, %ebx + je L(Exit14) test $0x40, %ah jnz L(Exit15) + cmp $15, %ebx + je L(Exit15) jmp L(Exit16) CFI_PUSH(%esi) + .p2align 4 L(CopyFrom1To16BytesCase2OrCase3): test %eax, %eax jnz L(CopyFrom1To16BytesCase2) @@ -2402,47 +2353,78 @@ L(CopyFrom1To16BytesCase3): add %esi, %ecx POP (%esi) - cmp $16, %ebx - je L(Exit16) + cmp $8, %ebx - je L(Exit8) - jg L(More8Case3) + ja L(ExitHigh8Case3) + +L(CopyFrom1To16BytesLess8Case3): cmp $4, %ebx - je L(Exit4) - jg L(More4Case3) + ja L(ExitHigh4Case3) + + cmp $1, %ebx + je L(Exit1) cmp $2, %ebx - jl L(Exit1) je L(Exit2) - jg L(Exit3) -L(More8Case3): /* but less than 16 */ - cmp $12, %ebx - je L(Exit12) - jl L(Less12Case3) - cmp $14, %ebx - jl L(Exit13) - je L(Exit14) - jg L(Exit15) -L(More4Case3): /* but less than 8 */ + cmp $3, %ebx + je L(Exit3) + movl (%ecx), %eax + movl %eax, (%edx) + SAVE_RESULT (4) + RETURN1 + + .p2align 4 +L(ExitHigh4Case3): + cmp $5, %ebx + je L(Exit5) cmp $6, %ebx - jl L(Exit5) je L(Exit6) - jg L(Exit7) -L(Less12Case3): /* but more than 8 */ + cmp $7, %ebx + je L(Exit7) + movlpd (%ecx), %xmm0 + movlpd %xmm0, (%edx) + SAVE_RESULT (8) + RETURN1 + + .p2align 4 +L(ExitHigh8Case3): + cmp $12, %ebx + ja L(ExitHigh12Case3) + + cmp $9, %ebx + je L(Exit9) cmp $10, %ebx - jl L(Exit9) je L(Exit10) - jg L(Exit11) + cmp $11, %ebx + je L(Exit11) + movlpd (%ecx), %xmm0 + movl 8(%ecx), %eax + movlpd %xmm0, (%edx) + movl %eax, 8(%edx) + SAVE_RESULT (12) + RETURN1 + + .p2align 4 +L(ExitHigh12Case3): + cmp $13, %ebx + je L(Exit13) + cmp $14, %ebx + je L(Exit14) + cmp $15, %ebx + je L(Exit15) + movlpd (%ecx), %xmm0 + movlpd 8(%ecx), %xmm1 + movlpd %xmm0, (%edx) + movlpd %xmm1, 8(%edx) + SAVE_RESULT (16) + RETURN1 + # endif .p2align 4 L(Exit1): movb (%ecx), %al movb %al, (%edx) -# ifdef USE_AS_STPCPY - lea (%edx), %eax -# else - movl %edi, %eax -# endif + SAVE_RESULT (0) # ifdef USE_AS_STRNCPY sub $1, %ebx lea 1(%edx), %ecx @@ -2458,11 +2440,7 @@ L(Exit1): L(Exit2): movw (%ecx), %ax movw %ax, (%edx) -# ifdef USE_AS_STPCPY - lea 1(%edx), %eax -# else - movl %edi, %eax -# endif + SAVE_RESULT (1) # ifdef USE_AS_STRNCPY sub $2, %ebx lea 2(%edx), %ecx @@ -2480,11 +2458,7 @@ L(Exit3): movw %ax, (%edx) movb 2(%ecx), %al movb %al, 2(%edx) -# ifdef USE_AS_STPCPY - lea 2(%edx), %eax -# else - movl %edi, %eax -# endif + SAVE_RESULT (2) # ifdef USE_AS_STRNCPY sub $3, %ebx lea 3(%edx), %ecx @@ -2497,36 +2471,12 @@ L(Exit3): RETURN1 .p2align 4 -L(Exit4): - movl (%ecx), %eax - movl %eax, (%edx) -# ifdef USE_AS_STPCPY - lea 3(%edx), %eax -# else - movl %edi, %eax -# endif -# ifdef USE_AS_STRNCPY - sub $4, %ebx - lea 4(%edx), %ecx - jnz L(StrncpyFillTailWithZero1) -# ifdef USE_AS_STPCPY - cmpb $1, (%eax) - sbb $-1, %eax -# endif -# endif - RETURN1 - - .p2align 4 L(Exit5): movl (%ecx), %eax movl %eax, (%edx) movb 4(%ecx), %al movb %al, 4(%edx) -# ifdef USE_AS_STPCPY - lea 4(%edx), %eax -# else - movl %edi, %eax -# endif + SAVE_RESULT (4) # ifdef USE_AS_STRNCPY sub $5, %ebx lea 5(%edx), %ecx @@ -2544,11 +2494,7 @@ L(Exit6): movl %eax, (%edx) movw 4(%ecx), %ax movw %ax, 4(%edx) -# ifdef USE_AS_STPCPY - lea 5(%edx), %eax -# else - movl %edi, %eax -# endif + SAVE_RESULT (5) # ifdef USE_AS_STRNCPY sub $6, %ebx lea 6(%edx), %ecx @@ -2566,11 +2512,7 @@ L(Exit7): movl %eax, (%edx) movl 3(%ecx), %eax movl %eax, 3(%edx) -# ifdef USE_AS_STPCPY - lea 6(%edx), %eax -# else - movl %edi, %eax -# endif + SAVE_RESULT (6) # ifdef USE_AS_STRNCPY sub $7, %ebx lea 7(%edx), %ecx @@ -2585,14 +2527,10 @@ L(Exit7): .p2align 4 L(Exit9): movlpd (%ecx), %xmm0 - movlpd %xmm0, (%edx) movb 8(%ecx), %al + movlpd %xmm0, (%edx) movb %al, 8(%edx) -# ifdef USE_AS_STPCPY - lea 8(%edx), %eax -# else - movl %edi, %eax -# endif + SAVE_RESULT (8) # ifdef USE_AS_STRNCPY sub $9, %ebx lea 9(%edx), %ecx @@ -2607,14 +2545,10 @@ L(Exit9): .p2align 4 L(Exit10): movlpd (%ecx), %xmm0 - movlpd %xmm0, (%edx) movw 8(%ecx), %ax + movlpd %xmm0, (%edx) movw %ax, 8(%edx) -# ifdef USE_AS_STPCPY - lea 9(%edx), %eax -# else - movl %edi, %eax -# endif + SAVE_RESULT (9) # ifdef USE_AS_STRNCPY sub $10, %ebx lea 10(%edx), %ecx @@ -2629,14 +2563,10 @@ L(Exit10): .p2align 4 L(Exit11): movlpd (%ecx), %xmm0 - movlpd %xmm0, (%edx) movl 7(%ecx), %eax + movlpd %xmm0, (%edx) movl %eax, 7(%edx) -# ifdef USE_AS_STPCPY - lea 10(%edx), %eax -# else - movl %edi, %eax -# endif + SAVE_RESULT (10) # ifdef USE_AS_STRNCPY sub $11, %ebx lea 11(%edx), %ecx @@ -2649,38 +2579,12 @@ L(Exit11): RETURN1 .p2align 4 -L(Exit12): - movlpd (%ecx), %xmm0 - movlpd %xmm0, (%edx) - movl 8(%ecx), %eax - movl %eax, 8(%edx) -# ifdef USE_AS_STPCPY - lea 11(%edx), %eax -# else - movl %edi, %eax -# endif -# ifdef USE_AS_STRNCPY - sub $12, %ebx - lea 12(%edx), %ecx - jnz L(StrncpyFillTailWithZero1) -# ifdef USE_AS_STPCPY - cmpb $1, (%eax) - sbb $-1, %eax -# endif -# endif - RETURN1 - - .p2align 4 L(Exit13): movlpd (%ecx), %xmm0 + movlpd 5(%ecx), %xmm1 movlpd %xmm0, (%edx) - movlpd 5(%ecx), %xmm0 - movlpd %xmm0, 5(%edx) -# ifdef USE_AS_STPCPY - lea 12(%edx), %eax -# else - movl %edi, %eax -# endif + movlpd %xmm1, 5(%edx) + SAVE_RESULT (12) # ifdef USE_AS_STRNCPY sub $13, %ebx lea 13(%edx), %ecx @@ -2695,14 +2599,10 @@ L(Exit13): .p2align 4 L(Exit14): movlpd (%ecx), %xmm0 + movlpd 6(%ecx), %xmm1 movlpd %xmm0, (%edx) - movlpd 6(%ecx), %xmm0 - movlpd %xmm0, 6(%edx) -# ifdef USE_AS_STPCPY - lea 13(%edx), %eax -# else - movl %edi, %eax -# endif + movlpd %xmm1, 6(%edx) + SAVE_RESULT (13) # ifdef USE_AS_STRNCPY sub $14, %ebx lea 14(%edx), %ecx @@ -2717,14 +2617,10 @@ L(Exit14): .p2align 4 L(Exit15): movlpd (%ecx), %xmm0 + movlpd 7(%ecx), %xmm1 movlpd %xmm0, (%edx) - movlpd 7(%ecx), %xmm0 - movlpd %xmm0, 7(%edx) -# ifdef USE_AS_STPCPY - lea 14(%edx), %eax -# else - movl %edi, %eax -# endif + movlpd %xmm1, 7(%edx) + SAVE_RESULT (14) # ifdef USE_AS_STRNCPY sub $15, %ebx lea 15(%edx), %ecx @@ -2853,7 +2749,7 @@ L(FillFrom1To16Bytes): jl L(Fill1) je L(Fill2) jg L(Fill3) -L(FillMore8): /* but less than 16 */ +L(FillMore8): /* but less than 16 */ cmp $12, %ebx je L(Fill12) jl L(FillLess12) @@ -2861,18 +2757,18 @@ L(FillMore8): /* but less than 16 */ jl L(Fill13) je L(Fill14) jg L(Fill15) -L(FillMore4): /* but less than 8 */ +L(FillMore4): /* but less than 8 */ cmp $6, %ebx jl L(Fill5) je L(Fill6) jg L(Fill7) -L(FillLess12): /* but more than 8 */ +L(FillLess12): /* but more than 8 */ cmp $10, %ebx jl L(Fill9) je L(Fill10) jmp L(Fill11) - CFI_PUSH (%edi) + CFI_PUSH(%edi) .p2align 4 L(StrncpyFillTailWithZero1): @@ -2929,11 +2825,7 @@ L(StrncpyFillLess32): L(ExitTail1): movb (%ecx), %al movb %al, (%edx) -# ifdef USE_AS_STPCPY - lea (%edx), %eax -# else - movl %edx, %eax -# endif + SAVE_RESULT_TAIL (0) # ifdef USE_AS_STRNCPY sub $1, %ebx lea 1(%edx), %ecx @@ -2949,11 +2841,7 @@ L(ExitTail1): L(ExitTail2): movw (%ecx), %ax movw %ax, (%edx) -# ifdef USE_AS_STPCPY - lea 1(%edx), %eax -# else - movl %edx, %eax -# endif + SAVE_RESULT_TAIL (1) # ifdef USE_AS_STRNCPY sub $2, %ebx lea 2(%edx), %ecx @@ -2971,11 +2859,7 @@ L(ExitTail3): movw %ax, (%edx) movb 2(%ecx), %al movb %al, 2(%edx) -# ifdef USE_AS_STPCPY - lea 2(%edx), %eax -# else - movl %edx, %eax -# endif + SAVE_RESULT_TAIL (2) # ifdef USE_AS_STRNCPY sub $3, %ebx lea 3(%edx), %ecx @@ -2991,11 +2875,7 @@ L(ExitTail3): L(ExitTail4): movl (%ecx), %eax movl %eax, (%edx) -# ifdef USE_AS_STPCPY - lea 3(%edx), %eax -# else - movl %edx, %eax -# endif + SAVE_RESULT_TAIL (3) # ifdef USE_AS_STRNCPY sub $4, %ebx lea 4(%edx), %ecx @@ -3013,11 +2893,7 @@ L(ExitTail5): movl %eax, (%edx) movb 4(%ecx), %al movb %al, 4(%edx) -# ifdef USE_AS_STPCPY - lea 4(%edx), %eax -# else - movl %edx, %eax -# endif + SAVE_RESULT_TAIL (4) # ifdef USE_AS_STRNCPY sub $5, %ebx lea 5(%edx), %ecx @@ -3035,11 +2911,7 @@ L(ExitTail6): movl %eax, (%edx) movw 4(%ecx), %ax movw %ax, 4(%edx) -# ifdef USE_AS_STPCPY - lea 5(%edx), %eax -# else - movl %edx, %eax -# endif + SAVE_RESULT_TAIL (5) # ifdef USE_AS_STRNCPY sub $6, %ebx lea 6(%edx), %ecx @@ -3057,11 +2929,7 @@ L(ExitTail7): movl %eax, (%edx) movl 3(%ecx), %eax movl %eax, 3(%edx) -# ifdef USE_AS_STPCPY - lea 6(%edx), %eax -# else - movl %edx, %eax -# endif + SAVE_RESULT_TAIL (6) # ifdef USE_AS_STRNCPY sub $7, %ebx lea 7(%edx), %ecx @@ -3077,33 +2945,21 @@ L(ExitTail7): L(ExitTail8): movlpd (%ecx), %xmm0 movlpd %xmm0, (%edx) -# ifdef USE_AS_STPCPY - lea 7(%edx), %eax -# else - movl %edx, %eax -# endif + SAVE_RESULT_TAIL (7) # ifdef USE_AS_STRNCPY sub $8, %ebx lea 8(%edx), %ecx jnz L(StrncpyFillTailWithZero) -# ifdef USE_AS_STPCPY - cmpb $1, (%eax) - sbb $-1, %eax -# endif # endif RETURN .p2align 4 L(ExitTail9): movlpd (%ecx), %xmm0 - movlpd %xmm0, (%edx) movb 8(%ecx), %al + movlpd %xmm0, (%edx) movb %al, 8(%edx) -# ifdef USE_AS_STPCPY - lea 8(%edx), %eax -# else - movl %edx, %eax -# endif + SAVE_RESULT_TAIL (8) # ifdef USE_AS_STRNCPY sub $9, %ebx lea 9(%edx), %ecx @@ -3118,14 +2974,10 @@ L(ExitTail9): .p2align 4 L(ExitTail10): movlpd (%ecx), %xmm0 - movlpd %xmm0, (%edx) movw 8(%ecx), %ax + movlpd %xmm0, (%edx) movw %ax, 8(%edx) -# ifdef USE_AS_STPCPY - lea 9(%edx), %eax -# else - movl %edx, %eax -# endif + SAVE_RESULT_TAIL (9) # ifdef USE_AS_STRNCPY sub $10, %ebx lea 10(%edx), %ecx @@ -3140,14 +2992,10 @@ L(ExitTail10): .p2align 4 L(ExitTail11): movlpd (%ecx), %xmm0 - movlpd %xmm0, (%edx) movl 7(%ecx), %eax + movlpd %xmm0, (%edx) movl %eax, 7(%edx) -# ifdef USE_AS_STPCPY - lea 10(%edx), %eax -# else - movl %edx, %eax -# endif + SAVE_RESULT_TAIL (10) # ifdef USE_AS_STRNCPY sub $11, %ebx lea 11(%edx), %ecx @@ -3162,14 +3010,10 @@ L(ExitTail11): .p2align 4 L(ExitTail12): movlpd (%ecx), %xmm0 - movlpd %xmm0, (%edx) movl 8(%ecx), %eax + movlpd %xmm0, (%edx) movl %eax, 8(%edx) -# ifdef USE_AS_STPCPY - lea 11(%edx), %eax -# else - movl %edx, %eax -# endif + SAVE_RESULT_TAIL (11) # ifdef USE_AS_STRNCPY sub $12, %ebx lea 12(%edx), %ecx @@ -3184,14 +3028,10 @@ L(ExitTail12): .p2align 4 L(ExitTail13): movlpd (%ecx), %xmm0 + movlpd 5(%ecx), %xmm1 movlpd %xmm0, (%edx) - movlpd 5(%ecx), %xmm0 - movlpd %xmm0, 5(%edx) -# ifdef USE_AS_STPCPY - lea 12(%edx), %eax -# else - movl %edx, %eax -# endif + movlpd %xmm1, 5(%edx) + SAVE_RESULT_TAIL (12) # ifdef USE_AS_STRNCPY sub $13, %ebx lea 13(%edx), %ecx @@ -3206,19 +3046,15 @@ L(ExitTail13): .p2align 4 L(ExitTail14): movlpd (%ecx), %xmm0 + movlpd 6(%ecx), %xmm1 movlpd %xmm0, (%edx) - movlpd 6(%ecx), %xmm0 - movlpd %xmm0, 6(%edx) -# ifdef USE_AS_STPCPY - lea 13(%edx), %eax -# else - movl %edx, %eax -# endif + movlpd %xmm1, 6(%edx) + SAVE_RESULT_TAIL (13) # ifdef USE_AS_STRNCPY sub $14, %ebx lea 14(%edx), %ecx jnz L(StrncpyFillTailWithZero) -# ifdef USE_AS_STPCPY +# ifdef USE_AS_STPCPY cmpb $1, (%eax) sbb $-1, %eax # endif @@ -3228,36 +3064,22 @@ L(ExitTail14): .p2align 4 L(ExitTail15): movlpd (%ecx), %xmm0 + movlpd 7(%ecx), %xmm1 movlpd %xmm0, (%edx) - movlpd 7(%ecx), %xmm0 - movlpd %xmm0, 7(%edx) -# ifdef USE_AS_STPCPY - lea 14(%edx), %eax -# else - movl %edx, %eax -# endif + movlpd %xmm1, 7(%edx) + SAVE_RESULT_TAIL (14) # ifdef USE_AS_STRNCPY sub $15, %ebx lea 15(%edx), %ecx jnz L(StrncpyFillTailWithZero) -# ifdef USE_AS_STPCPY - cmpb $1, (%eax) - sbb $-1, %eax -# endif # endif RETURN .p2align 4 L(ExitTail16): - movlpd (%ecx), %xmm0 - movlpd %xmm0, (%edx) - movlpd 8(%ecx), %xmm0 - movlpd %xmm0, 8(%edx) -# ifdef USE_AS_STPCPY - lea 15(%edx), %eax -# else - movl %edx, %eax -# endif + movdqu (%ecx), %xmm0 + movdqu %xmm0, (%edx) + SAVE_RESULT_TAIL (15) # ifdef USE_AS_STRNCPY sub $16, %ebx lea 16(%edx), %ecx @@ -3268,13 +3090,14 @@ L(ExitTail16): # endif # endif RETURN -#endif +# endif # ifdef USE_AS_STRNCPY # ifndef USE_AS_STRCAT - CFI_PUSH (%esi) - CFI_PUSH (%edi) + CFI_PUSH (%esi) + CFI_PUSH (%edi) # endif + .p2align 4 L(StrncpyLeaveCase2OrCase3): test %eax, %eax jnz L(Aligned64LeaveCase2) @@ -3327,153 +3150,153 @@ L(Aligned64LeaveCase2): lea 16(%esi), %esi lea -16(%ebx), %ebx jmp L(CopyFrom1To16BytesCase2) -/* -------------------------------------------------- */ + +/*--------------------------------------------------*/ + .p2align 4 L(StrncpyExit1Case2OrCase3): - movaps (%edx), %xmm6 - psrldq $15, %xmm6 + movlpd (%ecx), %xmm0 + movlpd 7(%ecx), %xmm1 + movlpd %xmm0, (%edx) + movlpd %xmm1, 7(%edx) mov $15, %esi - palignr $1, %xmm1, %xmm6 - movaps %xmm6, (%edx) test %eax, %eax jnz L(CopyFrom1To16BytesCase2) jmp L(CopyFrom1To16BytesCase3) + .p2align 4 L(StrncpyExit2Case2OrCase3): - movaps (%edx), %xmm6 - psrldq $14, %xmm6 + movlpd (%ecx), %xmm0 + movlpd 6(%ecx), %xmm1 + movlpd %xmm0, (%edx) + movlpd %xmm1, 6(%edx) mov $14, %esi - palignr $2, %xmm1, %xmm6 - movaps %xmm6, (%edx) test %eax, %eax jnz L(CopyFrom1To16BytesCase2) jmp L(CopyFrom1To16BytesCase3) + .p2align 4 L(StrncpyExit3Case2OrCase3): - movaps (%edx), %xmm6 - psrldq $13, %xmm6 + movlpd (%ecx), %xmm0 + movlpd 5(%ecx), %xmm1 + movlpd %xmm0, (%edx) + movlpd %xmm1, 5(%edx) mov $13, %esi - palignr $3, %xmm1, %xmm6 - movaps %xmm6, (%edx) test %eax, %eax jnz L(CopyFrom1To16BytesCase2) jmp L(CopyFrom1To16BytesCase3) + .p2align 4 L(StrncpyExit4Case2OrCase3): - movaps (%edx), %xmm6 - psrldq $12, %xmm6 + movlpd (%ecx), %xmm0 + movl 8(%ecx), %esi + movlpd %xmm0, (%edx) + movl %esi, 8(%edx) mov $12, %esi - palignr $4, %xmm1, %xmm6 - movaps %xmm6, (%edx) test %eax, %eax jnz L(CopyFrom1To16BytesCase2) jmp L(CopyFrom1To16BytesCase3) + .p2align 4 L(StrncpyExit5Case2OrCase3): - movaps (%edx), %xmm6 - psrldq $11, %xmm6 + movlpd (%ecx), %xmm0 + movl 7(%ecx), %esi + movlpd %xmm0, (%edx) + movl %esi, 7(%edx) mov $11, %esi - palignr $5, %xmm1, %xmm6 - movaps %xmm6, (%edx) test %eax, %eax jnz L(CopyFrom1To16BytesCase2) jmp L(CopyFrom1To16BytesCase3) + .p2align 4 L(StrncpyExit6Case2OrCase3): - movaps (%edx), %xmm6 - psrldq $10, %xmm6 + movlpd (%ecx), %xmm0 + movl 6(%ecx), %esi + movlpd %xmm0, (%edx) + movl %esi, 6(%edx) mov $10, %esi - palignr $6, %xmm1, %xmm6 - movaps %xmm6, (%edx) test %eax, %eax jnz L(CopyFrom1To16BytesCase2) jmp L(CopyFrom1To16BytesCase3) + .p2align 4 L(StrncpyExit7Case2OrCase3): - movaps (%edx), %xmm6 - psrldq $9, %xmm6 + movlpd (%ecx), %xmm0 + movl 5(%ecx), %esi + movlpd %xmm0, (%edx) + movl %esi, 5(%edx) mov $9, %esi - palignr $7, %xmm1, %xmm6 - movaps %xmm6, (%edx) test %eax, %eax jnz L(CopyFrom1To16BytesCase2) jmp L(CopyFrom1To16BytesCase3) + .p2align 4 L(StrncpyExit8Case2OrCase3): - movaps (%edx), %xmm6 - psrldq $8, %xmm6 + movlpd (%ecx), %xmm0 + movlpd %xmm0, (%edx) mov $8, %esi - palignr $8, %xmm1, %xmm6 - movaps %xmm6, (%edx) test %eax, %eax jnz L(CopyFrom1To16BytesCase2) jmp L(CopyFrom1To16BytesCase3) + .p2align 4 L(StrncpyExit9Case2OrCase3): - movaps (%edx), %xmm6 - psrldq $7, %xmm6 + movlpd (%ecx), %xmm0 + movlpd %xmm0, (%edx) mov $7, %esi - palignr $9, %xmm1, %xmm6 - movaps %xmm6, (%edx) test %eax, %eax jnz L(CopyFrom1To16BytesCase2) jmp L(CopyFrom1To16BytesCase3) + .p2align 4 L(StrncpyExit10Case2OrCase3): - movaps (%edx), %xmm6 - psrldq $6, %xmm6 + movlpd -1(%ecx), %xmm0 + movlpd %xmm0, -1(%edx) mov $6, %esi - palignr $10, %xmm1, %xmm6 - movaps %xmm6, (%edx) test %eax, %eax jnz L(CopyFrom1To16BytesCase2) jmp L(CopyFrom1To16BytesCase3) + .p2align 4 L(StrncpyExit11Case2OrCase3): - movaps (%edx), %xmm6 - psrldq $5, %xmm6 + movlpd -2(%ecx), %xmm0 + movlpd %xmm0, -2(%edx) mov $5, %esi - palignr $11, %xmm1, %xmm6 - movaps %xmm6, (%edx) test %eax, %eax jnz L(CopyFrom1To16BytesCase2) jmp L(CopyFrom1To16BytesCase3) + .p2align 4 L(StrncpyExit12Case2OrCase3): - movaps (%edx), %xmm6 - psrldq $4, %xmm6 + movl (%ecx), %esi + movl %esi, (%edx) mov $4, %esi - palignr $12, %xmm1, %xmm6 - movaps %xmm6, (%edx) test %eax, %eax jnz L(CopyFrom1To16BytesCase2) jmp L(CopyFrom1To16BytesCase3) + .p2align 4 L(StrncpyExit13Case2OrCase3): - movaps (%edx), %xmm6 - psrldq $3, %xmm6 + movl -1(%ecx), %esi + movl %esi, -1(%edx) mov $3, %esi - palignr $13, %xmm1, %xmm6 - movaps %xmm6, (%edx) test %eax, %eax jnz L(CopyFrom1To16BytesCase2) jmp L(CopyFrom1To16BytesCase3) + .p2align 4 L(StrncpyExit14Case2OrCase3): - movaps (%edx), %xmm6 - psrldq $2, %xmm6 + movl -2(%ecx), %esi + movl %esi, -2(%edx) mov $2, %esi - palignr $14, %xmm1, %xmm6 - movaps %xmm6, (%edx) test %eax, %eax jnz L(CopyFrom1To16BytesCase2) jmp L(CopyFrom1To16BytesCase3) + .p2align 4 L(StrncpyExit15Case2OrCase3): - movaps (%edx), %xmm6 - psrldq $1, %xmm6 + movl -3(%ecx), %esi + movl %esi, -3(%edx) mov $1, %esi - palignr $15, %xmm1, %xmm6 - movaps %xmm6, (%edx) test %eax, %eax jnz L(CopyFrom1To16BytesCase2) jmp L(CopyFrom1To16BytesCase3) @@ -3483,36 +3306,29 @@ L(StrncpyLeave1): add $48, %ebx jle L(StrncpyExit1) palignr $1, %xmm1, %xmm2 - movaps %xmm3, %xmm1 movaps %xmm2, (%edx) movaps 31(%ecx), %xmm2 lea 16(%esi), %esi - movaps %xmm2, %xmm3 sub $16, %ebx jbe L(StrncpyExit1) - palignr $1, %xmm1, %xmm2 + palignr $1, %xmm3, %xmm2 movaps %xmm2, 16(%edx) - movaps 31+16(%ecx), %xmm2 - movaps %xmm3, %xmm1 lea 16(%esi), %esi sub $16, %ebx jbe L(StrncpyExit1) - movaps %xmm2, %xmm1 movaps %xmm4, 32(%edx) lea 16(%esi), %esi sub $16, %ebx jbe L(StrncpyExit1) - movaps %xmm7, %xmm1 movaps %xmm5, 48(%edx) lea 16(%esi), %esi lea -16(%ebx), %ebx - L(StrncpyExit1): - movaps (%edx, %esi), %xmm6 - psrldq $15, %xmm6 - palignr $1, %xmm1, %xmm6 - movaps %xmm6, (%edx, %esi) - lea 15(%esi), %esi + lea 15(%edx, %esi), %edx + lea 15(%ecx, %esi), %ecx + movdqu -16(%ecx), %xmm0 + xor %esi, %esi + movdqu %xmm0, -16(%edx) jmp L(CopyFrom1To16BytesCase3) L(StrncpyLeave2): @@ -3520,36 +3336,29 @@ L(StrncpyLeave2): add $48, %ebx jle L(StrncpyExit2) palignr $2, %xmm1, %xmm2 - movaps %xmm3, %xmm1 movaps %xmm2, (%edx) movaps 30(%ecx), %xmm2 lea 16(%esi), %esi - movaps %xmm2, %xmm3 sub $16, %ebx jbe L(StrncpyExit2) - palignr $2, %xmm1, %xmm2 + palignr $2, %xmm3, %xmm2 movaps %xmm2, 16(%edx) - movaps 30+16(%ecx), %xmm2 - movaps %xmm3, %xmm1 lea 16(%esi), %esi sub $16, %ebx jbe L(StrncpyExit2) - movaps %xmm2, %xmm1 movaps %xmm4, 32(%edx) lea 16(%esi), %esi sub $16, %ebx jbe L(StrncpyExit2) - movaps %xmm7, %xmm1 movaps %xmm5, 48(%edx) lea 16(%esi), %esi lea -16(%ebx), %ebx - L(StrncpyExit2): - movaps (%edx, %esi), %xmm6 - psrldq $14, %xmm6 - palignr $2, %xmm1, %xmm6 - movaps %xmm6, (%edx, %esi) - lea 14(%esi), %esi + lea 14(%edx, %esi), %edx + lea 14(%ecx, %esi), %ecx + movdqu -16(%ecx), %xmm0 + xor %esi, %esi + movdqu %xmm0, -16(%edx) jmp L(CopyFrom1To16BytesCase3) L(StrncpyLeave3): @@ -3557,36 +3366,29 @@ L(StrncpyLeave3): add $48, %ebx jle L(StrncpyExit3) palignr $3, %xmm1, %xmm2 - movaps %xmm3, %xmm1 movaps %xmm2, (%edx) movaps 29(%ecx), %xmm2 lea 16(%esi), %esi - movaps %xmm2, %xmm3 sub $16, %ebx jbe L(StrncpyExit3) - palignr $3, %xmm1, %xmm2 + palignr $3, %xmm3, %xmm2 movaps %xmm2, 16(%edx) - movaps 29+16(%ecx), %xmm2 - movaps %xmm3, %xmm1 lea 16(%esi), %esi sub $16, %ebx jbe L(StrncpyExit3) - movaps %xmm2, %xmm1 movaps %xmm4, 32(%edx) lea 16(%esi), %esi sub $16, %ebx jbe L(StrncpyExit3) - movaps %xmm7, %xmm1 movaps %xmm5, 48(%edx) lea 16(%esi), %esi lea -16(%ebx), %ebx - L(StrncpyExit3): - movaps (%edx, %esi), %xmm6 - psrldq $13, %xmm6 - palignr $3, %xmm1, %xmm6 - movaps %xmm6, (%edx, %esi) - lea 13(%esi), %esi + lea 13(%edx, %esi), %edx + lea 13(%ecx, %esi), %ecx + movdqu -16(%ecx), %xmm0 + xor %esi, %esi + movdqu %xmm0, -16(%edx) jmp L(CopyFrom1To16BytesCase3) L(StrncpyLeave4): @@ -3594,36 +3396,31 @@ L(StrncpyLeave4): add $48, %ebx jle L(StrncpyExit4) palignr $4, %xmm1, %xmm2 - movaps %xmm3, %xmm1 movaps %xmm2, (%edx) movaps 28(%ecx), %xmm2 lea 16(%esi), %esi - movaps %xmm2, %xmm3 sub $16, %ebx jbe L(StrncpyExit4) - palignr $4, %xmm1, %xmm2 + palignr $4, %xmm3, %xmm2 movaps %xmm2, 16(%edx) - movaps 28+16(%ecx), %xmm2 - movaps %xmm3, %xmm1 lea 16(%esi), %esi sub $16, %ebx jbe L(StrncpyExit4) - movaps %xmm2, %xmm1 movaps %xmm4, 32(%edx) lea 16(%esi), %esi sub $16, %ebx jbe L(StrncpyExit4) - movaps %xmm7, %xmm1 movaps %xmm5, 48(%edx) lea 16(%esi), %esi lea -16(%ebx), %ebx - L(StrncpyExit4): - movaps (%edx, %esi), %xmm6 - psrldq $12, %xmm6 - palignr $4, %xmm1, %xmm6 - movaps %xmm6, (%edx, %esi) - lea 12(%esi), %esi + lea 12(%edx, %esi), %edx + lea 12(%ecx, %esi), %ecx + movlpd -12(%ecx), %xmm0 + movl -4(%ecx), %eax + movlpd %xmm0, -12(%edx) + movl %eax, -4(%edx) + xor %esi, %esi jmp L(CopyFrom1To16BytesCase3) L(StrncpyLeave5): @@ -3631,36 +3428,31 @@ L(StrncpyLeave5): add $48, %ebx jle L(StrncpyExit5) palignr $5, %xmm1, %xmm2 - movaps %xmm3, %xmm1 movaps %xmm2, (%edx) movaps 27(%ecx), %xmm2 lea 16(%esi), %esi - movaps %xmm2, %xmm3 sub $16, %ebx jbe L(StrncpyExit5) - palignr $5, %xmm1, %xmm2 + palignr $5, %xmm3, %xmm2 movaps %xmm2, 16(%edx) - movaps 27+16(%ecx), %xmm2 - movaps %xmm3, %xmm1 lea 16(%esi), %esi sub $16, %ebx jbe L(StrncpyExit5) - movaps %xmm2, %xmm1 movaps %xmm4, 32(%edx) lea 16(%esi), %esi sub $16, %ebx jbe L(StrncpyExit5) - movaps %xmm7, %xmm1 movaps %xmm5, 48(%edx) lea 16(%esi), %esi lea -16(%ebx), %ebx - L(StrncpyExit5): - movaps (%edx, %esi), %xmm6 - psrldq $11, %xmm6 - palignr $5, %xmm1, %xmm6 - movaps %xmm6, (%edx, %esi) - lea 11(%esi), %esi + lea 11(%edx, %esi), %edx + lea 11(%ecx, %esi), %ecx + movlpd -11(%ecx), %xmm0 + movl -4(%ecx), %eax + movlpd %xmm0, -11(%edx) + movl %eax, -4(%edx) + xor %esi, %esi jmp L(CopyFrom1To16BytesCase3) L(StrncpyLeave6): @@ -3668,36 +3460,32 @@ L(StrncpyLeave6): add $48, %ebx jle L(StrncpyExit6) palignr $6, %xmm1, %xmm2 - movaps %xmm3, %xmm1 movaps %xmm2, (%edx) movaps 26(%ecx), %xmm2 lea 16(%esi), %esi - movaps %xmm2, %xmm3 sub $16, %ebx jbe L(StrncpyExit6) - palignr $6, %xmm1, %xmm2 + palignr $6, %xmm3, %xmm2 movaps %xmm2, 16(%edx) - movaps 26+16(%ecx), %xmm2 - movaps %xmm3, %xmm1 lea 16(%esi), %esi sub $16, %ebx jbe L(StrncpyExit6) - movaps %xmm2, %xmm1 movaps %xmm4, 32(%edx) lea 16(%esi), %esi sub $16, %ebx jbe L(StrncpyExit6) - movaps %xmm7, %xmm1 movaps %xmm5, 48(%edx) lea 16(%esi), %esi lea -16(%ebx), %ebx - L(StrncpyExit6): - movaps (%edx, %esi), %xmm6 - psrldq $10, %xmm6 - palignr $6, %xmm1, %xmm6 - movaps %xmm6, (%edx, %esi) - lea 10(%esi), %esi + lea 10(%edx, %esi), %edx + lea 10(%ecx, %esi), %ecx + + movlpd -10(%ecx), %xmm0 + movw -2(%ecx), %ax + movlpd %xmm0, -10(%edx) + movw %ax, -2(%edx) + xor %esi, %esi jmp L(CopyFrom1To16BytesCase3) L(StrncpyLeave7): @@ -3705,36 +3493,32 @@ L(StrncpyLeave7): add $48, %ebx jle L(StrncpyExit7) palignr $7, %xmm1, %xmm2 - movaps %xmm3, %xmm1 movaps %xmm2, (%edx) movaps 25(%ecx), %xmm2 lea 16(%esi), %esi - movaps %xmm2, %xmm3 sub $16, %ebx jbe L(StrncpyExit7) - palignr $7, %xmm1, %xmm2 + palignr $7, %xmm3, %xmm2 movaps %xmm2, 16(%edx) - movaps 25+16(%ecx), %xmm2 - movaps %xmm3, %xmm1 lea 16(%esi), %esi sub $16, %ebx jbe L(StrncpyExit7) - movaps %xmm2, %xmm1 movaps %xmm4, 32(%edx) lea 16(%esi), %esi sub $16, %ebx jbe L(StrncpyExit7) - movaps %xmm7, %xmm1 movaps %xmm5, 48(%edx) lea 16(%esi), %esi lea -16(%ebx), %ebx - L(StrncpyExit7): - movaps (%edx, %esi), %xmm6 - psrldq $9, %xmm6 - palignr $7, %xmm1, %xmm6 - movaps %xmm6, (%edx, %esi) - lea 9(%esi), %esi + lea 9(%edx, %esi), %edx + lea 9(%ecx, %esi), %ecx + + movlpd -9(%ecx), %xmm0 + movb -1(%ecx), %ah + movlpd %xmm0, -9(%edx) + movb %ah, -1(%edx) + xor %esi, %esi jmp L(CopyFrom1To16BytesCase3) L(StrncpyLeave8): @@ -3742,36 +3526,29 @@ L(StrncpyLeave8): add $48, %ebx jle L(StrncpyExit8) palignr $8, %xmm1, %xmm2 - movaps %xmm3, %xmm1 movaps %xmm2, (%edx) movaps 24(%ecx), %xmm2 lea 16(%esi), %esi - movaps %xmm2, %xmm3 sub $16, %ebx jbe L(StrncpyExit8) - palignr $8, %xmm1, %xmm2 + palignr $8, %xmm3, %xmm2 movaps %xmm2, 16(%edx) - movaps 24+16(%ecx), %xmm2 - movaps %xmm3, %xmm1 lea 16(%esi), %esi sub $16, %ebx jbe L(StrncpyExit8) - movaps %xmm2, %xmm1 movaps %xmm4, 32(%edx) lea 16(%esi), %esi sub $16, %ebx jbe L(StrncpyExit8) - movaps %xmm7, %xmm1 movaps %xmm5, 48(%edx) lea 16(%esi), %esi lea -16(%ebx), %ebx - L(StrncpyExit8): - movaps (%edx, %esi), %xmm6 - psrldq $8, %xmm6 - palignr $8, %xmm1, %xmm6 - movaps %xmm6, (%edx, %esi) - lea 8(%esi), %esi + lea 8(%edx, %esi), %edx + lea 8(%ecx, %esi), %ecx + movlpd -8(%ecx), %xmm0 + movlpd %xmm0, -8(%edx) + xor %esi, %esi jmp L(CopyFrom1To16BytesCase3) L(StrncpyLeave9): @@ -3779,36 +3556,30 @@ L(StrncpyLeave9): add $48, %ebx jle L(StrncpyExit9) palignr $9, %xmm1, %xmm2 - movaps %xmm3, %xmm1 movaps %xmm2, (%edx) movaps 23(%ecx), %xmm2 lea 16(%esi), %esi - movaps %xmm2, %xmm3 sub $16, %ebx jbe L(StrncpyExit9) - palignr $9, %xmm1, %xmm2 + palignr $9, %xmm3, %xmm2 movaps %xmm2, 16(%edx) - movaps 23+16(%ecx), %xmm2 - movaps %xmm3, %xmm1 lea 16(%esi), %esi sub $16, %ebx jbe L(StrncpyExit9) - movaps %xmm2, %xmm1 movaps %xmm4, 32(%edx) lea 16(%esi), %esi sub $16, %ebx jbe L(StrncpyExit9) - movaps %xmm7, %xmm1 movaps %xmm5, 48(%edx) lea 16(%esi), %esi lea -16(%ebx), %ebx - L(StrncpyExit9): - movaps (%edx, %esi), %xmm6 - psrldq $7, %xmm6 - palignr $9, %xmm1, %xmm6 - movaps %xmm6, (%edx, %esi) - lea 7(%esi), %esi + lea 7(%edx, %esi), %edx + lea 7(%ecx, %esi), %ecx + + movlpd -8(%ecx), %xmm0 + movlpd %xmm0, -8(%edx) + xor %esi, %esi jmp L(CopyFrom1To16BytesCase3) L(StrncpyLeave10): @@ -3816,36 +3587,30 @@ L(StrncpyLeave10): add $48, %ebx jle L(StrncpyExit10) palignr $10, %xmm1, %xmm2 - movaps %xmm3, %xmm1 movaps %xmm2, (%edx) movaps 22(%ecx), %xmm2 lea 16(%esi), %esi - movaps %xmm2, %xmm3 sub $16, %ebx jbe L(StrncpyExit10) - palignr $10, %xmm1, %xmm2 + palignr $10, %xmm3, %xmm2 movaps %xmm2, 16(%edx) - movaps 22+16(%ecx), %xmm2 - movaps %xmm3, %xmm1 lea 16(%esi), %esi sub $16, %ebx jbe L(StrncpyExit10) - movaps %xmm2, %xmm1 movaps %xmm4, 32(%edx) lea 16(%esi), %esi sub $16, %ebx jbe L(StrncpyExit10) - movaps %xmm7, %xmm1 movaps %xmm5, 48(%edx) lea 16(%esi), %esi lea -16(%ebx), %ebx - L(StrncpyExit10): - movaps (%edx, %esi), %xmm6 - psrldq $6, %xmm6 - palignr $10, %xmm1, %xmm6 - movaps %xmm6, (%edx, %esi) - lea 6(%esi), %esi + lea 6(%edx, %esi), %edx + lea 6(%ecx, %esi), %ecx + + movlpd -8(%ecx), %xmm0 + movlpd %xmm0, -8(%edx) + xor %esi, %esi jmp L(CopyFrom1To16BytesCase3) L(StrncpyLeave11): @@ -3853,36 +3618,31 @@ L(StrncpyLeave11): add $48, %ebx jle L(StrncpyExit11) palignr $11, %xmm1, %xmm2 - movaps %xmm3, %xmm1 movaps %xmm2, (%edx) movaps 21(%ecx), %xmm2 lea 16(%esi), %esi - movaps %xmm2, %xmm3 sub $16, %ebx jbe L(StrncpyExit11) - palignr $11, %xmm1, %xmm2 + palignr $11, %xmm3, %xmm2 movaps %xmm2, 16(%edx) - movaps 21+16(%ecx), %xmm2 - movaps %xmm3, %xmm1 lea 16(%esi), %esi sub $16, %ebx jbe L(StrncpyExit11) - movaps %xmm2, %xmm1 movaps %xmm4, 32(%edx) lea 16(%esi), %esi sub $16, %ebx jbe L(StrncpyExit11) - movaps %xmm7, %xmm1 movaps %xmm5, 48(%edx) lea 16(%esi), %esi lea -16(%ebx), %ebx - L(StrncpyExit11): - movaps (%edx, %esi), %xmm6 - psrldq $5, %xmm6 - palignr $11, %xmm1, %xmm6 - movaps %xmm6, (%edx, %esi) - lea 5(%esi), %esi + lea 5(%edx, %esi), %edx + lea 5(%ecx, %esi), %ecx + movl -5(%ecx), %esi + movb -1(%ecx), %ah + movl %esi, -5(%edx) + movb %ah, -1(%edx) + xor %esi, %esi jmp L(CopyFrom1To16BytesCase3) L(StrncpyLeave12): @@ -3890,36 +3650,29 @@ L(StrncpyLeave12): add $48, %ebx jle L(StrncpyExit12) palignr $12, %xmm1, %xmm2 - movaps %xmm3, %xmm1 movaps %xmm2, (%edx) movaps 20(%ecx), %xmm2 lea 16(%esi), %esi - movaps %xmm2, %xmm3 sub $16, %ebx jbe L(StrncpyExit12) - palignr $12, %xmm1, %xmm2 + palignr $12, %xmm3, %xmm2 movaps %xmm2, 16(%edx) - movaps 20+16(%ecx), %xmm2 - movaps %xmm3, %xmm1 lea 16(%esi), %esi sub $16, %ebx jbe L(StrncpyExit12) - movaps %xmm2, %xmm1 movaps %xmm4, 32(%edx) lea 16(%esi), %esi sub $16, %ebx jbe L(StrncpyExit12) - movaps %xmm7, %xmm1 movaps %xmm5, 48(%edx) lea 16(%esi), %esi lea -16(%ebx), %ebx - L(StrncpyExit12): - movaps (%edx, %esi), %xmm6 - psrldq $4, %xmm6 - palignr $12, %xmm1, %xmm6 - movaps %xmm6, (%edx, %esi) - lea 4(%esi), %esi + lea 4(%edx, %esi), %edx + lea 4(%ecx, %esi), %ecx + movl -4(%ecx), %eax + movl %eax, -4(%edx) + xor %esi, %esi jmp L(CopyFrom1To16BytesCase3) L(StrncpyLeave13): @@ -3927,36 +3680,30 @@ L(StrncpyLeave13): add $48, %ebx jle L(StrncpyExit13) palignr $13, %xmm1, %xmm2 - movaps %xmm3, %xmm1 movaps %xmm2, (%edx) movaps 19(%ecx), %xmm2 lea 16(%esi), %esi - movaps %xmm2, %xmm3 sub $16, %ebx jbe L(StrncpyExit13) - palignr $13, %xmm1, %xmm2 + palignr $13, %xmm3, %xmm2 movaps %xmm2, 16(%edx) - movaps 19+16(%ecx), %xmm2 - movaps %xmm3, %xmm1 lea 16(%esi), %esi sub $16, %ebx jbe L(StrncpyExit13) - movaps %xmm2, %xmm1 movaps %xmm4, 32(%edx) lea 16(%esi), %esi sub $16, %ebx jbe L(StrncpyExit13) - movaps %xmm7, %xmm1 movaps %xmm5, 48(%edx) lea 16(%esi), %esi lea -16(%ebx), %ebx - L(StrncpyExit13): - movaps (%edx, %esi), %xmm6 - psrldq $3, %xmm6 - palignr $13, %xmm1, %xmm6 - movaps %xmm6, (%edx, %esi) - lea 3(%esi), %esi + lea 3(%edx, %esi), %edx + lea 3(%ecx, %esi), %ecx + + movl -4(%ecx), %eax + movl %eax, -4(%edx) + xor %esi, %esi jmp L(CopyFrom1To16BytesCase3) L(StrncpyLeave14): @@ -3964,36 +3711,29 @@ L(StrncpyLeave14): add $48, %ebx jle L(StrncpyExit14) palignr $14, %xmm1, %xmm2 - movaps %xmm3, %xmm1 movaps %xmm2, (%edx) movaps 18(%ecx), %xmm2 lea 16(%esi), %esi - movaps %xmm2, %xmm3 sub $16, %ebx jbe L(StrncpyExit14) - palignr $14, %xmm1, %xmm2 + palignr $14, %xmm3, %xmm2 movaps %xmm2, 16(%edx) - movaps 18+16(%ecx), %xmm2 - movaps %xmm3, %xmm1 lea 16(%esi), %esi sub $16, %ebx jbe L(StrncpyExit14) - movaps %xmm2, %xmm1 movaps %xmm4, 32(%edx) lea 16(%esi), %esi sub $16, %ebx jbe L(StrncpyExit14) - movaps %xmm7, %xmm1 movaps %xmm5, 48(%edx) lea 16(%esi), %esi lea -16(%ebx), %ebx - L(StrncpyExit14): - movaps (%edx, %esi), %xmm6 - psrldq $2, %xmm6 - palignr $14, %xmm1, %xmm6 - movaps %xmm6, (%edx, %esi) - lea 2(%esi), %esi + lea 2(%edx, %esi), %edx + lea 2(%ecx, %esi), %ecx + movw -2(%ecx), %ax + movw %ax, -2(%edx) + xor %esi, %esi jmp L(CopyFrom1To16BytesCase3) L(StrncpyLeave15): @@ -4001,43 +3741,36 @@ L(StrncpyLeave15): add $48, %ebx jle L(StrncpyExit15) palignr $15, %xmm1, %xmm2 - movaps %xmm3, %xmm1 movaps %xmm2, (%edx) movaps 17(%ecx), %xmm2 lea 16(%esi), %esi - movaps %xmm2, %xmm3 sub $16, %ebx jbe L(StrncpyExit15) - palignr $15, %xmm1, %xmm2 + palignr $15, %xmm3, %xmm2 movaps %xmm2, 16(%edx) - movaps 17+16(%ecx), %xmm2 - movaps %xmm3, %xmm1 lea 16(%esi), %esi sub $16, %ebx jbe L(StrncpyExit15) - movaps %xmm2, %xmm1 movaps %xmm4, 32(%edx) lea 16(%esi), %esi sub $16, %ebx jbe L(StrncpyExit15) - movaps %xmm7, %xmm1 movaps %xmm5, 48(%edx) lea 16(%esi), %esi lea -16(%ebx), %ebx - L(StrncpyExit15): - movaps (%edx, %esi), %xmm6 - psrldq $1, %xmm6 - palignr $15, %xmm1, %xmm6 - movaps %xmm6, (%edx, %esi) - lea 1(%esi), %esi + lea 1(%edx, %esi), %edx + lea 1(%ecx, %esi), %ecx + movb -1(%ecx), %ah + movb %ah, -1(%edx) + xor %esi, %esi jmp L(CopyFrom1To16BytesCase3) # endif # ifndef USE_AS_STRCAT # ifdef USE_AS_STRNCPY - CFI_POP (%esi) - CFI_POP (%edi) + CFI_POP (%esi) + CFI_POP (%edi) .p2align 4 L(ExitTail0): @@ -4046,20 +3779,14 @@ L(ExitTail0): .p2align 4 L(StrncpyExit15Bytes): - cmp $9, %ebx - je L(ExitTail9) + cmp $12, %ebx + jbe L(StrncpyExit12Bytes) cmpb $0, 8(%ecx) jz L(ExitTail9) - cmp $10, %ebx - je L(ExitTail10) cmpb $0, 9(%ecx) jz L(ExitTail10) - cmp $11, %ebx - je L(ExitTail11) cmpb $0, 10(%ecx) jz L(ExitTail11) - cmp $12, %ebx - je L(ExitTail12) cmpb $0, 11(%ecx) jz L(ExitTail12) cmp $13, %ebx @@ -4071,9 +3798,9 @@ L(StrncpyExit15Bytes): cmpb $0, 13(%ecx) jz L(ExitTail14) movlpd (%ecx), %xmm0 + movlpd 7(%ecx), %xmm1 movlpd %xmm0, (%edx) - movlpd 7(%ecx), %xmm0 - movlpd %xmm0, 7(%edx) + movlpd %xmm1, 7(%edx) # ifdef USE_AS_STPCPY lea 14(%edx), %eax cmpb $1, (%eax) @@ -4084,23 +3811,43 @@ L(StrncpyExit15Bytes): RETURN .p2align 4 +L(StrncpyExit12Bytes): + cmp $9, %ebx + je L(ExitTail9) + cmpb $0, 8(%ecx) + jz L(ExitTail9) + cmp $10, %ebx + je L(ExitTail10) + cmpb $0, 9(%ecx) + jz L(ExitTail10) + cmp $11, %ebx + je L(ExitTail11) + cmpb $0, 10(%ecx) + jz L(ExitTail11) + movlpd (%ecx), %xmm0 + movl 8(%ecx), %eax + movlpd %xmm0, (%edx) + movl %eax, 8(%edx) + SAVE_RESULT_TAIL (11) +# ifdef USE_AS_STPCPY + cmpb $1, (%eax) + sbb $-1, %eax +# endif + RETURN + + .p2align 4 L(StrncpyExit8Bytes): - cmp $1, %ebx - je L(ExitTail1) + cmp $4, %ebx + jbe L(StrncpyExit4Bytes) cmpb $0, (%ecx) jz L(ExitTail1) - cmp $2, %ebx - je L(ExitTail2) cmpb $0, 1(%ecx) jz L(ExitTail2) - cmp $3, %ebx - je L(ExitTail3) cmpb $0, 2(%ecx) jz L(ExitTail3) - cmp $4, %ebx - je L(ExitTail4) cmpb $0, 3(%ecx) jz L(ExitTail4) + cmp $5, %ebx je L(ExitTail5) cmpb $0, 4(%ecx) @@ -4123,8 +3870,32 @@ L(StrncpyExit8Bytes): movl %edx, %eax # endif RETURN -# endif + .p2align 4 +L(StrncpyExit4Bytes): + test %ebx, %ebx + jz L(ExitTail0) + cmp $1, %ebx + je L(ExitTail1) + cmpb $0, (%ecx) + jz L(ExitTail1) + cmp $2, %ebx + je L(ExitTail2) + cmpb $0, 1(%ecx) + jz L(ExitTail2) + cmp $3, %ebx + je L(ExitTail3) + cmpb $0, 2(%ecx) + jz L(ExitTail3) + movl (%ecx), %eax + movl %eax, (%edx) + SAVE_RESULT_TAIL (3) +# ifdef USE_AS_STPCPY + cmpb $1, (%eax) + sbb $-1, %eax +# endif + RETURN +# endif END (STRCPY) # endif diff --git a/libc/sysdeps/i386/i686/multiarch/wcscpy-ssse3.S b/libc/sysdeps/i386/i686/multiarch/wcscpy-ssse3.S index 84d92a8bd..abeea2226 100644 --- a/libc/sysdeps/i386/i686/multiarch/wcscpy-ssse3.S +++ b/libc/sysdeps/i386/i686/multiarch/wcscpy-ssse3.S @@ -54,7 +54,6 @@ ENTRY (__wcscpy_ssse3) PUSH (%edi) mov %edx, %edi - PUSH (%esi) lea 16(%ecx), %esi @@ -220,7 +219,6 @@ L(Shl4Start): jnz L(Shl4LoopExit) palignr $4, %xmm1, %xmm2 - movaps %xmm3, %xmm1 movaps %xmm2, (%edx) movaps 28(%ecx), %xmm2 @@ -228,15 +226,14 @@ L(Shl4Start): lea 16(%edx), %edx pmovmskb %xmm0, %eax lea 16(%ecx), %ecx - movaps %xmm2, %xmm3 + movaps %xmm2, %xmm1 test %eax, %eax jnz L(Shl4LoopExit) - palignr $4, %xmm1, %xmm2 + palignr $4, %xmm3, %xmm2 movaps %xmm2, (%edx) movaps 28(%ecx), %xmm2 - movaps %xmm3, %xmm1 pcmpeqd %xmm2, %xmm0 lea 16(%edx), %edx @@ -248,7 +245,6 @@ L(Shl4Start): jnz L(Shl4LoopExit) palignr $4, %xmm1, %xmm2 - movaps %xmm3, %xmm1 movaps %xmm2, (%edx) movaps 28(%ecx), %xmm2 @@ -256,13 +252,11 @@ L(Shl4Start): lea 16(%edx), %edx pmovmskb %xmm0, %eax lea 16(%ecx), %ecx - movaps %xmm2, %xmm3 test %eax, %eax jnz L(Shl4LoopExit) - palignr $4, %xmm1, %xmm2 - movaps %xmm3, %xmm1 + palignr $4, %xmm3, %xmm2 movaps %xmm2, (%edx) lea 28(%ecx), %ecx lea 16(%edx), %edx @@ -305,14 +299,13 @@ L(Shl4LoopStart): jmp L(Shl4LoopStart) L(Shl4LoopExit): - movaps (%edx), %xmm6 - psrldq $12, %xmm6 - palignr $4, %xmm1, %xmm6 - movaps %xmm6, (%edx) + movlpd (%ecx), %xmm0 + movl 8(%ecx), %esi + movlpd %xmm0, (%edx) + movl %esi, 8(%edx) + POP (%esi) add $12, %edx add $12, %ecx - - POP (%esi) test %al, %al jz L(ExitHigh) test $0x01, %al @@ -337,7 +330,6 @@ L(Shl8Start): jnz L(Shl8LoopExit) palignr $8, %xmm1, %xmm2 - movaps %xmm3, %xmm1 movaps %xmm2, (%edx) movaps 24(%ecx), %xmm2 @@ -345,15 +337,14 @@ L(Shl8Start): lea 16(%edx), %edx pmovmskb %xmm0, %eax lea 16(%ecx), %ecx - movaps %xmm2, %xmm3 + movaps %xmm2, %xmm1 test %eax, %eax jnz L(Shl8LoopExit) - palignr $8, %xmm1, %xmm2 + palignr $8, %xmm3, %xmm2 movaps %xmm2, (%edx) movaps 24(%ecx), %xmm2 - movaps %xmm3, %xmm1 pcmpeqd %xmm2, %xmm0 lea 16(%edx), %edx @@ -365,7 +356,6 @@ L(Shl8Start): jnz L(Shl8LoopExit) palignr $8, %xmm1, %xmm2 - movaps %xmm3, %xmm1 movaps %xmm2, (%edx) movaps 24(%ecx), %xmm2 @@ -373,13 +363,11 @@ L(Shl8Start): lea 16(%edx), %edx pmovmskb %xmm0, %eax lea 16(%ecx), %ecx - movaps %xmm2, %xmm3 test %eax, %eax jnz L(Shl8LoopExit) - palignr $8, %xmm1, %xmm2 - movaps %xmm3, %xmm1 + palignr $8, %xmm3, %xmm2 movaps %xmm2, (%edx) lea 24(%ecx), %ecx lea 16(%edx), %edx @@ -422,14 +410,11 @@ L(Shl8LoopStart): jmp L(Shl8LoopStart) L(Shl8LoopExit): - movaps (%edx), %xmm6 - psrldq $8, %xmm6 - palignr $8, %xmm1, %xmm6 - movaps %xmm6, (%edx) + movlpd (%ecx), %xmm0 + movlpd %xmm0, (%edx) + POP (%esi) add $8, %edx add $8, %ecx - - POP (%esi) test %al, %al jz L(ExitHigh) test $0x01, %al @@ -454,7 +439,6 @@ L(Shl12Start): jnz L(Shl12LoopExit) palignr $12, %xmm1, %xmm2 - movaps %xmm3, %xmm1 movaps %xmm2, (%edx) movaps 20(%ecx), %xmm2 @@ -462,15 +446,14 @@ L(Shl12Start): lea 16(%edx), %edx pmovmskb %xmm0, %eax lea 16(%ecx), %ecx - movaps %xmm2, %xmm3 + movaps %xmm2, %xmm1 test %eax, %eax jnz L(Shl12LoopExit) - palignr $12, %xmm1, %xmm2 + palignr $12, %xmm3, %xmm2 movaps %xmm2, (%edx) movaps 20(%ecx), %xmm2 - movaps %xmm3, %xmm1 pcmpeqd %xmm2, %xmm0 lea 16(%edx), %edx @@ -482,7 +465,6 @@ L(Shl12Start): jnz L(Shl12LoopExit) palignr $12, %xmm1, %xmm2 - movaps %xmm3, %xmm1 movaps %xmm2, (%edx) movaps 20(%ecx), %xmm2 @@ -490,13 +472,11 @@ L(Shl12Start): lea 16(%edx), %edx pmovmskb %xmm0, %eax lea 16(%ecx), %ecx - movaps %xmm2, %xmm3 test %eax, %eax jnz L(Shl12LoopExit) - palignr $12, %xmm1, %xmm2 - movaps %xmm3, %xmm1 + palignr $12, %xmm3, %xmm2 movaps %xmm2, (%edx) lea 20(%ecx), %ecx lea 16(%edx), %edx @@ -539,11 +519,9 @@ L(Shl12LoopStart): jmp L(Shl12LoopStart) L(Shl12LoopExit): - movaps (%edx), %xmm6 - psrldq $4, %xmm6 + movl (%ecx), %esi + movl %esi, (%edx) mov $4, %esi - palignr $12, %xmm1, %xmm6 - movaps %xmm6, (%edx) .p2align 4 L(CopyFrom1To16Bytes): @@ -555,6 +533,7 @@ L(CopyFrom1To16Bytes): jz L(ExitHigh) test $0x01, %al jnz L(Exit4) +L(Exit8): movlpd (%ecx), %xmm0 movlpd %xmm0, (%edx) movl %edi, %eax @@ -564,6 +543,7 @@ L(CopyFrom1To16Bytes): L(ExitHigh): test $0x01, %ah jnz L(Exit12) +L(Exit16): movdqu (%ecx), %xmm0 movdqu %xmm0, (%edx) movl %edi, %eax diff --git a/libc/sysdeps/ia64/bits/byteswap.h b/libc/sysdeps/ia64/bits/byteswap.h index d64914f36..29d0e37d1 100644 --- a/libc/sysdeps/ia64/bits/byteswap.h +++ b/libc/sysdeps/ia64/bits/byteswap.h @@ -1,5 +1,6 @@ /* Macros to swap the order of bytes in integer values. - Copyright (C) 1997,1998,2000,2002,2003,2008 Free Software Foundation, Inc. + Copyright (C) 1997,1998,2000,2002,2003,2008,2011 + Free Software Foundation, Inc. This file is part of the GNU C Library. The GNU C Library is free software; you can redistribute it and/or @@ -77,17 +78,17 @@ __bswap_32 (unsigned int __bsx) /* Swap bytes in 64 bit value. */ -#define __bswap_constant_64(x) \ - ((((x) & 0xff00000000000000ul) >> 56) \ - | (((x) & 0x00ff000000000000ul) >> 40) \ - | (((x) & 0x0000ff0000000000ul) >> 24) \ - | (((x) & 0x000000ff00000000ul) >> 8) \ - | (((x) & 0x00000000ff000000ul) << 8) \ - | (((x) & 0x0000000000ff0000ul) << 24) \ - | (((x) & 0x000000000000ff00ul) << 40) \ - | (((x) & 0x00000000000000fful) << 56)) - #if defined __GNUC__ && __GNUC__ >= 2 +# define __bswap_constant_64(x) \ + (__extension__ ((((x) & 0xff00000000000000ul) >> 56) \ + | (((x) & 0x00ff000000000000ul) >> 40) \ + | (((x) & 0x0000ff0000000000ul) >> 24) \ + | (((x) & 0x000000ff00000000ul) >> 8) \ + | (((x) & 0x00000000ff000000ul) << 8) \ + | (((x) & 0x0000000000ff0000ul) << 24) \ + | (((x) & 0x000000000000ff00ul) << 40) \ + | (((x) & 0x00000000000000fful) << 56))) + # define __bswap_64(x) \ (__extension__ \ ({ register unsigned long int __v, __x = (x); \ @@ -97,9 +98,19 @@ __bswap_32 (unsigned int __bsx) __asm__ __volatile__ ("mux1 %0 = %1, @rev ;;" \ : "=r" (__v) \ : "r" ((unsigned long int) (__x))); \ - __v; })) + __v; })) #else +# define __bswap_constant_64(x) \ + ((((x) & 0xff00000000000000ul) >> 56) \ + | (((x) & 0x00ff000000000000ul) >> 40) \ + | (((x) & 0x0000ff0000000000ul) >> 24) \ + | (((x) & 0x000000ff00000000ul) >> 8) \ + | (((x) & 0x00000000ff000000ul) << 8) \ + | (((x) & 0x0000000000ff0000ul) << 24) \ + | (((x) & 0x000000000000ff00ul) << 40) \ + | (((x) & 0x00000000000000fful) << 56)) + static __inline unsigned long int __bswap_64 (unsigned long int __bsx) { diff --git a/libc/sysdeps/s390/bits/byteswap.h b/libc/sysdeps/s390/bits/byteswap.h index 4bfd5fa06..0e0346bba 100644 --- a/libc/sysdeps/s390/bits/byteswap.h +++ b/libc/sysdeps/s390/bits/byteswap.h @@ -1,5 +1,5 @@ /* Macros to swap the order of bytes in integer values. s390 version. - Copyright (C) 2000, 2001, 2002, 2003, 2008 Free Software Foundation, Inc. + Copyright (C) 2000-2003, 2008, 2011 Free Software Foundation, Inc. Contributed by Martin Schwidefsky (schwidefsky@de.ibm.com). This file is part of the GNU C Library. @@ -35,31 +35,31 @@ # if __WORDSIZE == 64 # define __bswap_16(x) \ (__extension__ \ - ({ unsigned short int __v, __x = (x); \ + ({ unsigned short int __v, __x = (x); \ if (__builtin_constant_p (x)) \ __v = __bswap_constant_16 (__x); \ else { \ - unsigned short int __tmp = (unsigned short int) (__x); \ - __asm__ __volatile__ ( \ - "lrvh %0,%1" \ - : "=&d" (__v) : "m" (__tmp) ); \ - } \ + unsigned short int __tmp = (unsigned short int) (__x); \ + __asm__ __volatile__ ( \ + "lrvh %0,%1" \ + : "=&d" (__v) : "m" (__tmp) ); \ + } \ __v; })) # else # define __bswap_16(x) \ (__extension__ \ - ({ unsigned short int __v, __x = (x); \ + ({ unsigned short int __v, __x = (x); \ if (__builtin_constant_p (x)) \ __v = __bswap_constant_16 (__x); \ else { \ - unsigned short int __tmp = (unsigned short int) (__x); \ - __asm__ __volatile__ ( \ - "sr %0,%0\n" \ - "la 1,%1\n" \ - "icm %0,2,1(1)\n" \ - "ic %0,0(1)" \ - : "=&d" (__v) : "m" (__tmp) : "1"); \ - } \ + unsigned short int __tmp = (unsigned short int) (__x); \ + __asm__ __volatile__ ( \ + "sr %0,%0\n" \ + "la 1,%1\n" \ + "icm %0,2,1(1)\n" \ + "ic %0,0(1)" \ + : "=&d" (__v) : "m" (__tmp) : "1"); \ + } \ __v; })) # endif #else @@ -80,32 +80,32 @@ __bswap_16 (unsigned short int __bsx) # if __WORDSIZE == 64 # define __bswap_32(x) \ (__extension__ \ - ({ unsigned int __v, __x = (x); \ + ({ unsigned int __v, __x = (x); \ if (__builtin_constant_p (x)) \ __v = __bswap_constant_32 (__x); \ else { \ - unsigned int __tmp = (unsigned int) (__x); \ - __asm__ __volatile__ ( \ - "lrv %0,%1" \ - : "=&d" (__v) : "m" (__tmp)); \ - } \ + unsigned int __tmp = (unsigned int) (__x); \ + __asm__ __volatile__ ( \ + "lrv %0,%1" \ + : "=&d" (__v) : "m" (__tmp)); \ + } \ __v; })) # else # define __bswap_32(x) \ (__extension__ \ - ({ unsigned int __v, __x = (x); \ + ({ unsigned int __v, __x = (x); \ if (__builtin_constant_p (x)) \ __v = __bswap_constant_32 (__x); \ else { \ - unsigned int __tmp = (unsigned int) (__x); \ - __asm__ __volatile__ ( \ - "la 1,%1\n" \ - "icm %0,8,3(1)\n" \ - "icm %0,4,2(1)\n" \ - "icm %0,2,1(1)\n" \ - "ic %0,0(1)" \ - : "=&d" (__v) : "m" (__tmp) : "1"); \ - } \ + unsigned int __tmp = (unsigned int) (__x); \ + __asm__ __volatile__ ( \ + "la 1,%1\n" \ + "icm %0,8,3(1)\n" \ + "icm %0,4,2(1)\n" \ + "icm %0,2,1(1)\n" \ + "ic %0,0(1)" \ + : "=&d" (__v) : "m" (__tmp) : "1"); \ + } \ __v; })) # endif #else @@ -117,37 +117,51 @@ __bswap_32 (unsigned int __bsx) #endif /* Swap bytes in 64 bit value. */ -#define __bswap_constant_64(x) \ - ((((x)&0xff00000000000000) >> 56) | (((x)&0x00ff000000000000) >> 40) | \ - (((x)&0x0000ff0000000000) >> 24) | (((x)&0x000000ff00000000) >> 8) | \ - (((x)&0x00000000ff000000) << 8) | (((x)&0x0000000000ff0000) << 24) | \ - (((x)&0x000000000000ff00) << 40) | (((x)&0x00000000000000ff) << 56)) - #if defined __GNUC__ && __GNUC__ >= 2 +# define __bswap_constant_64(x) \ + (__extension__ ((((x) & 0xff00000000000000ul) >> 56) \ + | (((x) & 0x00ff000000000000ul) >> 40) \ + | (((x) & 0x0000ff0000000000ul) >> 24) \ + | (((x) & 0x000000ff00000000ul) >> 8) \ + | (((x) & 0x00000000ff000000ul) << 8) \ + | (((x) & 0x0000000000ff0000ul) << 24) \ + | (((x) & 0x000000000000ff00ul) << 40) \ + | (((x) & 0x00000000000000fful) << 56))) + # if __WORDSIZE == 64 # define __bswap_64(x) \ (__extension__ \ - ({ unsigned long __w, __x = (x); \ + ({ unsigned long __w, __x = (x); \ if (__builtin_constant_p (x)) \ __w = __bswap_constant_64 (__x); \ else { \ - unsigned long __tmp = (unsigned long) (__x); \ - __asm__ __volatile__ ( \ - "lrvg %0,%1" \ - : "=&d" (__w) : "m" (__tmp)); \ - } \ + unsigned long __tmp = (unsigned long) (__x); \ + __asm__ __volatile__ ( \ + "lrvg %0,%1" \ + : "=&d" (__w) : "m" (__tmp)); \ + } \ __w; })) # else # define __bswap_64(x) \ __extension__ \ ({ union { unsigned long long int __ll; \ - unsigned long int __l[2]; } __w, __r; \ - __w.__ll = (x); \ - __r.__l[0] = __bswap_32 (__w.__l[1]); \ - __r.__l[1] = __bswap_32 (__w.__l[0]); \ - __r.__ll; }) + unsigned long int __l[2]; } __w, __r; \ + __w.__ll = (x); \ + __r.__l[0] = __bswap_32 (__w.__l[1]); \ + __r.__l[1] = __bswap_32 (__w.__l[0]); \ + __r.__ll; }) # endif #else +# define __bswap_constant_64(x) \ + ((((x) & 0xff00000000000000ul) >> 56) \ + | (((x) & 0x00ff000000000000ul) >> 40) \ + | (((x) & 0x0000ff0000000000ul) >> 24) \ + | (((x) & 0x000000ff00000000ul) >> 8) \ + | (((x) & 0x00000000ff000000ul) << 8) \ + | (((x) & 0x0000000000ff0000ul) << 24) \ + | (((x) & 0x000000000000ff00ul) << 40) \ + | (((x) & 0x00000000000000fful) << 56)) + static __inline unsigned long long int __bswap_64 (unsigned long long int __bsx) { diff --git a/libc/sysdeps/x86_64/bits/byteswap.h b/libc/sysdeps/x86_64/bits/byteswap.h index e350fb806..c6db93c41 100644 --- a/libc/sysdeps/x86_64/bits/byteswap.h +++ b/libc/sysdeps/x86_64/bits/byteswap.h @@ -1,5 +1,5 @@ /* Macros to swap the order of bytes in integer values. - Copyright (C) 1997, 1998, 2000, 2002, 2003, 2007, 2008, 2010 + Copyright (C) 1997, 1998, 2000, 2002, 2003, 2007, 2008, 2010, 2011 Free Software Foundation, Inc. This file is part of the GNU C Library. @@ -99,14 +99,14 @@ #if defined __GNUC__ && __GNUC__ >= 2 /* Swap bytes in 64 bit value. */ # define __bswap_constant_64(x) \ - ((((x) & 0xff00000000000000ull) >> 56) \ - | (((x) & 0x00ff000000000000ull) >> 40) \ - | (((x) & 0x0000ff0000000000ull) >> 24) \ - | (((x) & 0x000000ff00000000ull) >> 8) \ - | (((x) & 0x00000000ff000000ull) << 8) \ - | (((x) & 0x0000000000ff0000ull) << 24) \ - | (((x) & 0x000000000000ff00ull) << 40) \ - | (((x) & 0x00000000000000ffull) << 56)) + (__extension__ ((((x) & 0xff00000000000000ull) >> 56) \ + | (((x) & 0x00ff000000000000ull) >> 40) \ + | (((x) & 0x0000ff0000000000ull) >> 24) \ + | (((x) & 0x000000ff00000000ull) >> 8) \ + | (((x) & 0x00000000ff000000ull) << 8) \ + | (((x) & 0x0000000000ff0000ull) << 24) \ + | (((x) & 0x000000000000ff00ull) << 40) \ + | (((x) & 0x00000000000000ffull) << 56))) # if __WORDSIZE == 64 # define __bswap_64(x) \ diff --git a/libc/sysdeps/x86_64/dl-machine.h b/libc/sysdeps/x86_64/dl-machine.h index 1068af6bb..a8fbc1625 100644 --- a/libc/sysdeps/x86_64/dl-machine.h +++ b/libc/sysdeps/x86_64/dl-machine.h @@ -98,7 +98,7 @@ elf_machine_runtime_setup (struct link_map *l, int lazy, int profile) /* The GOT entries for functions in the PLT have not yet been filled in. Their initial contents will arrange when called to push an offset into the .rel.plt section, push _GLOBAL_OFFSET_TABLE_[1], - and then jump to _GLOBAL_OFFSET_TABLE[2]. */ + and then jump to _GLOBAL_OFFSET_TABLE_[2]. */ got = (Elf64_Addr *) D_PTR (l, l_info[DT_PLTGOT]); /* If a library is prelinked but we have to relocate anyway, we have to be able to undo the prelinking of .got.plt. @@ -214,7 +214,7 @@ _dl_start_user:\n\ /* The x86-64 never uses Elf64_Rel relocations. */ #define ELF_MACHINE_NO_REL 1 -/* We define an initialization functions. This is called very early in +/* We define an initialization function. This is called very early in _dl_sysdep_start. */ #define DL_PLATFORM_INIT dl_platform_init () @@ -234,8 +234,8 @@ elf_machine_fixup_plt (struct link_map *map, lookup_t t, return *reloc_addr = value; } -/* Return the final value of a plt relocation. On x86-64 the - JUMP_SLOT relocation ignores the addend. */ +/* Return the final value of a PLT relocation. On x86-64 the + JUMP_SLOT relocation ignores the addend. */ static inline Elf64_Addr elf_machine_plt_value (struct link_map *map, const Elf64_Rela *reloc, Elf64_Addr value) diff --git a/libc/sysdeps/x86_64/multiarch/strcpy-ssse3.S b/libc/sysdeps/x86_64/multiarch/strcpy-ssse3.S index c4ec54cd2..b1047652d 100644 --- a/libc/sysdeps/x86_64/multiarch/strcpy-ssse3.S +++ b/libc/sysdeps/x86_64/multiarch/strcpy-ssse3.S @@ -29,6 +29,7 @@ .section .text.ssse3,"ax",@progbits ENTRY (STRCPY) + mov %rsi, %rcx # ifdef USE_AS_STRNCPY mov %rdx, %r8 @@ -39,7 +40,7 @@ ENTRY (STRCPY) jz L(Exit0) cmp $8, %r8 jbe L(StrncpyExit8Bytes) -# endif +# endif cmpb $0, (%rcx) jz L(Exit1) cmpb $0, 1(%rcx) @@ -56,10 +57,10 @@ ENTRY (STRCPY) jz L(Exit7) cmpb $0, 7(%rcx) jz L(Exit8) -# ifdef USE_AS_STRNCPY +# ifdef USE_AS_STRNCPY cmp $16, %r8 jb L(StrncpyExit15Bytes) -# endif +# endif cmpb $0, 8(%rcx) jz L(Exit9) cmpb $0, 9(%rcx) @@ -74,10 +75,10 @@ ENTRY (STRCPY) jz L(Exit14) cmpb $0, 14(%rcx) jz L(Exit15) -# ifdef USE_AS_STRNCPY +# ifdef USE_AS_STRNCPY cmp $16, %r8 je L(Exit16) -# endif +# endif cmpb $0, 15(%rcx) jz L(Exit16) # endif @@ -87,25 +88,15 @@ ENTRY (STRCPY) sub $16, %r8 and $0xf, %rsi -/* add 16 bytes rcx_shift to r8 */ +/* add 16 bytes rcx_offset to r8 */ + add %rsi, %r8 # endif lea 16(%rcx), %rsi -/* Now: - rsi = alignment_16(rcx) + rcx_shift + 16; - rcx_shift = rcx - alignment_16(rcx) -*/ and $-16, %rsi -/* Now: - rsi = alignment_16(rcx) + 16 -*/ pxor %xmm0, %xmm0 mov (%rcx), %r9 mov %r9, (%rdx) -/* - look if there is zero symbol in next 16 bytes of string - from rsi to rsi + 15 and form mask in xmm0 -*/ pcmpeqb (%rsi), %xmm0 mov 8(%rcx), %r9 mov %r9, 8(%rdx) @@ -115,10 +106,6 @@ ENTRY (STRCPY) pmovmskb %xmm0, %rax sub %rcx, %rsi -/* rsi = 16 - rcx_shift */ - -/* rax = 0: there isn't end of string from position rsi to rsi+15 */ - # ifdef USE_AS_STRNCPY sub $16, %r8 jbe L(CopyFrom1To16BytesCase2OrCase3) @@ -128,17 +115,9 @@ ENTRY (STRCPY) mov %rdx, %rax lea 16(%rdx), %rdx -/* Now: - rdx = rdx + 16 = alignment_16(rdx) + rdx_shift + 16 -*/ and $-16, %rdx - -/* Now: rdx = alignment_16(rdx) + 16 */ - sub %rdx, %rax -/* Now: rax = rdx_shift - 16 */ - # ifdef USE_AS_STRNCPY add %rax, %rsi lea -1(%rsi), %rsi @@ -150,22 +129,11 @@ ENTRY (STRCPY) L(ContinueCopy): # endif sub %rax, %rcx -/* Now: - case rcx_shift >= rdx_shift: - rcx = alignment_16(rcx) + (rcx_shift - rdx_shift) + 16 - case rcx_shift < rdx_shift: - rcx = alignment_16(rcx) + (16 + rcx_shift - rdx_shift) -*/ mov %rcx, %rax and $0xf, %rax -/* Now: - case rcx_shift >= rdx_shift: rax = rcx_shift - rdx_shift - case rcx_shift < rdx_shift: rax = (16 + rcx_shift - rdx_shift) - rax can be 0, 1, ..., 15 -*/ mov $0, %rsi -/* case: rcx_shift == rdx_shift */ +/* case: rcx_offset == rdx_offset */ jz L(Align16Both) @@ -282,10 +250,11 @@ L(Align16Both): sub %rcx, %rax sub %rax, %rdx # ifdef USE_AS_STRNCPY - lea 48+64(%r8, %rax), %r8 + lea 112(%r8, %rax), %r8 # endif mov $-0x40, %rsi + .p2align 4 L(Aligned64Loop): movaps (%rcx), %xmm2 movaps %xmm2, %xmm4 @@ -366,7 +335,6 @@ L(Shl1Start): jnz L(Shl1LoopExit) palignr $1, %xmm1, %xmm2 - movaps %xmm3, %xmm1 movaps %xmm2, (%rdx) movaps 31(%rcx), %xmm2 @@ -374,7 +342,7 @@ L(Shl1Start): lea 16(%rdx), %rdx pmovmskb %xmm0, %rax lea 16(%rcx), %rcx - movaps %xmm2, %xmm3 + movaps %xmm2, %xmm1 # ifdef USE_AS_STRNCPY sub $16, %r8 jbe L(StrncpyExit1Case2OrCase3) @@ -382,10 +350,9 @@ L(Shl1Start): test %rax, %rax jnz L(Shl1LoopExit) - palignr $1, %xmm1, %xmm2 + palignr $1, %xmm3, %xmm2 movaps %xmm2, (%rdx) movaps 31(%rcx), %xmm2 - movaps %xmm3, %xmm1 pcmpeqb %xmm2, %xmm0 lea 16(%rdx), %rdx @@ -400,7 +367,6 @@ L(Shl1Start): jnz L(Shl1LoopExit) palignr $1, %xmm1, %xmm2 - movaps %xmm3, %xmm1 movaps %xmm2, (%rdx) movaps 31(%rcx), %xmm2 @@ -408,7 +374,6 @@ L(Shl1Start): lea 16(%rdx), %rdx pmovmskb %xmm0, %rax lea 16(%rcx), %rcx - movaps %xmm2, %xmm3 # ifdef USE_AS_STRNCPY sub $16, %r8 jbe L(StrncpyExit1Case2OrCase3) @@ -416,8 +381,7 @@ L(Shl1Start): test %rax, %rax jnz L(Shl1LoopExit) - palignr $1, %xmm1, %xmm2 - movaps %xmm3, %xmm1 + palignr $1, %xmm3, %xmm2 movaps %xmm2, (%rdx) lea 31(%rcx), %rcx lea 16(%rdx), %rdx @@ -432,6 +396,8 @@ L(Shl1Start): # endif movaps -1(%rcx), %xmm1 +/* 64 bytes loop */ + .p2align 4 L(Shl1LoopStart): movaps 15(%rcx), %xmm2 movaps 31(%rcx), %xmm3 @@ -465,11 +431,9 @@ L(Shl1LoopStart): jmp L(Shl1LoopStart) L(Shl1LoopExit): - movaps (%rdx), %xmm6 - psrldq $15, %xmm6 + movdqu -1(%rcx), %xmm1 mov $15, %rsi - palignr $1, %xmm1, %xmm6 - movaps %xmm6, (%rdx) + movdqu %xmm1, -1(%rdx) jmp L(CopyFrom1To16Bytes) .p2align 4 @@ -488,7 +452,6 @@ L(Shl2Start): jnz L(Shl2LoopExit) palignr $2, %xmm1, %xmm2 - movaps %xmm3, %xmm1 movaps %xmm2, (%rdx) movaps 30(%rcx), %xmm2 @@ -496,7 +459,7 @@ L(Shl2Start): lea 16(%rdx), %rdx pmovmskb %xmm0, %rax lea 16(%rcx), %rcx - movaps %xmm2, %xmm3 + movaps %xmm2, %xmm1 # ifdef USE_AS_STRNCPY sub $16, %r8 jbe L(StrncpyExit2Case2OrCase3) @@ -504,10 +467,9 @@ L(Shl2Start): test %rax, %rax jnz L(Shl2LoopExit) - palignr $2, %xmm1, %xmm2 + palignr $2, %xmm3, %xmm2 movaps %xmm2, (%rdx) movaps 30(%rcx), %xmm2 - movaps %xmm3, %xmm1 pcmpeqb %xmm2, %xmm0 lea 16(%rdx), %rdx @@ -522,7 +484,6 @@ L(Shl2Start): jnz L(Shl2LoopExit) palignr $2, %xmm1, %xmm2 - movaps %xmm3, %xmm1 movaps %xmm2, (%rdx) movaps 30(%rcx), %xmm2 @@ -530,7 +491,6 @@ L(Shl2Start): lea 16(%rdx), %rdx pmovmskb %xmm0, %rax lea 16(%rcx), %rcx - movaps %xmm2, %xmm3 # ifdef USE_AS_STRNCPY sub $16, %r8 jbe L(StrncpyExit2Case2OrCase3) @@ -538,8 +498,7 @@ L(Shl2Start): test %rax, %rax jnz L(Shl2LoopExit) - palignr $2, %xmm1, %xmm2 - movaps %xmm3, %xmm1 + palignr $2, %xmm3, %xmm2 movaps %xmm2, (%rdx) lea 30(%rcx), %rcx lea 16(%rdx), %rdx @@ -554,6 +513,8 @@ L(Shl2Start): # endif movaps -2(%rcx), %xmm1 +/* 64 bytes loop */ + .p2align 4 L(Shl2LoopStart): movaps 14(%rcx), %xmm2 movaps 30(%rcx), %xmm3 @@ -587,11 +548,9 @@ L(Shl2LoopStart): jmp L(Shl2LoopStart) L(Shl2LoopExit): - movaps (%rdx), %xmm6 - psrldq $14, %xmm6 + movdqu -2(%rcx), %xmm1 mov $14, %rsi - palignr $2, %xmm1, %xmm6 - movaps %xmm6, (%rdx) + movdqu %xmm1, -2(%rdx) jmp L(CopyFrom1To16Bytes) .p2align 4 @@ -610,7 +569,6 @@ L(Shl3Start): jnz L(Shl3LoopExit) palignr $3, %xmm1, %xmm2 - movaps %xmm3, %xmm1 movaps %xmm2, (%rdx) movaps 29(%rcx), %xmm2 @@ -618,7 +576,7 @@ L(Shl3Start): lea 16(%rdx), %rdx pmovmskb %xmm0, %rax lea 16(%rcx), %rcx - movaps %xmm2, %xmm3 + movaps %xmm2, %xmm1 # ifdef USE_AS_STRNCPY sub $16, %r8 jbe L(StrncpyExit3Case2OrCase3) @@ -626,10 +584,9 @@ L(Shl3Start): test %rax, %rax jnz L(Shl3LoopExit) - palignr $3, %xmm1, %xmm2 + palignr $3, %xmm3, %xmm2 movaps %xmm2, (%rdx) movaps 29(%rcx), %xmm2 - movaps %xmm3, %xmm1 pcmpeqb %xmm2, %xmm0 lea 16(%rdx), %rdx @@ -644,7 +601,6 @@ L(Shl3Start): jnz L(Shl3LoopExit) palignr $3, %xmm1, %xmm2 - movaps %xmm3, %xmm1 movaps %xmm2, (%rdx) movaps 29(%rcx), %xmm2 @@ -652,7 +608,6 @@ L(Shl3Start): lea 16(%rdx), %rdx pmovmskb %xmm0, %rax lea 16(%rcx), %rcx - movaps %xmm2, %xmm3 # ifdef USE_AS_STRNCPY sub $16, %r8 jbe L(StrncpyExit3Case2OrCase3) @@ -660,8 +615,7 @@ L(Shl3Start): test %rax, %rax jnz L(Shl3LoopExit) - palignr $3, %xmm1, %xmm2 - movaps %xmm3, %xmm1 + palignr $3, %xmm3, %xmm2 movaps %xmm2, (%rdx) lea 29(%rcx), %rcx lea 16(%rdx), %rdx @@ -676,6 +630,8 @@ L(Shl3Start): # endif movaps -3(%rcx), %xmm1 +/* 64 bytes loop */ + .p2align 4 L(Shl3LoopStart): movaps 13(%rcx), %xmm2 movaps 29(%rcx), %xmm3 @@ -709,11 +665,9 @@ L(Shl3LoopStart): jmp L(Shl3LoopStart) L(Shl3LoopExit): - movaps (%rdx), %xmm6 - psrldq $13, %xmm6 + movdqu -3(%rcx), %xmm1 mov $13, %rsi - palignr $3, %xmm1, %xmm6 - movaps %xmm6, (%rdx) + movdqu %xmm1, -3(%rdx) jmp L(CopyFrom1To16Bytes) .p2align 4 @@ -732,7 +686,6 @@ L(Shl4Start): jnz L(Shl4LoopExit) palignr $4, %xmm1, %xmm2 - movaps %xmm3, %xmm1 movaps %xmm2, (%rdx) movaps 28(%rcx), %xmm2 @@ -740,7 +693,7 @@ L(Shl4Start): lea 16(%rdx), %rdx pmovmskb %xmm0, %rax lea 16(%rcx), %rcx - movaps %xmm2, %xmm3 + movaps %xmm2, %xmm1 # ifdef USE_AS_STRNCPY sub $16, %r8 jbe L(StrncpyExit4Case2OrCase3) @@ -748,10 +701,9 @@ L(Shl4Start): test %rax, %rax jnz L(Shl4LoopExit) - palignr $4, %xmm1, %xmm2 + palignr $4, %xmm3, %xmm2 movaps %xmm2, (%rdx) movaps 28(%rcx), %xmm2 - movaps %xmm3, %xmm1 pcmpeqb %xmm2, %xmm0 lea 16(%rdx), %rdx @@ -766,7 +718,6 @@ L(Shl4Start): jnz L(Shl4LoopExit) palignr $4, %xmm1, %xmm2 - movaps %xmm3, %xmm1 movaps %xmm2, (%rdx) movaps 28(%rcx), %xmm2 @@ -774,7 +725,6 @@ L(Shl4Start): lea 16(%rdx), %rdx pmovmskb %xmm0, %rax lea 16(%rcx), %rcx - movaps %xmm2, %xmm3 # ifdef USE_AS_STRNCPY sub $16, %r8 jbe L(StrncpyExit4Case2OrCase3) @@ -782,8 +732,7 @@ L(Shl4Start): test %rax, %rax jnz L(Shl4LoopExit) - palignr $4, %xmm1, %xmm2 - movaps %xmm3, %xmm1 + palignr $4, %xmm3, %xmm2 movaps %xmm2, (%rdx) lea 28(%rcx), %rcx lea 16(%rdx), %rdx @@ -798,6 +747,8 @@ L(Shl4Start): # endif movaps -4(%rcx), %xmm1 +/* 64 bytes loop */ + .p2align 4 L(Shl4LoopStart): movaps 12(%rcx), %xmm2 movaps 28(%rcx), %xmm3 @@ -831,11 +782,9 @@ L(Shl4LoopStart): jmp L(Shl4LoopStart) L(Shl4LoopExit): - movaps (%rdx), %xmm6 - psrldq $12, %xmm6 + movdqu -4(%rcx), %xmm1 mov $12, %rsi - palignr $4, %xmm1, %xmm6 - movaps %xmm6, (%rdx) + movdqu %xmm1, -4(%rdx) jmp L(CopyFrom1To16Bytes) .p2align 4 @@ -854,7 +803,6 @@ L(Shl5Start): jnz L(Shl5LoopExit) palignr $5, %xmm1, %xmm2 - movaps %xmm3, %xmm1 movaps %xmm2, (%rdx) movaps 27(%rcx), %xmm2 @@ -862,7 +810,7 @@ L(Shl5Start): lea 16(%rdx), %rdx pmovmskb %xmm0, %rax lea 16(%rcx), %rcx - movaps %xmm2, %xmm3 + movaps %xmm2, %xmm1 # ifdef USE_AS_STRNCPY sub $16, %r8 jbe L(StrncpyExit5Case2OrCase3) @@ -870,10 +818,9 @@ L(Shl5Start): test %rax, %rax jnz L(Shl5LoopExit) - palignr $5, %xmm1, %xmm2 + palignr $5, %xmm3, %xmm2 movaps %xmm2, (%rdx) movaps 27(%rcx), %xmm2 - movaps %xmm3, %xmm1 pcmpeqb %xmm2, %xmm0 lea 16(%rdx), %rdx @@ -888,7 +835,6 @@ L(Shl5Start): jnz L(Shl5LoopExit) palignr $5, %xmm1, %xmm2 - movaps %xmm3, %xmm1 movaps %xmm2, (%rdx) movaps 27(%rcx), %xmm2 @@ -896,7 +842,6 @@ L(Shl5Start): lea 16(%rdx), %rdx pmovmskb %xmm0, %rax lea 16(%rcx), %rcx - movaps %xmm2, %xmm3 # ifdef USE_AS_STRNCPY sub $16, %r8 jbe L(StrncpyExit5Case2OrCase3) @@ -904,8 +849,7 @@ L(Shl5Start): test %rax, %rax jnz L(Shl5LoopExit) - palignr $5, %xmm1, %xmm2 - movaps %xmm3, %xmm1 + palignr $5, %xmm3, %xmm2 movaps %xmm2, (%rdx) lea 27(%rcx), %rcx lea 16(%rdx), %rdx @@ -920,6 +864,8 @@ L(Shl5Start): # endif movaps -5(%rcx), %xmm1 +/* 64 bytes loop */ + .p2align 4 L(Shl5LoopStart): movaps 11(%rcx), %xmm2 movaps 27(%rcx), %xmm3 @@ -953,11 +899,9 @@ L(Shl5LoopStart): jmp L(Shl5LoopStart) L(Shl5LoopExit): - movaps (%rdx), %xmm6 - psrldq $11, %xmm6 + movdqu -5(%rcx), %xmm1 mov $11, %rsi - palignr $5, %xmm1, %xmm6 - movaps %xmm6, (%rdx) + movdqu %xmm1, -5(%rdx) jmp L(CopyFrom1To16Bytes) .p2align 4 @@ -976,7 +920,6 @@ L(Shl6Start): jnz L(Shl6LoopExit) palignr $6, %xmm1, %xmm2 - movaps %xmm3, %xmm1 movaps %xmm2, (%rdx) movaps 26(%rcx), %xmm2 @@ -984,7 +927,7 @@ L(Shl6Start): lea 16(%rdx), %rdx pmovmskb %xmm0, %rax lea 16(%rcx), %rcx - movaps %xmm2, %xmm3 + movaps %xmm2, %xmm1 # ifdef USE_AS_STRNCPY sub $16, %r8 jbe L(StrncpyExit6Case2OrCase3) @@ -992,10 +935,9 @@ L(Shl6Start): test %rax, %rax jnz L(Shl6LoopExit) - palignr $6, %xmm1, %xmm2 + palignr $6, %xmm3, %xmm2 movaps %xmm2, (%rdx) movaps 26(%rcx), %xmm2 - movaps %xmm3, %xmm1 pcmpeqb %xmm2, %xmm0 lea 16(%rdx), %rdx @@ -1010,7 +952,6 @@ L(Shl6Start): jnz L(Shl6LoopExit) palignr $6, %xmm1, %xmm2 - movaps %xmm3, %xmm1 movaps %xmm2, (%rdx) movaps 26(%rcx), %xmm2 @@ -1018,7 +959,6 @@ L(Shl6Start): lea 16(%rdx), %rdx pmovmskb %xmm0, %rax lea 16(%rcx), %rcx - movaps %xmm2, %xmm3 # ifdef USE_AS_STRNCPY sub $16, %r8 jbe L(StrncpyExit6Case2OrCase3) @@ -1026,8 +966,7 @@ L(Shl6Start): test %rax, %rax jnz L(Shl6LoopExit) - palignr $6, %xmm1, %xmm2 - movaps %xmm3, %xmm1 + palignr $6, %xmm3, %xmm2 movaps %xmm2, (%rdx) lea 26(%rcx), %rcx lea 16(%rdx), %rdx @@ -1042,6 +981,8 @@ L(Shl6Start): # endif movaps -6(%rcx), %xmm1 +/* 64 bytes loop */ + .p2align 4 L(Shl6LoopStart): movaps 10(%rcx), %xmm2 movaps 26(%rcx), %xmm3 @@ -1075,11 +1016,11 @@ L(Shl6LoopStart): jmp L(Shl6LoopStart) L(Shl6LoopExit): - movaps (%rdx), %xmm6 - psrldq $10, %xmm6 + mov (%rcx), %r9 + mov 6(%rcx), %esi + mov %r9, (%rdx) + mov %esi, 6(%rdx) mov $10, %rsi - palignr $6, %xmm1, %xmm6 - movaps %xmm6, (%rdx) jmp L(CopyFrom1To16Bytes) .p2align 4 @@ -1098,7 +1039,6 @@ L(Shl7Start): jnz L(Shl7LoopExit) palignr $7, %xmm1, %xmm2 - movaps %xmm3, %xmm1 movaps %xmm2, (%rdx) movaps 25(%rcx), %xmm2 @@ -1106,7 +1046,7 @@ L(Shl7Start): lea 16(%rdx), %rdx pmovmskb %xmm0, %rax lea 16(%rcx), %rcx - movaps %xmm2, %xmm3 + movaps %xmm2, %xmm1 # ifdef USE_AS_STRNCPY sub $16, %r8 jbe L(StrncpyExit7Case2OrCase3) @@ -1114,10 +1054,9 @@ L(Shl7Start): test %rax, %rax jnz L(Shl7LoopExit) - palignr $7, %xmm1, %xmm2 + palignr $7, %xmm3, %xmm2 movaps %xmm2, (%rdx) movaps 25(%rcx), %xmm2 - movaps %xmm3, %xmm1 pcmpeqb %xmm2, %xmm0 lea 16(%rdx), %rdx @@ -1132,7 +1071,6 @@ L(Shl7Start): jnz L(Shl7LoopExit) palignr $7, %xmm1, %xmm2 - movaps %xmm3, %xmm1 movaps %xmm2, (%rdx) movaps 25(%rcx), %xmm2 @@ -1140,7 +1078,6 @@ L(Shl7Start): lea 16(%rdx), %rdx pmovmskb %xmm0, %rax lea 16(%rcx), %rcx - movaps %xmm2, %xmm3 # ifdef USE_AS_STRNCPY sub $16, %r8 jbe L(StrncpyExit7Case2OrCase3) @@ -1148,8 +1085,7 @@ L(Shl7Start): test %rax, %rax jnz L(Shl7LoopExit) - palignr $7, %xmm1, %xmm2 - movaps %xmm3, %xmm1 + palignr $7, %xmm3, %xmm2 movaps %xmm2, (%rdx) lea 25(%rcx), %rcx lea 16(%rdx), %rdx @@ -1164,6 +1100,8 @@ L(Shl7Start): # endif movaps -7(%rcx), %xmm1 +/* 64 bytes loop */ + .p2align 4 L(Shl7LoopStart): movaps 9(%rcx), %xmm2 movaps 25(%rcx), %xmm3 @@ -1197,11 +1135,11 @@ L(Shl7LoopStart): jmp L(Shl7LoopStart) L(Shl7LoopExit): - movaps (%rdx), %xmm6 - psrldq $9, %xmm6 + mov (%rcx), %r9 + mov 5(%rcx), %esi + mov %r9, (%rdx) + mov %esi, 5(%rdx) mov $9, %rsi - palignr $7, %xmm1, %xmm6 - movaps %xmm6, (%rdx) jmp L(CopyFrom1To16Bytes) .p2align 4 @@ -1220,7 +1158,6 @@ L(Shl8Start): jnz L(Shl8LoopExit) palignr $8, %xmm1, %xmm2 - movaps %xmm3, %xmm1 movaps %xmm2, (%rdx) movaps 24(%rcx), %xmm2 @@ -1228,7 +1165,7 @@ L(Shl8Start): lea 16(%rdx), %rdx pmovmskb %xmm0, %rax lea 16(%rcx), %rcx - movaps %xmm2, %xmm3 + movaps %xmm2, %xmm1 # ifdef USE_AS_STRNCPY sub $16, %r8 jbe L(StrncpyExit8Case2OrCase3) @@ -1236,10 +1173,9 @@ L(Shl8Start): test %rax, %rax jnz L(Shl8LoopExit) - palignr $8, %xmm1, %xmm2 + palignr $8, %xmm3, %xmm2 movaps %xmm2, (%rdx) movaps 24(%rcx), %xmm2 - movaps %xmm3, %xmm1 pcmpeqb %xmm2, %xmm0 lea 16(%rdx), %rdx @@ -1254,7 +1190,6 @@ L(Shl8Start): jnz L(Shl8LoopExit) palignr $8, %xmm1, %xmm2 - movaps %xmm3, %xmm1 movaps %xmm2, (%rdx) movaps 24(%rcx), %xmm2 @@ -1262,7 +1197,6 @@ L(Shl8Start): lea 16(%rdx), %rdx pmovmskb %xmm0, %rax lea 16(%rcx), %rcx - movaps %xmm2, %xmm3 # ifdef USE_AS_STRNCPY sub $16, %r8 jbe L(StrncpyExit8Case2OrCase3) @@ -1270,8 +1204,7 @@ L(Shl8Start): test %rax, %rax jnz L(Shl8LoopExit) - palignr $8, %xmm1, %xmm2 - movaps %xmm3, %xmm1 + palignr $8, %xmm3, %xmm2 movaps %xmm2, (%rdx) lea 24(%rcx), %rcx lea 16(%rdx), %rdx @@ -1286,6 +1219,8 @@ L(Shl8Start): # endif movaps -8(%rcx), %xmm1 +/* 64 bytes loop */ + .p2align 4 L(Shl8LoopStart): movaps 8(%rcx), %xmm2 movaps 24(%rcx), %xmm3 @@ -1319,11 +1254,9 @@ L(Shl8LoopStart): jmp L(Shl8LoopStart) L(Shl8LoopExit): - movaps (%rdx), %xmm6 - psrldq $8, %xmm6 + mov (%rcx), %r9 mov $8, %rsi - palignr $8, %xmm1, %xmm6 - movaps %xmm6, (%rdx) + mov %r9, (%rdx) jmp L(CopyFrom1To16Bytes) .p2align 4 @@ -1342,7 +1275,6 @@ L(Shl9Start): jnz L(Shl9LoopExit) palignr $9, %xmm1, %xmm2 - movaps %xmm3, %xmm1 movaps %xmm2, (%rdx) movaps 23(%rcx), %xmm2 @@ -1350,7 +1282,7 @@ L(Shl9Start): lea 16(%rdx), %rdx pmovmskb %xmm0, %rax lea 16(%rcx), %rcx - movaps %xmm2, %xmm3 + movaps %xmm2, %xmm1 # ifdef USE_AS_STRNCPY sub $16, %r8 jbe L(StrncpyExit9Case2OrCase3) @@ -1358,10 +1290,9 @@ L(Shl9Start): test %rax, %rax jnz L(Shl9LoopExit) - palignr $9, %xmm1, %xmm2 + palignr $9, %xmm3, %xmm2 movaps %xmm2, (%rdx) movaps 23(%rcx), %xmm2 - movaps %xmm3, %xmm1 pcmpeqb %xmm2, %xmm0 lea 16(%rdx), %rdx @@ -1376,7 +1307,6 @@ L(Shl9Start): jnz L(Shl9LoopExit) palignr $9, %xmm1, %xmm2 - movaps %xmm3, %xmm1 movaps %xmm2, (%rdx) movaps 23(%rcx), %xmm2 @@ -1384,7 +1314,6 @@ L(Shl9Start): lea 16(%rdx), %rdx pmovmskb %xmm0, %rax lea 16(%rcx), %rcx - movaps %xmm2, %xmm3 # ifdef USE_AS_STRNCPY sub $16, %r8 jbe L(StrncpyExit9Case2OrCase3) @@ -1392,8 +1321,7 @@ L(Shl9Start): test %rax, %rax jnz L(Shl9LoopExit) - palignr $9, %xmm1, %xmm2 - movaps %xmm3, %xmm1 + palignr $9, %xmm3, %xmm2 movaps %xmm2, (%rdx) lea 23(%rcx), %rcx lea 16(%rdx), %rdx @@ -1408,6 +1336,8 @@ L(Shl9Start): # endif movaps -9(%rcx), %xmm1 +/* 64 bytes loop */ + .p2align 4 L(Shl9LoopStart): movaps 7(%rcx), %xmm2 movaps 23(%rcx), %xmm3 @@ -1441,11 +1371,9 @@ L(Shl9LoopStart): jmp L(Shl9LoopStart) L(Shl9LoopExit): - movaps (%rdx), %xmm6 - psrldq $7, %xmm6 + mov -1(%rcx), %r9 mov $7, %rsi - palignr $9, %xmm1, %xmm6 - movaps %xmm6, (%rdx) + mov %r9, -1(%rdx) jmp L(CopyFrom1To16Bytes) .p2align 4 @@ -1464,7 +1392,6 @@ L(Shl10Start): jnz L(Shl10LoopExit) palignr $10, %xmm1, %xmm2 - movaps %xmm3, %xmm1 movaps %xmm2, (%rdx) movaps 22(%rcx), %xmm2 @@ -1472,7 +1399,7 @@ L(Shl10Start): lea 16(%rdx), %rdx pmovmskb %xmm0, %rax lea 16(%rcx), %rcx - movaps %xmm2, %xmm3 + movaps %xmm2, %xmm1 # ifdef USE_AS_STRNCPY sub $16, %r8 jbe L(StrncpyExit10Case2OrCase3) @@ -1480,10 +1407,9 @@ L(Shl10Start): test %rax, %rax jnz L(Shl10LoopExit) - palignr $10, %xmm1, %xmm2 + palignr $10, %xmm3, %xmm2 movaps %xmm2, (%rdx) movaps 22(%rcx), %xmm2 - movaps %xmm3, %xmm1 pcmpeqb %xmm2, %xmm0 lea 16(%rdx), %rdx @@ -1498,7 +1424,6 @@ L(Shl10Start): jnz L(Shl10LoopExit) palignr $10, %xmm1, %xmm2 - movaps %xmm3, %xmm1 movaps %xmm2, (%rdx) movaps 22(%rcx), %xmm2 @@ -1506,7 +1431,6 @@ L(Shl10Start): lea 16(%rdx), %rdx pmovmskb %xmm0, %rax lea 16(%rcx), %rcx - movaps %xmm2, %xmm3 # ifdef USE_AS_STRNCPY sub $16, %r8 jbe L(StrncpyExit10Case2OrCase3) @@ -1514,8 +1438,7 @@ L(Shl10Start): test %rax, %rax jnz L(Shl10LoopExit) - palignr $10, %xmm1, %xmm2 - movaps %xmm3, %xmm1 + palignr $10, %xmm3, %xmm2 movaps %xmm2, (%rdx) lea 22(%rcx), %rcx lea 16(%rdx), %rdx @@ -1530,6 +1453,8 @@ L(Shl10Start): # endif movaps -10(%rcx), %xmm1 +/* 64 bytes loop */ + .p2align 4 L(Shl10LoopStart): movaps 6(%rcx), %xmm2 movaps 22(%rcx), %xmm3 @@ -1563,11 +1488,9 @@ L(Shl10LoopStart): jmp L(Shl10LoopStart) L(Shl10LoopExit): - movaps (%rdx), %xmm6 - psrldq $6, %xmm6 + mov -2(%rcx), %r9 mov $6, %rsi - palignr $10, %xmm1, %xmm6 - movaps %xmm6, (%rdx) + mov %r9, -2(%rdx) jmp L(CopyFrom1To16Bytes) .p2align 4 @@ -1586,7 +1509,6 @@ L(Shl11Start): jnz L(Shl11LoopExit) palignr $11, %xmm1, %xmm2 - movaps %xmm3, %xmm1 movaps %xmm2, (%rdx) movaps 21(%rcx), %xmm2 @@ -1594,7 +1516,7 @@ L(Shl11Start): lea 16(%rdx), %rdx pmovmskb %xmm0, %rax lea 16(%rcx), %rcx - movaps %xmm2, %xmm3 + movaps %xmm2, %xmm1 # ifdef USE_AS_STRNCPY sub $16, %r8 jbe L(StrncpyExit11Case2OrCase3) @@ -1602,10 +1524,9 @@ L(Shl11Start): test %rax, %rax jnz L(Shl11LoopExit) - palignr $11, %xmm1, %xmm2 + palignr $11, %xmm3, %xmm2 movaps %xmm2, (%rdx) movaps 21(%rcx), %xmm2 - movaps %xmm3, %xmm1 pcmpeqb %xmm2, %xmm0 lea 16(%rdx), %rdx @@ -1620,7 +1541,6 @@ L(Shl11Start): jnz L(Shl11LoopExit) palignr $11, %xmm1, %xmm2 - movaps %xmm3, %xmm1 movaps %xmm2, (%rdx) movaps 21(%rcx), %xmm2 @@ -1628,7 +1548,6 @@ L(Shl11Start): lea 16(%rdx), %rdx pmovmskb %xmm0, %rax lea 16(%rcx), %rcx - movaps %xmm2, %xmm3 # ifdef USE_AS_STRNCPY sub $16, %r8 jbe L(StrncpyExit11Case2OrCase3) @@ -1636,8 +1555,7 @@ L(Shl11Start): test %rax, %rax jnz L(Shl11LoopExit) - palignr $11, %xmm1, %xmm2 - movaps %xmm3, %xmm1 + palignr $11, %xmm3, %xmm2 movaps %xmm2, (%rdx) lea 21(%rcx), %rcx lea 16(%rdx), %rdx @@ -1652,6 +1570,8 @@ L(Shl11Start): # endif movaps -11(%rcx), %xmm1 +/* 64 bytes loop */ + .p2align 4 L(Shl11LoopStart): movaps 5(%rcx), %xmm2 movaps 21(%rcx), %xmm3 @@ -1685,11 +1605,9 @@ L(Shl11LoopStart): jmp L(Shl11LoopStart) L(Shl11LoopExit): - movaps (%rdx), %xmm6 - psrldq $5, %xmm6 + mov -3(%rcx), %r9 mov $5, %rsi - palignr $11, %xmm1, %xmm6 - movaps %xmm6, (%rdx) + mov %r9, -3(%rdx) jmp L(CopyFrom1To16Bytes) .p2align 4 @@ -1708,7 +1626,6 @@ L(Shl12Start): jnz L(Shl12LoopExit) palignr $12, %xmm1, %xmm2 - movaps %xmm3, %xmm1 movaps %xmm2, (%rdx) movaps 20(%rcx), %xmm2 @@ -1716,7 +1633,7 @@ L(Shl12Start): lea 16(%rdx), %rdx pmovmskb %xmm0, %rax lea 16(%rcx), %rcx - movaps %xmm2, %xmm3 + movaps %xmm2, %xmm1 # ifdef USE_AS_STRNCPY sub $16, %r8 jbe L(StrncpyExit12Case2OrCase3) @@ -1724,10 +1641,9 @@ L(Shl12Start): test %rax, %rax jnz L(Shl12LoopExit) - palignr $12, %xmm1, %xmm2 + palignr $12, %xmm3, %xmm2 movaps %xmm2, (%rdx) movaps 20(%rcx), %xmm2 - movaps %xmm3, %xmm1 pcmpeqb %xmm2, %xmm0 lea 16(%rdx), %rdx @@ -1742,7 +1658,6 @@ L(Shl12Start): jnz L(Shl12LoopExit) palignr $12, %xmm1, %xmm2 - movaps %xmm3, %xmm1 movaps %xmm2, (%rdx) movaps 20(%rcx), %xmm2 @@ -1750,7 +1665,6 @@ L(Shl12Start): lea 16(%rdx), %rdx pmovmskb %xmm0, %rax lea 16(%rcx), %rcx - movaps %xmm2, %xmm3 # ifdef USE_AS_STRNCPY sub $16, %r8 jbe L(StrncpyExit12Case2OrCase3) @@ -1758,8 +1672,7 @@ L(Shl12Start): test %rax, %rax jnz L(Shl12LoopExit) - palignr $12, %xmm1, %xmm2 - movaps %xmm3, %xmm1 + palignr $12, %xmm3, %xmm2 movaps %xmm2, (%rdx) lea 20(%rcx), %rcx lea 16(%rdx), %rdx @@ -1774,6 +1687,8 @@ L(Shl12Start): # endif movaps -12(%rcx), %xmm1 +/* 64 bytes loop */ + .p2align 4 L(Shl12LoopStart): movaps 4(%rcx), %xmm2 movaps 20(%rcx), %xmm3 @@ -1807,11 +1722,9 @@ L(Shl12LoopStart): jmp L(Shl12LoopStart) L(Shl12LoopExit): - movaps (%rdx), %xmm6 - psrldq $4, %xmm6 + mov (%rcx), %r9d mov $4, %rsi - palignr $12, %xmm1, %xmm6 - movaps %xmm6, (%rdx) + mov %r9d, (%rdx) jmp L(CopyFrom1To16Bytes) .p2align 4 @@ -1830,7 +1743,6 @@ L(Shl13Start): jnz L(Shl13LoopExit) palignr $13, %xmm1, %xmm2 - movaps %xmm3, %xmm1 movaps %xmm2, (%rdx) movaps 19(%rcx), %xmm2 @@ -1838,7 +1750,7 @@ L(Shl13Start): lea 16(%rdx), %rdx pmovmskb %xmm0, %rax lea 16(%rcx), %rcx - movaps %xmm2, %xmm3 + movaps %xmm2, %xmm1 # ifdef USE_AS_STRNCPY sub $16, %r8 jbe L(StrncpyExit13Case2OrCase3) @@ -1846,10 +1758,9 @@ L(Shl13Start): test %rax, %rax jnz L(Shl13LoopExit) - palignr $13, %xmm1, %xmm2 + palignr $13, %xmm3, %xmm2 movaps %xmm2, (%rdx) movaps 19(%rcx), %xmm2 - movaps %xmm3, %xmm1 pcmpeqb %xmm2, %xmm0 lea 16(%rdx), %rdx @@ -1864,7 +1775,6 @@ L(Shl13Start): jnz L(Shl13LoopExit) palignr $13, %xmm1, %xmm2 - movaps %xmm3, %xmm1 movaps %xmm2, (%rdx) movaps 19(%rcx), %xmm2 @@ -1872,7 +1782,6 @@ L(Shl13Start): lea 16(%rdx), %rdx pmovmskb %xmm0, %rax lea 16(%rcx), %rcx - movaps %xmm2, %xmm3 # ifdef USE_AS_STRNCPY sub $16, %r8 jbe L(StrncpyExit13Case2OrCase3) @@ -1880,8 +1789,7 @@ L(Shl13Start): test %rax, %rax jnz L(Shl13LoopExit) - palignr $13, %xmm1, %xmm2 - movaps %xmm3, %xmm1 + palignr $13, %xmm3, %xmm2 movaps %xmm2, (%rdx) lea 19(%rcx), %rcx lea 16(%rdx), %rdx @@ -1896,6 +1804,8 @@ L(Shl13Start): # endif movaps -13(%rcx), %xmm1 +/* 64 bytes loop */ + .p2align 4 L(Shl13LoopStart): movaps 3(%rcx), %xmm2 movaps 19(%rcx), %xmm3 @@ -1929,11 +1839,9 @@ L(Shl13LoopStart): jmp L(Shl13LoopStart) L(Shl13LoopExit): - movaps (%rdx), %xmm6 - psrldq $3, %xmm6 + mov -1(%rcx), %r9d mov $3, %rsi - palignr $13, %xmm1, %xmm6 - movaps %xmm6, (%rdx) + mov %r9d, -1(%rdx) jmp L(CopyFrom1To16Bytes) .p2align 4 @@ -1952,7 +1860,6 @@ L(Shl14Start): jnz L(Shl14LoopExit) palignr $14, %xmm1, %xmm2 - movaps %xmm3, %xmm1 movaps %xmm2, (%rdx) movaps 18(%rcx), %xmm2 @@ -1960,7 +1867,7 @@ L(Shl14Start): lea 16(%rdx), %rdx pmovmskb %xmm0, %rax lea 16(%rcx), %rcx - movaps %xmm2, %xmm3 + movaps %xmm2, %xmm1 # ifdef USE_AS_STRNCPY sub $16, %r8 jbe L(StrncpyExit14Case2OrCase3) @@ -1968,10 +1875,9 @@ L(Shl14Start): test %rax, %rax jnz L(Shl14LoopExit) - palignr $14, %xmm1, %xmm2 + palignr $14, %xmm3, %xmm2 movaps %xmm2, (%rdx) movaps 18(%rcx), %xmm2 - movaps %xmm3, %xmm1 pcmpeqb %xmm2, %xmm0 lea 16(%rdx), %rdx @@ -1986,7 +1892,6 @@ L(Shl14Start): jnz L(Shl14LoopExit) palignr $14, %xmm1, %xmm2 - movaps %xmm3, %xmm1 movaps %xmm2, (%rdx) movaps 18(%rcx), %xmm2 @@ -1994,7 +1899,6 @@ L(Shl14Start): lea 16(%rdx), %rdx pmovmskb %xmm0, %rax lea 16(%rcx), %rcx - movaps %xmm2, %xmm3 # ifdef USE_AS_STRNCPY sub $16, %r8 jbe L(StrncpyExit14Case2OrCase3) @@ -2002,8 +1906,7 @@ L(Shl14Start): test %rax, %rax jnz L(Shl14LoopExit) - palignr $14, %xmm1, %xmm2 - movaps %xmm3, %xmm1 + palignr $14, %xmm3, %xmm2 movaps %xmm2, (%rdx) lea 18(%rcx), %rcx lea 16(%rdx), %rdx @@ -2018,6 +1921,8 @@ L(Shl14Start): # endif movaps -14(%rcx), %xmm1 +/* 64 bytes loop */ + .p2align 4 L(Shl14LoopStart): movaps 2(%rcx), %xmm2 movaps 18(%rcx), %xmm3 @@ -2051,11 +1956,9 @@ L(Shl14LoopStart): jmp L(Shl14LoopStart) L(Shl14LoopExit): - movaps (%rdx), %xmm6 - psrldq $2, %xmm6 + mov -2(%rcx), %r9d mov $2, %rsi - palignr $14, %xmm1, %xmm6 - movaps %xmm6, (%rdx) + mov %r9d, -2(%rdx) jmp L(CopyFrom1To16Bytes) .p2align 4 @@ -2074,7 +1977,6 @@ L(Shl15Start): jnz L(Shl15LoopExit) palignr $15, %xmm1, %xmm2 - movaps %xmm3, %xmm1 movaps %xmm2, (%rdx) movaps 17(%rcx), %xmm2 @@ -2082,7 +1984,7 @@ L(Shl15Start): lea 16(%rdx), %rdx pmovmskb %xmm0, %rax lea 16(%rcx), %rcx - movaps %xmm2, %xmm3 + movaps %xmm2, %xmm1 # ifdef USE_AS_STRNCPY sub $16, %r8 jbe L(StrncpyExit15Case2OrCase3) @@ -2090,10 +1992,9 @@ L(Shl15Start): test %rax, %rax jnz L(Shl15LoopExit) - palignr $15, %xmm1, %xmm2 + palignr $15, %xmm3, %xmm2 movaps %xmm2, (%rdx) movaps 17(%rcx), %xmm2 - movaps %xmm3, %xmm1 pcmpeqb %xmm2, %xmm0 lea 16(%rdx), %rdx @@ -2108,7 +2009,6 @@ L(Shl15Start): jnz L(Shl15LoopExit) palignr $15, %xmm1, %xmm2 - movaps %xmm3, %xmm1 movaps %xmm2, (%rdx) movaps 17(%rcx), %xmm2 @@ -2116,7 +2016,6 @@ L(Shl15Start): lea 16(%rdx), %rdx pmovmskb %xmm0, %rax lea 16(%rcx), %rcx - movaps %xmm2, %xmm3 # ifdef USE_AS_STRNCPY sub $16, %r8 jbe L(StrncpyExit15Case2OrCase3) @@ -2124,8 +2023,7 @@ L(Shl15Start): test %rax, %rax jnz L(Shl15LoopExit) - palignr $15, %xmm1, %xmm2 - movaps %xmm3, %xmm1 + palignr $15, %xmm3, %xmm2 movaps %xmm2, (%rdx) lea 17(%rcx), %rcx lea 16(%rdx), %rdx @@ -2140,6 +2038,8 @@ L(Shl15Start): # endif movaps -15(%rcx), %xmm1 +/* 64 bytes loop */ + .p2align 4 L(Shl15LoopStart): movaps 1(%rcx), %xmm2 movaps 17(%rcx), %xmm3 @@ -2173,16 +2073,15 @@ L(Shl15LoopStart): jmp L(Shl15LoopStart) L(Shl15LoopExit): - movaps (%rdx), %xmm6 - psrldq $1, %xmm6 + mov -3(%rcx), %r9d mov $1, %rsi - palignr $15, %xmm1, %xmm6 - movaps %xmm6, (%rdx) + mov %r9d, -3(%rdx) # ifdef USE_AS_STRCAT jmp L(CopyFrom1To16Bytes) # endif # ifndef USE_AS_STRCAT + .p2align 4 L(CopyFrom1To16Bytes): # ifdef USE_AS_STRNCPY @@ -2463,7 +2362,7 @@ L(Exit4): # ifdef USE_AS_STPCPY cmpb $1, (%rax) sbb $-1, %rax -# endif +# endif # endif ret @@ -2485,7 +2384,7 @@ L(Exit5): # ifdef USE_AS_STPCPY cmpb $1, (%rax) sbb $-1, %rax -# endif +# endif # endif ret @@ -2507,7 +2406,7 @@ L(Exit6): # ifdef USE_AS_STPCPY cmpb $1, (%rax) sbb $-1, %rax -# endif +# endif # endif ret @@ -2617,7 +2516,7 @@ L(Exit12): # ifdef USE_AS_STPCPY cmpb $1, (%rax) sbb $-1, %rax -# endif +# endif # endif ret @@ -2955,11 +2854,10 @@ L(StrncpyExit8Bytes): ret # endif - # endif # ifdef USE_AS_STRNCPY - + .p2align 4 L(StrncpyLeaveCase2OrCase3): test %rax, %rax jnz L(Aligned64LeaveCase2) @@ -3014,710 +2912,639 @@ L(Aligned64LeaveCase2): lea -16(%r8), %r8 jmp L(CopyFrom1To16BytesCase2) /*--------------------------------------------------*/ + .p2align 4 L(StrncpyExit1Case2OrCase3): - movaps (%rdx), %xmm6 - psrldq $15, %xmm6 + movdqu -1(%rcx), %xmm0 + movdqu %xmm0, -1(%rdx) mov $15, %rsi - palignr $1, %xmm1, %xmm6 - movaps %xmm6, (%rdx) test %rax, %rax jnz L(CopyFrom1To16BytesCase2) jmp L(CopyFrom1To16BytesCase3) + .p2align 4 L(StrncpyExit2Case2OrCase3): - movaps (%rdx), %xmm6 - psrldq $14, %xmm6 + movdqu -2(%rcx), %xmm0 + movdqu %xmm0, -2(%rdx) mov $14, %rsi - palignr $2, %xmm1, %xmm6 - movaps %xmm6, (%rdx) test %rax, %rax jnz L(CopyFrom1To16BytesCase2) jmp L(CopyFrom1To16BytesCase3) + .p2align 4 L(StrncpyExit3Case2OrCase3): - movaps (%rdx), %xmm6 - psrldq $13, %xmm6 + movdqu -3(%rcx), %xmm0 + movdqu %xmm0, -3(%rdx) mov $13, %rsi - palignr $3, %xmm1, %xmm6 - movaps %xmm6, (%rdx) test %rax, %rax jnz L(CopyFrom1To16BytesCase2) jmp L(CopyFrom1To16BytesCase3) + .p2align 4 L(StrncpyExit4Case2OrCase3): - movaps (%rdx), %xmm6 - psrldq $12, %xmm6 + movdqu -4(%rcx), %xmm0 + movdqu %xmm0, -4(%rdx) mov $12, %rsi - palignr $4, %xmm1, %xmm6 - movaps %xmm6, (%rdx) test %rax, %rax jnz L(CopyFrom1To16BytesCase2) jmp L(CopyFrom1To16BytesCase3) + .p2align 4 L(StrncpyExit5Case2OrCase3): - movaps (%rdx), %xmm6 - psrldq $11, %xmm6 + movdqu -5(%rcx), %xmm0 + movdqu %xmm0, -5(%rdx) mov $11, %rsi - palignr $5, %xmm1, %xmm6 - movaps %xmm6, (%rdx) test %rax, %rax jnz L(CopyFrom1To16BytesCase2) jmp L(CopyFrom1To16BytesCase3) + .p2align 4 L(StrncpyExit6Case2OrCase3): - movaps (%rdx), %xmm6 - psrldq $10, %xmm6 - mov $10, %rsi - palignr $6, %xmm1, %xmm6 - movaps %xmm6, (%rdx) + mov (%rcx), %rsi + mov 6(%rcx), %r9d + mov %r9d, 6(%rdx) + mov %rsi, (%rdx) test %rax, %rax + mov $10, %rsi jnz L(CopyFrom1To16BytesCase2) jmp L(CopyFrom1To16BytesCase3) + .p2align 4 L(StrncpyExit7Case2OrCase3): - movaps (%rdx), %xmm6 - psrldq $9, %xmm6 - mov $9, %rsi - palignr $7, %xmm1, %xmm6 - movaps %xmm6, (%rdx) + mov (%rcx), %rsi + mov 5(%rcx), %r9d + mov %r9d, 5(%rdx) + mov %rsi, (%rdx) test %rax, %rax + mov $9, %rsi jnz L(CopyFrom1To16BytesCase2) jmp L(CopyFrom1To16BytesCase3) + .p2align 4 L(StrncpyExit8Case2OrCase3): - movaps (%rdx), %xmm6 - psrldq $8, %xmm6 + mov (%rcx), %r9 mov $8, %rsi - palignr $8, %xmm1, %xmm6 - movaps %xmm6, (%rdx) + mov %r9, (%rdx) test %rax, %rax jnz L(CopyFrom1To16BytesCase2) jmp L(CopyFrom1To16BytesCase3) + .p2align 4 L(StrncpyExit9Case2OrCase3): - movaps (%rdx), %xmm6 - psrldq $7, %xmm6 + mov -1(%rcx), %r9 mov $7, %rsi - palignr $9, %xmm1, %xmm6 - movaps %xmm6, (%rdx) + mov %r9, -1(%rdx) test %rax, %rax jnz L(CopyFrom1To16BytesCase2) jmp L(CopyFrom1To16BytesCase3) + .p2align 4 L(StrncpyExit10Case2OrCase3): - movaps (%rdx), %xmm6 - psrldq $6, %xmm6 + mov -2(%rcx), %r9 mov $6, %rsi - palignr $10, %xmm1, %xmm6 - movaps %xmm6, (%rdx) + mov %r9, -2(%rdx) test %rax, %rax jnz L(CopyFrom1To16BytesCase2) jmp L(CopyFrom1To16BytesCase3) + .p2align 4 L(StrncpyExit11Case2OrCase3): - movaps (%rdx), %xmm6 - psrldq $5, %xmm6 + mov -3(%rcx), %r9 mov $5, %rsi - palignr $11, %xmm1, %xmm6 - movaps %xmm6, (%rdx) + mov %r9, -3(%rdx) test %rax, %rax jnz L(CopyFrom1To16BytesCase2) jmp L(CopyFrom1To16BytesCase3) + .p2align 4 L(StrncpyExit12Case2OrCase3): - movaps (%rdx), %xmm6 - psrldq $4, %xmm6 + mov (%rcx), %r9d mov $4, %rsi - palignr $12, %xmm1, %xmm6 - movaps %xmm6, (%rdx) + mov %r9d, (%rdx) test %rax, %rax jnz L(CopyFrom1To16BytesCase2) jmp L(CopyFrom1To16BytesCase3) + .p2align 4 L(StrncpyExit13Case2OrCase3): - movaps (%rdx), %xmm6 - psrldq $3, %xmm6 + mov -1(%rcx), %r9d mov $3, %rsi - palignr $13, %xmm1, %xmm6 - movaps %xmm6, (%rdx) + mov %r9d, -1(%rdx) test %rax, %rax jnz L(CopyFrom1To16BytesCase2) jmp L(CopyFrom1To16BytesCase3) + .p2align 4 L(StrncpyExit14Case2OrCase3): - movaps (%rdx), %xmm6 - psrldq $2, %xmm6 + mov -2(%rcx), %r9d mov $2, %rsi - palignr $14, %xmm1, %xmm6 - movaps %xmm6, (%rdx) + mov %r9d, -2(%rdx) test %rax, %rax jnz L(CopyFrom1To16BytesCase2) jmp L(CopyFrom1To16BytesCase3) + .p2align 4 L(StrncpyExit15Case2OrCase3): - movaps (%rdx), %xmm6 - psrldq $1, %xmm6 + mov -3(%rcx), %r9d mov $1, %rsi - palignr $15, %xmm1, %xmm6 - movaps %xmm6, (%rdx) + mov %r9d, -3(%rdx) test %rax, %rax jnz L(CopyFrom1To16BytesCase2) jmp L(CopyFrom1To16BytesCase3) + .p2align 4 L(StrncpyLeave1): movaps %xmm2, %xmm3 add $48, %r8 jle L(StrncpyExit1) palignr $1, %xmm1, %xmm2 - movaps %xmm3, %xmm1 movaps %xmm2, (%rdx) movaps 31(%rcx), %xmm2 lea 16(%rsi), %rsi - movaps %xmm2, %xmm3 sub $16, %r8 jbe L(StrncpyExit1) - palignr $1, %xmm1, %xmm2 + palignr $1, %xmm3, %xmm2 movaps %xmm2, 16(%rdx) - movaps 31+16(%rcx), %xmm2 - movaps %xmm3, %xmm1 lea 16(%rsi), %rsi sub $16, %r8 jbe L(StrncpyExit1) - movaps %xmm2, %xmm1 movaps %xmm4, 32(%rdx) lea 16(%rsi), %rsi sub $16, %r8 jbe L(StrncpyExit1) - movaps %xmm7, %xmm1 movaps %xmm5, 48(%rdx) lea 16(%rsi), %rsi lea -16(%r8), %r8 L(StrncpyExit1): - movaps (%rdx, %rsi), %xmm6 - psrldq $15, %xmm6 - palignr $1, %xmm1, %xmm6 - movaps %xmm6, (%rdx, %rsi) - lea 15(%rsi), %rsi + lea 15(%rdx, %rsi), %rdx + lea 15(%rcx, %rsi), %rcx + mov -15(%rcx), %rsi + mov -8(%rcx), %rax + mov %rsi, -15(%rdx) + mov %rax, -8(%rdx) + xor %rsi, %rsi jmp L(CopyFrom1To16BytesCase3) + .p2align 4 L(StrncpyLeave2): movaps %xmm2, %xmm3 add $48, %r8 jle L(StrncpyExit2) palignr $2, %xmm1, %xmm2 - movaps %xmm3, %xmm1 movaps %xmm2, (%rdx) movaps 30(%rcx), %xmm2 lea 16(%rsi), %rsi - movaps %xmm2, %xmm3 sub $16, %r8 jbe L(StrncpyExit2) - palignr $2, %xmm1, %xmm2 + palignr $2, %xmm3, %xmm2 movaps %xmm2, 16(%rdx) - movaps 30+16(%rcx), %xmm2 - movaps %xmm3, %xmm1 lea 16(%rsi), %rsi sub $16, %r8 jbe L(StrncpyExit2) - movaps %xmm2, %xmm1 movaps %xmm4, 32(%rdx) lea 16(%rsi), %rsi sub $16, %r8 jbe L(StrncpyExit2) - movaps %xmm7, %xmm1 movaps %xmm5, 48(%rdx) lea 16(%rsi), %rsi lea -16(%r8), %r8 L(StrncpyExit2): - movaps (%rdx, %rsi), %xmm6 - psrldq $14, %xmm6 - palignr $2, %xmm1, %xmm6 - movaps %xmm6, (%rdx, %rsi) - lea 14(%rsi), %rsi + lea 14(%rdx, %rsi), %rdx + lea 14(%rcx, %rsi), %rcx + mov -14(%rcx), %rsi + mov -8(%rcx), %rax + mov %rsi, -14(%rdx) + mov %rax, -8(%rdx) + xor %rsi, %rsi jmp L(CopyFrom1To16BytesCase3) + .p2align 4 L(StrncpyLeave3): movaps %xmm2, %xmm3 add $48, %r8 jle L(StrncpyExit3) palignr $3, %xmm1, %xmm2 - movaps %xmm3, %xmm1 movaps %xmm2, (%rdx) movaps 29(%rcx), %xmm2 lea 16(%rsi), %rsi - movaps %xmm2, %xmm3 sub $16, %r8 jbe L(StrncpyExit3) - palignr $3, %xmm1, %xmm2 + palignr $3, %xmm3, %xmm2 movaps %xmm2, 16(%rdx) - movaps 29+16(%rcx), %xmm2 - movaps %xmm3, %xmm1 lea 16(%rsi), %rsi sub $16, %r8 jbe L(StrncpyExit3) - movaps %xmm2, %xmm1 movaps %xmm4, 32(%rdx) lea 16(%rsi), %rsi sub $16, %r8 jbe L(StrncpyExit3) - movaps %xmm7, %xmm1 movaps %xmm5, 48(%rdx) lea 16(%rsi), %rsi lea -16(%r8), %r8 L(StrncpyExit3): - movaps (%rdx, %rsi), %xmm6 - psrldq $13, %xmm6 - palignr $3, %xmm1, %xmm6 - movaps %xmm6, (%rdx, %rsi) - lea 13(%rsi), %rsi + lea 13(%rdx, %rsi), %rdx + lea 13(%rcx, %rsi), %rcx + mov -13(%rcx), %rsi + mov -8(%rcx), %rax + mov %rsi, -13(%rdx) + mov %rax, -8(%rdx) + xor %rsi, %rsi jmp L(CopyFrom1To16BytesCase3) + .p2align 4 L(StrncpyLeave4): movaps %xmm2, %xmm3 add $48, %r8 jle L(StrncpyExit4) palignr $4, %xmm1, %xmm2 - movaps %xmm3, %xmm1 movaps %xmm2, (%rdx) movaps 28(%rcx), %xmm2 lea 16(%rsi), %rsi - movaps %xmm2, %xmm3 sub $16, %r8 jbe L(StrncpyExit4) - palignr $4, %xmm1, %xmm2 + palignr $4, %xmm3, %xmm2 movaps %xmm2, 16(%rdx) - movaps 28+16(%rcx), %xmm2 - movaps %xmm3, %xmm1 lea 16(%rsi), %rsi sub $16, %r8 jbe L(StrncpyExit4) - movaps %xmm2, %xmm1 movaps %xmm4, 32(%rdx) lea 16(%rsi), %rsi sub $16, %r8 jbe L(StrncpyExit4) - movaps %xmm7, %xmm1 movaps %xmm5, 48(%rdx) lea 16(%rsi), %rsi lea -16(%r8), %r8 L(StrncpyExit4): - movaps (%rdx, %rsi), %xmm6 - psrldq $12, %xmm6 - palignr $4, %xmm1, %xmm6 - movaps %xmm6, (%rdx, %rsi) - lea 12(%rsi), %rsi + lea 12(%rdx, %rsi), %rdx + lea 12(%rcx, %rsi), %rcx + mov -12(%rcx), %rsi + mov -4(%rcx), %eax + mov %rsi, -12(%rdx) + mov %eax, -4(%rdx) + xor %rsi, %rsi jmp L(CopyFrom1To16BytesCase3) + .p2align 4 L(StrncpyLeave5): movaps %xmm2, %xmm3 add $48, %r8 jle L(StrncpyExit5) palignr $5, %xmm1, %xmm2 - movaps %xmm3, %xmm1 movaps %xmm2, (%rdx) movaps 27(%rcx), %xmm2 lea 16(%rsi), %rsi - movaps %xmm2, %xmm3 sub $16, %r8 jbe L(StrncpyExit5) - palignr $5, %xmm1, %xmm2 + palignr $5, %xmm3, %xmm2 movaps %xmm2, 16(%rdx) - movaps 27+16(%rcx), %xmm2 - movaps %xmm3, %xmm1 lea 16(%rsi), %rsi sub $16, %r8 jbe L(StrncpyExit5) - movaps %xmm2, %xmm1 movaps %xmm4, 32(%rdx) lea 16(%rsi), %rsi sub $16, %r8 jbe L(StrncpyExit5) - movaps %xmm7, %xmm1 movaps %xmm5, 48(%rdx) lea 16(%rsi), %rsi lea -16(%r8), %r8 L(StrncpyExit5): - movaps (%rdx, %rsi), %xmm6 - psrldq $11, %xmm6 - palignr $5, %xmm1, %xmm6 - movaps %xmm6, (%rdx, %rsi) - lea 11(%rsi), %rsi + lea 11(%rdx, %rsi), %rdx + lea 11(%rcx, %rsi), %rcx + mov -11(%rcx), %rsi + mov -4(%rcx), %eax + mov %rsi, -11(%rdx) + mov %eax, -4(%rdx) + xor %rsi, %rsi jmp L(CopyFrom1To16BytesCase3) + .p2align 4 L(StrncpyLeave6): movaps %xmm2, %xmm3 add $48, %r8 jle L(StrncpyExit6) palignr $6, %xmm1, %xmm2 - movaps %xmm3, %xmm1 movaps %xmm2, (%rdx) movaps 26(%rcx), %xmm2 lea 16(%rsi), %rsi - movaps %xmm2, %xmm3 sub $16, %r8 jbe L(StrncpyExit6) - palignr $6, %xmm1, %xmm2 + palignr $6, %xmm3, %xmm2 movaps %xmm2, 16(%rdx) - movaps 26+16(%rcx), %xmm2 - movaps %xmm3, %xmm1 lea 16(%rsi), %rsi sub $16, %r8 jbe L(StrncpyExit6) - movaps %xmm2, %xmm1 movaps %xmm4, 32(%rdx) lea 16(%rsi), %rsi sub $16, %r8 jbe L(StrncpyExit6) - movaps %xmm7, %xmm1 movaps %xmm5, 48(%rdx) lea 16(%rsi), %rsi lea -16(%r8), %r8 L(StrncpyExit6): - movaps (%rdx, %rsi), %xmm6 - psrldq $10, %xmm6 - palignr $6, %xmm1, %xmm6 - movaps %xmm6, (%rdx, %rsi) - lea 10(%rsi), %rsi + lea 10(%rdx, %rsi), %rdx + lea 10(%rcx, %rsi), %rcx + mov -10(%rcx), %rsi + movw -2(%rcx), %ax + mov %rsi, -10(%rdx) + movw %ax, -2(%rdx) + xor %rsi, %rsi jmp L(CopyFrom1To16BytesCase3) + .p2align 4 L(StrncpyLeave7): movaps %xmm2, %xmm3 add $48, %r8 jle L(StrncpyExit7) palignr $7, %xmm1, %xmm2 - movaps %xmm3, %xmm1 movaps %xmm2, (%rdx) movaps 25(%rcx), %xmm2 lea 16(%rsi), %rsi - movaps %xmm2, %xmm3 sub $16, %r8 jbe L(StrncpyExit7) - palignr $7, %xmm1, %xmm2 + palignr $7, %xmm3, %xmm2 movaps %xmm2, 16(%rdx) - movaps 25+16(%rcx), %xmm2 - movaps %xmm3, %xmm1 lea 16(%rsi), %rsi sub $16, %r8 jbe L(StrncpyExit7) - movaps %xmm2, %xmm1 movaps %xmm4, 32(%rdx) lea 16(%rsi), %rsi sub $16, %r8 jbe L(StrncpyExit7) - movaps %xmm7, %xmm1 movaps %xmm5, 48(%rdx) lea 16(%rsi), %rsi lea -16(%r8), %r8 L(StrncpyExit7): - movaps (%rdx, %rsi), %xmm6 - psrldq $9, %xmm6 - palignr $7, %xmm1, %xmm6 - movaps %xmm6, (%rdx, %rsi) - lea 9(%rsi), %rsi + lea 9(%rdx, %rsi), %rdx + lea 9(%rcx, %rsi), %rcx + mov -9(%rcx), %rsi + movb -1(%rcx), %ah + mov %rsi, -9(%rdx) + movb %ah, -1(%rdx) + xor %rsi, %rsi jmp L(CopyFrom1To16BytesCase3) + .p2align 4 L(StrncpyLeave8): movaps %xmm2, %xmm3 add $48, %r8 jle L(StrncpyExit8) palignr $8, %xmm1, %xmm2 - movaps %xmm3, %xmm1 movaps %xmm2, (%rdx) movaps 24(%rcx), %xmm2 lea 16(%rsi), %rsi - movaps %xmm2, %xmm3 sub $16, %r8 jbe L(StrncpyExit8) - palignr $8, %xmm1, %xmm2 + palignr $8, %xmm3, %xmm2 movaps %xmm2, 16(%rdx) - movaps 24+16(%rcx), %xmm2 - movaps %xmm3, %xmm1 lea 16(%rsi), %rsi sub $16, %r8 jbe L(StrncpyExit8) - movaps %xmm2, %xmm1 movaps %xmm4, 32(%rdx) lea 16(%rsi), %rsi sub $16, %r8 jbe L(StrncpyExit8) - movaps %xmm7, %xmm1 movaps %xmm5, 48(%rdx) lea 16(%rsi), %rsi lea -16(%r8), %r8 L(StrncpyExit8): - movaps (%rdx, %rsi), %xmm6 - psrldq $8, %xmm6 - palignr $8, %xmm1, %xmm6 - movaps %xmm6, (%rdx, %rsi) - lea 8(%rsi), %rsi + lea 8(%rdx, %rsi), %rdx + lea 8(%rcx, %rsi), %rcx + mov -8(%rcx), %rax + xor %rsi, %rsi + mov %rax, -8(%rdx) jmp L(CopyFrom1To16BytesCase3) + .p2align 4 L(StrncpyLeave9): movaps %xmm2, %xmm3 add $48, %r8 jle L(StrncpyExit9) palignr $9, %xmm1, %xmm2 - movaps %xmm3, %xmm1 movaps %xmm2, (%rdx) movaps 23(%rcx), %xmm2 lea 16(%rsi), %rsi - movaps %xmm2, %xmm3 sub $16, %r8 jbe L(StrncpyExit9) - palignr $9, %xmm1, %xmm2 + palignr $9, %xmm3, %xmm2 movaps %xmm2, 16(%rdx) - movaps 23+16(%rcx), %xmm2 - movaps %xmm3, %xmm1 lea 16(%rsi), %rsi sub $16, %r8 jbe L(StrncpyExit9) - movaps %xmm2, %xmm1 movaps %xmm4, 32(%rdx) lea 16(%rsi), %rsi sub $16, %r8 jbe L(StrncpyExit9) - movaps %xmm7, %xmm1 movaps %xmm5, 48(%rdx) lea 16(%rsi), %rsi lea -16(%r8), %r8 L(StrncpyExit9): - movaps (%rdx, %rsi), %xmm6 - psrldq $7, %xmm6 - palignr $9, %xmm1, %xmm6 - movaps %xmm6, (%rdx, %rsi) - lea 7(%rsi), %rsi + lea 7(%rdx, %rsi), %rdx + lea 7(%rcx, %rsi), %rcx + mov -8(%rcx), %rax + xor %rsi, %rsi + mov %rax, -8(%rdx) jmp L(CopyFrom1To16BytesCase3) + .p2align 4 L(StrncpyLeave10): movaps %xmm2, %xmm3 add $48, %r8 jle L(StrncpyExit10) palignr $10, %xmm1, %xmm2 - movaps %xmm3, %xmm1 movaps %xmm2, (%rdx) movaps 22(%rcx), %xmm2 lea 16(%rsi), %rsi - movaps %xmm2, %xmm3 sub $16, %r8 jbe L(StrncpyExit10) - palignr $10, %xmm1, %xmm2 + palignr $10, %xmm3, %xmm2 movaps %xmm2, 16(%rdx) - movaps 22+16(%rcx), %xmm2 - movaps %xmm3, %xmm1 lea 16(%rsi), %rsi sub $16, %r8 jbe L(StrncpyExit10) - movaps %xmm2, %xmm1 movaps %xmm4, 32(%rdx) lea 16(%rsi), %rsi sub $16, %r8 jbe L(StrncpyExit10) - movaps %xmm7, %xmm1 movaps %xmm5, 48(%rdx) lea 16(%rsi), %rsi lea -16(%r8), %r8 L(StrncpyExit10): - movaps (%rdx, %rsi), %xmm6 - psrldq $6, %xmm6 - palignr $10, %xmm1, %xmm6 - movaps %xmm6, (%rdx, %rsi) - lea 6(%rsi), %rsi + lea 6(%rdx, %rsi), %rdx + lea 6(%rcx, %rsi), %rcx + mov -8(%rcx), %rax + xor %rsi, %rsi + mov %rax, -8(%rdx) jmp L(CopyFrom1To16BytesCase3) + .p2align 4 L(StrncpyLeave11): movaps %xmm2, %xmm3 add $48, %r8 jle L(StrncpyExit11) palignr $11, %xmm1, %xmm2 - movaps %xmm3, %xmm1 movaps %xmm2, (%rdx) movaps 21(%rcx), %xmm2 lea 16(%rsi), %rsi - movaps %xmm2, %xmm3 sub $16, %r8 jbe L(StrncpyExit11) - palignr $11, %xmm1, %xmm2 + palignr $11, %xmm3, %xmm2 movaps %xmm2, 16(%rdx) - movaps 21+16(%rcx), %xmm2 - movaps %xmm3, %xmm1 lea 16(%rsi), %rsi sub $16, %r8 jbe L(StrncpyExit11) - movaps %xmm2, %xmm1 movaps %xmm4, 32(%rdx) lea 16(%rsi), %rsi sub $16, %r8 jbe L(StrncpyExit11) - movaps %xmm7, %xmm1 movaps %xmm5, 48(%rdx) lea 16(%rsi), %rsi lea -16(%r8), %r8 L(StrncpyExit11): - movaps (%rdx, %rsi), %xmm6 - psrldq $5, %xmm6 - palignr $11, %xmm1, %xmm6 - movaps %xmm6, (%rdx, %rsi) - lea 5(%rsi), %rsi + lea 5(%rdx, %rsi), %rdx + lea 5(%rcx, %rsi), %rcx + mov -8(%rcx), %rax + xor %rsi, %rsi + mov %rax, -8(%rdx) jmp L(CopyFrom1To16BytesCase3) + .p2align 4 L(StrncpyLeave12): movaps %xmm2, %xmm3 add $48, %r8 jle L(StrncpyExit12) palignr $12, %xmm1, %xmm2 - movaps %xmm3, %xmm1 movaps %xmm2, (%rdx) movaps 20(%rcx), %xmm2 lea 16(%rsi), %rsi - movaps %xmm2, %xmm3 sub $16, %r8 jbe L(StrncpyExit12) - palignr $12, %xmm1, %xmm2 + palignr $12, %xmm3, %xmm2 movaps %xmm2, 16(%rdx) - movaps 20+16(%rcx), %xmm2 - movaps %xmm3, %xmm1 lea 16(%rsi), %rsi sub $16, %r8 jbe L(StrncpyExit12) - movaps %xmm2, %xmm1 movaps %xmm4, 32(%rdx) lea 16(%rsi), %rsi sub $16, %r8 jbe L(StrncpyExit12) - movaps %xmm7, %xmm1 movaps %xmm5, 48(%rdx) lea 16(%rsi), %rsi lea -16(%r8), %r8 L(StrncpyExit12): - movaps (%rdx, %rsi), %xmm6 - psrldq $4, %xmm6 - palignr $12, %xmm1, %xmm6 - movaps %xmm6, (%rdx, %rsi) - lea 4(%rsi), %rsi + lea 4(%rdx, %rsi), %rdx + lea 4(%rcx, %rsi), %rcx + mov -4(%rcx), %eax + xor %rsi, %rsi + mov %eax, -4(%rdx) jmp L(CopyFrom1To16BytesCase3) + .p2align 4 L(StrncpyLeave13): movaps %xmm2, %xmm3 add $48, %r8 jle L(StrncpyExit13) palignr $13, %xmm1, %xmm2 - movaps %xmm3, %xmm1 movaps %xmm2, (%rdx) movaps 19(%rcx), %xmm2 lea 16(%rsi), %rsi - movaps %xmm2, %xmm3 sub $16, %r8 jbe L(StrncpyExit13) - palignr $13, %xmm1, %xmm2 + palignr $13, %xmm3, %xmm2 movaps %xmm2, 16(%rdx) - movaps 19+16(%rcx), %xmm2 - movaps %xmm3, %xmm1 lea 16(%rsi), %rsi sub $16, %r8 jbe L(StrncpyExit13) - movaps %xmm2, %xmm1 movaps %xmm4, 32(%rdx) lea 16(%rsi), %rsi sub $16, %r8 jbe L(StrncpyExit13) - movaps %xmm7, %xmm1 movaps %xmm5, 48(%rdx) lea 16(%rsi), %rsi lea -16(%r8), %r8 L(StrncpyExit13): - movaps (%rdx, %rsi), %xmm6 - psrldq $3, %xmm6 - palignr $13, %xmm1, %xmm6 - movaps %xmm6, (%rdx, %rsi) - lea 3(%rsi), %rsi + lea 3(%rdx, %rsi), %rdx + lea 3(%rcx, %rsi), %rcx + mov -4(%rcx), %eax + xor %rsi, %rsi + mov %eax, -4(%rdx) jmp L(CopyFrom1To16BytesCase3) + .p2align 4 L(StrncpyLeave14): movaps %xmm2, %xmm3 add $48, %r8 jle L(StrncpyExit14) palignr $14, %xmm1, %xmm2 - movaps %xmm3, %xmm1 movaps %xmm2, (%rdx) movaps 18(%rcx), %xmm2 lea 16(%rsi), %rsi - movaps %xmm2, %xmm3 sub $16, %r8 jbe L(StrncpyExit14) - palignr $14, %xmm1, %xmm2 + palignr $14, %xmm3, %xmm2 movaps %xmm2, 16(%rdx) - movaps 18+16(%rcx), %xmm2 - movaps %xmm3, %xmm1 lea 16(%rsi), %rsi sub $16, %r8 jbe L(StrncpyExit14) - movaps %xmm2, %xmm1 movaps %xmm4, 32(%rdx) lea 16(%rsi), %rsi sub $16, %r8 jbe L(StrncpyExit14) - movaps %xmm7, %xmm1 movaps %xmm5, 48(%rdx) lea 16(%rsi), %rsi lea -16(%r8), %r8 L(StrncpyExit14): - movaps (%rdx, %rsi), %xmm6 - psrldq $2, %xmm6 - palignr $14, %xmm1, %xmm6 - movaps %xmm6, (%rdx, %rsi) - lea 2(%rsi), %rsi + lea 2(%rdx, %rsi), %rdx + lea 2(%rcx, %rsi), %rcx + movw -2(%rcx), %ax + xor %rsi, %rsi + movw %ax, -2(%rdx) jmp L(CopyFrom1To16BytesCase3) + .p2align 4 L(StrncpyLeave15): movaps %xmm2, %xmm3 add $48, %r8 jle L(StrncpyExit15) palignr $15, %xmm1, %xmm2 - movaps %xmm3, %xmm1 movaps %xmm2, (%rdx) movaps 17(%rcx), %xmm2 lea 16(%rsi), %rsi - movaps %xmm2, %xmm3 sub $16, %r8 jbe L(StrncpyExit15) - palignr $15, %xmm1, %xmm2 + palignr $15, %xmm3, %xmm2 movaps %xmm2, 16(%rdx) - movaps 17+16(%rcx), %xmm2 - movaps %xmm3, %xmm1 lea 16(%rsi), %rsi sub $16, %r8 jbe L(StrncpyExit15) - movaps %xmm2, %xmm1 movaps %xmm4, 32(%rdx) lea 16(%rsi), %rsi sub $16, %r8 jbe L(StrncpyExit15) - movaps %xmm7, %xmm1 movaps %xmm5, 48(%rdx) lea 16(%rsi), %rsi lea -16(%r8), %r8 L(StrncpyExit15): - movaps (%rdx, %rsi), %xmm6 - psrldq $1, %xmm6 - palignr $15, %xmm1, %xmm6 - movaps %xmm6, (%rdx, %rsi) - lea 1(%rsi), %rsi + lea 1(%rdx, %rsi), %rdx + lea 1(%rcx, %rsi), %rcx + movb -1(%rcx), %ah + xor %rsi, %rsi + movb %ah, -1(%rdx) jmp L(CopyFrom1To16BytesCase3) + # endif # ifndef USE_AS_STRCAT END (STRCPY) diff --git a/libc/sysdeps/x86_64/multiarch/wcscpy-ssse3.S b/libc/sysdeps/x86_64/multiarch/wcscpy-ssse3.S index 4e292f3c2..477b2cb4e 100644 --- a/libc/sysdeps/x86_64/multiarch/wcscpy-ssse3.S +++ b/libc/sysdeps/x86_64/multiarch/wcscpy-ssse3.S @@ -21,8 +21,9 @@ #ifndef NOT_IN_libc # include <sysdep.h> -.text + .section .text.ssse3,"ax",@progbits ENTRY (__wcscpy_ssse3) + mov %rsi, %rcx mov %rdi, %rdx @@ -136,6 +137,7 @@ L(Align16Both): mov $-0x40, %rsi + .p2align 4 L(Aligned64Loop): movaps (%rcx), %xmm2 movaps %xmm2, %xmm4 @@ -205,7 +207,6 @@ L(Shl4Start): jnz L(Shl4LoopExit) palignr $4, %xmm1, %xmm2 - movaps %xmm3, %xmm1 movaps %xmm2, (%rdx) movaps 28(%rcx), %xmm2 @@ -213,15 +214,14 @@ L(Shl4Start): lea 16(%rdx), %rdx pmovmskb %xmm0, %rax lea 16(%rcx), %rcx - movaps %xmm2, %xmm3 + movaps %xmm2, %xmm1 test %rax, %rax jnz L(Shl4LoopExit) - palignr $4, %xmm1, %xmm2 + palignr $4, %xmm3, %xmm2 movaps %xmm2, (%rdx) movaps 28(%rcx), %xmm2 - movaps %xmm3, %xmm1 pcmpeqd %xmm2, %xmm0 lea 16(%rdx), %rdx @@ -233,7 +233,6 @@ L(Shl4Start): jnz L(Shl4LoopExit) palignr $4, %xmm1, %xmm2 - movaps %xmm3, %xmm1 movaps %xmm2, (%rdx) movaps 28(%rcx), %xmm2 @@ -245,8 +244,7 @@ L(Shl4Start): test %rax, %rax jnz L(Shl4LoopExit) - palignr $4, %xmm1, %xmm2 - movaps %xmm3, %xmm1 + palignr $4, %xmm3, %xmm2 movaps %xmm2, (%rdx) lea 28(%rcx), %rcx lea 16(%rdx), %rdx @@ -259,6 +257,7 @@ L(Shl4Start): movaps -4(%rcx), %xmm1 + .p2align 4 L(Shl4LoopStart): movaps 12(%rcx), %xmm2 movaps 28(%rcx), %xmm3 @@ -289,11 +288,9 @@ L(Shl4LoopStart): jmp L(Shl4LoopStart) L(Shl4LoopExit): - movaps (%rdx), %xmm6 - psrldq $12, %xmm6 + movdqu -4(%rcx), %xmm1 mov $12, %rsi - palignr $4, %xmm1, %xmm6 - movaps %xmm6, (%rdx) + movdqu %xmm1, -4(%rdx) jmp L(CopyFrom1To16Bytes) .p2align 4 @@ -309,7 +306,6 @@ L(Shl8Start): jnz L(Shl8LoopExit) palignr $8, %xmm1, %xmm2 - movaps %xmm3, %xmm1 movaps %xmm2, (%rdx) movaps 24(%rcx), %xmm2 @@ -317,15 +313,14 @@ L(Shl8Start): lea 16(%rdx), %rdx pmovmskb %xmm0, %rax lea 16(%rcx), %rcx - movaps %xmm2, %xmm3 + movaps %xmm2, %xmm1 test %rax, %rax jnz L(Shl8LoopExit) - palignr $8, %xmm1, %xmm2 + palignr $8, %xmm3, %xmm2 movaps %xmm2, (%rdx) movaps 24(%rcx), %xmm2 - movaps %xmm3, %xmm1 pcmpeqd %xmm2, %xmm0 lea 16(%rdx), %rdx @@ -337,7 +332,6 @@ L(Shl8Start): jnz L(Shl8LoopExit) palignr $8, %xmm1, %xmm2 - movaps %xmm3, %xmm1 movaps %xmm2, (%rdx) movaps 24(%rcx), %xmm2 @@ -345,13 +339,11 @@ L(Shl8Start): lea 16(%rdx), %rdx pmovmskb %xmm0, %rax lea 16(%rcx), %rcx - movaps %xmm2, %xmm3 test %rax, %rax jnz L(Shl8LoopExit) - palignr $8, %xmm1, %xmm2 - movaps %xmm3, %xmm1 + palignr $8, %xmm3, %xmm2 movaps %xmm2, (%rdx) lea 24(%rcx), %rcx lea 16(%rdx), %rdx @@ -364,6 +356,7 @@ L(Shl8Start): movaps -8(%rcx), %xmm1 + .p2align 4 L(Shl8LoopStart): movaps 8(%rcx), %xmm2 movaps 24(%rcx), %xmm3 @@ -394,11 +387,9 @@ L(Shl8LoopStart): jmp L(Shl8LoopStart) L(Shl8LoopExit): - movaps (%rdx), %xmm6 - psrldq $8, %xmm6 + mov (%rcx), %r9 mov $8, %rsi - palignr $8, %xmm1, %xmm6 - movaps %xmm6, (%rdx) + mov %r9, (%rdx) jmp L(CopyFrom1To16Bytes) .p2align 4 @@ -414,7 +405,6 @@ L(Shl12Start): jnz L(Shl12LoopExit) palignr $12, %xmm1, %xmm2 - movaps %xmm3, %xmm1 movaps %xmm2, (%rdx) movaps 20(%rcx), %xmm2 @@ -422,15 +412,14 @@ L(Shl12Start): lea 16(%rdx), %rdx pmovmskb %xmm0, %rax lea 16(%rcx), %rcx - movaps %xmm2, %xmm3 + movaps %xmm2, %xmm1 test %rax, %rax jnz L(Shl12LoopExit) - palignr $12, %xmm1, %xmm2 + palignr $12, %xmm3, %xmm2 movaps %xmm2, (%rdx) movaps 20(%rcx), %xmm2 - movaps %xmm3, %xmm1 pcmpeqd %xmm2, %xmm0 lea 16(%rdx), %rdx @@ -442,7 +431,6 @@ L(Shl12Start): jnz L(Shl12LoopExit) palignr $12, %xmm1, %xmm2 - movaps %xmm3, %xmm1 movaps %xmm2, (%rdx) movaps 20(%rcx), %xmm2 @@ -450,13 +438,11 @@ L(Shl12Start): lea 16(%rdx), %rdx pmovmskb %xmm0, %rax lea 16(%rcx), %rcx - movaps %xmm2, %xmm3 test %rax, %rax jnz L(Shl12LoopExit) - palignr $12, %xmm1, %xmm2 - movaps %xmm3, %xmm1 + palignr $12, %xmm3, %xmm2 movaps %xmm2, (%rdx) lea 20(%rcx), %rcx lea 16(%rdx), %rdx @@ -469,6 +455,7 @@ L(Shl12Start): movaps -12(%rcx), %xmm1 + .p2align 4 L(Shl12LoopStart): movaps 4(%rcx), %xmm2 movaps 20(%rcx), %xmm3 @@ -498,11 +485,10 @@ L(Shl12LoopStart): jmp L(Shl12LoopStart) L(Shl12LoopExit): - movaps (%rdx), %xmm6 - psrldq $4, %xmm6 + mov (%rcx), %r9d mov $4, %rsi - palignr $12, %xmm1, %xmm6 - movaps %xmm6, (%rdx) + mov %r9d, (%rdx) + jmp L(CopyFrom1To16Bytes) .p2align 4 L(CopyFrom1To16Bytes): @@ -556,8 +542,10 @@ L(Exit12): .p2align 4 L(Exit16): - movdqu (%rcx), %xmm0 - movdqu %xmm0, (%rdx) + mov (%rcx), %rax + mov %rax, (%rdx) + mov 8(%rcx), %rax + mov %rax, 8(%rdx) mov %rdi, %rax ret diff --git a/libc/version.h b/libc/version.h index a37c4c77d..1dac42529 100644 --- a/libc/version.h +++ b/libc/version.h @@ -1,4 +1,4 @@ /* This file just defines the current version number of libc. */ -#define RELEASE "development" -#define VERSION "2.14.90" +#define RELEASE "stable" +#define VERSION "2.15" diff --git a/ports/ChangeLog.m68k b/ports/ChangeLog.m68k index bd8631763..a234cfae6 100644 --- a/ports/ChangeLog.m68k +++ b/ports/ChangeLog.m68k @@ -1,3 +1,8 @@ +2011-12-23 Andreas Schwab <schwab@linux-m68k.org> + + * sysdeps/m68k/bits/byteswap.h (__bswap_constant_64): Protect long + long constant with __extension__. + 2011-12-04 Thorsten Glaser <tg@mirbsd.de> * sysdeps/unix/sysv/linux/m68k/syscall.S: Allow six arguments. diff --git a/ports/sysdeps/m68k/bits/byteswap.h b/ports/sysdeps/m68k/bits/byteswap.h index 4f31d95bb..5e08805c5 100644 --- a/ports/sysdeps/m68k/bits/byteswap.h +++ b/ports/sysdeps/m68k/bits/byteswap.h @@ -28,7 +28,7 @@ because GCC is smart enough to generate optimal assembler output, and this allows for better cse. */ #define __bswap_constant_16(x) \ - ((((x) >> 8) & 0xffu) | (((x) & 0xffu) << 8)) + ((((x) >> 8) & 0xffu) | (((x) & 0xffu) << 8)) static __inline unsigned short int __bswap_16 (unsigned short int __bsx) @@ -38,8 +38,8 @@ __bswap_16 (unsigned short int __bsx) /* Swap bytes in 32 bit value. */ #define __bswap_constant_32(x) \ - ((((x) & 0xff000000u) >> 24) | (((x) & 0x00ff0000u) >> 8) | \ - (((x) & 0x0000ff00u) << 8) | (((x) & 0x000000ffu) << 24)) + ((((x) & 0xff000000u) >> 24) | (((x) & 0x00ff0000u) >> 8) | \ + (((x) & 0x0000ff00u) << 8) | (((x) & 0x000000ffu) << 24)) #if !defined(__mcoldfire__) static __inline unsigned int @@ -64,14 +64,15 @@ __bswap_32 (unsigned int __bsx) #if defined __GNUC__ && __GNUC__ >= 2 /* Swap bytes in 64 bit value. */ # define __bswap_constant_64(x) \ - ((((x) & 0xff00000000000000ull) >> 56) \ - | (((x) & 0x00ff000000000000ull) >> 40) \ - | (((x) & 0x0000ff0000000000ull) >> 24) \ - | (((x) & 0x000000ff00000000ull) >> 8) \ - | (((x) & 0x00000000ff000000ull) << 8) \ - | (((x) & 0x0000000000ff0000ull) << 24) \ - | (((x) & 0x000000000000ff00ull) << 40) \ - | (((x) & 0x00000000000000ffull) << 56)) + __extension__ \ + ((((x) & 0xff00000000000000ull) >> 56) \ + | (((x) & 0x00ff000000000000ull) >> 40) \ + | (((x) & 0x0000ff0000000000ull) >> 24) \ + | (((x) & 0x000000ff00000000ull) >> 8) \ + | (((x) & 0x00000000ff000000ull) << 8) \ + | (((x) & 0x0000000000ff0000ull) << 24) \ + | (((x) & 0x000000000000ff00ull) << 40) \ + | (((x) & 0x00000000000000ffull) << 56)) /* Swap bytes in 64 bit value. */ static __inline unsigned long long |