summaryrefslogtreecommitdiff
diff options
context:
space:
mode:
authorAdhemerval Zanella <azanella@linux.vnet.ibm.com>2011-12-29 19:43:33 -0500
committerAdhemerval Zanella <azanella@linux.vnet.ibm.com>2011-12-29 19:43:33 -0500
commitb20eec6a2b7056d4a1737a92574cab6d15225796 (patch)
treefb4b8e1422fa11219e79370fc5718534fb5b313b
parenta960be3fad233bd721f64dfe62d3f0ee798d26e3 (diff)
downloadglibc-b20eec6a2b7056d4a1737a92574cab6d15225796.tar.gz
PowerPC - Optimization for str[n]casecmp functions
This patch provides throughput boost for the strcasecmp function (25% on ppc32 and 40% on ppc64) and strncasecmp (15% on both ppc32 and ppc64) for POWER7. The optimization is done by manually (strcasecmp) or automatically (strncasecmp) unrolling the test loop to avoid CPU stalls caused by a test followed by a load.
-rw-r--r--ChangeLog18
-rw-r--r--sysdeps/powerpc/Makefile2
-rw-r--r--sysdeps/powerpc/locale-defines.sym5
-rw-r--r--sysdeps/powerpc/powerpc32/power7/Makefile4
-rw-r--r--sysdeps/powerpc/powerpc32/power7/strcasecmp.S132
-rw-r--r--sysdeps/powerpc/powerpc32/power7/strcasecmp_l.S5
-rw-r--r--sysdeps/powerpc/powerpc64/power7/Makefile5
-rw-r--r--sysdeps/powerpc/powerpc64/power7/strcasecmp.S125
-rw-r--r--sysdeps/powerpc/powerpc64/power7/strcasecmp_l.S5
9 files changed, 301 insertions, 0 deletions
diff --git a/ChangeLog b/ChangeLog
index ce94df9271..c70747e20e 100644
--- a/ChangeLog
+++ b/ChangeLog
@@ -1,3 +1,21 @@
+2011-11-16 Adhemerval Zanella <azanella@linux.vnet.ibm.com>
+
+ * sysdeps/powerpc/Makefile: Added locale-defines.sym generation.
+ * sysdeps/powerpc/locale-defines.sym: Locale definitions for strcasecmp
+ optimized code.
+ * sysdeps/powerpc/powerpc32/power7/Makefile: New file: added unroll-loop
+ option for strncasecmp/strncasecmp_l compilation.
+ * sysdeps/powerpc/powerpc32/power7/strcasecmp.S: New file: strcasecmp
+ optimization for PPC32.
+ * sysdeps/powerpc/powerpc32/power7/strcasecmp_l.S: New file: strcasecmp_l
+ optimization for PPC32.
+ * sysdeps/powerpc/powerpc64/power7/Makefile: Added unroll-loop option for
+ strncasecmp/strncasecmp_l compilation.
+ * sysdeps/powerpc/powerpc64/power7/strcasecmp.S: New file: strcasecmp
+ optimization for PPC64.
+ * sysdeps/powerpc/powerpc64/power7/strcasecmp_l.S: New file: strcasecmp_l
+ optimization for PPC64.
+
2011-11-18 Adhemerval Zanella <azanella@linux.vnet.ibm.com>
* math/libm-test.inc: Added more nerabyint tests.
diff --git a/sysdeps/powerpc/Makefile b/sysdeps/powerpc/Makefile
index e43ca704f0..23a9a16730 100644
--- a/sysdeps/powerpc/Makefile
+++ b/sysdeps/powerpc/Makefile
@@ -23,4 +23,6 @@ endif
ifeq ($(subdir),csu)
# get offset to rtld_global._dl_hwcap
gen-as-const-headers += rtld-global-offsets.sym
+# get offset to __locale_struct.__ctype_tolower
+gen-as-const-headers += locale-defines.sym
endif
diff --git a/sysdeps/powerpc/locale-defines.sym b/sysdeps/powerpc/locale-defines.sym
new file mode 100644
index 0000000000..af64b920a4
--- /dev/null
+++ b/sysdeps/powerpc/locale-defines.sym
@@ -0,0 +1,5 @@
+#include <locale/localeinfo.h>
+
+--
+
+LOCALE_CTYPE_TOLOWER offsetof (struct __locale_struct, __ctype_tolower)
diff --git a/sysdeps/powerpc/powerpc32/power7/Makefile b/sysdeps/powerpc/powerpc32/power7/Makefile
new file mode 100644
index 0000000000..5e8f4a28ba
--- /dev/null
+++ b/sysdeps/powerpc/powerpc32/power7/Makefile
@@ -0,0 +1,4 @@
+ifeq ($(subdir),string)
+CFLAGS-strncase.c += -funroll-loops
+CFLAGS-strncase_l.c += -funroll-loops
+endif
diff --git a/sysdeps/powerpc/powerpc32/power7/strcasecmp.S b/sysdeps/powerpc/powerpc32/power7/strcasecmp.S
new file mode 100644
index 0000000000..5d84fce470
--- /dev/null
+++ b/sysdeps/powerpc/powerpc32/power7/strcasecmp.S
@@ -0,0 +1,132 @@
+/* Optimized strcasecmp implementation for PowerPC32.
+ Copyright (C) 2011 Free Software Foundation, Inc.
+ This file is part of the GNU C Library.
+
+ The GNU C Library is free software; you can redistribute it and/or
+ modify it under the terms of the GNU Lesser General Public
+ License as published by the Free Software Foundation; either
+ version 2.1 of the License, or (at your option) any later version.
+
+ The GNU C Library is distributed in the hope that it will be useful,
+ but WITHOUT ANY WARRANTY; without even the implied warranty of
+ MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
+ Lesser General Public License for more details.
+
+ You should have received a copy of the GNU Lesser General Public
+ License along with the GNU C Library; if not, write to the Free
+ Software Foundation, Inc., 59 Temple Place, Suite 330, Boston, MA
+ 02111-1307 USA. */
+
+#include <sysdep.h>
+#include <bp-sym.h>
+#include <bp-asm.h>
+#include <locale-defines.h>
+
+/* int [r3] strcasecmp (const char *s1 [r3], const char *s2 [r4] )
+
+ or if defined USE_IN_EXTENDED_LOCALE_MODEL:
+
+ int [r3] strcasecmp_l (const char *s1 [r3], const char *s2 [r4],
+ __locale_t loc [r5]) */
+
+#ifndef STRCMP
+# define __STRCMP __strcasecmp
+# define STRCMP strcasecmp
+#endif
+
+ENTRY (BP_SYM (__STRCMP))
+
+#define rRTN r3 /* Return value */
+#define rSTR1 r5 /* 1st string */
+#define rSTR2 r4 /* 2nd string */
+#define rLOCARG r5 /* 3rd argument: locale_t */
+#define rCHAR1 r6 /* Byte readed from 1st string */
+#define rCHAR2 r7 /* Byte readed from 2nd string */
+#define rADDR1 r8 /* Address of tolower(rCHAR1) */
+#define rADDR2 r12 /* Address of tolower(rCHAR2) */
+#define rLWR1 r8 /* Byte tolower(rCHAR1) */
+#define rLWR2 r12 /* Byte tolower(rCHAR2) */
+#define rTMP r0
+#define rGOT r9 /* Address of the Global Offset Table */
+#define rLOC r11 /* Default locale address */
+
+ cmpw cr7, r3, r4
+#ifndef USE_IN_EXTENDED_LOCALE_MODEL
+# ifdef SHARED
+ mflr rTMP
+ bcl 20,31,.L1
+.L1: mflr rGOT
+ addis rGOT, rGOT, _GLOBAL_OFFSET_TABLE_-.L1@ha
+ addi rGOT, rGOT, _GLOBAL_OFFSET_TABLE_-.L1@l
+ lwz rLOC, __libc_tsd_LOCALE@got@tprel(rGOT)
+ add rLOC, rLOC, __libc_tsd_LOCALE@tls
+ lwz rLOC, 0(rLOC)
+ mtlr rTMP
+# else
+ lis rTMP,_GLOBAL_OFFSET_TABLE_@ha
+ la rLOC,_GLOBAL_OFFSET_TABLE_@l(rTMP)
+ lwz rLOC, __libc_tsd_LOCALE@got@tprel(rGOT)
+ add rLOC, rLOC, __libc_tsd_LOCALE@tls
+ lwz rLOC, 0(rLOC)
+# endif /* SHARED */
+#else
+ mr rLOC, rLOCARG
+#endif
+ mr rSTR1, rRTN
+ lwz rLOC, LOCALE_CTYPE_TOLOWER(rLOC)
+ li rRTN, 0
+ beqlr cr7
+
+ /* Unrolling loop for POWER: loads are done with 'lbz' plus
+ offset and string descriptors are only updated in the end
+ of loop unrolling. */
+
+L(loop):
+ lbz rCHAR1, 0(rSTR1) /* Load char from s1 */
+ lbz rCHAR2, 0(rSTR2) /* Load char from s2 */
+ sldi rADDR1, rCHAR1, 2 /* Calculate address for tolower(*s1) */
+ sldi rADDR2, rCHAR2, 2 /* Calculate address for tolower(*s2) */
+ lwzx rLWR1, rLOC, rADDR1 /* Load tolower(*s1) */
+ lwzx rLWR2, rLOC, rADDR2 /* Load tolower(*s2) */
+ cmpwi cr7, rCHAR1, 0 /* *s1 == '\0' ? */
+ subf. r3, rLWR2, rLWR1
+ bnelr
+ beqlr cr7
+ lbz rCHAR1, 1(rSTR1)
+ lbz rCHAR2, 1(rSTR2)
+ sldi rADDR1, rCHAR1, 2
+ sldi rADDR2, rCHAR2, 2
+ lwzx rLWR1, rLOC, rADDR1
+ lwzx rLWR2, rLOC, rADDR2
+ cmpwi cr7, rCHAR1, 0
+ subf. r3, rLWR2, rLWR1
+ bnelr
+ beqlr cr7
+ lbz rCHAR1, 2(rSTR1)
+ lbz rCHAR2, 2(rSTR2)
+ sldi rADDR1, rCHAR1, 2
+ sldi rADDR2, rCHAR2, 2
+ lwzx rLWR1, rLOC, rADDR1
+ lwzx rLWR2, rLOC, rADDR2
+ cmpwi cr7, rCHAR1, 0
+ subf. r3, rLWR2, rLWR1
+ bnelr
+ beqlr cr7
+ lbz rCHAR1, 3(rSTR1)
+ lbz rCHAR2, 3(rSTR2)
+ /* Increment both string descriptors */
+ addi rSTR1, rSTR1, 4
+ addi rSTR2, rSTR2, 4
+ sldi rADDR1, rCHAR1, 2
+ sldi rADDR2, rCHAR2, 2
+ lwzx rLWR1, rLOC, rADDR1
+ lwzx rLWR2, rLOC, rADDR2
+ cmpwi cr7, rCHAR1, 0
+ subf. r3, rLWR2, rLWR1
+ bnelr
+ bne cr7,L(loop)
+ blr
+END (BP_SYM (__STRCMP))
+
+weak_alias (BP_SYM (__STRCMP), BP_SYM (STRCMP))
+libc_hidden_builtin_def (__STRCMP)
diff --git a/sysdeps/powerpc/powerpc32/power7/strcasecmp_l.S b/sysdeps/powerpc/powerpc32/power7/strcasecmp_l.S
new file mode 100644
index 0000000000..c13c4ebcb8
--- /dev/null
+++ b/sysdeps/powerpc/powerpc32/power7/strcasecmp_l.S
@@ -0,0 +1,5 @@
+#define USE_IN_EXTENDED_LOCALE_MODEL
+#define STRCMP strcasecmp_l
+#define __STRCMP __strcasecmp_l
+
+#include "strcasecmp.S"
diff --git a/sysdeps/powerpc/powerpc64/power7/Makefile b/sysdeps/powerpc/powerpc64/power7/Makefile
index b0f45205b9..40aacfa15a 100644
--- a/sysdeps/powerpc/powerpc64/power7/Makefile
+++ b/sysdeps/powerpc/powerpc64/power7/Makefile
@@ -3,3 +3,8 @@ ifeq ($(subdir),elf)
# optimization may require a TOC reference before relocations are resolved.
CFLAGS-rtld.c += -mno-vsx
endif
+
+ifeq ($(subdir),string)
+CFLAGS-strncase.c += -funroll-loops
+CFLAGS-strncase_l.c += -funroll-loops
+endif
diff --git a/sysdeps/powerpc/powerpc64/power7/strcasecmp.S b/sysdeps/powerpc/powerpc64/power7/strcasecmp.S
new file mode 100644
index 0000000000..1477b2e17e
--- /dev/null
+++ b/sysdeps/powerpc/powerpc64/power7/strcasecmp.S
@@ -0,0 +1,125 @@
+/* Optimized strcasecmp implementation for PowerPC64.
+ Copyright (C) 2011 Free Software Foundation, Inc.
+ This file is part of the GNU C Library.
+
+ The GNU C Library is free software; you can redistribute it and/or
+ modify it under the terms of the GNU Lesser General Public
+ License as published by the Free Software Foundation; either
+ version 2.1 of the License, or (at your option) any later version.
+
+ The GNU C Library is distributed in the hope that it will be useful,
+ but WITHOUT ANY WARRANTY; without even the implied warranty of
+ MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
+ Lesser General Public License for more details.
+
+ You should have received a copy of the GNU Lesser General Public
+ License along with the GNU C Library; if not, write to the Free
+ Software Foundation, Inc., 59 Temple Place, Suite 330, Boston, MA
+ 02111-1307 USA. */
+
+#include <sysdep.h>
+#include <bp-sym.h>
+#include <bp-asm.h>
+#include <locale-defines.h>
+
+/* int [r3] strcasecmp (const char *s1 [r3], const char *s2 [r4] )
+
+ or if defined USE_IN_EXTENDED_LOCALE_MODEL:
+
+ int [r3] strcasecmp_l (const char *s1 [r3], const char *s2 [r4],
+ __locale_t loc [r5]) */
+
+#ifndef STRCMP
+# define __STRCMP __strcasecmp
+# define STRCMP strcasecmp
+#endif
+
+ENTRY (BP_SYM (__STRCMP))
+ CALL_MCOUNT 2
+
+#define rRTN r3 /* Return value */
+#define rSTR1 r5 /* 1st string */
+#define rSTR2 r4 /* 2nd string */
+#define rLOCARG r5 /* 3rd argument: locale_t */
+#define rCHAR1 r6 /* Byte readed from 1st string */
+#define rCHAR2 r7 /* Byte readed from 2nd string */
+#define rADDR1 r8 /* Address of tolower(rCHAR1) */
+#define rADDR2 r12 /* Address of tolower(rCHAR2) */
+#define rLWR1 r8 /* Word tolower(rCHAR1) */
+#define rLWR2 r12 /* Word tolower(rCHAR2) */
+#define rTMP r9
+#define rLOC r11 /* Default locale address */
+
+ cmpd cr7, r3, r4
+#ifndef USE_IN_EXTENDED_LOCALE_MODEL
+ ld rTMP, __libc_tsd_LOCALE@got@tprel(r2)
+ add rLOC, rTMP, __libc_tsd_LOCALE@tls
+ ld rLOC, 0(rLOC)
+#else
+ mr rLOC, rLOCARG
+#endif
+ ld rLOC, LOCALE_CTYPE_TOLOWER(rLOC)
+ mr rSTR1, rRTN
+ li rRTN, 0
+ beqlr cr7
+
+
+ /* Unrolling loop for POWER: loads are done with 'lbz' plus
+ offset and string descriptors are only updated in the end
+ of loop unrolling. */
+
+ lbz rCHAR1, 0(rSTR1) /* Load char from s1 */
+ lbz rCHAR2, 0(rSTR2) /* Load char from s2 */
+L(loop):
+ cmpdi rCHAR1, 0 /* *s1 == '\0' ? */
+ sldi rADDR1, rCHAR1, 2 /* Calculate address for tolower(*s1) */
+ sldi rADDR2, rCHAR2, 2 /* Calculate address for tolower(*s2) */
+ lwzx rLWR1, rLOC, rADDR1 /* Load tolower(*s1) */
+ lwzx rLWR2, rLOC, rADDR2 /* Load tolower(*s2) */
+ cmpw cr1, rLWR1, rLWR2 /* r = tolower(*s1) == tolower(*s2) ? */
+ crorc 4*cr1+eq,eq,4*cr1+eq /* (*s1 != '\0') || (r == 1) */
+ beq cr1, L(done)
+ lbz rCHAR1, 1(rSTR1)
+ lbz rCHAR2, 1(rSTR2)
+ cmpdi rCHAR1, 0
+ sldi rADDR1, rCHAR1, 2
+ sldi rADDR2, rCHAR2, 2
+ lwzx rLWR1, rLOC, rADDR1
+ lwzx rLWR2, rLOC, rADDR2
+ cmpw cr1, rLWR1, rLWR2
+ crorc 4*cr1+eq,eq,4*cr1+eq
+ beq cr1, L(done)
+ lbz rCHAR1, 2(rSTR1)
+ lbz rCHAR2, 2(rSTR2)
+ cmpdi rCHAR1, 0
+ sldi rADDR1, rCHAR1, 2
+ sldi rADDR2, rCHAR2, 2
+ lwzx rLWR1, rLOC, rADDR1
+ lwzx rLWR2, rLOC, rADDR2
+ cmpw cr1, rLWR1, rLWR2
+ crorc 4*cr1+eq,eq,4*cr1+eq
+ beq cr1, L(done)
+ lbz rCHAR1, 3(rSTR1)
+ lbz rCHAR2, 3(rSTR2)
+ cmpdi rCHAR1, 0
+ /* Increment both string descriptors */
+ addi rSTR1, rSTR1, 4
+ addi rSTR2, rSTR2, 4
+ sldi rADDR1, rCHAR1, 2
+ sldi rADDR2, rCHAR2, 2
+ lwzx rLWR1, rLOC, rADDR1
+ lwzx rLWR2, rLOC, rADDR2
+ cmpw cr1, rLWR1, rLWR2
+ crorc 4*cr1+eq,eq,4*cr1+eq
+ beq cr1,L(done)
+ lbz rCHAR1, 0(rSTR1) /* Load char from s1 */
+ lbz rCHAR2, 0(rSTR2) /* Load char from s2 */
+ b L(loop)
+L(done):
+ subf r0, rLWR2, rLWR1
+ extsw rRTN, r0
+ blr
+END (BP_SYM (__STRCMP))
+
+weak_alias (BP_SYM (__STRCMP), BP_SYM (STRCMP))
+libc_hidden_builtin_def (__STRCMP)
diff --git a/sysdeps/powerpc/powerpc64/power7/strcasecmp_l.S b/sysdeps/powerpc/powerpc64/power7/strcasecmp_l.S
new file mode 100644
index 0000000000..c13c4ebcb8
--- /dev/null
+++ b/sysdeps/powerpc/powerpc64/power7/strcasecmp_l.S
@@ -0,0 +1,5 @@
+#define USE_IN_EXTENDED_LOCALE_MODEL
+#define STRCMP strcasecmp_l
+#define __STRCMP __strcasecmp_l
+
+#include "strcasecmp.S"