From 41e8926aa4b7f17bc95984737ee82a254ad0911c Mon Sep 17 00:00:00 2001
From: Adhemerval Zanella <azanella@linux.vnet.ibm.com>
Date: Mon, 25 Nov 2013 06:12:30 -0600
Subject: PowerPC: Set/restore rounding mode only when needed

This patch helps some math functions performance by adding the libc_fexxx
variant of inline functions to handle both FPU round and exception set/restore
and by using them on the libc_fexxx_ctx functions. It is based on already coded
fexxx family functions for PPC with fpu.

Here is the summary of performance improvements due this patch (measured on a
POWER7 machine):

Before:

cos(): ITERS:9.5895e+07: TOTAL:5116.03Mcy, MAX:77.6cy, MIN:49.792cy, 18744 calls/Mcy
exp(): ITERS:2.827e+07: TOTAL:5187.15Mcy, MAX:494.018cy, MIN:38.422cy, 5450.01 calls/Mcy
pow(): ITERS:6.1705e+07: TOTAL:5144.26Mcy, MAX:171.95cy, MIN:29.935cy, 11994.9 calls/Mcy
sin(): ITERS:8.6898e+07: TOTAL:5117.06Mcy, MAX:83.841cy, MIN:46.582cy, 16982 calls/Mcy
tan(): ITERS:2.9473e+07: TOTAL:5115.39Mcy, MAX:191.017cy, MIN:172.352cy, 5761.63 calls/Mcy

After:

cos(): ITERS:2.05265e+08: TOTAL:5111.37Mcy, MAX:78.754cy, MIN:24.196cy, 40158.5 calls/Mcy
exp(): ITERS:3.341e+07: TOTAL:5170.84Mcy, MAX:476.317cy, MIN:15.574cy, 6461.23 calls/Mcy
pow(): ITERS:7.6153e+07: TOTAL:5129.1Mcy, MAX:147.5cy, MIN:30.916cy, 14847.2 calls/Mcy
sin(): ITERS:1.58816e+08: TOTAL:5115.11Mcy, MAX:1490.39cy, MIN:22.341cy, 31048.4 calls/Mcy
tan(): ITERS:3.4964e+07: TOTAL:5114.18Mcy, MAX:177.422cy, MIN:146.115cy, 6836.68 calls/Mcy
---
 sysdeps/ieee754/ldbl-128ibm/e_expl.c    | 31 +++++++++++++------------------
 sysdeps/ieee754/ldbl-128ibm/s_llrintl.c |  2 +-
 sysdeps/ieee754/ldbl-128ibm/s_lrintl.c  |  2 +-
 sysdeps/ieee754/ldbl-128ibm/s_rintl.c   |  2 +-
 4 files changed, 16 insertions(+), 21 deletions(-)

(limited to 'sysdeps/ieee754/ldbl-128ibm')

diff --git a/sysdeps/ieee754/ldbl-128ibm/e_expl.c b/sysdeps/ieee754/ldbl-128ibm/e_expl.c
index f7c50bfd3c..65ef18532d 100644
--- a/sysdeps/ieee754/ldbl-128ibm/e_expl.c
+++ b/sysdeps/ieee754/ldbl-128ibm/e_expl.c
@@ -134,18 +134,17 @@ static const long double C[] = {
 long double
 __ieee754_expl (long double x)
 {
+  long double result, x22;
+  union ibm_extended_long_double ex2_u, scale_u;
+  int unsafe;
+
   /* Check for usual case.  */
   if (isless (x, himark) && isgreater (x, lomark))
     {
-      int tval1, tval2, unsafe, n_i, exponent2;
-      long double x22, n, result, xl;
-      union ibm_extended_long_double ex2_u, scale_u;
-      fenv_t oldenv;
-
-      feholdexcept (&oldenv);
-#ifdef FE_TONEAREST
-      fesetround (FE_TONEAREST);
-#endif
+      int tval1, tval2, n_i, exponent2;
+      long double n, xl;
+
+      SET_RESTORE_ROUND (FE_TONEAREST);
 
       n = __roundl (x*M_1_LN2);
       x = x-n*M_LN2_0;
@@ -201,11 +200,6 @@ __ieee754_expl (long double x)
 	 less than 4.8e-39.  */
       x22 = x + x*x*(P1+x*(P2+x*(P3+x*(P4+x*(P5+x*P6)))));
 
-      /* Return result.  */
-      fesetenv (&oldenv);
-
-      result = x22 * ex2_u.ld + ex2_u.ld;
-
       /* Now we can test whether the result is ultimate or if we are unsure.
 	 In the later case we should probably call a mpn based routine to give
 	 the ultimate result.
@@ -235,10 +229,6 @@ __ieee754_expl (long double x)
 	    return __ieee754_expl_proc2 (origx);
 	  }
        */
-      if (!unsafe)
-	return result;
-      else
-	return result * scale_u.ld;
     }
   /* Exceptional cases:  */
   else if (isless (x, himark))
@@ -253,5 +243,10 @@ __ieee754_expl (long double x)
   else
     /* Return x, if x is a NaN or Inf; or overflow, otherwise.  */
     return TWO1023*x;
+
+  result = x22 * ex2_u.ld + ex2_u.ld;
+  if (!unsafe)
+    return result;
+  return result * scale_u.ld;
 }
 strong_alias (__ieee754_expl, __expl_finite)
diff --git a/sysdeps/ieee754/ldbl-128ibm/s_llrintl.c b/sysdeps/ieee754/ldbl-128ibm/s_llrintl.c
index 8560349631..35039737bf 100644
--- a/sysdeps/ieee754/ldbl-128ibm/s_llrintl.c
+++ b/sysdeps/ieee754/ldbl-128ibm/s_llrintl.c
@@ -43,7 +43,7 @@ __llrintl (long double x)
 #endif
     )
     {
-      save_round = fegetround ();
+      save_round = __fegetround ();
 
       if (__builtin_expect ((xh == -(double) (-__LONG_LONG_MAX__ - 1)), 0))
 	{
diff --git a/sysdeps/ieee754/ldbl-128ibm/s_lrintl.c b/sysdeps/ieee754/ldbl-128ibm/s_lrintl.c
index 588098d090..49dbd42f5b 100644
--- a/sysdeps/ieee754/ldbl-128ibm/s_lrintl.c
+++ b/sysdeps/ieee754/ldbl-128ibm/s_lrintl.c
@@ -49,7 +49,7 @@ __lrintl (long double x)
 #endif
     )
     {
-      save_round = fegetround ();
+      save_round = __fegetround ();
 
 #if __LONG_MAX__ == 2147483647
       long long llhi = (long long) xh;
diff --git a/sysdeps/ieee754/ldbl-128ibm/s_rintl.c b/sysdeps/ieee754/ldbl-128ibm/s_rintl.c
index 48dbe8569c..5fd6bb8702 100644
--- a/sysdeps/ieee754/ldbl-128ibm/s_rintl.c
+++ b/sysdeps/ieee754/ldbl-128ibm/s_rintl.c
@@ -40,7 +40,7 @@ __rintl (long double x)
 					     __builtin_inf ()), 1))
     {
       double orig_xh;
-      int save_round = fegetround ();
+      int save_round = __fegetround ();
 
       /* Long double arithmetic, including the canonicalisation below,
 	 only works in round-to-nearest mode.  */
-- 
cgit v1.2.1