diff options
Diffstat (limited to 'sysdeps/ia64/fpu/s_expm1l.S')
-rw-r--r-- | sysdeps/ia64/fpu/s_expm1l.S | 1947 |
1 files changed, 1068 insertions, 879 deletions
diff --git a/sysdeps/ia64/fpu/s_expm1l.S b/sysdeps/ia64/fpu/s_expm1l.S index 5f135faf67..e53d3c8d7c 100644 --- a/sysdeps/ia64/fpu/s_expm1l.S +++ b/sysdeps/ia64/fpu/s_expm1l.S @@ -1,10 +1,10 @@ -.file "expl_m1.s" +.file "exp_m1l.s" - -// Copyright (c) 2000 - 2003, Intel Corporation +// Copyright (C) 2000, 2001, Intel Corporation // All rights reserved. -// -// Contributed 2000 by the Intel Numerics Group, Intel Corporation +// +// Contributed 2/2/2000 by John Harrison, Ted Kubaska, Bob Norin, Shane Story, +// and Ping Tak Peter Tang of the Computational Software Lab, Intel Corporation. // // Redistribution and use in source and binary forms, with or without // modification, are permitted provided that the following conditions are @@ -20,7 +20,7 @@ // * The name of Intel Corporation may not be used to endorse or promote // products derived from this software without specific prior written // permission. - +// // THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS // "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT // LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR @@ -35,23 +35,15 @@ // // Intel Corporation is the author of this code, and requests that all // problem reports or change requests be submitted to it directly at -// http://www.intel.com/software/products/opensource/libraries/num.htm. +// http://developer.intel.com/opensource. // // History //============================================================== -// 02/02/00 Initial Version -// 04/04/00 Unwind support added -// 08/15/00 Bundle added after call to __libm_error_support to properly +// 4/04/00 Unwind support added +// 8/15/00 Bundle added after call to __libm_error_support to properly // set [the previously overwritten] GR_Parameter_RESULT. -// 07/07/01 Improved speed of all paths -// 05/20/02 Cleaned up namespace and sf0 syntax -// 02/10/03 Reordered header: .section, .global, .proc, .align; -// used data8 for long double table values -// 03/11/03 Improved accuracy and performance, corrected missing inexact flags -// 04/17/03 Eliminated misplaced and unused data label -// 12/15/03 Eliminated call to error support on expm1l underflow // -//********************************************************************* +// ********************************************************************* // // Function: Combined expl(x) and expm1l(x), where // x @@ -59,20 +51,20 @@ // x // expm1l(x) = e - 1 for double-extended precision x values // -//********************************************************************* +// ********************************************************************* // // Resources Used: // // Floating-Point Registers: f8 (Input and Return Value) -// f9-f15,f32-f77 +// f9,f32-f61, f99-f102 // // General Purpose Registers: -// r14-r38 -// r35-r38 (Used to pass arguments to error handling routine) +// r32-r61 +// r62-r65 (Used to pass arguments to error handling routine) // // Predicate Registers: p6-p15 // -//********************************************************************* +// ********************************************************************* // // IEEE Special Conditions: // @@ -82,37 +74,39 @@ // (Error Handling Routine called for overflow and Underflow) // Inexact raised when appropriate by algorithm // -// exp(inf) = inf -// exp(-inf) = +0 -// exp(SNaN) = QNaN -// exp(QNaN) = QNaN -// exp(0) = 1 -// exp(EM_special Values) = QNaN -// exp(inf) = inf -// expm1(-inf) = -1 -// expm1(SNaN) = QNaN -// expm1(QNaN) = QNaN -// expm1(0) = 0 -// expm1(EM_special Values) = QNaN +// expl(inf) = inf +// expl(-inf) = +0 +// expl(SNaN) = QNaN +// expl(QNaN) = QNaN +// expl(0) = 1 +// expl(EM_special Values) = QNaN +// expl(inf) = inf +// expm1l(-inf) = -1 +// expm1l(SNaN) = QNaN +// expm1l(QNaN) = QNaN +// expm1l(0) = 0 +// expm1l(EM_special Values) = QNaN // -//********************************************************************* +// ********************************************************************* // // Implementation and Algorithm Notes: // // ker_exp_64( in_FR : X, +// in_GR : Flag, +// in_GR : Expo_Range // out_FR : Y_hi, // out_FR : Y_lo, // out_FR : scale, // out_PR : Safe ) // -// On input, X is in register format -// p6 for exp, -// p7 for expm1, +// On input, X is in register format and +// Flag = 0 for exp, +// Flag = 1 for expm1, // -// On output, +// On output, provided X and X_cor are real numbers, then // -// scale*(Y_hi + Y_lo) approximates exp(X) if exp -// scale*(Y_hi + Y_lo) approximates exp(X)-1 if expm1 +// scale*(Y_hi + Y_lo) approximates expl(X) if Flag is 0 +// scale*(Y_hi + Y_lo) approximates expl(X)-1 if Flag is 1 // // The accuracy is sufficient for a highly accurate 64 sig. // bit implementation. Safe is set if there is no danger of @@ -128,36 +122,36 @@ // The method consists of three cases. // // If |X| < Tiny use case exp_tiny; -// else if |X| < 2^(-m) use case exp_small; m=12 for exp, m=7 for expm1 +// else if |X| < 2^(-6) use case exp_small; // else use case exp_regular; // // Case exp_tiny: // -// 1 + X can be used to approximate exp(X) -// X + X^2/2 can be used to approximate exp(X) - 1 +// 1 + X can be used to approximate expl(X) or expl(X+X_cor); +// X + X^2/2 can be used to approximate expl(X) - 1 // // Case exp_small: // -// Here, exp(X) and exp(X) - 1 can all be +// Here, expl(X), expl(X+X_cor), and expl(X) - 1 can all be // appproximated by a relatively simple polynomial. // // This polynomial resembles the truncated Taylor series // -// exp(w) = 1 + w + w^2/2! + w^3/3! + ... + w^n/n! +// expl(w) = 1 + w + w^2/2! + w^3/3! + ... + w^n/n! // // Case exp_regular: // // Here we use a table lookup method. The basic idea is that in -// order to compute exp(X), we accurately decompose X into +// order to compute expl(X), we accurately decompose X into // // X = N * log(2)/(2^12) + r, |r| <= log(2)/2^13. // // Hence // -// exp(X) = 2^( N / 2^12 ) * exp(r). +// expl(X) = 2^( N / 2^12 ) * expl(r). // // The value 2^( N / 2^12 ) is obtained by simple combinations -// of values calculated beforehand and stored in table; exp(r) +// of values calculated beforehand and stored in table; expl(r) // is approximated by a short polynomial because |r| is small. // // We elaborate this method in 4 steps. @@ -184,9 +178,13 @@ // as a double-precision number; L_lo has 64 significant bits and // stored as a double-extended number. // +// In the case Flag = 2, we further modify r by +// +// r := r + X_cor. +// // Step 2: Approximation // -// exp(r) - 1 is approximated by a short polynomial of the form +// expl(r) - 1 is approximated by a short polynomial of the form // // r + A_1 r^2 + A_2 r^3 + A_3 r^4 . // @@ -215,19 +213,19 @@ // Define two mathematical values, delta_1 and delta_2, implicitly // such that // -// T_1 = exp( [M_1 log(2)/2^6] - delta_1 ) -// T_2 = exp( [M_2 log(2)/2^12] - delta_2 ) +// T_1 = expl( [M_1 log(2)/2^6] - delta_1 ) +// T_2 = expl( [M_2 log(2)/2^12] - delta_2 ) // // are representable as 24 significant bits. To illustrate the idea, // we show how we define delta_1: // -// T_1 := round_to_24_bits( exp( M_1 log(2)/2^6 ) ) +// T_1 := round_to_24_bits( expl( M_1 log(2)/2^6 ) ) // delta_1 = (M_1 log(2)/2^6) - log( T_1 ) // // The last equality means mathematical equality. We then tabulate // -// W_1 := exp(delta_1) - 1 -// W_2 := exp(delta_2) - 1 +// W_1 := expl(delta_1) - 1 +// W_2 := expl(delta_2) - 1 // // Both in double precision. // @@ -237,13 +235,13 @@ // T := T_1 * T_2 ...exactly // W := W_1 + (1 + W_1)*W_2 // -// W approximates exp( delta ) - 1 where delta = delta_1 + delta_2. +// W approximates expl( delta ) - 1 where delta = delta_1 + delta_2. // The mathematical product of T and (W+1) is an accurate representation // of 2^(M_1/2^6) * 2^(M_2/2^12). // // Step 4. Reconstruction // -// Finally, we can reconstruct exp(X), exp(X) - 1. +// Finally, we can reconstruct expl(X), expl(X) - 1. // Because // // X = K * log(2) + (M_1*log(2)/2^6 - delta_1) @@ -251,18 +249,18 @@ // + delta_1 + delta_2 + r ...accurately // We have // -// exp(X) ~=~ 2^K * ( T + T*[exp(delta_1+delta_2+r) - 1] ) -// ~=~ 2^K * ( T + T*[exp(delta + r) - 1] ) -// ~=~ 2^K * ( T + T*[(exp(delta)-1) -// + exp(delta)*(exp(r)-1)] ) +// expl(X) ~=~ 2^K * ( T + T*[expl(delta_1+delta_2+r) - 1] ) +// ~=~ 2^K * ( T + T*[expl(delta + r) - 1] ) +// ~=~ 2^K * ( T + T*[(expl(delta)-1) +// + expl(delta)*(expl(r)-1)] ) // ~=~ 2^K * ( T + T*( W + (1+W)*poly(r) ) ) // ~=~ 2^K * ( Y_hi + Y_lo ) // // where Y_hi = T and Y_lo = T*(W + (1+W)*poly(r)) // -// For exp(X)-1, we have +// For expl(X)-1, we have // -// exp(X)-1 ~=~ 2^K * ( Y_hi + Y_lo ) - 1 +// expl(X)-1 ~=~ 2^K * ( Y_hi + Y_lo ) - 1 // ~=~ 2^K * ( Y_hi + Y_lo - 2^(-K) ) // // and we combine Y_hi + Y_lo - 2^(-N) into the form of two @@ -280,7 +278,7 @@ // different rounding directions and a correct setting of the SAFE // flag. // -// If expm1 is 1, then +// If Flag is 1, then // SAFE := False ...possibility of underflow // Scale := 1.0 // Y_hi := X @@ -298,25 +296,26 @@ // // Let r = X // -// If exp ...i.e. exp( argument ) +// If Flag is not 1 ...i.e. expl( argument ) // // rsq := r * r; // r4 := rsq*rsq // poly_lo := P_3 + r*(P_4 + r*(P_5 + r*P_6)) // poly_hi := r + rsq*(P_1 + r*P_2) // Y_lo := poly_hi + r4 * poly_lo +// set lsb(Y_lo) to 1 // Y_hi := 1.0 // Scale := 1.0 // -// Else ...i.e. exp( argument ) - 1 +// Else ...i.e. expl( argument ) - 1 // // rsq := r * r // r4 := rsq * rsq -// poly_lo := Q_7 + r*(Q_8 + r*Q_9)) -// poly_med:= Q_3 + r*Q_4 + rsq*(Q_5 + r*Q_6) -// poly_med:= poly_med + r4*poly_lo -// poly_hi := Q_1 + r*Q_2 -// Y_lo := rsq*(poly_hi + rsq*poly_lo) +// r6 := rsq * r4 +// poly_lo := r6*(Q_5 + r*(Q_6 + r*Q_7)) +// poly_hi := Q_1 + r*(Q_2 + r*(Q_3 + r*Q_4)) +// Y_lo := rsq*poly_hi + poly_lo +// set lsb(Y_lo) to 1 // Y_hi := X // Scale := 1.0 // @@ -326,14 +325,14 @@ // // The previous description contain enough information except the // computation of poly and the final Y_hi and Y_lo in the case for -// exp(X)-1. +// expl(X)-1. // // The computation of poly for Step 2: // // rsq := r*r // poly := r + rsq*(A_1 + r*(A_2 + r*A_3)) // -// For the case exp(X) - 1, we need to incorporate 2^(-K) into +// For the case expl(X) - 1, we need to incorporate 2^(-K) into // Y_hi and Y_lo at the end of Step 4. // // If K > 10 then @@ -347,197 +346,72 @@ // End If // End If // -//======================================================= -// General Purpose Registers -// -GR_ad_Arg = r14 -GR_ad_A = r15 -GR_sig_inv_ln2 = r15 -GR_rshf_2to51 = r16 -GR_ad_PQ = r16 -GR_ad_Q = r16 -GR_signexp_x = r17 -GR_exp_x = r17 -GR_small_exp = r18 -GR_rshf = r18 -GR_exp_mask = r19 -GR_ad_W1 = r20 -GR_exp_2tom51 = r20 -GR_ad_W2 = r21 -GR_exp_underflow = r21 -GR_M2 = r22 -GR_huge_exp = r22 -GR_M1 = r23 -GR_huge_signif = r23 -GR_K = r24 -GR_one = r24 -GR_minus_one = r24 -GR_exp_bias = r25 -GR_ad_Limits = r26 -GR_N_fix = r26 -GR_exp_2_mk = r26 -GR_ad_P = r27 -GR_exp_2_k = r27 -GR_big_expo_neg = r28 -GR_very_small_exp = r29 -GR_exp_half = r29 -GR_ad_T1 = r30 -GR_ad_T2 = r31 -GR_SAVE_PFS = r32 -GR_SAVE_B0 = r33 -GR_SAVE_GP = r34 -GR_Parameter_X = r35 -GR_Parameter_Y = r36 -GR_Parameter_RESULT = r37 -GR_Parameter_TAG = r38 +#include "libm_support.h" -// Floating Point Registers -// -FR_norm_x = f9 -FR_RSHF_2TO51 = f10 -FR_INV_LN2_2TO63 = f11 -FR_W_2TO51_RSH = f12 -FR_2TOM51 = f13 -FR_RSHF = f14 -FR_Y_hi = f34 -FR_Y_lo = f35 -FR_scale = f36 -FR_tmp = f37 -FR_float_N = f38 -FR_N_signif = f39 -FR_L_hi = f40 -FR_L_lo = f41 -FR_r = f42 -FR_W1 = f43 -FR_T1 = f44 -FR_W2 = f45 -FR_T2 = f46 -FR_W1_p1 = f47 -FR_rsq = f48 -FR_A2 = f49 -FR_r4 = f50 -FR_A3 = f51 -FR_poly = f52 -FR_T = f53 -FR_W = f54 -FR_Wp1 = f55 -FR_p21 = f59 -FR_p210 = f59 -FR_p65 = f60 -FR_p654 = f60 -FR_p6543 = f60 -FR_2_mk = f61 -FR_P4Q7 = f61 -FR_P4 = f61 -FR_Q7 = f61 -FR_P3Q6 = f62 -FR_P3 = f62 -FR_Q6 = f62 -FR_q65 = f62 -FR_q6543 = f62 -FR_P2Q5 = f63 -FR_P2 = f63 -FR_Q5 = f63 -FR_P1Q4 = f64 -FR_P1 = f64 -FR_Q4 = f64 -FR_q43 = f64 -FR_Q3 = f65 -FR_Q2 = f66 -FR_q21 = f66 -FR_Q1 = f67 -FR_A1 = f68 -FR_P6Q9 = f68 -FR_P6 = f68 -FR_Q9 = f68 -FR_P5Q8 = f69 -FR_P5 = f69 -FR_Q8 = f69 -FR_q987 = f69 -FR_q98 = f69 -FR_q9876543 = f69 -FR_min_oflow_x = f70 -FR_huge_exp = f70 -FR_zero_uflow_x = f71 -FR_huge_signif = f71 -FR_huge = f72 -FR_small = f72 -FR_half = f73 -FR_T_scale = f74 -FR_result_lo = f75 -FR_W_T_scale = f76 -FR_Wp1_T_scale = f77 -FR_ftz = f77 -FR_half_x = f77 -// - -FR_X = f9 -FR_Y = f0 -FR_RESULT = f15 - -// ************* DO NOT CHANGE ORDER OF THESE TABLES ******************** +#ifdef _LIBC +.rodata +#else +.data +#endif -// double-extended 1/ln(2) -// 3fff b8aa 3b29 5c17 f0bb be87fed0691d3e88 -// 3fff b8aa 3b29 5c17 f0bc -// For speed the significand will be loaded directly with a movl and setf.sig -// and the exponent will be bias+63 instead of bias+0. Thus subsequent -// computations need to scale appropriately. -// The constant 2^12/ln(2) is needed for the computation of N. This is also -// obtained by scaling the computations. -// -// Two shifting constants are loaded directly with movl and setf.d. -// 1. RSHF_2TO51 = 1.1000..00 * 2^(63-12) -// This constant is added to x*1/ln2 to shift the integer part of -// x*2^12/ln2 into the rightmost bits of the significand. -// The result of this fma is N_signif. -// 2. RSHF = 1.1000..00 * 2^(63) -// This constant is subtracted from N_signif * 2^(-51) to give -// the integer part of N, N_fix, as a floating-point number. -// The result of this fms is float_N. - -RODATA .align 64 -LOCAL_OBJECT_START(Constants_exp_64_Arg) -//data8 0xB8AA3B295C17F0BC,0x0000400B // Inv_L = 2^12/log(2) -data8 0xB17217F400000000,0x00003FF2 // L_hi = hi part log(2)/2^12 -data8 0xF473DE6AF278ECE6,0x00003FD4 // L_lo = lo part log(2)/2^12 -LOCAL_OBJECT_END(Constants_exp_64_Arg) +Constants_exp_64_Arg: +ASM_TYPE_DIRECTIVE(Constants_exp_64_Arg,@object) +data4 0x5C17F0BC,0xB8AA3B29,0x0000400B,0x00000000 +data4 0x00000000,0xB17217F4,0x00003FF2,0x00000000 +data4 0xF278ECE6,0xF473DE6A,0x00003FD4,0x00000000 +// /* Inv_L, L_hi, L_lo */ +ASM_SIZE_DIRECTIVE(Constants_exp_64_Arg) -LOCAL_OBJECT_START(Constants_exp_64_Limits) -data8 0xb17217f7d1cf79ac,0x0000400c // Smallest long dbl oflow x -data8 0xb220000000000000,0x0000c00c // Small long dbl uflow zero x -LOCAL_OBJECT_END(Constants_exp_64_Limits) +.align 64 +Constants_exp_64_Exponents: +ASM_TYPE_DIRECTIVE(Constants_exp_64_Exponents,@object) +data4 0x0000007E,0x00000000,0xFFFFFF83,0xFFFFFFFF +data4 0x000003FE,0x00000000,0xFFFFFC03,0xFFFFFFFF +data4 0x00003FFE,0x00000000,0xFFFFC003,0xFFFFFFFF +data4 0x00003FFE,0x00000000,0xFFFFC003,0xFFFFFFFF +data4 0xFFFFFFE2,0xFFFFFFFF,0xFFFFFFC4,0xFFFFFFFF +data4 0xFFFFFFBA,0xFFFFFFFF,0xFFFFFFBA,0xFFFFFFFF +ASM_SIZE_DIRECTIVE(Constants_exp_64_Exponents) -LOCAL_OBJECT_START(Constants_exp_64_A) -data8 0xAAAAAAABB1B736A0,0x00003FFA // A3 -data8 0xAAAAAAAB90CD6327,0x00003FFC // A2 -data8 0xFFFFFFFFFFFFFFFF,0x00003FFD // A1 -LOCAL_OBJECT_END(Constants_exp_64_A) +.align 64 +Constants_exp_64_A: +ASM_TYPE_DIRECTIVE(Constants_exp_64_A,@object) +data4 0xB1B736A0,0xAAAAAAAB,0x00003FFA,0x00000000 +data4 0x90CD6327,0xAAAAAAAB,0x00003FFC,0x00000000 +data4 0xFFFFFFFF,0xFFFFFFFF,0x00003FFD,0x00000000 +// /* Reversed */ +ASM_SIZE_DIRECTIVE(Constants_exp_64_A) -LOCAL_OBJECT_START(Constants_exp_64_P) -data8 0xD00D6C8143914A8A,0x00003FF2 // P6 -data8 0xB60BC4AC30304B30,0x00003FF5 // P5 -data8 0x888888887474C518,0x00003FF8 // P4 -data8 0xAAAAAAAA8DAE729D,0x00003FFA // P3 -data8 0xAAAAAAAAAAAAAF61,0x00003FFC // P2 -data8 0x80000000000004C7,0x00003FFE // P1 -LOCAL_OBJECT_END(Constants_exp_64_P) +.align 64 +Constants_exp_64_P: +ASM_TYPE_DIRECTIVE(Constants_exp_64_P,@object) +data4 0x43914A8A,0xD00D6C81,0x00003FF2,0x00000000 +data4 0x30304B30,0xB60BC4AC,0x00003FF5,0x00000000 +data4 0x7474C518,0x88888888,0x00003FF8,0x00000000 +data4 0x8DAE729D,0xAAAAAAAA,0x00003FFA,0x00000000 +data4 0xAAAAAF61,0xAAAAAAAA,0x00003FFC,0x00000000 +data4 0x000004C7,0x80000000,0x00003FFE,0x00000000 +// /* Reversed */ +ASM_SIZE_DIRECTIVE(Constants_exp_64_P) -LOCAL_OBJECT_START(Constants_exp_64_Q) -data8 0x93F2AC5F7471F32E, 0x00003FE9 // Q9 -data8 0xB8DA0F3550B3E764, 0x00003FEC // Q8 -data8 0xD00D00D0028E89C4, 0x00003FEF // Q7 -data8 0xD00D00DAEB8C4E91, 0x00003FF2 // Q6 -data8 0xB60B60B60B60B6F5, 0x00003FF5 // Q5 -data8 0x888888888886CC23, 0x00003FF8 // Q4 -data8 0xAAAAAAAAAAAAAAAB, 0x00003FFA // Q3 -data8 0xAAAAAAAAAAAAAAAB, 0x00003FFC // Q2 -data8 0x8000000000000000, 0x00003FFE // Q1 -LOCAL_OBJECT_END(Constants_exp_64_Q) +.align 64 +Constants_exp_64_Q: +ASM_TYPE_DIRECTIVE(Constants_exp_64_Q,@object) +data4 0xA49EF6CA,0xD00D56F7,0x00003FEF,0x00000000 +data4 0x1C63493D,0xD00D59AB,0x00003FF2,0x00000000 +data4 0xFB50CDD2,0xB60B60B5,0x00003FF5,0x00000000 +data4 0x7BA68DC8,0x88888888,0x00003FF8,0x00000000 +data4 0xAAAAAC8D,0xAAAAAAAA,0x00003FFA,0x00000000 +data4 0xAAAAACCA,0xAAAAAAAA,0x00003FFC,0x00000000 +data4 0x00000000,0x80000000,0x00003FFE,0x00000000 +// /* Reversed */ +ASM_SIZE_DIRECTIVE(Constants_exp_64_Q) -LOCAL_OBJECT_START(Constants_exp_64_T1) +.align 64 +Constants_exp_64_T1: +ASM_TYPE_DIRECTIVE(Constants_exp_64_T1,@object) data4 0x3F800000,0x3F8164D2,0x3F82CD87,0x3F843A29 data4 0x3F85AAC3,0x3F871F62,0x3F88980F,0x3F8A14D5 data4 0x3F8B95C2,0x3F8D1ADF,0x3F8EA43A,0x3F9031DC @@ -554,9 +428,11 @@ data4 0x3FD744FD,0x3FD99D16,0x3FDBFBB8,0x3FDE60F5 data4 0x3FE0CCDF,0x3FE33F89,0x3FE5B907,0x3FE8396A data4 0x3FEAC0C7,0x3FED4F30,0x3FEFE4BA,0x3FF28177 data4 0x3FF5257D,0x3FF7D0DF,0x3FFA83B3,0x3FFD3E0C -LOCAL_OBJECT_END(Constants_exp_64_T1) +ASM_SIZE_DIRECTIVE(Constants_exp_64_T1) -LOCAL_OBJECT_START(Constants_exp_64_T2) +.align 64 +Constants_exp_64_T2: +ASM_TYPE_DIRECTIVE(Constants_exp_64_T2,@object) data4 0x3F800000,0x3F80058C,0x3F800B18,0x3F8010A4 data4 0x3F801630,0x3F801BBD,0x3F80214A,0x3F8026D7 data4 0x3F802C64,0x3F8031F2,0x3F803780,0x3F803D0E @@ -573,812 +449,1124 @@ data4 0x3F810B41,0x3F8110D8,0x3F81166F,0x3F811C07 data4 0x3F81219F,0x3F812737,0x3F812CD0,0x3F813269 data4 0x3F813802,0x3F813D9B,0x3F814334,0x3F8148CE data4 0x3F814E68,0x3F815402,0x3F81599C,0x3F815F37 -LOCAL_OBJECT_END(Constants_exp_64_T2) +ASM_SIZE_DIRECTIVE(Constants_exp_64_T2) -LOCAL_OBJECT_START(Constants_exp_64_W1) -data8 0x0000000000000000, 0xBE384454171EC4B4 -data8 0xBE6947414AA72766, 0xBE5D32B6D42518F8 -data8 0x3E68D96D3A319149, 0xBE68F4DA62415F36 -data8 0xBE6DDA2FC9C86A3B, 0x3E6B2E50F49228FE -data8 0xBE49C0C21188B886, 0x3E64BFC21A4C2F1F -data8 0xBE6A2FBB2CB98B54, 0x3E5DC5DE9A55D329 -data8 0x3E69649039A7AACE, 0x3E54728B5C66DBA5 -data8 0xBE62B0DBBA1C7D7D, 0x3E576E0409F1AF5F -data8 0x3E6125001A0DD6A1, 0xBE66A419795FBDEF -data8 0xBE5CDE8CE1BD41FC, 0xBE621376EA54964F -data8 0x3E6370BE476E76EE, 0x3E390D1A3427EB92 -data8 0x3E1336DE2BF82BF8, 0xBE5FF1CBD0F7BD9E -data8 0xBE60A3550CEB09DD, 0xBE5CA37E0980F30D -data8 0xBE5C541B4C082D25, 0xBE5BBECA3B467D29 -data8 0xBE400D8AB9D946C5, 0xBE5E2A0807ED374A -data8 0xBE66CB28365C8B0A, 0x3E3AAD5BD3403BCA -data8 0x3E526055C7EA21E0, 0xBE442C75E72880D6 -data8 0x3E58B2BB85222A43, 0xBE5AAB79522C42BF -data8 0xBE605CB4469DC2BC, 0xBE589FA7A48C40DC -data8 0xBE51C2141AA42614, 0xBE48D087C37293F4 -data8 0x3E367A1CA2D673E0, 0xBE51BEBB114F7A38 -data8 0xBE6348E5661A4B48, 0xBDF526431D3B9962 -data8 0x3E3A3B5E35A78A53, 0xBE46C46C1CECD788 -data8 0xBE60B7EC7857D689, 0xBE594D3DD14F1AD7 -data8 0xBE4F9C304C9A8F60, 0xBE52187302DFF9D2 -data8 0xBE5E4C8855E6D68F, 0xBE62140F667F3DC4 -data8 0xBE36961B3BF88747, 0x3E602861C96EC6AA -data8 0xBE3B5151D57FD718, 0x3E561CD0FC4A627B -data8 0xBE3A5217CA913FEA, 0x3E40A3CC9A5D193A -data8 0xBE5AB71310A9C312, 0x3E4FDADBC5F57719 -data8 0x3E361428DBDF59D5, 0x3E5DB5DB61B4180D -data8 0xBE42AD5F7408D856, 0x3E2A314831B2B707 -LOCAL_OBJECT_END(Constants_exp_64_W1) +.align 64 +Constants_exp_64_W1: +ASM_TYPE_DIRECTIVE(Constants_exp_64_W1,@object) +data4 0x00000000,0x00000000,0x171EC4B4,0xBE384454 +data4 0x4AA72766,0xBE694741,0xD42518F8,0xBE5D32B6 +data4 0x3A319149,0x3E68D96D,0x62415F36,0xBE68F4DA +data4 0xC9C86A3B,0xBE6DDA2F,0xF49228FE,0x3E6B2E50 +data4 0x1188B886,0xBE49C0C2,0x1A4C2F1F,0x3E64BFC2 +data4 0x2CB98B54,0xBE6A2FBB,0x9A55D329,0x3E5DC5DE +data4 0x39A7AACE,0x3E696490,0x5C66DBA5,0x3E54728B +data4 0xBA1C7D7D,0xBE62B0DB,0x09F1AF5F,0x3E576E04 +data4 0x1A0DD6A1,0x3E612500,0x795FBDEF,0xBE66A419 +data4 0xE1BD41FC,0xBE5CDE8C,0xEA54964F,0xBE621376 +data4 0x476E76EE,0x3E6370BE,0x3427EB92,0x3E390D1A +data4 0x2BF82BF8,0x3E1336DE,0xD0F7BD9E,0xBE5FF1CB +data4 0x0CEB09DD,0xBE60A355,0x0980F30D,0xBE5CA37E +data4 0x4C082D25,0xBE5C541B,0x3B467D29,0xBE5BBECA +data4 0xB9D946C5,0xBE400D8A,0x07ED374A,0xBE5E2A08 +data4 0x365C8B0A,0xBE66CB28,0xD3403BCA,0x3E3AAD5B +data4 0xC7EA21E0,0x3E526055,0xE72880D6,0xBE442C75 +data4 0x85222A43,0x3E58B2BB,0x522C42BF,0xBE5AAB79 +data4 0x469DC2BC,0xBE605CB4,0xA48C40DC,0xBE589FA7 +data4 0x1AA42614,0xBE51C214,0xC37293F4,0xBE48D087 +data4 0xA2D673E0,0x3E367A1C,0x114F7A38,0xBE51BEBB +data4 0x661A4B48,0xBE6348E5,0x1D3B9962,0xBDF52643 +data4 0x35A78A53,0x3E3A3B5E,0x1CECD788,0xBE46C46C +data4 0x7857D689,0xBE60B7EC,0xD14F1AD7,0xBE594D3D +data4 0x4C9A8F60,0xBE4F9C30,0x02DFF9D2,0xBE521873 +data4 0x55E6D68F,0xBE5E4C88,0x667F3DC4,0xBE62140F +data4 0x3BF88747,0xBE36961B,0xC96EC6AA,0x3E602861 +data4 0xD57FD718,0xBE3B5151,0xFC4A627B,0x3E561CD0 +data4 0xCA913FEA,0xBE3A5217,0x9A5D193A,0x3E40A3CC +data4 0x10A9C312,0xBE5AB713,0xC5F57719,0x3E4FDADB +data4 0xDBDF59D5,0x3E361428,0x61B4180D,0x3E5DB5DB +data4 0x7408D856,0xBE42AD5F,0x31B2B707,0x3E2A3148 +ASM_SIZE_DIRECTIVE(Constants_exp_64_W1) -LOCAL_OBJECT_START(Constants_exp_64_W2) -data8 0x0000000000000000, 0xBE641F2537A3D7A2 -data8 0xBE68DD57AD028C40, 0xBE5C77D8F212B1B6 -data8 0x3E57878F1BA5B070, 0xBE55A36A2ECAE6FE -data8 0xBE620608569DFA3B, 0xBE53B50EA6D300A3 -data8 0x3E5B5EF2223F8F2C, 0xBE56A0D9D6DE0DF4 -data8 0xBE64EEF3EAE28F51, 0xBE5E5AE2367EA80B -data8 0x3E47CB1A5FCBC02D, 0xBE656BA09BDAFEB7 -data8 0x3E6E70C6805AFEE7, 0xBE6E0509A3415EBA -data8 0xBE56856B49BFF529, 0x3E66DD3300508651 -data8 0x3E51165FC114BC13, 0x3E53333DC453290F -data8 0x3E6A072B05539FDA, 0xBE47CD877C0A7696 -data8 0xBE668BF4EB05C6D9, 0xBE67C3E36AE86C93 -data8 0xBE533904D0B3E84B, 0x3E63E8D9556B53CE -data8 0x3E212C8963A98DC8, 0xBE33138F032A7A22 -data8 0x3E530FA9BC584008, 0xBE6ADF82CCB93C97 -data8 0x3E5F91138370EA39, 0x3E5443A4FB6A05D8 -data8 0x3E63DACD181FEE7A, 0xBE62B29DF0F67DEC -data8 0x3E65C4833DDE6307, 0x3E5BF030D40A24C1 -data8 0x3E658B8F14E437BE, 0xBE631C29ED98B6C7 -data8 0x3E6335D204CF7C71, 0x3E529EEDE954A79D -data8 0x3E5D9257F64A2FB8, 0xBE6BED1B854ED06C -data8 0x3E5096F6D71405CB, 0xBE3D4893ACB9FDF5 -data8 0xBDFEB15801B68349, 0x3E628D35C6A463B9 -data8 0xBE559725ADE45917, 0xBE68C29C042FC476 -data8 0xBE67593B01E511FA, 0xBE4A4313398801ED -data8 0x3E699571DA7C3300, 0x3E5349BE08062A9E -data8 0x3E5229C4755BB28E, 0x3E67E42677A1F80D -data8 0xBE52B33F6B69C352, 0xBE6B3550084DA57F -data8 0xBE6DB03FD1D09A20, 0xBE60CBC42161B2C1 -data8 0x3E56ED9C78A2B771, 0xBE508E319D0FA795 -data8 0xBE59482AFD1A54E9, 0xBE2A17CEB07FD23E -data8 0x3E68BF5C17365712, 0x3E3956F9B3785569 -LOCAL_OBJECT_END(Constants_exp_64_W2) +.align 64 +Constants_exp_64_W2: +ASM_TYPE_DIRECTIVE(Constants_exp_64_W2,@object) +data4 0x00000000,0x00000000,0x37A3D7A2,0xBE641F25 +data4 0xAD028C40,0xBE68DD57,0xF212B1B6,0xBE5C77D8 +data4 0x1BA5B070,0x3E57878F,0x2ECAE6FE,0xBE55A36A +data4 0x569DFA3B,0xBE620608,0xA6D300A3,0xBE53B50E +data4 0x223F8F2C,0x3E5B5EF2,0xD6DE0DF4,0xBE56A0D9 +data4 0xEAE28F51,0xBE64EEF3,0x367EA80B,0xBE5E5AE2 +data4 0x5FCBC02D,0x3E47CB1A,0x9BDAFEB7,0xBE656BA0 +data4 0x805AFEE7,0x3E6E70C6,0xA3415EBA,0xBE6E0509 +data4 0x49BFF529,0xBE56856B,0x00508651,0x3E66DD33 +data4 0xC114BC13,0x3E51165F,0xC453290F,0x3E53333D +data4 0x05539FDA,0x3E6A072B,0x7C0A7696,0xBE47CD87 +data4 0xEB05C6D9,0xBE668BF4,0x6AE86C93,0xBE67C3E3 +data4 0xD0B3E84B,0xBE533904,0x556B53CE,0x3E63E8D9 +data4 0x63A98DC8,0x3E212C89,0x032A7A22,0xBE33138F +data4 0xBC584008,0x3E530FA9,0xCCB93C97,0xBE6ADF82 +data4 0x8370EA39,0x3E5F9113,0xFB6A05D8,0x3E5443A4 +data4 0x181FEE7A,0x3E63DACD,0xF0F67DEC,0xBE62B29D +data4 0x3DDE6307,0x3E65C483,0xD40A24C1,0x3E5BF030 +data4 0x14E437BE,0x3E658B8F,0xED98B6C7,0xBE631C29 +data4 0x04CF7C71,0x3E6335D2,0xE954A79D,0x3E529EED +data4 0xF64A2FB8,0x3E5D9257,0x854ED06C,0xBE6BED1B +data4 0xD71405CB,0x3E5096F6,0xACB9FDF5,0xBE3D4893 +data4 0x01B68349,0xBDFEB158,0xC6A463B9,0x3E628D35 +data4 0xADE45917,0xBE559725,0x042FC476,0xBE68C29C +data4 0x01E511FA,0xBE67593B,0x398801ED,0xBE4A4313 +data4 0xDA7C3300,0x3E699571,0x08062A9E,0x3E5349BE +data4 0x755BB28E,0x3E5229C4,0x77A1F80D,0x3E67E426 +data4 0x6B69C352,0xBE52B33F,0x084DA57F,0xBE6B3550 +data4 0xD1D09A20,0xBE6DB03F,0x2161B2C1,0xBE60CBC4 +data4 0x78A2B771,0x3E56ED9C,0x9D0FA795,0xBE508E31 +data4 0xFD1A54E9,0xBE59482A,0xB07FD23E,0xBE2A17CE +data4 0x17365712,0x3E68BF5C,0xB3785569,0x3E3956F9 +ASM_SIZE_DIRECTIVE(Constants_exp_64_W2) + +GR_SAVE_PFS = r59 +GR_SAVE_B0 = r60 +GR_SAVE_GP = r61 +GR_Parameter_X = r62 +GR_Parameter_Y = r63 +GR_Parameter_RESULT = r64 +GR_Parameter_TAG = r65 +FR_X = f9 +FR_Y = f9 +FR_RESULT = f99 .section .text - -GLOBAL_IEEE754_ENTRY(expm1l) - -// -// Set p7 true for expm1, p6 false -// - -{ .mlx - getf.exp GR_signexp_x = f8 // Get sign and exponent of x, redo if unorm - movl GR_sig_inv_ln2 = 0xb8aa3b295c17f0bc // significand of 1/ln2 +.proc expm1l# +.global expm1l# +.align 64 +expm1l: +#ifdef _LIBC +.global __expm1l# +__expm1l: +#endif +{ .mii +alloc r32 = ar.pfs,0,30,4,0 +(p0) add r33 = 1, r0 +(p0) cmp.eq.unc p7, p0 = r0, r0 } -{ .mlx - addl GR_ad_Arg = @ltoff(Constants_exp_64_Arg#),gp - movl GR_rshf_2to51 = 0x4718000000000000 // 1.10000 2^(63+51) +{ .mbb + nop.m 999 +(p0) br.cond.sptk exp_continue + nop.b 999 ;; } -;; -{ .mfi - ld8 GR_ad_Arg = [GR_ad_Arg] // Point to Arg table - fclass.m p8, p0 = f8, 0x1E7 // Test x for natval, nan, inf, zero - cmp.eq p7, p6 = r0, r0 -} -{ .mfb - mov GR_exp_half = 0x0FFFE // Exponent of 0.5, for very small path - fnorm.s1 FR_norm_x = f8 // Normalize x - br.cond.sptk exp_continue -} -;; +// +// Set p7 true for expm1 +// Set Flag = r33 = 1 for expm1 +// -GLOBAL_IEEE754_END(expm1l) +.endp expm1l +ASM_SIZE_DIRECTIVE(expm1l) +#ifdef _LIBC +libm_hidden_def (__expm1l) +#endif -GLOBAL_IEEE754_ENTRY(expl) -// -// Set p7 false for exp, p6 true -// -{ .mlx - getf.exp GR_signexp_x = f8 // Get sign and exponent of x, redo if unorm - movl GR_sig_inv_ln2 = 0xb8aa3b295c17f0bc // significand of 1/ln2 -} -{ .mlx - addl GR_ad_Arg = @ltoff(Constants_exp_64_Arg#),gp - movl GR_rshf_2to51 = 0x4718000000000000 // 1.10000 2^(63+51) +.section .text +.proc expl# +.global expl# +.align 64 +expl: +#ifdef _LIBC +.global __ieee754_expl# +__ieee754_expl: +#endif +{ .mii +alloc r32 = ar.pfs,0,30,4,0 +(p0) add r33 = r0, r0 +(p0) cmp.eq.unc p0, p7 = r0, r0 ;; } -;; - +exp_continue: { .mfi - ld8 GR_ad_Arg = [GR_ad_Arg] // Point to Arg table - fclass.m p8, p0 = f8, 0x1E7 // Test x for natval, nan, inf, zero - cmp.eq p6, p7 = r0, r0 +(p0) add r32 = 2,r0 +(p0) fnorm.s1 f9 = f8 + nop.i 0 } { .mfi - mov GR_exp_half = 0x0FFFE // Exponent of 0.5, for very small path - fnorm.s1 FR_norm_x = f8 // Normalize x - nop.i 999 +(p0) nop.m 0 +// +// Set p7 false for exp +// Set Flag = r33 = 0 for exp +// +(p0) fclass.m.unc p6, p8 = f8, 0x1E7 + nop.i 0;; } -;; - -exp_continue: -// Form two constants we need -// 1/ln2 * 2^63 to compute w = x * 1/ln2 * 128 -// 1.1000..000 * 2^(63+63-12) to right shift int(N) into the significand - { .mfi - setf.sig FR_INV_LN2_2TO63 = GR_sig_inv_ln2 // form 1/ln2 * 2^63 - fclass.nm.unc p9, p0 = f8, 0x1FF // Test x for unsupported - mov GR_exp_2tom51 = 0xffff-51 -} -{ .mlx - setf.d FR_RSHF_2TO51 = GR_rshf_2to51 // Form const 1.1000 * 2^(63+51) - movl GR_rshf = 0x43e8000000000000 // 1.10000 2^63 for right shift + nop.m 999 +(p0) fclass.nm.unc p9, p0 = f8, 0x1FF + nop.i 0 } -;; - { .mfi - setf.exp FR_half = GR_exp_half // Form 0.5 for very small path - fma.s1 FR_scale = f1,f1,f0 // Scale = 1.0 - mov GR_exp_bias = 0x0FFFF // Set exponent bias + nop.m 999 +(p0) mov f36 = f1 + nop.i 999 ;; } -{ .mib - add GR_ad_Limits = 0x20, GR_ad_Arg // Point to Limits table - mov GR_exp_mask = 0x1FFFF // Form exponent mask -(p8) br.cond.spnt EXP_64_SPECIAL // Branch if natval, nan, inf, zero -} -;; - -{ .mfi - setf.exp FR_2TOM51 = GR_exp_2tom51 // Form 2^-51 for scaling float_N - nop.f 999 - add GR_ad_A = 0x40, GR_ad_Arg // Point to A table +{ .mfb + nop.m 999 +// +// Identify NatVals, NaNs, Infs, and Zeros. +// Identify EM unsupporteds. +// Save special input registers +(p0) mov f32 = f0 +// +// Create FR_X_cor = 0.0 +// GR_Flag = 0 +// GR_Expo_Range = 2 (r32) for double-extended precision +// FR_Scale = 1.0 +// +(p6) br.cond.spnt EXPL_64_SPECIAL ;; } { .mib - setf.d FR_RSHF = GR_rshf // Form right shift const 1.1000 * 2^63 - add GR_ad_T1 = 0x160, GR_ad_Arg // Point to T1 table -(p9) br.cond.spnt EXP_64_UNSUPPORTED // Branch if unsupported + nop.m 999 + nop.i 999 +(p9) br.cond.spnt EXPL_64_UNSUPPORTED ;; } -;; - -.pred.rel "mutex",p6,p7 { .mfi - ldfe FR_L_hi = [GR_ad_Arg],16 // Get L_hi - fcmp.eq.s0 p9,p0 = f8, f0 // Dummy op to flag denormals -(p6) add GR_ad_PQ = 0x30, GR_ad_A // Point to P table for exp +(p0) cmp.ne.unc p12, p13 = 0x01, r33 +// +// Branch out for special input values +// +(p0) fcmp.lt.unc.s0 p9,p0 = f8, f0 +(p0) cmp.eq.unc p15, p0 = r0, r0 } -{ .mfi - ldfe FR_min_oflow_x = [GR_ad_Limits],16 // Get min x to cause overflow - fmpy.s1 FR_rsq = f8, f8 // rsq = x * x for small path -(p7) add GR_ad_PQ = 0x90, GR_ad_A // Point to Q table for expm1 +{ .mmi + nop.m 999 +// +// Raise possible denormal operand exception +// Normalize x +// +// This function computes expl( x + x_cor) +// Input FR 1: FR_X +// Input FR 2: FR_X_cor +// Input GR 1: GR_Flag +// Input GR 2: GR_Expo_Range +// Output FR 3: FR_Y_hi +// Output FR 4: FR_Y_lo +// Output FR 5: FR_Scale +// Output PR 1: PR_Safe +(p0) addl r34 = @ltoff(Constants_exp_64_Arg#),gp +(p0) addl r40 = @ltoff(Constants_exp_64_W1#),gp };; +// +// Prepare to load constants +// Set Safe = True +// { .mmi - ldfe FR_L_lo = [GR_ad_Arg],16 // Get L_lo - ldfe FR_zero_uflow_x = [GR_ad_Limits],16 // Get x for zero uflow result - add GR_ad_W1 = 0x200, GR_ad_T1 // Point to W1 table -} -;; + ld8 r34 = [r34] + ld8 r40 = [r40] +(p0) addl r41 = @ltoff(Constants_exp_64_W2#),gp +};; -{ .mfi - ldfe FR_P6Q9 = [GR_ad_PQ],16 // P6(exp) or Q9(expm1) for small path - mov FR_r = FR_norm_x // r = X for small path - mov GR_very_small_exp = -60 // Exponent of x for very small path -} -{ .mfi - add GR_ad_W2 = 0x400, GR_ad_T1 // Point to W2 table - nop.f 999 -(p7) mov GR_small_exp = -7 // Exponent of x for small path expm1 +{ .mmi +(p0) ldfe f37 = [r34],16 +(p0) ld8 r41 = [r41] ;; } -;; +// +// N = fcvt.fx(float_N) +// Set p14 if -6 > expo_X +// +// +// Bias = 0x0FFFF +// expo_X = expo_X and Mask +// + +{ .mmi +(p0) ldfe f40 = [r34],16 + nop.m 999 +// +// Load L_lo +// Set p10 if 14 < expo_X +// +(p0) addl r50 = @ltoff(Constants_exp_64_T1#),gp +} { .mmi - ldfe FR_P5Q8 = [GR_ad_PQ],16 // P5(exp) or Q8(expm1) for small path - and GR_exp_x = GR_signexp_x, GR_exp_mask -(p6) mov GR_small_exp = -12 // Exponent of x for small path exp + nop.m 999 + nop.m 999 +(p0) addl r51 = @ltoff(Constants_exp_64_T2#),gp ;; } -;; +// +// Load W2_ptr +// Branch to SMALL is expo_X < -6 +// -// N_signif = X * Inv_log2_by_2^12 -// By adding 1.10...0*2^63 we shift and get round_int(N_signif) in significand. -// We actually add 1.10...0*2^51 to X * Inv_log2 to do the same thing. -{ .mfi - ldfe FR_P4Q7 = [GR_ad_PQ],16 // P4(exp) or Q7(expm1) for small path - fma.s1 FR_N_signif = FR_norm_x, FR_INV_LN2_2TO63, FR_RSHF_2TO51 - nop.i 999 +{.mmi +(p0) ld8 r50 = [r50] +(p0) ld8 r51 = [r51] +};; + +{ .mlx +(p0) ldfe f41 = [r34],16 +// +// float_N = X * L_Inv +// expo_X = exponent of X +// Mask = 0x1FFFF +// +(p0) movl r58 = 0x0FFFF } -{ .mfi - sub GR_exp_x = GR_exp_x, GR_exp_bias // Get exponent - fmpy.s1 FR_r4 = FR_rsq, FR_rsq // Form r4 for small path - cmp.eq.unc p15, p0 = r0, r0 // Set Safe as default +{ .mlx + nop.m 999 +(p0) movl r39 = 0x1FFFF ;; } -;; - { .mmi - ldfe FR_P3Q6 = [GR_ad_PQ],16 // P3(exp) or Q6(expm1) for small path - cmp.lt p14, p0 = GR_exp_x, GR_very_small_exp // Is |x| < 2^-60? - nop.i 999 +(p0) getf.exp r37 = f9 + nop.m 999 +(p0) addl r34 = @ltoff(Constants_exp_64_Exponents#),gp ;; } -;; - -{ .mfi - ldfe FR_P2Q5 = [GR_ad_PQ],16 // P2(exp) or Q5(expm1) for small path - fmpy.s1 FR_half_x = FR_half, FR_norm_x // 0.5 * x for very small path - cmp.lt p13, p0 = GR_exp_x, GR_small_exp // Is |x| < 2^-m? +{ .mii +(p0) ld8 r34 = [r34] + nop.i 999 +(p0) and r37 = r37, r39 ;; } -{ .mib - nop.m 999 - nop.i 999 -(p14) br.cond.spnt EXP_VERY_SMALL // Branch if |x| < 2^-60 +{ .mmi +(p0) sub r37 = r37, r58 ;; +(p0) cmp.gt.unc p14, p0 = -6, r37 +(p0) cmp.lt.unc p10, p0 = 14, r37 ;; } -;; - { .mfi - ldfe FR_A3 = [GR_ad_A],16 // Get A3 for normal path - fcmp.ge.s1 p10,p0 = FR_norm_x, FR_min_oflow_x // Will result overflow? - mov GR_big_expo_neg = -16381 // -0x3ffd +(p0) nop.m 0 +// +// Load L_inv +// Set p12 true for Flag = 0 (exp) +// Set p13 true for Flag = 1 (expm1) +// +(p0) fmpy.s1 f38 = f9, f37 + nop.i 999 ;; } { .mfb - ldfe FR_P1Q4 = [GR_ad_PQ],16 // P1(exp) or Q4(expm1) for small path - nop.f 999 -(p13) br.cond.spnt EXP_SMALL // Branch if |x| < 2^-m - // m=12 for exp, m=7 for expm1 -} -;; - -// Now we are on the main path for |x| >= 2^-m, m=12 for exp, m=7 for expm1 + nop.m 999 // -// float_N = round_int(N_signif) -// The signficand of N_signif contains the rounded integer part of X * 2^12/ln2, -// as a twos complement number in the lower bits (that is, it may be negative). -// That twos complement number (called N) is put into GR_N. - -// Since N_signif is scaled by 2^51, it must be multiplied by 2^-51 -// before the shift constant 1.10000 * 2^63 is subtracted to yield float_N. -// Thus, float_N contains the floating point version of N - - -{ .mfi - ldfe FR_A2 = [GR_ad_A],16 // Get A2 for main path - fcmp.lt.s1 p11,p0 = FR_norm_x, FR_zero_uflow_x // Certain zero, uflow? - add GR_ad_T2 = 0x100, GR_ad_T1 // Point to T2 table +// Load L_hi +// expo_X = expo_X - Bias +// get W1_ptr +// +(p0) fcvt.fx.s1 f39 = f38 +(p14) br.cond.spnt EXPL_SMALL ;; } -{ .mfi +{ .mib + nop.m 999 + nop.i 999 +(p10) br.cond.spnt EXPL_HUGE ;; +} +{ .mmi +(p0) shladd r34 = r32,4,r34 nop.m 999 - fms.s1 FR_float_N = FR_N_signif, FR_2TOM51, FR_RSHF // Form float_N - nop.i 999 +(p0) addl r35 = @ltoff(Constants_exp_64_A#),gp ;; } -;; - -{ .mbb - getf.sig GR_N_fix = FR_N_signif // Get N from significand -(p10) br.cond.spnt EXP_OVERFLOW // Branch if result will overflow -(p11) br.cond.spnt EXP_CERTAIN_UNDERFLOW_ZERO // Branch if certain zero, uflow +// +// Load T_1,T_2 +// +{ .mmi + nop.m 999 + ld8 r35 =[r35] + nop.i 99 +};; +{ .mmb +(p0) ldfe f51 = [r35],16 +(p0) ld8 r45 = [r34],8 + nop.b 999 ;; } -;; - -{ .mfi - ldfe FR_A1 = [GR_ad_A],16 // Get A1 for main path - fnma.s1 FR_r = FR_L_hi, FR_float_N, FR_norm_x // r = -L_hi * float_N + x - extr.u GR_M1 = GR_N_fix, 6, 6 // Extract index M_1 +// +// Set Safe = True if k >= big_expo_neg +// Set Safe = False if k < big_expo_neg +// +{ .mmb +(p0) ldfe f49 = [r35],16 +(p0) ld8 r48 = [r34],0 + nop.b 999 ;; } { .mfi - and GR_M2 = 0x3f, GR_N_fix // Extract index M_2 - nop.f 999 - nop.i 999 + nop.m 999 +// +// Branch to HUGE is expo_X > 14 +// +(p0) fcvt.xf f38 = f39 + nop.i 999 ;; } -;; - -// N_fix is only correct up to 50 bits because of our right shift technique. -// Actually in the normal path we will have restricted K to about 14 bits. -// Somewhat arbitrarily we extract 32 bits. { .mfi - shladd GR_ad_W1 = GR_M1,3,GR_ad_W1 // Point to W1 - nop.f 999 - extr GR_K = GR_N_fix, 12, 32 // Extract limited range K +(p0) getf.sig r52 = f39 + nop.f 999 + nop.i 999 ;; +} +{ .mii + nop.m 999 +(p0) extr.u r43 = r52, 6, 6 ;; +// +// r = r - float_N * L_lo +// K = extr(N_fix,12,52) +// +(p0) shladd r40 = r43,3,r40 ;; } { .mfi - shladd GR_ad_T1 = GR_M1,2,GR_ad_T1 // Point to T1 - nop.f 999 - shladd GR_ad_T2 = GR_M2,2,GR_ad_T2 // Point to T2 +(p0) shladd r50 = r43,2,r50 +(p0) fnma.s1 f42 = f40, f38, f9 +// +// float_N = float(N) +// N_fix = signficand N +// +(p0) extr.u r42 = r52, 0, 6 } -;; - { .mmi - ldfs FR_T1 = [GR_ad_T1],0 // Get T1 - ldfd FR_W1 = [GR_ad_W1],0 // Get W1 - add GR_exp_2_k = GR_exp_bias, GR_K // Form exponent of 2^k +(p0) ldfd f43 = [r40],0 ;; +(p0) shladd r41 = r42,3,r41 +(p0) shladd r51 = r42,2,r51 +} +// +// W_1_p1 = 1 + W_1 +// +{ .mmi +(p0) ldfs f44 = [r50],0 ;; +(p0) ldfd f45 = [r41],0 +// +// M_2 = extr(N_fix,0,6) +// M_1 = extr(N_fix,6,6) +// r = X - float_N * L_hi +// +(p0) extr r44 = r52, 12, 52 } -;; - { .mmi - ldfs FR_T2 = [GR_ad_T2],0 // Get T2 - shladd GR_ad_W2 = GR_M2,3,GR_ad_W2 // Point to W2 - sub GR_exp_2_mk = GR_exp_bias, GR_K // Form exponent of 2^-k +(p0) ldfs f46 = [r51],0 ;; +(p0) sub r46 = r58, r44 +(p0) cmp.gt.unc p8, p15 = r44, r45 +} +// +// W = W_1 + W_1_p1*W_2 +// Load A_2 +// Bias_m_K = Bias - K +// +{ .mii +(p0) ldfe f40 = [r35],16 +// +// load A_1 +// poly = A_2 + r*A_3 +// rsq = r * r +// neg_2_mK = exponent of Bias_m_k +// +(p0) add r47 = r58, r44 ;; +// +// Set Safe = True if k <= big_expo_pos +// Set Safe = False if k > big_expo_pos +// Load A_3 +// +(p15) cmp.lt p8,p15 = r44,r48 ;; } -;; - { .mmf - ldfd FR_W2 = [GR_ad_W2],0 // Get W2 - setf.exp FR_scale = GR_exp_2_k // Set scale = 2^k - fnma.s1 FR_r = FR_L_lo, FR_float_N, FR_r // r = -L_lo * float_N + r +(p0) setf.exp f61 = r46 +// +// Bias_p + K = Bias + K +// T = T_1 * T_2 +// +(p0) setf.exp f36 = r47 +(p0) fnma.s1 f42 = f41, f38, f42 ;; } -;; - { .mfi - setf.exp FR_2_mk = GR_exp_2_mk // Form 2^-k - fma.s1 FR_poly = FR_r, FR_A3, FR_A2 // poly = r * A3 + A2 - cmp.lt p8,p15 = GR_K,GR_big_expo_neg // Set Safe if K > big_expo_neg + nop.m 999 +// +// Load W_1,W_2 +// Load big_exp_pos, load big_exp_neg +// +(p0) fadd.s1 f47 = f43, f1 + nop.i 999 ;; } { .mfi - nop.m 999 - fmpy.s1 FR_rsq = FR_r, FR_r // rsq = r * r - nop.i 999 + nop.m 999 +(p0) fma.s1 f52 = f42, f51, f49 + nop.i 999 } -;; - { .mfi - nop.m 999 - fmpy.s1 FR_T = FR_T1, FR_T2 // T = T1 * T2 - nop.i 999 + nop.m 999 +(p0) fmpy.s1 f48 = f42, f42 + nop.i 999 ;; } { .mfi - nop.m 999 - fadd.s1 FR_W1_p1 = FR_W1, f1 // W1_p1 = W1 + 1.0 - nop.i 999 + nop.m 999 +(p0) fmpy.s1 f53 = f44, f46 + nop.i 999 ;; } -;; - { .mfi -(p7) cmp.lt.unc p8, p9 = 10, GR_K // If expm1, set p8 if K > 10 - fma.s1 FR_poly = FR_r, FR_poly, FR_A1 // poly = r * poly + A1 - nop.i 999 + nop.m 999 +(p0) fma.s1 f54 = f45, f47, f43 + nop.i 999 } -;; - { .mfi -(p7) cmp.eq p15, p0 = r0, r0 // If expm1, set Safe flag - fma.s1 FR_T_scale = FR_T, FR_scale, f0 // T_scale = T * scale -(p9) cmp.gt.unc p9, p10 = -10, GR_K // If expm1, set p9 if K < -10 - // If expm1, set p10 if -10<=K<=10 + nop.m 999 +(p0) fneg f61 = f61 + nop.i 999 ;; } { .mfi - nop.m 999 - fma.s1 FR_W = FR_W2, FR_W1_p1, FR_W1 // W = W2 * (W1+1.0) + W1 - nop.i 999 + nop.m 999 +(p0) fma.s1 f52 = f42, f52, f40 + nop.i 999 ;; } -;; - { .mfi - nop.m 999 - mov FR_Y_hi = FR_T // Assume Y_hi = T - nop.i 999 + nop.m 999 +(p0) fadd.s1 f55 = f54, f1 + nop.i 999 } -;; - { .mfi - nop.m 999 - fma.s1 FR_poly = FR_rsq, FR_poly, FR_r // poly = rsq * poly + r - nop.i 999 + nop.m 999 +// +// W + Wp1 * poly +// +(p0) mov f34 = f53 + nop.i 999 ;; } -;; - { .mfi - nop.m 999 - fma.s1 FR_Wp1_T_scale = FR_W, FR_T_scale, FR_T_scale // (W+1)*T*scale - nop.i 999 + nop.m 999 +// +// A_1 + r * poly +// Scale = setf_expl(Bias_p_k) +// +(p0) fma.s1 f52 = f48, f52, f42 + nop.i 999 ;; } { .mfi - nop.m 999 - fma.s1 FR_W_T_scale = FR_W, FR_T_scale, f0 // W*T*scale - nop.i 999 + nop.m 999 +// +// poly = r + rsq(A_1 + r*poly) +// Wp1 = 1 + W +// neg_2_mK = -neg_2_mK +// +(p0) fma.s1 f35 = f55, f52, f54 + nop.i 999 ;; } -;; - +{ .mfb + nop.m 999 +(p0) fmpy.s1 f35 = f35, f53 +// +// Y_hi = T +// Y_lo = T * (W + Wp1*poly) +// +(p12) br.cond.sptk EXPL_MAIN ;; +} +// +// Branch if expl(x) +// Continue for expl(x-1) +// +{ .mii +(p0) cmp.lt.unc p12, p13 = 10, r44 + nop.i 999 ;; +// +// Set p12 if 10 < K, Else p13 +// +(p13) cmp.gt.unc p13, p14 = -10, r44 ;; +} +// +// K > 10: Y_lo = Y_lo + neg_2_mK +// K <=10: Set p13 if -10 > K, Else set p14 +// { .mfi - nop.m 999 -(p9) fsub.s1 FR_Y_hi = f0, FR_2_mk // If expm1, if K < -10 set Y_hi - nop.i 999 +(p13) cmp.eq p15, p0 = r0, r0 +(p14) fadd.s1 f34 = f61, f34 + nop.i 999 ;; } { .mfi - nop.m 999 -(p10) fsub.s1 FR_Y_hi = FR_T, FR_2_mk // If expm1, if |K|<=10 set Y_hi - nop.i 999 + nop.m 999 +(p12) fadd.s1 f35 = f35, f61 + nop.i 999 ;; } -;; - { .mfi + nop.m 999 +(p13) fadd.s1 f35 = f35, f34 + nop.i 999 +} +{ .mfb + nop.m 999 +// +// K <= 10 and K < -10, Set Safe = True +// K <= 10 and K < 10, Y_lo = Y_hi + Y_lo +// K <= 10 and K > =-10, Y_hi = Y_hi + neg_2_mk +// +(p13) mov f34 = f61 +(p0) br.cond.sptk EXPL_MAIN ;; +} +EXPL_SMALL: +{ .mmi nop.m 999 - fma.s1 FR_result_lo = FR_Wp1_T_scale, FR_poly, FR_W_T_scale - nop.i 999 +(p0) addl r34 = @ltoff(Constants_exp_64_Exponents#),gp +(p12) addl r35 = @ltoff(Constants_exp_64_P#),gp ;; } -;; - -.pred.rel "mutex",p8,p9 -// If K > 10 adjust result_lo = result_lo - scale * 2^-k -// If |K| <= 10 adjust result_lo = result_lo + scale * T +.pred.rel "mutex",p12,p13 +{ .mmi +(p12) ld8 r35=[r35] +nop.m 999 +(p13) addl r35 = @ltoff(Constants_exp_64_Q#),gp +};; +{ .mmi +(p13) ld8 r35=[r35] +(p0) ld8 r34=[r34] +nop.i 999 +};; { .mfi - nop.m 999 -(p8) fnma.s1 FR_result_lo = FR_scale, FR_2_mk, FR_result_lo // If K > 10 - nop.i 999 +(p0) add r34 = 0x48,r34 +// +// Return +// K <= 10 and K < 10, Y_hi = neg_2_mk +// +// /*******************************************************/ +// /*********** Branch EXPL_SMALL ************************/ +// /*******************************************************/ +(p0) mov f42 = f9 + nop.i 999 ;; } +// +// Flag = 0 +// r4 = rsq * rsq +// { .mfi - nop.m 999 -(p9) fma.s1 FR_result_lo = FR_T_scale, f1, FR_result_lo // If |K| <= 10 - nop.i 999 +(p0) ld8 r49 =[r34],0 + nop.f 999 + nop.i 999 ;; +} +{ .mii + nop.m 999 + nop.i 999 ;; +// +// Flag = 1 +// +(p0) cmp.lt.unc p14, p0 = r37, r49 ;; } -;; - { .mfi - nop.m 999 - fmpy.s0 FR_tmp = FR_A1, FR_A1 // Dummy op to set inexact - nop.i 999 + nop.m 999 +// +// r = X +// +(p0) fmpy.s1 f48 = f42, f42 + nop.i 999 ;; } { .mfb - nop.m 999 -(p15) fma.s0 f8 = FR_Y_hi, FR_scale, FR_result_lo // Safe result -(p15) br.ret.sptk b0 // Safe exit for normal path + nop.m 999 +// +// rsq = r * r +// +(p0) fmpy.s1 f50 = f48, f48 +// +// Is input very small? +// +(p14) br.cond.spnt EXPL_VERY_SMALL ;; } -;; - -// Here if unsafe, will only be here for exp with K < big_expo_neg -{ .mfb - nop.m 999 - fma.s0 FR_RESULT = FR_Y_hi, FR_scale, FR_result_lo // Prelim result - br.cond.sptk EXP_POSSIBLE_UNDERFLOW // Branch to unsafe code +// +// Flag_not1: Y_hi = 1.0 +// Flag is 1: r6 = rsq * r4 +// +{ .mfi +(p12) ldfe f52 = [r35],16 +(p12) mov f34 = f1 +(p0) add r53 = 0x1,r0 ;; } -;; - - -EXP_SMALL: -// Here if 2^-60 < |x| < 2^-m, m=12 for exp, m=7 for expm1 { .mfi -(p7) ldfe FR_Q3 = [GR_ad_Q],16 // Get Q3 for small path, if expm1 -(p6) fma.s1 FR_p65 = FR_P6, FR_r, FR_P5 // If exp, p65 = P6 * r + P5 - nop.i 999 +(p13) ldfe f51 = [r35],16 +// +// Flag_not_1: Y_lo = poly_hi + r4 * poly_lo +// +(p13) mov f34 = f9 + nop.i 999 ;; +} +{ .mmf +(p12) ldfe f53 = [r35],16 +// +// For Flag_not_1, Y_hi = X +// Scale = 1 +// Create 0x000...01 +// +(p0) setf.sig f37 = r53 +(p0) mov f36 = f1 ;; +} +{ .mmi +(p13) ldfe f52 = [r35],16 ;; +(p12) ldfe f54 = [r35],16 + nop.i 999 ;; } { .mfi - mov GR_minus_one = -1 -(p7) fma.s1 FR_q98 = FR_Q9, FR_r, FR_Q8 // If expm1, q98 = Q9 * r + Q8 - nop.i 999 +(p13) ldfe f53 = [r35],16 +(p13) fmpy.s1 f58 = f48, f50 + nop.i 999 ;; +} +// +// Flag_not1: poly_lo = P_5 + r*P_6 +// Flag_1: poly_lo = Q_6 + r*Q_7 +// +{ .mmi +(p13) ldfe f54 = [r35],16 ;; +(p12) ldfe f55 = [r35],16 + nop.i 999 ;; +} +{ .mmi +(p12) ldfe f56 = [r35],16 ;; +(p13) ldfe f55 = [r35],16 + nop.i 999 ;; +} +{ .mmi +(p12) ldfe f57 = [r35],0 ;; +(p13) ldfe f56 = [r35],16 + nop.i 999 ;; } -;; - { .mfi -(p7) ldfe FR_Q2 = [GR_ad_Q],16 // Get Q2 for small path, if expm1 -(p7) fma.s1 FR_q65 = FR_Q6, FR_r, FR_Q5 // If expm1, q65 = Q6 * r + Q5 - nop.i 999 +(p13) ldfe f57 = [r35],0 + nop.f 999 + nop.i 999 ;; } -;; - { .mfi - setf.sig FR_tmp = GR_minus_one // Create value to force inexact -(p6) fma.s1 FR_p21 = FR_P2, FR_r, FR_P1 // If exp, p21 = P2 * r + P1 - nop.i 999 + nop.m 999 +// +// For Flag_not_1, load p5,p6,p1,p2 +// Else load p5,p6,p1,p2 +// +(p12) fma.s1 f60 = f52, f42, f53 + nop.i 999 ;; } { .mfi -(p7) ldfe FR_Q1 = [GR_ad_Q],16 // Get Q1 for small path, if expm1 -(p7) fma.s1 FR_q43 = FR_Q4, FR_r, FR_Q3 // If expm1, q43 = Q4 * r + Q3 - nop.i 999 + nop.m 999 +(p13) fma.s1 f60 = f51, f42, f52 + nop.i 999 ;; } -;; - { .mfi - nop.m 999 -(p6) fma.s1 FR_p654 = FR_p65, FR_r, FR_P4 // If exp, p654 = p65 * r + P4 - nop.i 999 + nop.m 999 +(p12) fma.s1 f60 = f60, f42, f54 + nop.i 999 ;; } { .mfi - nop.m 999 -(p7) fma.s1 FR_q987 = FR_q98, FR_r, FR_Q7 // If expm1, q987 = q98 * r + Q7 - nop.i 999 + nop.m 999 +(p12) fma.s1 f59 = f56, f42, f57 + nop.i 999 ;; } -;; - { .mfi - nop.m 999 -(p7) fma.s1 FR_q21 = FR_Q2, FR_r, FR_Q1 // If expm1, q21 = Q2 * r + Q1 - nop.i 999 + nop.m 999 +(p13) fma.s1 f60 = f42, f60, f53 + nop.i 999 ;; } -;; - { .mfi - nop.m 999 -(p6) fma.s1 FR_p210 = FR_p21, FR_rsq, FR_r // If exp, p210 = p21 * r + P0 - nop.i 999 + nop.m 999 +(p12) fma.s1 f59 = f59, f48, f42 + nop.i 999 ;; } { .mfi - nop.m 999 -(p7) fma.s1 FR_q6543 = FR_q65, FR_rsq, FR_q43 // If expm1, q6543 = q65*r2+q43 - nop.i 999 + nop.m 999 +// +// Flag_1: poly_lo = Q_5 + r*(Q_6 + r*Q_7) +// Flag_not1: poly_lo = P_4 + r*(P_5 + r*P_6) +// Flag_not1: poly_hi = (P_1 + r*P_2) +// +(p13) fmpy.s1 f60 = f60, f58 + nop.i 999 ;; } -;; - { .mfi - nop.m 999 -(p6) fma.s1 FR_p6543 = FR_p654, FR_r, FR_P3 // If exp, p6543 = p654 * r + P3 - nop.i 999 + nop.m 999 +(p12) fma.s1 f60 = f60, f42, f55 + nop.i 999 ;; } { .mfi - nop.m 999 -(p7) fma.s1 FR_q9876543 = FR_q987, FR_r4, FR_q6543 // If expm1, q9876543 = ... - nop.i 999 + nop.m 999 +// +// Flag_1: poly_lo = r6 *(Q_5 + ....) +// Flag_not1: poly_hi = r + rsq *(P_1 + r*P_2) +// +(p12) fma.s1 f35 = f60, f50, f59 + nop.i 999 } -;; - { .mfi - nop.m 999 -(p6) fma.s1 FR_Y_lo = FR_p6543, FR_r4, FR_p210 // If exp, form Y_lo - nop.i 999 + nop.m 999 +(p13) fma.s1 f59 = f54, f42, f55 + nop.i 999 ;; } -;; - { .mfi - nop.m 999 -(p7) fma.s1 FR_Y_lo = FR_q9876543, FR_rsq, FR_q21 // If expm1, form Y_lo - nop.i 999 + nop.m 999 +// +// Flag_not1: Y_lo = rsq* poly_hi + poly_lo +// Flag_1: poly_lo = rsq* poly_hi + poly_lo +// +(p13) fma.s1 f59 = f59, f42, f56 + nop.i 999 ;; } -;; - { .mfi - nop.m 999 - fmpy.s0 FR_tmp = FR_tmp, FR_tmp // Dummy op to set inexact - nop.i 999 + nop.m 999 +// +// Flag_not_1: (P_1 + r*P_2) +// +(p13) fma.s1 f59 = f59, f42, f57 + nop.i 999 ;; } -;; - -.pred.rel "mutex",p6,p7 { .mfi - nop.m 999 -(p6) fma.s0 f8 = FR_Y_lo, f1, f1 // If exp, result = 1 + Y_lo - nop.i 999 + nop.m 999 +// +// Flag_not_1: poly_hi = r + rsq * (P_1 + r*P_2) +// +(p13) fma.s1 f35 = f59, f48, f60 + nop.i 999 ;; +} +{ .mfi + nop.m 999 +// +// Create 0.000...01 +// +(p0) for f37 = f35, f37 + nop.i 999 ;; } { .mfb - nop.m 999 -(p7) fma.s0 f8 = FR_Y_lo, FR_rsq, FR_norm_x // If expm1, result = Y_lo*r2+x - br.ret.sptk b0 // Exit for 2^-60 <= |x| < 2^-m - // m=12 for exp, m=7 for expm1 + nop.m 999 +// +// Set lsb of Y_lo to 1 +// +(p0) fmerge.se f35 = f35,f37 +(p0) br.cond.sptk EXPL_MAIN ;; +} +EXPL_VERY_SMALL: +{ .mmi + nop.m 999 + nop.m 999 +(p13) addl r34 = @ltoff(Constants_exp_64_Exponents#),gp +} +{ .mfi + nop.m 999 +(p12) mov f35 = f9 + nop.i 999 ;; +} +{ .mfb +(p13) ld8 r34 = [r34] +(p12) mov f34 = f1 +(p12) br.cond.sptk EXPL_MAIN ;; +} +{ .mlx +(p13) add r34 = 8,r34 +(p13) movl r39 = 0x0FFFE ;; } -;; - - -EXP_VERY_SMALL: // -// Here if 0 < |x| < 2^-60 -// If exp, result = 1.0 + x -// If expm1, result = x +x*x/2, but have to check for possible underflow +// Load big_exp_neg +// Create 1/2's exponent +// +{ .mii +(p13) setf.exp f56 = r39 +(p13) shladd r34 = r32,4,r34 ;; + nop.i 999 +} +// +// Negative exponents are stored after positive // - { .mfi -(p7) mov GR_exp_underflow = -16381 // Exponent for possible underflow -(p6) fadd.s0 f8 = f1, FR_norm_x // If exp, result = 1+x - nop.i 999 +(p13) ld8 r45 = [r34],0 +// +// Y_hi = x +// Scale = 1 +// +(p13) fmpy.s1 f35 = f9, f9 + nop.i 999 ;; } { .mfi - nop.m 999 -(p7) fmpy.s1 FR_result_lo = FR_half_x, FR_norm_x // If expm1 result_lo = x*x/2 - nop.i 999 + nop.m 999 +// +// Reset Safe if necessary +// Create 1/2 +// +(p13) mov f34 = f9 + nop.i 999 ;; } -;; - { .mfi -(p7) cmp.lt.unc p0, p8 = GR_exp_x, GR_exp_underflow // Unsafe if expm1 x small -(p7) mov FR_Y_hi = FR_norm_x // If expm1, Y_hi = x -(p7) cmp.lt p0, p15 = GR_exp_x, GR_exp_underflow // Unsafe if expm1 x small +(p13) cmp.lt.unc p0, p15 = r37, r45 +(p13) mov f36 = f1 + nop.i 999 ;; } -;; - { .mfb - nop.m 999 -(p8) fma.s0 f8 = FR_norm_x, f1, FR_result_lo // If expm1, result=x+x*x/2 -(p15) br.ret.sptk b0 // If Safe, exit + nop.m 999 +// +// Y_lo = x * x +// +(p13) fmpy.s1 f35 = f35, f56 +// +// Y_lo = x*x/2 +// +(p13) br.cond.sptk EXPL_MAIN ;; +} +EXPL_HUGE: +{ .mfi + nop.m 999 +(p0) fcmp.gt.unc.s1 p14, p0 = f9, f0 + nop.i 999 +} +{ .mlx + nop.m 999 +(p0) movl r39 = 0x15DC0 ;; +} +{ .mfi +(p14) setf.exp f34 = r39 +(p14) mov f35 = f1 +(p14) cmp.eq p0, p15 = r0, r0 ;; } -;; - -// Here if expm1 and 0 < |x| < 2^-16381; may be possible underflow { .mfb - nop.m 999 - fma.s0 FR_RESULT = FR_Y_hi, FR_scale, FR_result_lo // Prelim result - br.cond.sptk EXP_POSSIBLE_UNDERFLOW // Branch to unsafe code + nop.m 999 +(p14) mov f36 = f34 +// +// If x > 0, Set Safe = False +// If x > 0, Y_hi = 2**(24,000) +// If x > 0, Y_lo = 1.0 +// If x > 0, Scale = 2**(24,000) +// +(p14) br.cond.sptk EXPL_MAIN ;; } -;; - -EXP_CERTAIN_UNDERFLOW_ZERO: -// Here if x < zero_uflow_x -// For exp, set result to tiny+0.0 and set I, U, and branch to error handling -// For expm1, set result to tiny-1.0 and set I, and exit -{ .mmi - alloc GR_SAVE_PFS = ar.pfs,0,3,4,0 - nop.m 999 - mov GR_one = 1 +{ .mlx + nop.m 999 +(p12) movl r39 = 0xA240 } -;; - -{ .mmi - setf.exp FR_small = GR_one // Form small value - nop.m 999 -(p6) mov GR_Parameter_TAG = 13 // Error tag for exp underflow +{ .mlx + nop.m 999 +(p12) movl r38 = 0xA1DC ;; +} +{ .mmb +(p13) cmp.eq p15, p14 = r0, r0 +(p12) setf.exp f34 = r39 + nop.b 999 ;; +} +{ .mlx +(p12) setf.exp f35 = r38 +(p13) movl r39 = 0xFF9C } -;; - { .mfi - nop.m 999 - fmerge.s FR_X = f8,f8 // Save x for error call - nop.i 999 + nop.m 999 +(p13) fsub.s1 f34 = f0, f1 + nop.i 999 ;; } -;; - -.pred.rel "mutex",p6,p7 -{ .mfb - nop.m 999 -(p6) fma.s0 FR_RESULT = FR_small, FR_small, f0 // If exp, set I,U, tiny result -(p6) br.cond.sptk __libm_error_region // If exp, go to error handling +{ .mfi + nop.m 999 +(p12) mov f36 = f34 +(p12) cmp.eq p0, p15 = r0, r0 ;; +} +{ .mfi +(p13) setf.exp f35 = r39 +(p13) mov f36 = f1 + nop.i 999 ;; +} +EXPL_MAIN: +{ .mfi +(p0) cmp.ne.unc p12, p0 = 0x01, r33 +(p0) fmpy.s1 f101 = f36, f35 + nop.i 999 ;; } { .mfb - nop.m 999 -(p7) fms.s0 f8 = FR_small, FR_small, f1 // If expm1, set I, result -1.0 -(p7) br.ret.sptk b0 // If expm1, exit -} -;; - - -EXP_OVERFLOW: -// Here if x >= min_oflow_x -{ .mmi - alloc GR_SAVE_PFS = ar.pfs,0,3,4,0 - mov GR_huge_exp = 0x1fffe - nop.i 999 + nop.m 999 +(p0) fma.s0 f99 = f34, f36, f101 +(p15) br.cond.sptk EXPL_64_RETURN ;; } { .mfi - mov GR_huge_signif = -0x1 - nop.f 999 -(p6) mov GR_Parameter_TAG = 12 // Error tag for exp overflow + nop.m 999 +(p0) fsetc.s3 0x7F,0x01 + nop.i 999 } -;; - -{ .mmf - setf.exp FR_huge_exp = GR_huge_exp // Create huge value - setf.sig FR_huge_signif = GR_huge_signif // Create huge value - fmerge.s FR_X = f8,f8 // Save x for error call +{ .mlx + nop.m 999 +(p0) movl r50 = 0x00000000013FFF ;; } -;; - +// +// S0 user supplied status +// S2 user supplied status + WRE + TD (Overflows) +// S3 user supplied status + RZ + TD (Underflows) +// +// +// If (Safe) is true, then +// Compute result using user supplied status field. +// No overflow or underflow here, but perhaps inexact. +// Return +// Else +// Determine if overflow or underflow was raised. +// Fetch +/- overflow threshold for IEEE single, double, +// double extended +// { .mfi - nop.m 999 - fmerge.se FR_huge = FR_huge_exp, FR_huge_signif -(p7) mov GR_Parameter_TAG = 39 // Error tag for expm1 overflow +(p0) setf.exp f60 = r50 +(p0) fma.s3 f102 = f34, f36, f101 + nop.i 999 } -;; - -{ .mfb - nop.m 999 - fma.s0 FR_RESULT = FR_huge, FR_huge, FR_huge // Force I, O, and Inf - br.cond.sptk __libm_error_region // Branch to error handling +{ .mfi + nop.m 999 +(p0) fsetc.s3 0x7F,0x40 + nop.i 999 ;; } -;; - - - -EXP_POSSIBLE_UNDERFLOW: -// Here if exp and zero_uflow_x < x < about -11356 [where k < -16381] -// Here if expm1 and |x| < 2^-16381 { .mfi - alloc GR_SAVE_PFS = ar.pfs,0,3,4,0 - fsetc.s2 0x7F,0x41 // Set FTZ and disable traps - nop.i 999 + nop.m 999 +// +// For Safe, no need to check for over/under. +// For expm1, handle errors like exp. +// +(p0) fsetc.s2 0x7F,0x42 + nop.i 999;; } -;; - { .mfi - nop.m 999 - fma.s2 FR_ftz = FR_Y_hi, FR_scale, FR_result_lo // Result with FTZ - nop.i 999 + nop.m 999 +(p0) fma.s2 f100 = f34, f36, f101 + nop.i 999 ;; } -;; - { .mfi - nop.m 999 - fsetc.s2 0x7F,0x40 // Disable traps (set s2 default) - nop.i 999 + nop.m 999 +(p0) fsetc.s2 0x7F,0x40 + nop.i 999 ;; } -;; - { .mfi - nop.m 999 -(p6) fclass.m.unc p11, p0 = FR_ftz, 0x00F // If exp, FTZ result denorm or zero? - nop.i 999 + nop.m 999 +(p7) fclass.m.unc p12, p0 = f102, 0x00F + nop.i 999 } -;; - -{ .mfb -(p11) mov GR_Parameter_TAG = 13 // exp underflow - fmerge.s FR_X = f8,f8 // Save x for error call -(p11) br.cond.spnt __libm_error_region // Branch on exp underflow +{ .mfi + nop.m 999 +(p0) fclass.m.unc p11, p0 = f102, 0x00F + nop.i 999 ;; } -;; - -{ .mfb - nop.m 999 - mov f8 = FR_RESULT // Was safe after all - br.ret.sptk b0 +{ .mfi + nop.m 999 +(p7) fcmp.ge.unc.s1 p10, p0 = f100, f60 + nop.i 999 } -;; - - -EXP_64_SPECIAL: -// Here if x natval, nan, inf, zero -// If x natval, +inf, or if expm1 and x zero, just return x. -// The other cases must be tested for, and results set. -// These cases do not generate exceptions. { .mfi - nop.m 999 - fclass.m p8, p0 = f8, 0x0c3 // Is x nan? - nop.i 999 + nop.m 999 +// +// Create largest double exponent + 1. +// Create smallest double exponent - 1. +// +(p0) fcmp.ge.unc.s1 p8, p0 = f100, f60 + nop.i 999 ;; } -;; - +// +// fcmp: resultS2 >= + overflow threshold -> set (a) if true +// fcmp: resultS2 <= - overflow threshold -> set (b) if true +// fclass: resultS3 is denorm/unorm/0 -> set (d) if true +// +{ .mib +(p10) mov GR_Parameter_TAG = 39 + nop.i 999 +(p10) br.cond.sptk __libm_error_region ;; +} +{ .mib +(p8) mov GR_Parameter_TAG = 12 + nop.i 999 +(p8) br.cond.sptk __libm_error_region ;; +} +// +// Report that exp overflowed +// +{ .mib +(p12) mov GR_Parameter_TAG = 40 + nop.i 999 +(p12) br.cond.sptk __libm_error_region ;; +} +{ .mib +(p11) mov GR_Parameter_TAG = 13 + nop.i 999 +(p11) br.cond.sptk __libm_error_region ;; +} +{ .mib + nop.m 999 + nop.i 999 +// +// Report that exp underflowed +// +(p0) br.cond.sptk EXPL_64_RETURN ;; +} +EXPL_64_SPECIAL: { .mfi - nop.m 999 -(p6) fclass.m.unc p13, p0 = f8, 0x007 // If exp, is x zero? - nop.i 999 + nop.m 999 +(p0) fclass.m.unc p6, p0 = f8, 0x0c3 + nop.i 999 } -;; - { .mfi - nop.m 999 -(p6) fclass.m.unc p11, p0 = f8, 0x022 // If exp, is x -inf? - nop.i 999 + nop.m 999 +(p0) fclass.m.unc p13, p8 = f8, 0x007 + nop.i 999 ;; } { .mfi - nop.m 999 -(p8) fadd.s0 f8 = f8, f1 // If x nan, result quietized x - nop.i 999 + nop.m 999 +(p7) fclass.m.unc p14, p0 = f8, 0x007 + nop.i 999 } -;; - { .mfi - nop.m 999 -(p7) fclass.m.unc p10, p0 = f8, 0x022 // If expm1, is x -inf? - nop.i 999 + nop.m 999 +(p0) fclass.m.unc p12, p9 = f8, 0x021 + nop.i 999 ;; } { .mfi - nop.m 999 -(p13) fadd.s0 f8 = f0, f1 // If exp and x zero, result 1.0 - nop.i 999 + nop.m 999 +(p0) fclass.m.unc p11, p0 = f8, 0x022 + nop.i 999 } -;; - { .mfi - nop.m 999 -(p11) mov f8 = f0 // If exp and x -inf, result 0 - nop.i 999 + nop.m 999 +(p7) fclass.m.unc p10, p0 = f8, 0x022 + nop.i 999 ;; +} +{ .mfi + nop.m 999 +// +// Identify +/- 0, Inf, or -Inf +// Generate the right kind of NaN. +// +(p13) fadd.s0 f99 = f0, f1 + nop.i 999 ;; +} +{ .mfi + nop.m 999 +(p14) mov f99 = f8 + nop.i 999 ;; } -;; - { .mfb - nop.m 999 -(p10) fsub.s1 f8 = f0, f1 // If expm1, x -inf, result -1.0 - br.ret.sptk b0 // Exit special cases + nop.m 999 +(p6) fadd.s0 f99 = f8, f1 +// +// expl(+/-0) = 1 +// expm1l(+/-0) = +/-0 +// No exceptions raised +// +(p6) br.cond.sptk EXPL_64_RETURN ;; } -;; - - -EXP_64_UNSUPPORTED: -// Here if x unsupported type +{ .mib + nop.m 999 + nop.i 999 +(p14) br.cond.sptk EXPL_64_RETURN ;; +} +{ .mfi + nop.m 999 +(p11) mov f99 = f0 + nop.i 999 ;; +} +{ .mfb + nop.m 999 +(p10) fsub.s1 f99 = f0, f1 +// +// expl(-Inf) = 0 +// expm1l(-Inf) = -1 +// No exceptions raised. +// +(p10) br.cond.sptk EXPL_64_RETURN ;; +} +{ .mfb + nop.m 999 +(p12) fmpy.s1 f99 = f8, f1 +// +// expl(+Inf) = Inf +// No exceptions raised. +// +(p0) br.cond.sptk EXPL_64_RETURN ;; +} +EXPL_64_UNSUPPORTED: +{ .mfb + nop.m 999 +(p0) fmpy.s0 f99 = f8, f0 +(p0) br.cond.sptk EXPL_64_RETURN ;; +} +EXPL_64_RETURN: { .mfb nop.m 999 - fmpy.s0 f8 = f8, f0 // Return nan - br.ret.sptk b0 +(p0) mov f8 = f99 +(p0) br.ret.sptk b0 } -;; - -GLOBAL_IEEE754_END(expl) +.endp +ASM_SIZE_DIRECTIVE(expl) -LOCAL_LIBM_ENTRY(__libm_error_region) +.proc __libm_error_region +__libm_error_region: .prologue { .mfi add GR_Parameter_Y=-32,sp // Parameter 2 value @@ -1410,9 +1598,9 @@ LOCAL_LIBM_ENTRY(__libm_error_region) br.call.sptk b0=__libm_error_support# // Call error handling function };; { .mmi - add GR_Parameter_RESULT = 48,sp nop.m 0 - nop.i 0 + nop.m 0 + add GR_Parameter_RESULT = 48,sp };; { .mmi ldfe f8 = [GR_Parameter_RESULT] // Get return result off stack @@ -1425,7 +1613,8 @@ LOCAL_LIBM_ENTRY(__libm_error_region) mov ar.pfs = GR_SAVE_PFS // Restore ar.pfs br.ret.sptk b0 // Return };; -LOCAL_LIBM_END(__libm_error_region#) +.endp __libm_error_region +ASM_SIZE_DIRECTIVE(__libm_error_region) .type __libm_error_support#,@function .global __libm_error_support# |