summaryrefslogtreecommitdiff
path: root/backend
diff options
context:
space:
mode:
authorrander.wang <rander.wang@intel.com>2017-05-15 14:32:48 +0800
committerYang Rong <rong.r.yang@intel.com>2017-05-17 18:10:06 +0800
commita476a849e726968dd76771da867f18f79fbf4ee9 (patch)
treea26e0c18e136319df4cab84aa89a71634cc7c0ae /backend
parentacd53cffa43da9f92b7e656f1314edc0b4044c0b (diff)
downloadbeignet-a476a849e726968dd76771da867f18f79fbf4ee9.tar.gz
backend: refine pow function
remove private array and some unnecessary if check. convert some if to select. improve about 50% performance Signed-off-by: rander.wang <rander.wang@intel.com> Tested-by: Yang Rong <rong.r.yang@intel.com>
Diffstat (limited to 'backend')
-rw-r--r--backend/src/libocl/tmpl/ocl_math_common.tmpl.cl275
1 files changed, 141 insertions, 134 deletions
diff --git a/backend/src/libocl/tmpl/ocl_math_common.tmpl.cl b/backend/src/libocl/tmpl/ocl_math_common.tmpl.cl
index e82b7f55..f860f8f2 100644
--- a/backend/src/libocl/tmpl/ocl_math_common.tmpl.cl
+++ b/backend/src/libocl/tmpl/ocl_math_common.tmpl.cl
@@ -2313,7 +2313,7 @@ OVERLOADABLE float __gen_ocl_internal_pow(float x, float y) {
float y1,t1,t2,r,s,sn,t,u,v,w;
int i,j,k,yisint,n;
int hx,hy,ix,iy,is;
- float bp[2],dp_h[2],dp_l[2],
+ float bp,dp_h,dp_l,
zero = 0.0,
one = 1.0,
two = 2.0,
@@ -2335,25 +2335,29 @@ OVERLOADABLE float __gen_ocl_internal_pow(float x, float y) {
ivln2 = 1.4426950216e+00, /* 0x3fb8aa3b =1/ln2 */
ivln2_h = 1.4426879883e+00, /* 0x3fb8aa00 =16b 1/ln2*/
ivln2_l = 7.0526075433e-06; /* 0x36eca570 =1/ln2 tail*/
- bp[0] = 1.0,bp[1] = 1.5,
- dp_h[0] = 0.0,dp_h[1] = 5.84960938e-01,
- dp_l[0] = 0.0,dp_l[1] = 1.56322085e-06;
+ bp = 1.0;//,bp1 = 1.5,
+ dp_h = 0.0;//,dp_h[1] = 5.84960938e-01,
+ dp_l = 0.0;//,dp_l[1] = 1.56322085e-06;
+
+ float retVal = 0.0f;
+ bool bRet = false;
+
GEN_OCL_GET_FLOAT_WORD(hx,x);
GEN_OCL_GET_FLOAT_WORD(hy,y);
- ix = hx&0x7fffffff; iy = hy&0x7fffffff;
- if (ix < 0x00800000) { /* x < 2**-126 */
- ix = 0;/* Gen does not support subnormal number now */
+ ax = __gen_ocl_fabs(x);
+ ix = as_int(ax); iy = as_int(fabs(y));
+
+ if(iy < 0x00800000 || hx==0x3f800000)
+ {
+ bRet = true;
+ retVal = one;
}
- if (iy < 0x00800000) { /* y < 2**-126 */
- iy = 0;/* Gen does not support subnormal number now */
+ else if (ix > 0x7f800000 || iy > 0x7f800000)
+ {
+ bRet = true;
+ retVal = NAN;
}
- /* y==zero: x**0 = 1 */
- if(iy==0) return one;
- /* pow(+1, y) returns 1 for any y, even a NAN */
- if(hx==0x3f800000) return one;
- /* +-NaN return x+y */
- if(ix > 0x7f800000 || iy > 0x7f800000)
- return (x+0.0f)+y+(0.0f);
+
/* determine if y is an odd int when x < 0
* yisint = 0 ... y is not an integer
* yisint = 1 ... y is an odd int
@@ -2361,138 +2365,135 @@ OVERLOADABLE float __gen_ocl_internal_pow(float x, float y) {
*/
yisint = 0;
if(hx<0) {
- if(iy>=0x4b800000) yisint = 2; /* even integer y */
- else if(iy>=0x3f800000) {
- k = (iy>>23)-0x7f; /* exponent */
- j = iy>>(23-k);
- if((j<<(23-k))==iy) yisint = 2-(j&1);
- }
- }
- /* special value of y */
- if (iy==0x7f800000) { /* y is +-inf */
- if (ix==0x3f800000)
- //return y - y; /* inf**+-1 is NaN */
- return one;
- else if (ix > 0x3f800000)/* (|x|>1)**+-inf = inf,0 */
- return (hy>=0)? y: zero;
- else /* (|x|<1)**-,+inf = inf,0 */
- return (hy<0)?-y: zero;
- }
- if(iy==0x3f800000) { /* y is +-1 */
- if(hy<0) return one/x; else return x;
- }
- if(hy==0x40000000) return x*x; /* y is 2 */
- if(hy==0x3f000000) { /* y is 0.5 */
- if(hx>=0)return __gen_ocl_sqrt(x);
+ k = (iy>>23)-0x7f; /* exponent */
+ j = iy>>(23-k);
+ yisint = (iy>=0x3f800000 && (j<<(23-k))==iy)? 2-(j&1):yisint;
+ yisint = (iy>=0x4b800000) ? 2:yisint;
}
- ax = __gen_ocl_fabs(x);
/* special value of x */
if(ix==0x7f800000||ix==0||ix==0x3f800000){
z = ax; /*x is +-0,+-inf,+-1*/
- if(hy<0) z = one/z; /* z = (1/|x|) */
- if(hx<0) {
- if(((ix-0x3f800000)|yisint)==0) {
- z = (z-z)/(z-z); /* (-1)**non-int is NaN */
- } else if(yisint==1)
- z = -z; /* (x<0)**odd = -(|x|**odd) */
- }
- return z;
+
+ z = (hy < 0)? one/z:z;
+ z = ((hx<0) && (((ix-0x3f800000)|yisint)==0))? NAN:z;
+ z = ((hx<0) && (yisint==1))? -z:z;
+
+ retVal = (bRet)? retVal:z;
+ bRet = true;
}
+
n = ((uint)hx>>31)-1;
/* (x<0)**(non-int) is NaN */
- if((n|yisint)==0) return (x-x)/(x-x);
+ if(!bRet && (n|yisint)==0)
+ {
+ bRet= true;
+ retVal = NAN;
+ }
sn = one; /* s (sign of result -ve**odd) = -1 else = 1 */
if((n|(yisint-1))==0) sn = -one;/* (-ve)**(odd int) */
/* |y| is huge */
- if(iy>0x4d000000) { /* if |y| > 2**27 */
- /* over/underflow if x is not close to one */
- if(ix<0x3f7ffff8) return (hy<0)? sn*huge*huge:sn*tiny*tiny;
- if(ix>0x3f800007) return (hy>0)? sn*huge*huge:sn*tiny*tiny;
- /* now |1-x| is tiny <= 2**-20, suffice to compute
- log(x) by x-x^2/2+x^3/3-x^4/4 */
- t = ax-1; /* t has 20 trailing zeros */
- w = (t*t)*((float)0.5-t*(0.333333333333f-t*0.25f));
- u = ivln2_h*t; /* ivln2_h has 16 sig. bits */
- v = t*ivln2_l-w*ivln2;
- t1 = u+v;
- GEN_OCL_GET_FLOAT_WORD(is,t1);
- GEN_OCL_SET_FLOAT_WORD(t1,is&0xfffff000);
- t2 = v-(t1-u);
- } else {
- float s2,s_h,s_l,t_h,t_l;
- n = 0;
- /* take care subnormal number */
- //if(ix<0x00800000)
- //{ax *= two24; n -= 24; GEN_OCL_GET_FLOAT_WORD(ix,ax); }
- n += ((ix)>>23)-0x7f;
- j = ix&0x007fffff;
+ if(iy>0x4d000000)
+ { /* if |y| > 2**27 */
+ /* over/underflow if x is not close to one */
+ /* special value of y */
+ float b1 = (hy>=0)? y: zero;
+ float b2 = (hy<0)?-y: zero;
+ b1 = (ix > 0x3f800000)? b1:b2;
+ retVal = (iy==0x7f800000 && !bRet)? b1:retVal;
+ bRet = (iy==0x7f800000 && !bRet)? true: bRet;
+
+
+ b1 = (hy>0)? sn*huge*huge:0;
+ retVal = (ix>0x3f800007 && !bRet)? b1:retVal;
+ bRet = (ix>0x3f800007 && !bRet)? true:bRet;
+
+ /* now |1-x| is tiny <= 2**-20, suffice to compute
+ log(x) by x-x^2/2+x^3/3-x^4/4 */
+ t = ax-1; /* t has 20 trailing zeros */
+ w = (t*t)*((float)0.5-t*(0.333333333333f-t*0.25f));
+ u = ivln2_h*t; /* ivln2_h has 16 sig. bits */
+ v = t*ivln2_l-w*ivln2;
+ t1 = u+v;
+ GEN_OCL_GET_FLOAT_WORD(is,t1);
+ GEN_OCL_SET_FLOAT_WORD(t1,is&0xfffff000);
+ t2 = v-(t1-u);
+ }
+ else
+ {
+ float s2,s_h,s_l,t_h,t_l;
+ n = 0;
+ n += ((ix)>>23)-0x7f;
+ j = ix&0x007fffff;
/* determine interval */
- ix = j|0x3f800000; /* normalize ix */
- if(j<=0x1cc471) k=0; /* |x|<sqrt(3/2) */
- else if(j<0x5db3d7) k=1; /* |x|<sqrt(3) */
- else {k=0;n+=1;ix -= 0x00800000;}
- GEN_OCL_SET_FLOAT_WORD(ax,ix);
+ ix = j|0x3f800000; /* normalize ix */
+
+ n = (j >= 0x5db3d7)?n+1:n;
+ ix = (j >= 0x5db3d7)? ix - 0x00800000:ix;
+ k = (j<=0x1cc471 || j >= 0x5db3d7)? 0:1;
+
+ GEN_OCL_SET_FLOAT_WORD(ax,ix);
+
+ bp = k? 1.5:bp;
+ dp_h = k? 5.84960938e-01:dp_h;
+ dp_l = k? 1.56322085e-06:dp_l;
/* compute s = s_h+s_l = (x-1)/(x+1) or (x-1.5)/(x+1.5) */
- u = ax-bp[k]; /* bp[0]=1.0, bp[1]=1.5 */
- v = one/(ax+bp[k]);
- s = u*v;
- s_h = s;
- GEN_OCL_GET_FLOAT_WORD(is,s_h);
- GEN_OCL_SET_FLOAT_WORD(s_h,is&0xfffff000);
- /* t_h=ax+bp[k] High */
- is = ((ix>>1)&0xfffff000)|0x20000000;
- GEN_OCL_SET_FLOAT_WORD(t_h,is+0x00400000+(k<<21));
- t_l = ax - (t_h-bp[k]);
- s_l = v*((u-s_h*t_h)-s_h*t_l);
+ u = ax-bp; /* bp[0]=1.0, bp[1]=1.5 */
+ v = one/(ax+bp);
+ s = u*v;
+ s_h = s;
+ GEN_OCL_GET_FLOAT_WORD(is,s_h);
+ GEN_OCL_SET_FLOAT_WORD(s_h,is&0xfffff000);
+ /* t_h=ax+bp[k] High */
+ is = ((ix>>1)&0xfffff000)|0x20000000;
+ GEN_OCL_SET_FLOAT_WORD(t_h,is+0x00400000+(k<<21));
+ t_l = ax - (t_h-bp);
+ s_l = v*(mad(-s_h, t_l, mad(-s_h, t_h, u)));
- /* compute log(ax) */
- s2 = s*s;
- r = s2*s2*(L1+s2*L2);
- r += s_l*(s_h+s);
- s2 = s_h*s_h;
- t_h = 3.0f+s2+r;
- GEN_OCL_GET_FLOAT_WORD(is,t_h);
- GEN_OCL_SET_FLOAT_WORD(t_h,is&0xffffe000);
- t_l = r-((t_h-3.0f)-s2);
- /* u+v = s*(1+...) */
- u = s_h*t_h;
- v = s_l*t_h+t_l*s;
- /* 2/(3log2)*(s+...) */
- p_h = u+v;
- GEN_OCL_GET_FLOAT_WORD(is,p_h);
- GEN_OCL_SET_FLOAT_WORD(p_h,is&0xffffe000);
- p_l = v-(p_h-u);
- z_h = cp_h*p_h; /* cp_h+cp_l = 2/(3*log2) */
- z_l = cp_l*p_h+p_l*cp+dp_l[k];
- /* log2(ax) = (s+..)*2/(3*log2) = n + dp_h + z_h + z_l */
- t = (float)n;
- t1 = (((z_h+z_l)+dp_h[k])+t);
- GEN_OCL_GET_FLOAT_WORD(is,t1);
- GEN_OCL_SET_FLOAT_WORD(t1,is&0xffffe000);
- t2 = z_l-(((t1-t)-dp_h[k])-z_h);
+ /* compute log(ax) */
+ s2 = s*s;
+ r = s2*s2*(mad(s2, L2, L1));
+ r = mad(s_l, (s_h+s), r);
+ t_h = mad(s_h, s_h, 3.0f)+r;
+ GEN_OCL_GET_FLOAT_WORD(is,t_h);
+ GEN_OCL_SET_FLOAT_WORD(t_h,is&0xffffe000);
+ t_l = r-(mad(-s_h, s_h, (t_h-3.0f)));
+ /* u+v = s*(1+...) */
+ u = s_h*t_h;
+ v = mad(s_l, t_h, t_l*s);
+ /* 2/(3log2)*(s+...) */
+ p_h = mad(s_h, t_h, v);
+ GEN_OCL_GET_FLOAT_WORD(is,p_h);
+ GEN_OCL_SET_FLOAT_WORD(p_h,is&0xffffe000);
+ p_l = v-(mad(-s_h, t_h, p_h));
+ z_l = mad(cp_l, p_h, mad(p_l, cp, dp_l));
+ /* log2(ax) = (s+..)*2/(3*log2) = n + dp_h + z_h + z_l */
+ t = (float)n;
+ t1 = ((mad(cp_h, p_h, z_l)+dp_h)+t);
+ GEN_OCL_GET_FLOAT_WORD(is,t1);
+ GEN_OCL_SET_FLOAT_WORD(t1,is&0xffffe000);
+ t2 = z_l-mad(-cp_h, p_h, ((t1-t)-dp_h));
}
/* split up y into y1+y2 and compute (y1+y2)*(t1+t2) */
GEN_OCL_GET_FLOAT_WORD(is,y);
GEN_OCL_SET_FLOAT_WORD(y1,is&0xffffe000);
- p_l = (y-y1)*t1+y*t2;
+ p_l = mad((y-y1), t1, y*t2);
p_h = y1*t1;
- z = p_l+p_h;
+ z = mad(y1, t1, p_l);
+
GEN_OCL_GET_FLOAT_WORD(j,z);
- if (j>0x43000000) /* if z > 128 */
- return sn*huge*huge; /* overflow */
- else if (j==0x43000000) { /* if z == 128 */
- if(p_l+ovt>z-p_h) return sn*huge*huge; /* overflow */
- }
- else if ((j&0x7fffffff)>0x43160000) /* z <= -150 */
- return sn*tiny*tiny; /* underflow */
- else if (j==0xc3160000){ /* z == -150 */
- if(p_l<=z-p_h) return sn*tiny*tiny; /* underflow */
+ if((j&0x7fffffff) >= 0x43000000 && !bRet)
+ {
+ retVal = ((j&0x7fffffff)>0x43160000)? 0.0:retVal;
+ retVal = (j > 0x43000000) ? sn*huge*huge:retVal;
+ retVal = (j == 0x43000000 && p_l+ovt>z-p_h)? sn*huge*huge:retVal;
+ retVal = (j == 0xc3160000 && p_l<=z-p_h)? 0.0:retVal;
+ bRet = (((j&0x7fffffff)>0x43160000) || (j == 0xc3160000 && p_l<=z-p_h) || retVal == sn*huge*huge)? true:false;
}
/*
@@ -2506,25 +2507,31 @@ OVERLOADABLE float __gen_ocl_internal_pow(float x, float y) {
k = ((n&0x7fffffff)>>23)-0x7f; /* new k for n */
GEN_OCL_SET_FLOAT_WORD(t,n&~(0x007fffff>>k));
n = ((n&0x007fffff)|0x00800000)>>(23-k);
- if(j<0) n = -n;
- p_h -= t;
+ n = (j<0)?-n:n;
+ p_h = mad(y1, t1, -t);
}
+
t = p_l+p_h;
GEN_OCL_GET_FLOAT_WORD(is,t);
GEN_OCL_SET_FLOAT_WORD(t,is&0xffff8000);
- u = t*lg2_h;
- v = (p_l-(t-p_h))*lg2+t*lg2_l;
- z = u+v;
- w = v-(z-u);
+
+ v = mad((p_l-(t-p_h)), lg2, t*lg2_l);
+ z = mad(t, lg2_h, v);
+ w = v-mad(-t, lg2_h, z);
t = z*z;
- t1 = z - t*(P1+t*P2);
- r = (z*t1)/(t1-two)-(w+z*w);
+ t1 = mad(-t, (mad(t, P2, P1)), z);
+ r = (z*t1)/(t1-two)-(mad(z, w, w));
z = one-(r-z);
+
GEN_OCL_GET_FLOAT_WORD(j,z);
j += (n<<23);
- if((j>>23)<=0) z = __gen_ocl_scalbnf(z,n); /* subnormal output */
- else GEN_OCL_SET_FLOAT_WORD(z,j);
- return sn*z;
+
+ if((j>>23)<=0)
+ z = 0;//__gen_ocl_scalbnf(z,n); /* subnormal output */
+ else
+ GEN_OCL_SET_FLOAT_WORD(z,j);
+
+ return bRet ? retVal:sn*z;
}
#define BODY \