diff options
Diffstat (limited to 'libgcc/config/arc/ieee-754/arc600-mul64')
-rw-r--r-- | libgcc/config/arc/ieee-754/arc600-mul64/divdf3.S | 410 | ||||
-rw-r--r-- | libgcc/config/arc/ieee-754/arc600-mul64/divsf3.S | 274 | ||||
-rw-r--r-- | libgcc/config/arc/ieee-754/arc600-mul64/muldf3.S | 234 | ||||
-rw-r--r-- | libgcc/config/arc/ieee-754/arc600-mul64/mulsf3.S | 180 |
4 files changed, 1098 insertions, 0 deletions
diff --git a/libgcc/config/arc/ieee-754/arc600-mul64/divdf3.S b/libgcc/config/arc/ieee-754/arc600-mul64/divdf3.S new file mode 100644 index 00000000000..0584b53f77b --- /dev/null +++ b/libgcc/config/arc/ieee-754/arc600-mul64/divdf3.S @@ -0,0 +1,410 @@ +/* Copyright (C) 2008-2013 Free Software Foundation, Inc. + Contributor: Joern Rennecke <joern.rennecke@embecosm.com> + on behalf of Synopsys Inc. + +This file is part of GCC. + +GCC is free software; you can redistribute it and/or modify it under +the terms of the GNU General Public License as published by the Free +Software Foundation; either version 3, or (at your option) any later +version. + +GCC is distributed in the hope that it will be useful, but WITHOUT ANY +WARRANTY; without even the implied warranty of MERCHANTABILITY or +FITNESS FOR A PARTICULAR PURPOSE. See the GNU General Public License +for more details. + +Under Section 7 of GPL version 3, you are granted additional +permissions described in the GCC Runtime Library Exception, version +3.1, as published by the Free Software Foundation. + +You should have received a copy of the GNU General Public License and +a copy of the GCC Runtime Library Exception along with this program; +see the files COPYING3 and COPYING.RUNTIME respectively. If not, see +<http://www.gnu.org/licenses/>. */ + +/* + to calculate a := b/x as b*y, with y := 1/x: + - x is in the range [1..2) + - calculate 15..18 bit inverse y0 using a table of approximating polynoms. + Precision is higher for polynoms used to evaluate input with larger + value. + - Do one newton-raphson iteration step to double the precision, + then multiply this with the divisor + -> more time to decide if dividend is subnormal + - the worst error propagation is on the side of the value range + with the least initial defect, thus giving us about 30 bits precision. + The truncation error for the either is less than 1 + x/2 ulp. + A 31 bit inverse can be simply calculated by using x with implicit 1 + and chaining the multiplies. For a 32 bit inverse, we multiply y0^2 + with the bare fraction part of x, then add in y0^2 for the implicit + 1 of x. + - If calculating a 31 bit inverse, the systematic error is less than + -1 ulp; likewise, for 32 bit, it is less than -2 ulp. + - If we calculate our seed with a 32 bit fraction, we can archive a + tentative result strictly better than -2 / +2.5 (1) ulp/128, i.e. we + only need to take the step to calculate the 2nd stage rest and + rounding adjust 1/32th of the time. However, if we use a 20 bit + fraction for the seed, the negative error can exceed -2 ulp/128, (2) + thus for a simple add / tst check, we need to do the 2nd stage + rest calculation/ rounding adjust 1/16th of the time. + (1): The inexactness of the 32 bit inverse contributes an error in the + range of (-1 .. +(1+x/2) ) ulp/128. Leaving out the low word of the + rest contributes an error < +1/x ulp/128 . In the interval [1,2), + x/2 + 1/x <= 1.5 . + (2): Unless proven otherwise. I have not actually looked for an + example where -2 ulp/128 is exceeded, and my calculations indicate + that the excess, if existent, is less than -1/512 ulp. + ??? The algorithm is still based on the ARC700 optimized code. + Maybe we could make better use of 64 bit multiply results and/or mmed . + */ +#include "../arc-ieee-754.h" + +/* N.B. fp-bit.c does double rounding on denormal numbers. */ +#if 0 /* DEBUG */ + .global __divdf3 + FUNC(__divdf3) + .balign 4 +__divdf3: + push_s blink + push_s r2 + push_s r3 + push_s r0 + bl.d __divdf3_c + push_s r1 + ld_s r2,[sp,12] + ld_s r3,[sp,8] + st_s r0,[sp,12] + st_s r1,[sp,8] + pop_s r1 + bl.d __divdf3_asm + pop_s r0 + pop_s r3 + pop_s r2 + pop_s blink + cmp r0,r2 + cmp.eq r1,r3 + jeq_s [blink] + and r12,DBL0H,DBL1H + bic.f 0,0x7ff80000,r12 ; both NaN -> OK + jeq_s [blink] + bl abort + ENDFUNC(__divdf3) +#define __divdf3 __divdf3_asm +#endif /* DEBUG */ + + FUNC(__divdf3) + .balign 4 +.L7ff00000: + .long 0x7ff00000 +.Ldivtab: + .long 0xfc0fffe1 + .long 0xf46ffdfb + .long 0xed1ffa54 + .long 0xe61ff515 + .long 0xdf7fee75 + .long 0xd91fe680 + .long 0xd2ffdd52 + .long 0xcd1fd30c + .long 0xc77fc7cd + .long 0xc21fbbb6 + .long 0xbcefaec0 + .long 0xb7efa100 + .long 0xb32f92bf + .long 0xae8f83b7 + .long 0xaa2f7467 + .long 0xa5ef6479 + .long 0xa1cf53fa + .long 0x9ddf433e + .long 0x9a0f3216 + .long 0x965f2091 + .long 0x92df0f11 + .long 0x8f6efd05 + .long 0x8c1eeacc + .long 0x88eed876 + .long 0x85dec615 + .long 0x82eeb3b9 + .long 0x800ea10b + .long 0x7d3e8e0f + .long 0x7a8e7b3f + .long 0x77ee6836 + .long 0x756e5576 + .long 0x72fe4293 + .long 0x709e2f93 + .long 0x6e4e1c7f + .long 0x6c0e095e + .long 0x69edf6c5 + .long 0x67cde3a5 + .long 0x65cdd125 + .long 0x63cdbe25 + .long 0x61ddab3f + .long 0x600d991f + .long 0x5e3d868c + .long 0x5c6d7384 + .long 0x5abd615f + .long 0x590d4ecd + .long 0x576d3c83 + .long 0x55dd2a89 + .long 0x545d18e9 + .long 0x52dd06e9 + .long 0x516cf54e + .long 0x4ffce356 + .long 0x4e9cd1ce + .long 0x4d3cbfec + .long 0x4becae86 + .long 0x4aac9da4 + .long 0x496c8c73 + .long 0x483c7bd3 + .long 0x470c6ae8 + .long 0x45dc59af + .long 0x44bc4915 + .long 0x43ac3924 + .long 0x428c27fb + .long 0x418c187a + .long 0x407c07bd + +__divdf3_support: /* This label makes debugger output saner. */ + .balign 4 +.Ldenorm_dbl1: + brge r6, \ + 0x43500000,.Linf_NaN ; large number / denorm -> Inf + bmsk.f r12,DBL1H,19 + mov.eq r12,DBL1L + mov.eq DBL1L,0 + sub.eq r7,r7,32 + norm.f r11,r12 ; flag for x/0 -> Inf check + beq_s .Linf_NaN + mov.mi r11,0 + add.pl r11,r11,1 + add_s r12,r12,r12 + asl r8,r12,r11 + rsub r12,r11,31 + lsr r12,DBL1L,r12 + tst_s DBL1H,DBL1H + or r8,r8,r12 + lsr r4,r8,26 + lsr DBL1H,r8,12 + ld.as r4,[r10,r4] + bxor.mi DBL1H,DBL1H,31 + sub r11,r11,11 + asl DBL1L,DBL1L,r11 + sub r11,r11,1 + mulu64 r4,r8 + sub r7,r7,r11 + b.d .Lpast_denorm_dbl1 + asl r7,r7,20 + + .balign 4 +.Ldenorm_dbl0: + bmsk.f r12,DBL0H,19 + ; wb stall + mov.eq r12,DBL0L + sub.eq r6,r6,32 + norm.f r11,r12 ; flag for 0/x -> 0 check + brge r7, \ + 0x43500000, .Lret0_2 ; denorm/large number -> 0 + beq_s .Lret0_2 + mov.mi r11,0 + add.pl r11,r11,1 + asl r12,r12,r11 + sub r6,r6,r11 + add.f 0,r6,31 + lsr r10,DBL0L,r6 + mov.mi r10,0 + add r6,r6,11+32 + neg.f r11,r6 + asl DBL0L,DBL0L,r11 + mov.pl DBL0L,0 + sub r6,r6,32-1 + b.d .Lpast_denorm_dbl0 + asl r6,r6,20 + +.Linf_NaN: + tst_s DBL0L,DBL0L ; 0/0 -> NaN + xor_s DBL1H,DBL1H,DBL0H + bclr.eq.f DBL0H,DBL0H,31 + bmsk DBL0H,DBL1H,30 + xor_s DBL0H,DBL0H,DBL1H + sub.eq DBL0H,DBL0H,1 + mov_s DBL0L,0 + j_s.d [blink] + or DBL0H,DBL0H,r9 + .balign 4 +.Lret0_2: + xor_s DBL1H,DBL1H,DBL0H + mov_s DBL0L,0 + bmsk DBL0H,DBL1H,30 + j_s.d [blink] + xor_s DBL0H,DBL0H,DBL1H + .balign 4 + .global __divdf3 +/* N.B. the spacing between divtab and the sub3 to get its address must + be a multiple of 8. */ +__divdf3: + asl r8,DBL1H,12 + lsr r4,r8,26 + sub3 r10,pcl,61; (.-.Ldivtab) >> 3 + ld.as r9,[pcl,-124]; [pcl,(-((.-.L7ff00000) >> 2))] ; 0x7ff00000 + ld.as r4,[r10,r4] + lsr r12,DBL1L,20 + and.f r7,DBL1H,r9 + or r8,r8,r12 + mulu64 r4,r8 + beq.d .Ldenorm_dbl1 +.Lpast_denorm_dbl1: + and.f r6,DBL0H,r9 + breq.d r7,r9,.Linf_nan_dbl1 + asl r4,r4,12 + sub r4,r4,mhi + mulu64 r4,r4 + beq.d .Ldenorm_dbl0 + lsr r8,r8,1 + breq.d r6,r9,.Linf_nan_dbl0 + asl r12,DBL0H,11 + lsr r10,DBL0L,21 +.Lpast_denorm_dbl0: + bset r8,r8,31 + mulu64 mhi,r8 + add_s r12,r12,r10 + bset r5,r12,31 + cmp r5,r8 + cmp.eq DBL0L,DBL1L + lsr.cc r5,r5,1 + sub r4,r4,mhi ; u1.31 inverse, about 30 bit + mulu64 r5,r4 ; result fraction highpart + lsr r8,r8,2 ; u3.29 + add r5,r6, /* wait for immediate */ \ + 0x3fe00000 + mov r11,mhi ; result fraction highpart + mulu64 r11,r8 ; u-28.31 + asl_s DBL1L,DBL1L,9 ; u-29.23:9 + sbc r6,r5,r7 + mov r12,mlo ; u-28.31 + mulu64 r11,DBL1L ; mhi: u-28.23:9 + add.cs DBL0L,DBL0L,DBL0L + asl_s DBL0L,DBL0L,6 ; u-26.25:7 + asl r10,r11,23 + sub_l DBL0L,DBL0L,r12 + lsr r7,r11,9 + sub r5,DBL0L,mhi ; rest msw ; u-26.31:0 + mul64 r5,r4 ; mhi: result fraction lowpart + xor.f 0,DBL0H,DBL1H + and DBL0H,r6,r9 + add_s DBL0H,DBL0H,r7 + bclr r12,r9,20 ; 0x7fe00000 + brhs.d r6,r12,.Linf_denorm + bxor.mi DBL0H,DBL0H,31 + add.f r12,mhi,0x11 + asr r9,r12,5 + sub.mi DBL0H,DBL0H,1 + add.f DBL0L,r9,r10 + tst r12,0x1c + jne.d [blink] + add.cs DBL0H,DBL0H,1 + /* work out exact rounding if we fall through here. */ + /* We know that the exact result cannot be represented in double + precision. Find the mid-point between the two nearest + representable values, multiply with the divisor, and check if + the result is larger than the dividend. Since we want to know + only the sign bit, it is sufficient to calculate only the + highpart of the lower 64 bits. */ + mulu64 r11,DBL1L ; rest before considering r12 in r5 : -mlo + sub.f DBL0L,DBL0L,1 + asl r12,r9,2 ; u-22.30:2 + sub.cs DBL0H,DBL0H,1 + sub.f r12,r12,2 + mov r10,mlo ; rest before considering r12 in r5 : -r10 + mulu64 r12,DBL1L ; mhi: u-51.32 + asl r5,r5,25 ; s-51.7:25 + lsr r10,r10,7 ; u-51.30:2 + mov r7,mhi ; u-51.32 + mulu64 r12,r8 ; mlo: u-51.31:1 + sub r5,r5,r10 + add.mi r5,r5,DBL1L ; signed multiply adjust for r12*DBL1L + bset r7,r7,0 ; make sure that the result is not zero, and that + sub r5,r5,r7 ; a highpart zero appears negative + sub.f r5,r5,mlo ; rest msw + add.pl.f DBL0L,DBL0L,1 + j_s.d [blink] + add.eq DBL0H,DBL0H,1 + +.Linf_nan_dbl1: ; 0/Inf -> NaN Inf/Inf -> NaN x/Inf-> 0 x/NaN -> NaN + or.f 0,r6,DBL0L + cmp.ne r6,r9 + not_s DBL0L,DBL1H + sub_s.ne DBL0L,DBL0L,DBL0L + tst_s DBL0H,DBL0H + add_s DBL0H,DBL1H,DBL0L + j_s.d [blink] + bxor.mi DBL0H,DBL0H,31 +.Linf_nan_dbl0: + tst_s DBL1H,DBL1H + j_s.d [blink] + bxor.mi DBL0H,DBL0H,31 + .balign 4 +.Linf_denorm: + lsr r12,r6,28 + brlo.d r12,0xc,.Linf +.Ldenorm: + asr r6,r6,20 + neg r9,r6 + mov_s DBL0H,0 + brhs.d r9,54,.Lret0 + bxor.mi DBL0H,DBL0H,31 + add r12,mhi,1 + and r12,r12,-4 + rsub r7,r6,5 + asr r10,r12,28 + bmsk r4,r12,27 + min r7,r7,31 + asr DBL0L,r4,r7 + add DBL1H,r11,r10 + abs.f r10,r4 + sub.mi r10,r10,1 + add.f r7,r6,32-5 + asl r4,r4,r7 + mov.mi r4,r10 + add.f r10,r6,23 + rsub r7,r6,9 + lsr r7,DBL1H,r7 + asl r10,DBL1H,r10 + or.pnz DBL0H,DBL0H,r7 + or.mi r4,r4,r10 + mov.mi r10,r7 + add.f DBL0L,r10,DBL0L + add.cs.f DBL0H,DBL0H,1 ; carry clear after this point + bxor.f 0,r4,31 + add.pnz.f DBL0L,DBL0L,1 + add.cs.f DBL0H,DBL0H,1 + jne_s [blink] + /* Calculation so far was not conclusive; calculate further rest. */ + mulu64 r11,DBL1L ; rest before considering r12 in r5 : -mlo + asr.f r12,r12,3 + asl r5,r5,25 ; s-51.7:25 + mov r11,mlo ; rest before considering r12 in r5 : -r11 + mulu64 r12,r8 ; u-51.31:1 + and r9,DBL0L,1 ; tie-breaker: round to even + lsr r11,r11,7 ; u-51.30:2 + mov DBL1H,mlo ; u-51.31:1 + mulu64 r12,DBL1L ; u-51.62:2 + sub.mi r11,r11,DBL1L ; signed multiply adjust for r12*DBL1L + add_s DBL1H,DBL1H,r11 + sub DBL1H,DBL1H,r5 ; -rest msw + add_s DBL1H,DBL1H,mhi ; -rest msw + add.f 0,DBL1H,DBL1H ; can't ror.f by 32 :-( + tst_s DBL1H,DBL1H + cmp.eq mlo,r9 + add.cs.f DBL0L,DBL0L,1 + j_s.d [blink] + add.cs DBL0H,DBL0H,1 + +.Lret0: + /* return +- 0 */ + j_s.d [blink] + mov_s DBL0L,0 +.Linf: + mov_s DBL0H,r9 + mov_s DBL0L,0 + j_s.d [blink] + bxor.mi DBL0H,DBL0H,31 + ENDFUNC(__divdf3) diff --git a/libgcc/config/arc/ieee-754/arc600-mul64/divsf3.S b/libgcc/config/arc/ieee-754/arc600-mul64/divsf3.S new file mode 100644 index 00000000000..a9934f0ad94 --- /dev/null +++ b/libgcc/config/arc/ieee-754/arc600-mul64/divsf3.S @@ -0,0 +1,274 @@ +/* Copyright (C) 2008-2013 Free Software Foundation, Inc. + Contributor: Joern Rennecke <joern.rennecke@embecosm.com> + on behalf of Synopsys Inc. + +This file is part of GCC. + +GCC is free software; you can redistribute it and/or modify it under +the terms of the GNU General Public License as published by the Free +Software Foundation; either version 3, or (at your option) any later +version. + +GCC is distributed in the hope that it will be useful, but WITHOUT ANY +WARRANTY; without even the implied warranty of MERCHANTABILITY or +FITNESS FOR A PARTICULAR PURPOSE. See the GNU General Public License +for more details. + +Under Section 7 of GPL version 3, you are granted additional +permissions described in the GCC Runtime Library Exception, version +3.1, as published by the Free Software Foundation. + +You should have received a copy of the GNU General Public License and +a copy of the GCC Runtime Library Exception along with this program; +see the files COPYING3 and COPYING.RUNTIME respectively. If not, see +<http://www.gnu.org/licenses/>. */ + +/* + - calculate 15..18 bit inverse using a table of approximating polynoms. + precision is higher for polynoms used to evaluate input with larger + value. + - do one newton-raphson iteration step to double the precision, + then multiply this with the divisor + -> more time to decide if dividend is subnormal + - the worst error propagation is on the side of the value range + with the least initial defect, thus giving us about 30 bits precision. + */ +#include "../arc-ieee-754.h" + +#if 0 /* DEBUG */ + .global __divsf3 + FUNC(__divsf3) + .balign 4 +__divsf3: + push_s blink + push_s r1 + bl.d __divsf3_c + push_s r0 + ld_s r1,[sp,4] + st_s r0,[sp,4] + bl.d __divsf3_asm + pop_s r0 + pop_s r1 + pop_s blink + cmp r0,r1 +#if 1 + bne abort + jeq_s [blink] + b abort +#else + bne abort + j_s [blink] +#endif + ENDFUNC(__divsf3) +#define __divsf3 __divsf3_asm +#endif /* DEBUG */ + + FUNC(__divsf3) + .balign 4 +.Ldivtab: + .long 0xfc0ffff0 + .long 0xf46ffefd + .long 0xed1ffd2a + .long 0xe627fa8e + .long 0xdf7ff73b + .long 0xd917f33b + .long 0xd2f7eea3 + .long 0xcd1fe986 + .long 0xc77fe3e7 + .long 0xc21fdddb + .long 0xbcefd760 + .long 0xb7f7d08c + .long 0xb32fc960 + .long 0xae97c1ea + .long 0xaa27ba26 + .long 0xa5e7b22e + .long 0xa1cfa9fe + .long 0x9ddfa1a0 + .long 0x9a0f990c + .long 0x9667905d + .long 0x92df878a + .long 0x8f6f7e84 + .long 0x8c27757e + .long 0x88f76c54 + .long 0x85df630c + .long 0x82e759c5 + .long 0x8007506d + .long 0x7d3f470a + .long 0x7a8f3da2 + .long 0x77ef341e + .long 0x756f2abe + .long 0x72f7212d + .long 0x709717ad + .long 0x6e4f0e44 + .long 0x6c1704d6 + .long 0x69e6fb44 + .long 0x67cef1d7 + .long 0x65c6e872 + .long 0x63cedf18 + .long 0x61e6d5cd + .long 0x6006cc6d + .long 0x5e36c323 + .long 0x5c76b9f3 + .long 0x5abeb0b7 + .long 0x5916a79b + .long 0x57769e77 + .long 0x55de954d + .long 0x54568c4e + .long 0x52d6834d + .long 0x51667a7f + .long 0x4ffe71b5 + .long 0x4e9e68f1 + .long 0x4d466035 + .long 0x4bf65784 + .long 0x4aae4ede + .long 0x496e4646 + .long 0x48363dbd + .long 0x47063547 + .long 0x45de2ce5 + .long 0x44be2498 + .long 0x43a61c64 + .long 0x4296144a + .long 0x41860c0e + .long 0x407e03ee +.L7f800000: + .long 0x7f800000 + .balign 4 + .global __divsf3_support +__divsf3_support: +.Linf_NaN: + bclr.f 0,r0,31 ; 0/0 -> NaN + xor_s r0,r0,r1 + bmsk r1,r0,30 + bic_s r0,r0,r1 + sub.eq r0,r0,1 + j_s.d [blink] + or r0,r0,r9 +.Lret0: + xor_s r0,r0,r1 + bmsk r1,r0,30 + j_s.d [blink] + bic_s r0,r0,r1 +/* N.B. the spacing between divtab and the sub3 to get its address must + be a multiple of 8. */ +__divsf3: + lsr r2,r1,17 + sub3 r3,pcl,37 ; (.-.Ldivtab) >> 3 + bmsk_s r2,r2,5 + ld.as r5,[r3,r2] + asl r4,r1,9 + ld.as r9,[pcl,-13]; [pcl,(-((.-.L7f800000) >> 2))] ; 0x7f800000 + mulu64 r5,r4 + and.f r11,r1,r9 + asl r6,r1,8 + bset r6,r6,31 + beq.d .Ldenorm_fp1 + asl r5,r5,13 + breq.d r11,r9,.Linf_nan_fp1 + and.f r2,r0,r9 + sub r7,r5,mhi + mulu64 r7,r6 + beq.d .Ldenorm_fp0 + asl r12,r0,8 + breq.d r2,r9,.Linf_nan_fp0 + mulu64 mhi,r7 +.Lpast_denorm_fp1: + bset r3,r12,31 +.Lpast_denorm_fp0: + cmp_s r3,r6 + lsr.cc r3,r3,1 + add_s r2,r2, /* wait for immediate */ \ + 0x3f000000 + sub r7,r7,mhi ; u1.31 inverse, about 30 bit + mulu64 r3,r7 + sbc r2,r2,r11 + xor.f 0,r0,r1 + and r0,r2,r9 + bclr r3,r9,23 ; 0x7f000000 + brhs.d r2,r3,.Linf_denorm + bxor.mi r0,r0,31 +.Lpast_denorm: + add r3,mhi,0x22 ; round to nearest or higher + tst r3,0x3c ; check if rounding was unsafe + lsr r3,r3,6 + jne.d [blink] ; return if rounding was safe. + add_s r0,r0,r3 + /* work out exact rounding if we fall through here. */ + /* We know that the exact result cannot be represented in single + precision. Find the mid-point between the two nearest + representable values, multiply with the divisor, and check if + the result is larger than the dividend. */ + add_s r3,r3,r3 + sub_s r3,r3,1 + mulu64 r3,r6 + asr.f 0,r0,1 ; for round-to-even in case this is a denorm + rsub r2,r9,25 + asl_s r12,r12,r2 + sub.f 0,r12,mlo + j_s.d [blink] + sub.mi r0,r0,1 +.Linf_nan_fp1: + lsr_s r0,r0,31 + bmsk.f 0,r1,22 + asl_s r0,r0,31 + bne_s 0f ; inf/inf -> nan + brne r2,r9,.Lsigned0 ; x/inf -> 0, but x/nan -> nan +0: j_s.d [blink] + mov r0,-1 +.Lsigned0: +.Linf_nan_fp0: + tst_s r1,r1 + j_s.d [blink] + bxor.mi r0,r0,31 + .balign 4 + .global __divsf3 +/* For denormal results, it is possible that an exact result needs + rounding, and thus the round-to-even rule has to come into play. */ +.Linf_denorm: + brlo r2,0xc0000000,.Linf +.Ldenorm: + asr_s r2,r2,23 + bic r0,r0,r9 + neg r9,r2 + brlo.d r9,25,.Lpast_denorm + lsr r3,mlo,r9 + /* Fall through: return +- 0 */ + j_s [blink] +.Linf: + j_s.d [blink] + or r0,r0,r9 + .balign 4 +.Ldenorm_fp1: + bclr r6,r6,31 + norm.f r12,r6 ; flag for x/0 -> Inf check + add r6,r6,r6 + rsub r5,r12,16 + ror r5,r1,r5 + asl r6,r6,r12 + bmsk r5,r5,5 + ld.as r5,[r3,r5] + add r4,r6,r6 + ; load latency + mulu64 r5,r4 + bic.ne.f 0, \ + 0x60000000,r0 ; large number / denorm -> Inf + asl r5,r5,13 + sub r7,r5,mhi + beq.d .Linf_NaN + mulu64 r7,r6 + asl_s r12,r12,23 + and.f r2,r0,r9 + add_s r2,r2,r12 + asl r12,r0,8 + bne.d .Lpast_denorm_fp1 +.Ldenorm_fp0: mulu64 mhi,r7 + bclr r12,r12,31 + norm.f r3,r12 ; flag for 0/x -> 0 check + bic.ne.f 0,0x60000000,r1 ; denorm/large number -> 0 + beq_s .Lret0 + asl_s r12,r12,r3 + asl_s r3,r3,23 + add_s r12,r12,r12 + add r11,r11,r3 + b.d .Lpast_denorm_fp0 + mov_s r3,r12 + ENDFUNC(__divsf3) diff --git a/libgcc/config/arc/ieee-754/arc600-mul64/muldf3.S b/libgcc/config/arc/ieee-754/arc600-mul64/muldf3.S new file mode 100644 index 00000000000..1e18999e49a --- /dev/null +++ b/libgcc/config/arc/ieee-754/arc600-mul64/muldf3.S @@ -0,0 +1,234 @@ +/* Copyright (C) 2008-2013 Free Software Foundation, Inc. + Contributor: Joern Rennecke <joern.rennecke@embecosm.com> + on behalf of Synopsys Inc. + +This file is part of GCC. + +GCC is free software; you can redistribute it and/or modify it under +the terms of the GNU General Public License as published by the Free +Software Foundation; either version 3, or (at your option) any later +version. + +GCC is distributed in the hope that it will be useful, but WITHOUT ANY +WARRANTY; without even the implied warranty of MERCHANTABILITY or +FITNESS FOR A PARTICULAR PURPOSE. See the GNU General Public License +for more details. + +Under Section 7 of GPL version 3, you are granted additional +permissions described in the GCC Runtime Library Exception, version +3.1, as published by the Free Software Foundation. + +You should have received a copy of the GNU General Public License and +a copy of the GCC Runtime Library Exception along with this program; +see the files COPYING3 and COPYING.RUNTIME respectively. If not, see +<http://www.gnu.org/licenses/>. */ + +#include "../arc-ieee-754.h" + +#if 0 /* DEBUG */ + .global __muldf3 + .balign 4 +__muldf3: + push_s blink + push_s r2 + push_s r3 + push_s r0 + bl.d __muldf3_c + push_s r1 + ld_s r2,[sp,12] + ld_s r3,[sp,8] + st_s r0,[sp,12] + st_s r1,[sp,8] + pop_s r1 + bl.d __muldf3_asm + pop_s r0 + pop_s r3 + pop_s r2 + pop_s blink + cmp r0,r2 + cmp.eq r1,r3 + jeq_s [blink] + and r12,DBL0H,DBL1H + bic.f 0,0x7ff80000,r12 ; both NaN -> OK + jeq_s [blink] + b abort +#define __muldf3 __muldf3_asm +#endif /* DEBUG */ + +__muldf3_support: /* This label makes debugger output saner. */ + .balign 4 + FUNC(__muldf3) +.Ldenorm_2: + breq.d DBL1L,0,.Lret0_2 ; 0 input -> 0 output + norm.f r12,DBL1L + mov.mi r12,21 + add.pl r12,r12,22 + neg r11,r12 + asl_s r12,r12,20 + lsr.f DBL1H,DBL1L,r11 + ror DBL1L,DBL1L,r11 + sub_s DBL0H,DBL0H,r12 + mov.eq DBL1H,DBL1L + sub_l DBL1L,DBL1L,DBL1H + /* Fall through. */ + .global __muldf3 + .balign 4 +__muldf3: + mulu64 DBL0L,DBL1L + ld.as r9,[pcl,0x68] ; ((.L7ff00000-.+2)/4)] + bmsk r6,DBL0H,19 + bset r6,r6,20 + and r11,DBL0H,r9 + breq.d r11,0,.Ldenorm_dbl0 + and r12,DBL1H,r9 + breq.d r12,0,.Ldenorm_dbl1 + mov r8,mlo + mov r4,mhi + mulu64 r6,DBL1L + breq.d r11,r9,.Linf_nan + bmsk r10,DBL1H,19 + breq.d r12,r9,.Linf_nan + bset r10,r10,20 + add.f r4,r4,mlo + adc r5,mhi,0 + mulu64 r10,DBL0L + add_s r12,r12,r11 ; add exponents + add.f r4,r4,mlo + adc r5,r5,mhi + mulu64 r6,r10 + tst r8,r8 + bclr r8,r9,30 ; 0x3ff00000 + bset.ne r4,r4,0 ; put least significant word into sticky bit + bclr r6,r9,20 ; 0x7fe00000 + add.f r5,r5,mlo + adc r7,mhi,0 ; fraction product in r7:r5:r4 + lsr.f r10,r7,9 + rsub.eq r8,r8,r9 ; 0x40000000 + sub r12,r12,r8 ; subtract bias + implicit 1 + brhs.d r12,r6,.Linf_denorm + rsub r10,r10,12 +.Lshift_frac: + neg r8,r10 + asl r6,r4,r10 + lsr DBL0L,r4,r8 + add.f 0,r6,r6 + btst.eq DBL0L,0 + cmp.eq r4,r4 ; round to nearest / round to even + asl r4,r5,r10 + lsr r5,r5,r8 + adc.f DBL0L,DBL0L,r4 + xor.f 0,DBL0H,DBL1H + asl r7,r7,r10 + add_s r12,r12,r5 + adc DBL0H,r12,r7 + j_s.d [blink] + bset.mi DBL0H,DBL0H,31 + +/* N.B. This is optimized for ARC700. + ARC600 has very different scheduling / instruction selection criteria. */ + +/* If one number is denormal, subtract some from the exponent of the other + one (if the other exponent is too small, return 0), and normalize the + denormal. Then re-run the computation. */ +.Lret0_2: + lsr_s DBL0H,DBL0H,31 + asl_s DBL0H,DBL0H,31 + j_s.d [blink] + mov_s DBL0L,0 + .balign 4 +.Ldenorm_dbl0: + mov_s r12,DBL0L + mov_s DBL0L,DBL1L + mov_s DBL1L,r12 + mov_s r12,DBL0H + mov_s DBL0H,DBL1H + mov_s DBL1H,r12 + and r11,DBL0H,r9 +.Ldenorm_dbl1: + brhs r11,r9,.Linf_nan + brhs 0x3ca00001,r11,.Lret0 + sub_s DBL0H,DBL0H,DBL1H + bmsk.f DBL1H,DBL1H,30 + add_s DBL0H,DBL0H,DBL1H + beq.d .Ldenorm_2 + norm r12,DBL1H + sub_s r12,r12,10 + asl r5,r12,20 + asl_s DBL1H,DBL1H,r12 + sub DBL0H,DBL0H,r5 + neg r5,r12 + lsr r6,DBL1L,r5 + asl_s DBL1L,DBL1L,r12 + b.d __muldf3 + add_s DBL1H,DBL1H,r6 + +.Lret0: xor_s DBL0H,DBL0H,DBL1H + bclr DBL1H,DBL0H,31 + xor_s DBL0H,DBL0H,DBL1H + j_s.d [blink] + mov_s DBL0L,0 + + .balign 4 +.Linf_nan: + bclr r12,DBL1H,31 + xor_s DBL1H,DBL1H,DBL0H + bclr_s DBL0H,DBL0H,31 + max r8,DBL0H,r12 ; either NaN -> NaN ; otherwise inf + or.f 0,DBL0H,DBL0L + mov_s DBL0L,0 + or.ne.f DBL1L,DBL1L,r12 + not_s DBL0H,DBL0L ; inf * 0 -> NaN + mov.ne DBL0H,r8 + tst_s DBL1H,DBL1H + j_s.d [blink] + bset.mi DBL0H,DBL0H,31 + +/* We have checked for infinity / NaN input before, and transformed + denormalized inputs into normalized inputs. Thus, the worst case + exponent overflows are: + 1 + 1 - 0x400 == 0xc02 : maximum underflow + 0x7fe + 0x7fe - 0x3ff == 0xbfd ; maximum overflow + N.B. 0x7e and 0x7f are also values for overflow. + + If (r12 <= -54), we have an underflow to zero. */ + .balign 4 +.Linf_denorm: + lsr r6,r12,28 + brlo.d r6,0xc,.Linf + asr r6,r12,20 + add.f r10,r10,r6 + brgt.d r10,0,.Lshift_frac + mov_s r12,0 + beq.d .Lround_frac + add r10,r10,32 +.Lshift32_frac: + tst r4,r4 + mov r4,r5 + bset.ne r4,r4,1 + mov r5,r7 + brge.d r10,1,.Lshift_frac + mov r7,0 + breq.d r10,0,.Lround_frac + add r10,r10,32 + brgt r10,21,.Lshift32_frac + b_s .Lret0 + +.Lround_frac: + add.f 0,r4,r4 + btst.eq r5,0 + mov_s DBL0L,r5 + mov_s DBL0H,r7 + adc.eq.f DBL0L,DBL0L,0 + j_s.d [blink] + adc.eq DBL0H,DBL0H,0 + +.Linf: mov_s DBL0L,0 + xor.f DBL1H,DBL1H,DBL0H + mov_s DBL0H,r9 + j_s.d [blink] + bset.mi DBL0H,DBL0H,31 + ENDFUNC(__muldf3) + + .balign 4 +.L7ff00000: + .long 0x7ff00000 diff --git a/libgcc/config/arc/ieee-754/arc600-mul64/mulsf3.S b/libgcc/config/arc/ieee-754/arc600-mul64/mulsf3.S new file mode 100644 index 00000000000..619662491b2 --- /dev/null +++ b/libgcc/config/arc/ieee-754/arc600-mul64/mulsf3.S @@ -0,0 +1,180 @@ +/* Copyright (C) 2008-2013 Free Software Foundation, Inc. + Contributor: Joern Rennecke <joern.rennecke@embecosm.com> + on behalf of Synopsys Inc. + +This file is part of GCC. + +GCC is free software; you can redistribute it and/or modify it under +the terms of the GNU General Public License as published by the Free +Software Foundation; either version 3, or (at your option) any later +version. + +GCC is distributed in the hope that it will be useful, but WITHOUT ANY +WARRANTY; without even the implied warranty of MERCHANTABILITY or +FITNESS FOR A PARTICULAR PURPOSE. See the GNU General Public License +for more details. + +Under Section 7 of GPL version 3, you are granted additional +permissions described in the GCC Runtime Library Exception, version +3.1, as published by the Free Software Foundation. + +You should have received a copy of the GNU General Public License and +a copy of the GCC Runtime Library Exception along with this program; +see the files COPYING3 and COPYING.RUNTIME respectively. If not, see +<http://www.gnu.org/licenses/>. */ + +#include "../arc-ieee-754.h" + +#if 0 /* DEBUG */ + .global __mulsf3 + FUNC(__mulsf3) + .balign 4 +__mulsf3: + push_s blink + push_s r1 + bl.d __mulsf3_c + push_s r0 + ld_s r1,[sp,4] + st_s r0,[sp,4] + bl.d __mulsf3_asm + pop_s r0 + pop_s r1 + pop_s blink + cmp r0,r1 + jeq_s [blink] + and r12,r0,r1 + bic.f 0,0x7f800000,r12 + bne 0f + bmsk.f 0,r0,22 + bmsk.ne.f r1,r1,22 + jne_s [blink] ; both NaN -> OK +0: bl abort + ENDFUNC(__mulsf3) +#define __mulsf3 __mulsf3_asm +#endif /* DEBUG */ + + .balign 4 + .global __mulsf3 + FUNC(__mulsf3) +__mulsf3: + ld.as r9,[pcl,80]; [pcl,((.L7f800000-.+2)/4)] + bmsk r4,r1,22 + bset r2,r0,23 + asl_s r2,r2,8 + bset r3,r4,23 + mulu64 r2,r3 + and r11,r0,r9 + breq.d r11,0,.Ldenorm_dbl0 + and r12,r1,r9 + breq.d r12,0,.Ldenorm_dbl1 + xor_s r0,r0,r1 + breq.d r11,r9,.Linf_nan_dbl0 + ld.as r4,[pcl,70]; [pcl,((.L7fffffff-.+2)/4)] + breq.d r12,r9,.Linf_nan_dbl1 +.Lpast_denorm: + asl.f 0,mhi,8 + mov r6,mhi + mov r7,mlo + add.pl r6,r6,r6 + bclr.pl r6,r6,23 + add.pl.f r7,r7,r7 + add.cs r6,r6,1 + lsr.f 0,r6,1 + add_s r12,r12,r11 + adc.f 0,r7,r4 + add_s r12,r12, \ + -0x3f800000 + adc.f r8,r6,r12 + tst.pl r8,r9 + bic r0,r0,r4 + min r3,r8,r9 + jpnz.d [blink] + add.pnz r0,r0,r3 +; infinity or denormal number + add.ne.f r3,r3,r3 + asr_s r3,r3,23+1 + bset r6,r6,23 + bpnz.d .Linfinity + sub_s r3,r3,1 + neg_s r2,r3 + brhi.d r2,24,.Lret_r0 ; right shift shift > 24 -> return +-0 + lsr r2,r6,r2 + asl r9,r6,r3 + lsr.f 0,r2,1 + tst r7,r7 + add_s r0,r0,r2 + bset.ne r9,r9,0 + adc.f 0,r9,r4 + j_s.d [blink] + add.cs r0,r0,1 +.Linfinity: + j_s.d [blink] + add_s r0,r0,r9 + +.Lret_r0: j_s [blink] + + .balign 4 +.Ldenorm_dbl0: + bclr_s r2,r2,31 + norm.f r4,r2 + add_s r2,r2,r2 + asl r2,r2,r4 + mulu64 r2,r3 + breq.d r12,r9,.Ldenorm_dbl0_inf_nan_dbl1 + asl r4,r4,23 + sub.ne.f r12,r12,r4 + ld.as r4,[pcl,29]; [pcl,((.L7fffffff-.+2)/4)] + bhi.d .Lpast_denorm + xor_s r0,r0,r1 + bmsk r1,r0,30 + j_s.d [blink] + bic_s r0,r0,r1 + + .balign 4 +.Ldenorm_dbl0_inf_nan_dbl1: + bmsk.f 0,r0,30 + beq_s .Lretnan + xor_s r0,r0,r1 +.Linf_nan_dbl1: + xor_s r1,r1,r0 +.Linf_nan_dbl0: + bclr_s r1,r1,31 + cmp_s r1,r9 + jls.d [blink] + xor_s r0,r0,r1 +; r1 NaN -> result NaN +.Lretnan: + j_s.d [blink] + mov r0,-1 + + .balign 4 +.Ldenorm_dbl1: + breq.d r11,r9,.Linf_nan_dbl0_2 + norm.f r3,r4 + sub_s r3,r3,7 + asl r4,r4,r3 + mulu64 r2,r4 + sub_s r3,r3,1 + asl_s r3,r3,23 + sub.ne.f r11,r11,r3 + ld.as r4,[pcl,11]; [pcl,((.L7fffffff-.+2)/4)] + bhi.d .Lpast_denorm + bmsk r8,r0,30 + j_s.d [blink] + bic r0,r0,r8 + + .balign 4 +.Linf_nan_dbl0_2: + bclr_s r1,r1,31 + xor_s r0,r0,r1 + sub.eq r1,r1,1 ; inf/nan * 0 -> nan + bic.f 0,r9,r1 + j_s.d [blink] + or.eq r0,r0,r1 ; r1 nan -> result nan + + .balign 4 +.L7f800000: + .long 0x7f800000 +.L7fffffff: + .long 0x7fffffff + ENDFUNC(__mulsf3) |