diff options
Diffstat (limited to 'sysdeps/alpha')
38 files changed, 1447 insertions, 231 deletions
diff --git a/sysdeps/alpha/Makefile b/sysdeps/alpha/Makefile index 45babb6c1c..6d4fbbb18b 100644 --- a/sysdeps/alpha/Makefile +++ b/sysdeps/alpha/Makefile @@ -42,4 +42,4 @@ ifeq ($(subdir),elf) sysdep-CFLAGS += -mno-fp-regs endif -divrem := divl divlu divq divqu reml remlu remq remqu +divrem := divl divq reml remq diff --git a/sysdeps/alpha/bsd-_setjmp.S b/sysdeps/alpha/bsd-_setjmp.S index a7bdbb5a61..be7f6dd05f 100644 --- a/sysdeps/alpha/bsd-_setjmp.S +++ b/sysdeps/alpha/bsd-_setjmp.S @@ -25,6 +25,10 @@ Cambridge, MA 02139, USA. */ ENTRY(_setjmp) ldgp $29,0($27) +#ifdef PROF + lda AT, _mcount + jsr AT, (AT), _mcount +#endif .prologue 1 bis $31, $31, $17 /* Pass a second argument of zero. */ jmp $31, __sigsetjmp /* Call __sigsetjmp. */ diff --git a/sysdeps/alpha/bsd-setjmp.S b/sysdeps/alpha/bsd-setjmp.S index c0ed691f40..2b799613d7 100644 --- a/sysdeps/alpha/bsd-setjmp.S +++ b/sysdeps/alpha/bsd-setjmp.S @@ -25,6 +25,10 @@ Cambridge, MA 02139, USA. */ ENTRY(setjmp) ldgp $29, 0($27) +#ifdef PROF + lda AT, _mcount + jsr AT, (AT), _mcount +#endif .prologue 1 bis $31, 1, $17 /* Pass a second argument of one. */ jmp $31, __sigsetjmp /* Call __sigsetjmp. */ diff --git a/sysdeps/alpha/bzero.S b/sysdeps/alpha/bzero.S index fffa53d7f1..c614fc1a6e 100644 --- a/sysdeps/alpha/bzero.S +++ b/sysdeps/alpha/bzero.S @@ -80,7 +80,14 @@ $tail: bne t4, 1f # is there a tail to do? .end bzero_loop ENTRY(bzero) +#ifdef PROF + ldgp gp, 0(pv) + lda AT, _mcount + jsr AT, (AT), _mcount + .prologue 1 +#else .prologue 0 +#endif mov a0, v0 # e0 : move return value in place beq a1, $done # .. e1 : early exit for zero-length store diff --git a/sysdeps/alpha/div.S b/sysdeps/alpha/div.S new file mode 100644 index 0000000000..6c461c40d4 --- /dev/null +++ b/sysdeps/alpha/div.S @@ -0,0 +1,110 @@ +/* Copyright (C) 1996 Free Software Foundation, Inc. + Contributed by Richard Henderson (rth@tamu.edu) + + This file is part of the GNU C Library. + + The GNU C Library is free software; you can redistribute it and/or + modify it under the terms of the GNU Library General Public License as + published by the Free Software Foundation; either version 2 of the + License, or (at your option) any later version. + + The GNU C Library is distributed in the hope that it will be useful, + but WITHOUT ANY WARRANTY; without even the implied warranty of + MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU + Library General Public License for more details. + + You should have received a copy of the GNU Library General Public + License along with the GNU C Library; see the file COPYING.LIB. If + not, write to the Free Software Foundation, Inc., 675 Mass Ave, + Cambridge, MA 02139, USA. */ + + +#include <sysdep.h> + +#ifdef __linux__ +# include <asm/gentrap.h> +# include <asm/pal.h> +#else +# include <machine/pal.h> +#endif + + .set noat + + .align 4 + .globl div + .ent div +div: + .frame sp, 0, ra +#ifdef PROF + ldgp gp, 0(pv) + lda AT, _mcount + jsr AT, (AT), _mcount + .prologue 1 +#else + .prologue 0 +#endif + +#define dividend t0 +#define divisor t1 +#define mask t2 +#define quotient t3 +#define modulus t4 +#define tmp1 t5 +#define tmp2 t6 +#define compare t7 + + /* find correct sign for input to unsigned divide loop. */ + sextl a1, a1 # e0 : + sextl a2, a2 # .. e1 : + negl a1, dividend # e0 : + negl a2, divisor # .. e1 : + cmovge a1, a1, dividend # e0 : + cmovge a2, a2, divisor # .. e1 : + beq a2, $divbyzero # e1 : + unop # : + + /* shift divisor left, using 3-bit shifts for 32-bit divides as we + can't overflow. Three-bit shifts will result in looping three + times less here, but can result in two loops more later. Thus + using a large shift isn't worth it (and s8addq pairs better than + a shift). */ + +1: cmpult divisor, modulus, compare # e0 : + s8addq divisor, zero, divisor # .. e1 : + s8addq mask, zero, mask # e0 : + bne compare, 1b # .. e1 : + + /* start to go right again. */ +2: addq quotient, mask, tmp2 # e1 : + srl mask, 1, mask # .. e0 : + cmpule divisor, modulus, compare # e0 : + subq modulus, divisor, tmp1 # .. e1 : + cmovne compare, tmp2, quotient # e1 : + srl divisor, 1, divisor # .. e0 : + cmovne compare, tmp1, modulus # e0 : + bne mask, 2b # .. e1 : + + /* find correct sign for result. */ + xor a1, a2, compare # e0 : + negl quotient, tmp1 # .. e1 : + negl modulus, tmp2 # e0 : + cmovlt compare, tmp1, quotient # .. e1 : + cmovlt a1, tmp2, modulus # e1 : + + /* and store it away in the structure. */ + stl quotient, 0(a0) # .. e0 : + mov a0, v0 # e1 : + stl modulus, 4(a0) # .. e0 : + ret # e1 : + +$divbyzero: + mov a0, v0 + ldiq a0, GEN_INTDIV + call_pal PAL_gentrap + + /* if trap returns, return zero. */ + stl zero, 0(v0) + stl zero, 4(v0) + ret + + .end div diff --git a/sysdeps/alpha/divl.S b/sysdeps/alpha/divl.S index 6990665937..fdf053fc25 100644 --- a/sysdeps/alpha/divl.S +++ b/sysdeps/alpha/divl.S @@ -1,6 +1,6 @@ #define IS_REM 0 #define SIZE 4 -#define SIGNED 1 -#define FUNC_NAME __divl +#define UFUNC_NAME __divlu +#define SFUNC_NAME __divl #include "divrem.h" diff --git a/sysdeps/alpha/divlu.S b/sysdeps/alpha/divlu.S deleted file mode 100644 index ee96c95008..0000000000 --- a/sysdeps/alpha/divlu.S +++ /dev/null @@ -1,6 +0,0 @@ -#define IS_REM 0 -#define SIZE 4 -#define SIGNED 0 -#define FUNC_NAME __divlu - -#include "divrem.h" diff --git a/sysdeps/alpha/divq.S b/sysdeps/alpha/divq.S index bde3425f37..8c88af9736 100644 --- a/sysdeps/alpha/divq.S +++ b/sysdeps/alpha/divq.S @@ -1,6 +1,6 @@ #define IS_REM 0 #define SIZE 8 -#define SIGNED 1 -#define FUNC_NAME __divq +#define UFUNC_NAME __divqu +#define SFUNC_NAME __divq #include "divrem.h" diff --git a/sysdeps/alpha/divqu.S b/sysdeps/alpha/divqu.S deleted file mode 100644 index 72dcf971dd..0000000000 --- a/sysdeps/alpha/divqu.S +++ /dev/null @@ -1,6 +0,0 @@ -#define IS_REM 0 -#define SIZE 8 -#define SIGNED 0 -#define FUNC_NAME __divqu - -#include "divrem.h" diff --git a/sysdeps/alpha/divrem.h b/sysdeps/alpha/divrem.h index eaf892b3c6..b83908dc9f 100644 --- a/sysdeps/alpha/divrem.h +++ b/sysdeps/alpha/divrem.h @@ -1,25 +1,25 @@ /* Copyright (C) 1996 Free Software Foundation, Inc. Contributed by David Mosberger (davidm@cs.arizona.edu). -This file is part of the GNU C Library. + This file is part of the GNU C Library. -The GNU C Library is free software; you can redistribute it and/or -modify it under the terms of the GNU Library General Public License as -published by the Free Software Foundation; either version 2 of the -License, or (at your option) any later version. + The GNU C Library is free software; you can redistribute it and/or + modify it under the terms of the GNU Library General Public License as + published by the Free Software Foundation; either version 2 of the + License, or (at your option) any later version. -The GNU C Library is distributed in the hope that it will be useful, -but WITHOUT ANY WARRANTY; without even the implied warranty of -MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU -Library General Public License for more details. + The GNU C Library is distributed in the hope that it will be useful, + but WITHOUT ANY WARRANTY; without even the implied warranty of + MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU + Library General Public License for more details. -You should have received a copy of the GNU Library General Public -License along with the GNU C Library; see the file COPYING.LIB. If -not, write to the Free Software Foundation, Inc., 675 Mass Ave, -Cambridge, MA 02139, USA. */ + You should have received a copy of the GNU Library General Public + License along with the GNU C Library; see the file COPYING.LIB. If + not, write to the Free Software Foundation, Inc., 675 Mass Ave, + Cambridge, MA 02139, USA. */ /* The current Alpha chips don't provide hardware for integer -division. The C compiler expects the functions + division. The C compiler expects the functions __divqu: 64-bit unsigned long divide __remqu: 64-bit unsigned long remainder @@ -27,10 +27,10 @@ division. The C compiler expects the functions __divlu/__remlu: unsigned 32-bit __divls/__remls: signed 32-bit -These are not normal C functions: instead of the normal calling -sequence, these expect their arguments in registers t10 and t11, and -return the result in t12 (aka pv). Register AT may be clobbered -(assembly temporary), anything else must be saved. */ + These are not normal C functions: instead of the normal calling + sequence, these expect their arguments in registers t10 and t11, and + return the result in t12 (aka pv). Register AT may be clobbered + (assembly temporary), anything else must be saved. */ #include <sysdep.h> @@ -41,147 +41,185 @@ return the result in t12 (aka pv). Register AT may be clobbered # include <machine/pal.h> #endif -#ifdef DEBUG -# define arg1 a0 -# define arg2 a1 -# define result v0 -# define mask t0 -# define tmp0 t1 -# define tmp1 t2 -# define sign t3 -# define retaddr ra -#else -# define arg1 t10 -# define arg2 t11 -# define result t12 -# define mask v0 -# define tmp0 t0 -# define tmp1 t1 -# define sign t2 -# define retaddr t9 -#endif +#define mask v0 +#define divisor t0 +#define compare AT +#define tmp1 t2 +#define tmp2 t3 +#define retaddr t9 +#define arg1 t10 +#define arg2 t11 +#define result t12 -# define divisor arg2 #if IS_REM -# define dividend result -# define quotient arg1 -# define GETDIVIDEND bis arg1,zero,dividend +# define DIV_ONLY(x,y...) +# define REM_ONLY(x,y...) x,##y +# define modulus result +# define quotient t1 +# define GETSIGN(x) mov arg1, x +# define STACK 32 #else -# define dividend arg1 -# define quotient result -# define GETDIVIDEND +# define DIV_ONLY(x,y...) x,##y +# define REM_ONLY(x,y...) +# define modulus t1 +# define quotient result +# define GETSIGN(x) xor arg1, arg2, x +# define STACK 48 #endif #if SIZE == 8 -# define LONGIFYarg1 GETDIVIDEND -# define LONGIFYarg2 -#else -# if SIGNED -# define LONGIFYarg1 addl arg1,zero,dividend -# define LONGIFYarg2 addl arg2,zero,divisor -# else -# define LONGIFYarg1 zapnot arg1,0x0f,dividend -# define LONGIFYarg2 zapnot arg2,0x0f,divisor -# endif -#endif - -#if SIGNED -# define SETSIGN(sign,reg,tmp) subq zero,reg,tmp; cmovlt sign,tmp,reg -# if IS_REM -# define GETSIGN(x,y,s) bis x,zero,s -# else -# define GETSIGN(x,y,s) xor x,y,s -# endif +# define LONGIFY(x,y) mov x,y +# define SLONGIFY(x,y) mov x,y +# define _SLONGIFY(x) +# define NEG(x,y) negq x,y #else -# define SETSIGN(sign,reg,tmp) -# define GETSIGN(x,y,s) +# define LONGIFY(x,y) zapnot x,15,y +# define SLONGIFY(x,y) sextl x,y +# define _SLONGIFY(x) sextl x,x +# define NEG(x,y) negl x,y #endif .set noreorder .set noat - .ent FUNC_NAME - .globl FUNC_NAME - -#define FRAME_SIZE 0x30 + .ent UFUNC_NAME + .globl UFUNC_NAME - .align 5 -FUNC_NAME: + .align 3 +UFUNC_NAME: + lda sp, -STACK(sp) + .frame sp, STACK, retaddr, 0 #ifdef PROF - lda sp, -0x18(sp) - stq ra, 0x00(sp) - stq pv, 0x08(sp) - stq gp, 0x10(sp) + stq ra, 0(sp) + stq pv, 8(sp) + stq gp, 16(sp) br AT, 1f 1: ldgp gp, 0(AT) mov retaddr, ra - jsr AT, _mcount + lda AT, _mcount + jsr AT, (AT), _mcount - ldq ra, 0x00(sp) - ldq pv, 0x08(sp) - ldq gp, 0x10(sp) - lda sp, 0x18(sp) + ldq ra, 0(sp) + ldq pv, 8(sp) + ldq gp, 16(sp) #endif - .frame sp, FRAME_SIZE, retaddr, 0 - lda sp,-FRAME_SIZE(sp) - .prologue 1 - stq arg1,0x00(sp) - LONGIFYarg1 - stq arg2,0x08(sp) - LONGIFYarg2 - stq mask,0x10(sp) - bis zero,1,mask - stq tmp0,0x18(sp) - bis zero,zero,quotient - stq tmp1,0x20(sp) - beq divisor,$divbyzero - stq sign,0x28(sp) - GETSIGN(dividend,divisor,sign) -#if SIGNED - subq zero,dividend,tmp0 - subq zero,divisor,tmp1 - cmovlt dividend,tmp0,dividend - cmovlt divisor,tmp1,divisor + .prologue 0 + +$udiv: + stq t0, 0(sp) + LONGIFY (arg2, divisor) + stq t1, 8(sp) + LONGIFY (arg1, modulus) + stq v0, 16(sp) + clr quotient + stq tmp1, 24(sp) + ldiq mask, 1 + DIV_ONLY(stq tmp2,32(sp)) + + beq divisor, $divbyzero + + .align 3 +#if SIZE == 8 + /* Shift divisor left. */ +1: cmpult divisor, modulus, compare + blt divisor, 2f + addq divisor, divisor, divisor + addq mask, mask, mask + bne compare, 1b + unop +2: +#else + /* Shift divisor left using 3-bit shifts as we can't overflow. + This results in looping three times less here, but up to + two more times later. Thus using a large shift isn't worth it. */ +1: cmpult divisor, modulus, compare + s8addq divisor, zero, divisor + s8addq mask, zero, mask + bne compare, 1b #endif - /* - * Shift divisor left until either bit 63 is set or until it - * is at least as big as the dividend: - */ - .align 3 -1: cmpule dividend,divisor,AT - blt divisor,2f - blbs AT,2f - addq mask,mask,mask - addq divisor,divisor,divisor - br 1b - - .align 3 -2: addq mask,quotient,tmp0 - cmpule divisor,dividend,AT - subq dividend,divisor,tmp1 - srl divisor,1,divisor - srl mask,1,mask - cmovlbs AT,tmp0,quotient - cmovlbs AT,tmp1,dividend - bne mask,2b - - ldq arg1,0x00(sp) - SETSIGN(sign,result,tmp0) -$done: ldq arg2,0x08(sp) - ldq mask,0x10(sp) - ldq tmp0,0x18(sp) - ldq tmp1,0x20(sp) - ldq sign,0x28(sp) - lda sp,FRAME_SIZE(sp) - ret zero,(retaddr),0 + + /* Now go back to the right. */ +3: DIV_ONLY(addq quotient, mask, tmp2) + srl mask, 1, mask + cmpule divisor, modulus, compare + subq modulus, divisor, tmp1 + DIV_ONLY(cmovne compare, tmp2, quotient) + srl divisor, 1, divisor + cmovne compare, tmp1, modulus + bne mask, 3b + +$done: ldq t0, 0(sp) + ldq t1, 8(sp) + ldq v0, 16(sp) + ldq tmp1, 24(sp) + DIV_ONLY(ldq tmp2, 32(sp)) + lda sp, STACK(sp) + ret zero, (retaddr), 1 $divbyzero: - lda a0,GEN_INTDIV(zero) + mov a0, tmp1 + ldiq a0, GEN_INTDIV call_pal PAL_gentrap - bis zero,zero,result /* if trap returns, return 0 */ - ldq arg1,0x00(sp) + mov tmp1, a0 + clr result /* If trap returns, return zero. */ br $done - END(FUNC_NAME) + .end UFUNC_NAME + + .ent SFUNC_NAME + .globl SFUNC_NAME + + .align 3 +SFUNC_NAME: + lda sp, -STACK(sp) + .frame sp, STACK, retaddr, 0 +#ifdef PROF + stq ra, 0(sp) + stq pv, 8(sp) + stq gp, 16(sp) + + br AT, 1f +1: ldgp gp, 0(AT) + + mov retaddr, ra + jsr AT, _mcount + + ldq ra, 0(sp) + ldq pv, 8(sp) + ldq gp, 16(sp) +#endif + .prologue 0 + + or arg1, arg2, AT + _SLONGIFY(AT) + bge AT, $udiv /* don't need to mess with signs */ + + /* Save originals and find absolute values. */ + stq arg1, 0(sp) + NEG (arg1, AT) + stq arg2, 8(sp) + cmovge AT, AT, arg1 + stq retaddr, 16(sp) + NEG (arg2, AT) + stq tmp1, 24(sp) + cmovge AT, AT, arg2 + + /* Do the unsigned division. */ + bsr retaddr, UFUNC_NAME + + /* Restore originals and adjust the sign of the result. */ + ldq arg1, 0(sp) + ldq arg2, 8(sp) + GETSIGN (AT) + NEG (result, tmp1) + _SLONGIFY(AT) + ldq retaddr, 16(sp) + cmovlt AT, tmp1, result + ldq tmp1, 24(sp) + + lda sp, STACK(sp) + ret zero, (retaddr), 1 + + .end SFUNC_NAME diff --git a/sysdeps/alpha/ffs.S b/sysdeps/alpha/ffs.S index b84a51d326..959d1046d4 100644 --- a/sysdeps/alpha/ffs.S +++ b/sysdeps/alpha/ffs.S @@ -27,7 +27,14 @@ architecture. */ .set noat ENTRY(ffs) +#ifdef PROF + ldgp gp, 0(pv) + lda AT, _mcount + jsr AT, (AT), _mcount + .prologue 1 +#else .prologue 0 +#endif ldq_u zero, 0(sp) # on the 21064, this helps dual-issuing addl a0, zero, a0 # the last insn and reduces the stall diff --git a/sysdeps/alpha/htonl.S b/sysdeps/alpha/htonl.S index 9777e461cc..55d4f62bc7 100644 --- a/sysdeps/alpha/htonl.S +++ b/sysdeps/alpha/htonl.S @@ -19,7 +19,15 @@ Cambridge, MA 02139, USA. */ #include <sysdep.h> ENTRY(__htonl) +#ifdef PROF + ldgp gp, 0(pv) + lda AT, _mcount + jsr AT, (AT), _mcount + .prologue 1 +#else .prologue 0 +#endif + extlh a0, 5, t1 # t1 = dd000000 zap a0, 0xfd, t2 # t2 = 0000cc00 sll t2, 5, t2 # t2 = 00198000 diff --git a/sysdeps/alpha/htons.S b/sysdeps/alpha/htons.S index 7717636782..743d3e2474 100644 --- a/sysdeps/alpha/htons.S +++ b/sysdeps/alpha/htons.S @@ -19,7 +19,15 @@ Cambridge, MA 02139, USA. */ #include <sysdep.h> ENTRY(__htons) +#ifdef PROF + ldgp gp, 0(pv) + lda AT, _mcount + jsr AT, (AT), _mcount + .prologue 1 +#else .prologue 0 +#endif + extwh a0, 7, t1 # t1 = bb00 extbl a0, 1, v0 # v0 = 00aa bis v0, t1, v0 # v0 = bbaa diff --git a/sysdeps/alpha/ldiv.S b/sysdeps/alpha/ldiv.S new file mode 100644 index 0000000000..ebbe055870 --- /dev/null +++ b/sysdeps/alpha/ldiv.S @@ -0,0 +1,109 @@ +/* Copyright (C) 1996 Free Software Foundation, Inc. + Contributed by Richard Henderson (rth@tamu.edu) + + This file is part of the GNU C Library. + + The GNU C Library is free software; you can redistribute it and/or + modify it under the terms of the GNU Library General Public License as + published by the Free Software Foundation; either version 2 of the + License, or (at your option) any later version. + + The GNU C Library is distributed in the hope that it will be useful, + but WITHOUT ANY WARRANTY; without even the implied warranty of + MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU + Library General Public License for more details. + + You should have received a copy of the GNU Library General Public + License along with the GNU C Library; see the file COPYING.LIB. If + not, write to the Free Software Foundation, Inc., 675 Mass Ave, + Cambridge, MA 02139, USA. */ + + +#include <sysdep.h> + +#ifdef __linux__ +# include <asm/gentrap.h> +# include <asm/pal.h> +#else +# include <machine/pal.h> +#endif + + .set noat + + .align 4 + .globl ldiv + .ent ldiv +ldiv: + .frame sp, 0, ra +#ifdef PROF + ldgp gp, 0(pv) + lda AT, _mcount + jsr AT, (AT), _mcount + .prologue 1 +#else + .prologue 0 +#endif + +#define dividend t0 +#define divisor t1 +#define mask t2 +#define quotient t3 +#define modulus t4 +#define tmp1 t5 +#define tmp2 t6 +#define compare t7 + + /* find correct sign for input to unsigned divide loop. */ + mov a1, dividend # e0 : + mov a2, divisor # .. e1 : + negq a1, tmp1 # e0 : + negq a2, tmp2 # .. e1 : + cmovlt a1, tmp1, dividend # e0 : + cmovlt a2, tmp2, divisor # .. e1 : + beq a2, $divbyzero # e1 : + unop # : + + /* shift divisor left. */ +1: cmpult divisor, modulus, compare # e0 : + blt divisor, 2f # .. e1 : + addq divisor, divisor, divisor # e0 : + addq mask, mask, mask # .. e1 : + bne compare, 1b # e1 : + unop # : + + /* start to go right again. */ +2: addq quotient, mask, tmp2 # e1 : + srl mask, 1, mask # .. e0 : + cmpule divisor, modulus, compare # e0 : + subq modulus, divisor, tmp1 # .. e1 : + cmovne compare, tmp2, quotient # e1 : + srl divisor, 1, divisor # .. e0 : + cmovne compare, tmp1, modulus # e0 : + bne mask, 2b # .. e1 : + + /* find correct sign for result. */ + xor a1, a2, compare # e0 : + negq quotient, tmp1 # .. e1 : + negq modulus, tmp2 # e0 : + cmovlt compare, tmp1, quotient # .. e1 : + cmovlt a1, tmp2, modulus # e1 : + + /* and store it away in the structure. */ +9: stq quotient, 0(a0) # .. e0 : + mov a0, v0 # e1 : + stq modulus, 8(a0) # .. e0 : + ret # e1 : + +$divbyzero: + mov a0, v0 + lda a0, GEN_INTDIV + call_pal PAL_gentrap + + /* if trap returns, return zero. */ + stq zero, 0(v0) + stq zero, 8(v0) + ret + + .end ldiv + +weak_alias(ldiv, lldiv) diff --git a/sysdeps/alpha/lldiv.S b/sysdeps/alpha/lldiv.S new file mode 100644 index 0000000000..80c450a3fc --- /dev/null +++ b/sysdeps/alpha/lldiv.S @@ -0,0 +1 @@ +/* lldiv is the same as ldiv on the Alpha. */ diff --git a/sysdeps/alpha/memchr.S b/sysdeps/alpha/memchr.S index a47ac96e01..ecd26e8d6f 100644 --- a/sysdeps/alpha/memchr.S +++ b/sysdeps/alpha/memchr.S @@ -40,7 +40,14 @@ For correctness consider that: .set noat ENTRY(memchr) +#ifdef PROF + ldgp gp, 0(pv) + lda AT, _mcount + jsr AT, (AT), _mcount + .prologue 1 +#else .prologue 0 +#endif beq a2, $not_found ldq_u t0, 0(a0) # load first quadword (a0 may be misaligned) diff --git a/sysdeps/alpha/memcpy.S b/sysdeps/alpha/memcpy.S new file mode 100644 index 0000000000..4ee9c115db --- /dev/null +++ b/sysdeps/alpha/memcpy.S @@ -0,0 +1,276 @@ +/* Copyright (C) 1996 Free Software Foundation, Inc. + Contributed by Richard Henderson (rth@tamu.edu) + + This file is part of the GNU C Library. + + The GNU C Library is free software; you can redistribute it and/or + modify it under the terms of the GNU Library General Public License as + published by the Free Software Foundation; either version 2 of the + License, or (at your option) any later version. + + The GNU C Library is distributed in the hope that it will be useful, + but WITHOUT ANY WARRANTY; without even the implied warranty of + MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU + Library General Public License for more details. + + You should have received a copy of the GNU Library General Public + License along with the GNU C Library; see the file COPYING.LIB. If + not, write to the Free Software Foundation, Inc., 675 Mass Ave, + Cambridge, MA 02139, USA. */ + + +/* This is the child of the C-with-inline-assembly memcpy posted by + Martin Ostermann (ost@comnets.rwth-aachen.de). + + This is generally scheduled for the EV5, but whenever necessary and + possible, the autoswap slotting feature of the EV5 is used so that the + code lays out nicely for the EV4 as well. */ + +#include <alpha/regdef.h> + + .set noreorder + + .text + + .ent copy_fwd_aligned +copy_fwd_aligned: + .frame sp, 0, ra, 0 + .prologue 0 + + /* Aligned forward copy main loop. On entry to this basic block: + t0 == source word waiting to be stored + t2 == loop counter + a0 == destination pointer + a1 == source pointer + a2 mod 8 == byte count in final word */ + .align 4 +$fa_loop: + and t2, 7, t1 # e0 : + beq t1, 1f # .. e1 : + +0: stq_u t0, 0(a0) # e0 : + subq t1, 1, t1 # .. e1 : + ldq_u t0, 8(a1) # e0 : copy up to seven words + addq a0, 8, a0 # .. e1 : + addq a1, 8, a1 # e0 : + bne t1, 0b # .. e1 : + +1: bic t2, 7, t2 # e0 : + beq t2, $fa_tail # .. e1 : + +2: stq_u t0, 0(a0) # e0 : + addq a0, 64, a0 # .. e1 : + ldq_u t3, 8(a1) # e0 : copy eight words as fast as we can + ldq_u t4, 16(a1) # .. e1 : + ldq_u t5, 24(a1) # e0 : + ldq_u t6, 32(a1) # .. e1 : + ldq_u t7, 40(a1) # e0 : + ldq_u t8, 48(a1) # .. e1 : + ldq_u t9, 56(a1) # e0 : + ldq_u t0, 64(a1) # .. e1 : + stq_u t3, -56(a0) # e0 : + subq t2, 8, t2 # .. e1 : + stq_u t4, -48(a0) # e0 : + addq a1, 64, a1 # .. e1 : + stq_u t5, -40(a0) # e0 : + stq_u t6, -32(a0) # e0 : + stq_u t7, -24(a0) # e0 : + stq_u t8, -16(a0) # e0 : + stq_u t9, -8(a0) # e0 : + bne t2, 2b # .. e1 : + + /* Take care of a partial word tail. */ +$fa_tail: + and a2, 7, t3 # e0 : + bne t3, 1f # .. e1 (zdb) + + /* Aligned copy, aligned tail, final store. */ + stq_u t0, 0(a0) + ret + +1: ldq_u t1, 0(a0) # e1 : + mskql t0, a2, t0 # .. e1 : + mskqh t1, a2, t1 # e0 (stall) + bis t0, t1, t0 # e1 : + stq_u t0, 0(a0) # e0 : + ret # .. e1 : + + /* This is the actual entry point to this function. */ + .align 3 +$fwd_aligned: + ldq_u t0, 0(a1) # e0 : + and a0, 7, t3 # .. e1 : + addq a2, t3, a2 # e0 : + subq a2, 1, t2 # e1 : + sra t2, 3, t2 # e0 : + beq t3, $fa_loop # .. e1 : + + ldq_u t1, 0(a0) # e0 : + beq t2, $fa_small # .. e1 : + mskqh t0, a0, t0 # e0 : + mskql t1, a0, t3 # e0 : + bis t0, t3, t0 # e0 : + br $fa_loop # .. e1 : + + /* The move affects exactly one destination word. */ +$fa_small: + mskqh t0, a0, t0 # e0 : + and a2, 7, t4 # .. e1 : + mskql t1, a0, t3 # e0 : + bne t4, 1f # .. e1 : + + or t0, t3, t0 # e0 : + unop # : + stq_u t0, 0(a0) # e0 : + ret # .. e1 : + +1: mskql t0, a2, t0 # e0 : + mskqh t1, a2, t1 # e0 : + or t0, t3, t0 # e0 : + or t0, t1, t0 # e1 : + stq_u t0, 0(a0) # e0 : + ret # .. e1 : + + .end copy_fwd_aligned + + .ent memcpy + .globl memcpy + .align 3 +memcpy: + .frame sp, 0, ra, 0 +#ifdef PROF + ldgp gp, 0(ra) + lda AT, _mcount + jsr AT, (AT), _mcount + .prologue 1 +#else + .prologue 0 +#endif + + mov a0, v0 + beq a2, $zero_length + + /* Are source and destination co-aligned? */ + xor a0, a1, t0 + unop + and t0, 7, t0 + beq t0, $fwd_aligned + br $fwd_unaligned + + .end memcpy + + .ent copy_fwd_unaligned +copy_fwd_unaligned: + .frame sp, 0, ra, 0 + .prologue 0 + + /* Unaligned forward copy main loop. On entry to this basic block: + t0 == source low word, unshifted + t2 == loop counter + t7 == last source byte + 1 + a0 == destination pointer + a1 == source pointer + a2 mod 8 == byte count in final word */ + .align 4 +$fu_loop: + beq t2, $fu_tail # e1 : + blbc t2, 0f # e1 : + + ldq_u t1, 8(a1) # e1 : copy one unaligned word + extql t0, a1, t3 # .. e0 : + addq a1, 8, a1 # e0 : + addq a0, 8, a0 # .. e1 : + extqh t1, a1, t4 # e0 : + subq t2, 1, t2 # .. e1 : + mov t1, t0 # e0 : + or t3, t4, t3 # .. e1 : + stq_u t3, -8(a0) # e0 : + beq t2, $fu_tail # .. e1 : + +0: ldq_u t1, 8(a1) # e1 : copy two unaligned words + extql t0, a1, t3 # .. e0 : + ldq_u t0, 16(a1) # e0 : + subq t2, 2, t2 # .. e1 : + extqh t1, a1, t4 # e0 : + addq a0, 16, a0 # .. e1 : + extql t1, a1, t5 # e0 : + or t3, t4, t3 # .. e1 : + extqh t0, a1, t6 # e0 : + addq a1, 16, a1 # .. e1 : + stq_u t3, -16(a0) # e0 : + or t5, t6, t5 # .. e1 : + stq_u t5, -8(a0) # e0 : + bne t2, 0b # .. e1 : + + /* Take care of a partial words tail. */ +$fu_tail: + ldq_u t4, -1(t7) # e1 : + extql t0, a1, t3 # .. e0 : + extqh t4, a1, t4 # e0 (stall) + and a2, 7, t5 # .. e1 : + or t3, t4, t3 # e0 : + beq t5, 1f # .. e1 : + + ldq_u t1, 0(a0) # e1 : + mskql t3, a2, t3 # .. e0 : + mskqh t1, a2, t1 # e0 (stall) + or t1, t3, t3 # e1 : + +1: stq_u t3, 0(a0) # e0 : + ret # .. e1 : + + /* The entry point to the unaligned forward copy. */ + .align 3 +$fwd_unaligned: + ldq_u t0, 0(a1) # e0 : load initial bits of src + addq a1, a2, t7 # .. e1 : record last byte + 1 of src + and a0, 7, t3 # e0 : find dst misalignment + addq a2, t3, a2 # e1 : find number of words affected + subq a2, 1, t2 # e0 : + cmple a2, 8, t4 # .. e1 : are we dealing with a small block? + subq a1, t3, a1 # e0 : + bne t4, $fu_small # .. e1 : + srl t2, 3, t2 # e0 : + beq t3, $fu_loop # .. e1 : + + /* Take care of an unaligned dst head. */ + ldq_u t5, 0(a0) # e0 : + ldq_u t1, 8(a1) # .. e1 : + extql t0, a1, t3 # e0 : + addq a0, 8, a0 # .. e1 : + extqh t1, a1, t4 # e0 : + addq a1, 8, a1 # .. e1 : + mskql t5, a0, t5 # e0 : + or t3, t4, t3 # .. e1 : + mskqh t3, a0, t3 # e0 : + subq t2, 1, t2 # .. e1 : + or t3, t5, t3 # e0 : + mov t1, t0 # .. e1 : + stq_u t3, -8(a0) # e0 : + br $fu_loop # .. e1 : + + /* The move affects exactly one destination word. */ + .align 3 +$fu_small: + ldq_u t2, 0(a0) # e1 : + extql t0, a1, t3 # .. e0 : + ldq_u t1, -1(t7) # e0 : + and a2, 7, t8 # .. e1 : + mskqh t2, a2, t6 # e0 : + mskql t2, a0, t5 # e0 : + extqh t1, a1, t4 # e0 : + cmovne t8, t6, t8 # .. e1 : + or t3, t4, t3 # e0 : + or t5, t8, t5 # .. e1 : + mskqh t3, a0, t3 # e0 : + and a2, 7, t8 # .. e1 : + mskql t3, a2, t6 # e0 : + cmovne t8, t6, t8 # e1 : + or t3, t5, t3 # e0 : + unop # : + stq_u t3, 0(a0) # e0 : + +$zero_length: + ret # .. e1 : + + .end copy_fwd_unaligned diff --git a/sysdeps/alpha/memset.S b/sysdeps/alpha/memset.S index 55271f00ea..2b29357c6e 100644 --- a/sysdeps/alpha/memset.S +++ b/sysdeps/alpha/memset.S @@ -85,7 +85,14 @@ $tail: bne t4, 1f # is there a tail to do? .end memset_loop ENTRY(memset) +#ifdef PROF + ldgp gp, 0(pv) + lda AT, _mcount + jsr AT, (AT), _mcount + .prologue 1 +#else .prologue 0 +#endif zapnot a1, 1, a1 # e0 : zero extend input character mov a0, v0 # .. e1 : move return value in place diff --git a/sysdeps/alpha/reml.S b/sysdeps/alpha/reml.S index b631a02c31..8c00365ee3 100644 --- a/sysdeps/alpha/reml.S +++ b/sysdeps/alpha/reml.S @@ -1,6 +1,6 @@ #define IS_REM 1 #define SIZE 4 -#define SIGNED 1 -#define FUNC_NAME __reml +#define UFUNC_NAME __remlu +#define SFUNC_NAME __reml #include "divrem.h" diff --git a/sysdeps/alpha/remlu.S b/sysdeps/alpha/remlu.S deleted file mode 100644 index 8d527e4678..0000000000 --- a/sysdeps/alpha/remlu.S +++ /dev/null @@ -1,6 +0,0 @@ -#define IS_REM 1 -#define SIZE 4 -#define SIGNED 0 -#define FUNC_NAME __remlu - -#include "divrem.h" diff --git a/sysdeps/alpha/remq.S b/sysdeps/alpha/remq.S index 8bd9f334f4..cd1064af4e 100644 --- a/sysdeps/alpha/remq.S +++ b/sysdeps/alpha/remq.S @@ -1,6 +1,6 @@ #define IS_REM 1 #define SIZE 8 -#define SIGNED 1 -#define FUNC_NAME __remq +#define UFUNC_NAME __remqu +#define SFUNC_NAME __remq #include "divrem.h" diff --git a/sysdeps/alpha/remqu.S b/sysdeps/alpha/remqu.S deleted file mode 100644 index 14a7486148..0000000000 --- a/sysdeps/alpha/remqu.S +++ /dev/null @@ -1,6 +0,0 @@ -#define IS_REM 1 -#define SIZE 8 -#define SIGNED 0 -#define FUNC_NAME __remqu - -#include "divrem.h" diff --git a/sysdeps/alpha/s_copysign.S b/sysdeps/alpha/s_copysign.S index 95eb608666..739d3deb79 100644 --- a/sysdeps/alpha/s_copysign.S +++ b/sysdeps/alpha/s_copysign.S @@ -20,7 +20,15 @@ Cambridge, MA 02139, USA. */ #include <sysdep.h> ENTRY(__copysign) +#ifdef PROF + ldgp gp, 0(pv) + lda AT, _mcount + jsr AT, (AT), _mcount + .prologue 1 +#else .prologue 0 +#endif + cpys $f17,$f16,$f0 ret diff --git a/sysdeps/alpha/s_fabs.S b/sysdeps/alpha/s_fabs.S index 12c0abdf75..75976333f9 100644 --- a/sysdeps/alpha/s_fabs.S +++ b/sysdeps/alpha/s_fabs.S @@ -20,7 +20,15 @@ Cambridge, MA 02139, USA. */ #include <sysdep.h> ENTRY(__fabs) +#ifdef PROF + ldgp gp, 0(pv) + lda AT, _mcount + jsr AT, (AT), _mcount + .prologue 1 +#else .prologue 0 +#endif + cpys $f31,$f16,$f0 ret diff --git a/sysdeps/alpha/setjmp.S b/sysdeps/alpha/setjmp.S index 59929a0234..f57d49017e 100644 --- a/sysdeps/alpha/setjmp.S +++ b/sysdeps/alpha/setjmp.S @@ -23,6 +23,10 @@ Cambridge, MA 02139, USA. */ extra arguments. */ ENTRY (__sigsetjmp) ldgp $29, 0($27) +#ifdef PROF + lda AT, _mcount + jsr AT, (AT), _mcount +#endif .prologue 1 bis $30, $30, $18 /* Pass SP as 3rd arg. */ diff --git a/sysdeps/alpha/stpcpy.S b/sysdeps/alpha/stpcpy.S index 0dc44d353a..9c2668b535 100644 --- a/sysdeps/alpha/stpcpy.S +++ b/sysdeps/alpha/stpcpy.S @@ -27,6 +27,10 @@ Cambridge, MA 02139, USA. */ ENTRY(__stpcpy) ldgp gp, 0(pv) +#ifdef PROF + lda AT, _mcount + jsr AT, (AT), _mcount +#endif .prologue 1 jsr t9, __stxcpy # do the work of the copy diff --git a/sysdeps/alpha/stpncpy.S b/sysdeps/alpha/stpncpy.S index 50cda2672e..90470cf95c 100644 --- a/sysdeps/alpha/stpncpy.S +++ b/sysdeps/alpha/stpncpy.S @@ -1,24 +1,23 @@ /* Copyright (C) 1996 Free Software Foundation, Inc. + This file is part of the GNU C Library. Contributed by Richard Henderson (rth@tamu.edu) -This file is part of the GNU C Library. + The GNU C Library is free software; you can redistribute it and/or + modify it under the terms of the GNU Library General Public License as + published by the Free Software Foundation; either version 2 of the + License, or (at your option) any later version. -The GNU C Library is free software; you can redistribute it and/or -modify it under the terms of the GNU Library General Public License as -published by the Free Software Foundation; either version 2 of the -License, or (at your option) any later version. + The GNU C Library is distributed in the hope that it will be useful, + but WITHOUT ANY WARRANTY; without even the implied warranty of + MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU + Library General Public License for more details. -The GNU C Library is distributed in the hope that it will be useful, -but WITHOUT ANY WARRANTY; without even the implied warranty of -MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU -Library General Public License for more details. + You should have received a copy of the GNU Library General Public + License along with the GNU C Library; see the file COPYING.LIB. If not, + write to the Free Software Foundation, Inc., 59 Temple Place - Suite 330, + Boston, MA 02111-1307, USA. */ -You should have received a copy of the GNU Library General Public -License along with the GNU C Library; see the file COPYING.LIB. If -not, write to the Free Software Foundation, Inc., 675 Mass Ave, -Cambridge, MA 02139, USA. */ - -/* Copy no more than COUNT bytes of the null-terminated string from +/* Copy no more than COUNT bytes of the null-terminated string from SRC to DST. If SRC does not cover all of COUNT, the balance is zeroed. Return the address of the terminating null in DEST, if any, else DEST + COUNT. */ @@ -32,8 +31,12 @@ Cambridge, MA 02139, USA. */ ENTRY(__stpncpy) ldgp gp, 0(pv) +#ifdef PROF + lda AT, _mcount + jsr AT, (AT), _mcount +#endif .prologue 1 - + beq a2, $zerocount jsr t9, __stxncpy # do the work of the copy diff --git a/sysdeps/alpha/strcat.S b/sysdeps/alpha/strcat.S index d3afff3c5f..e57259f51d 100644 --- a/sysdeps/alpha/strcat.S +++ b/sysdeps/alpha/strcat.S @@ -1,22 +1,21 @@ /* Copyright (C) 1996 Free Software Foundation, Inc. + This file is part of the GNU C Library. Contributed by Richard Henderson (rth@tamu.edu) -This file is part of the GNU C Library. + The GNU C Library is free software; you can redistribute it and/or + modify it under the terms of the GNU Library General Public License as + published by the Free Software Foundation; either version 2 of the + License, or (at your option) any later version. -The GNU C Library is free software; you can redistribute it and/or -modify it under the terms of the GNU Library General Public License as -published by the Free Software Foundation; either version 2 of the -License, or (at your option) any later version. + The GNU C Library is distributed in the hope that it will be useful, + but WITHOUT ANY WARRANTY; without even the implied warranty of + MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU + Library General Public License for more details. -The GNU C Library is distributed in the hope that it will be useful, -but WITHOUT ANY WARRANTY; without even the implied warranty of -MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU -Library General Public License for more details. - -You should have received a copy of the GNU Library General Public -License along with the GNU C Library; see the file COPYING.LIB. If -not, write to the Free Software Foundation, Inc., 675 Mass Ave, -Cambridge, MA 02139, USA. */ + You should have received a copy of the GNU Library General Public + License along with the GNU C Library; see the file COPYING.LIB. If not, + write to the Free Software Foundation, Inc., 59 Temple Place - Suite 330, + Boston, MA 02111-1307, USA. */ /* Append a null-terminated string from SRC to DST. */ @@ -26,6 +25,10 @@ Cambridge, MA 02139, USA. */ ENTRY(strcat) ldgp gp, 0(pv) +#ifdef PROF + lda AT, _mcount + jsr AT, (AT), _mcount +#endif .prologue 1 mov a0, v0 # set up return value @@ -59,7 +62,7 @@ $found: negq t1, t2 # clear all but least set bit addq a0, t2, a0 /* Now do the append. */ - + jsr t9, __stxcpy ret diff --git a/sysdeps/alpha/strchr.S b/sysdeps/alpha/strchr.S index c26a8431d2..e35b44ad6c 100644 --- a/sysdeps/alpha/strchr.S +++ b/sysdeps/alpha/strchr.S @@ -1,25 +1,24 @@ /* Copyright (C) 1996 Free Software Foundation, Inc. + This file is part of the GNU C Library. Contributed by Richard Henderson (rth@tamu.edu) -This file is part of the GNU C Library. + The GNU C Library is free software; you can redistribute it and/or + modify it under the terms of the GNU Library General Public License as + published by the Free Software Foundation; either version 2 of the + License, or (at your option) any later version. -The GNU C Library is free software; you can redistribute it and/or -modify it under the terms of the GNU Library General Public License as -published by the Free Software Foundation; either version 2 of the -License, or (at your option) any later version. + The GNU C Library is distributed in the hope that it will be useful, + but WITHOUT ANY WARRANTY; without even the implied warranty of + MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU + Library General Public License for more details. -The GNU C Library is distributed in the hope that it will be useful, -but WITHOUT ANY WARRANTY; without even the implied warranty of -MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU -Library General Public License for more details. - -You should have received a copy of the GNU Library General Public -License along with the GNU C Library; see the file COPYING.LIB. If -not, write to the Free Software Foundation, Inc., 675 Mass Ave, -Cambridge, MA 02139, USA. */ + You should have received a copy of the GNU Library General Public + License along with the GNU C Library; see the file COPYING.LIB. If not, + write to the Free Software Foundation, Inc., 59 Temple Place - Suite 330, + Boston, MA 02111-1307, USA. */ /* Return the address of a given character within a null-terminated - string, or null if it is not found. + string, or null if it is not found. This is generally scheduled for the EV5 (got to look out for my own interests :-), but with EV4 needs in mind. There *should* be no more @@ -32,7 +31,14 @@ Cambridge, MA 02139, USA. */ .set noat ENTRY(strchr) +#ifdef PROF + ldgp gp, 0(pv) + lda AT, _mcount + jsr AT, (AT), _mcount + .prologue 1 +#else .prologue 0 +#endif zapnot a1, 1, a1 # e0 : zero extend the search character ldq_u t0, 0(a0) # .. e1 : load first quadword diff --git a/sysdeps/alpha/strcmp.S b/sysdeps/alpha/strcmp.S new file mode 100644 index 0000000000..7dcae04ea4 --- /dev/null +++ b/sysdeps/alpha/strcmp.S @@ -0,0 +1,195 @@ +/* Copyright (C) 1996 Free Software Foundation, Inc. + Contributed by Richard Henderson (rth@tamu.edu) + + This file is part of the GNU C Library. + + The GNU C Library is free software; you can redistribute it and/or + modify it under the terms of the GNU Library General Public License as + published by the Free Software Foundation; either version 2 of the + License, or (at your option) any later version. + + The GNU C Library is distributed in the hope that it will be useful, + but WITHOUT ANY WARRANTY; without even the implied warranty of + MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU + Library General Public License for more details. + + You should have received a copy of the GNU Library General Public + License along with the GNU C Library; see the file COPYING.LIB. If + not, write to the Free Software Foundation, Inc., 675 Mass Ave, + Cambridge, MA 02139, USA. */ + +/* Bytewise compare two null-terminated strings. */ + +#include <sysdep.h> + + .set noat + .set noreorder + + .text + +ENTRY(strcmp) +#ifdef PROF + ldgp gp, 0(pv) + lda AT, _mcount + jmp AT, (AT), _mcount + .prologue 1 +#else + .prologue 0 +#endif + + ldq_u t0, 0(a0) # e0 : give cache time to catch up + xor a0, a1, t2 # .. e1 : are s1 and s2 co-aligned? + ldq_u t1, 0(a1) # e0 : + and t2, 7, t2 # .. e1 : + lda t3, -1 # e0 : + bne t2, $unaligned # .. e1 : + + /* On entry to this basic block: + t0 == the first destination word for masking back in + t1 == the first source word. + t3 == -1. */ + +$aligned: + mskqh t3, a0, t3 # e0 : + nop # .. e1 : + ornot t1, t3, t1 # e0 : + ornot t0, t3, t0 # .. e1 : + cmpbge zero, t1, t7 # e0 : bits set iff null found + bne t7, $eos # e1 (zdb) + + /* Aligned compare main loop. + On entry to this basic block: + t0 == an s1 word. + t1 == an s2 word not containing a null. */ + +$a_loop: + xor t0, t1, t2 # e0 : + bne t2, $wordcmp # .. e1 (zdb) + ldq_u t1, 8(a1) # e0 : + ldq_u t0, 8(a0) # .. e1 : + addq a1, 8, a1 # e0 : + addq a0, 8, a0 # .. e1 : + cmpbge zero, t1, t7 # e0 : + beq t7, $a_loop # .. e1 (zdb) + br $eos # e1 : + + /* The two strings are not co-aligned. Align s1 and cope. */ + +$unaligned: + and a0, 7, t4 # e0 : find s1 misalignment + and a1, 7, t5 # .. e1 : find s2 misalignment + subq a1, t4, a1 # e0 : + + /* If s2 misalignment is larger than s2 misalignment, we need + extra startup checks to avoid SEGV. */ + + cmplt t4, t5, t8 # .. e1 : + beq t8, $u_head # e1 : + + mskqh t3, t5, t3 # e0 : + ornot t1, t3, t3 # e0 : + cmpbge zero, t3, t7 # e1 : is there a zero? + beq t7, $u_head # e1 : + + /* We've found a zero in the first partial word of s2. Align + our current s1 and s2 words and compare what we've got. */ + + extql t1, t5, t1 # e0 : + extql t0, a0, t0 # e0 : + cmpbge zero, t1, t7 # .. e1 : find that zero again + br $eos # e1 : and finish up + + .align 3 +$u_head: + /* We know just enough now to be able to assemble the first + full word of s2. We can still find a zero at the end of it. + + On entry to this basic block: + t0 == first word of s1 + t1 == first partial word of s2. */ + + ldq_u t2, 8(a1) # e0 : load second partial s2 word + lda t3, -1 # .. e1 : create leading garbage mask + extql t1, a1, t1 # e0 : create first s2 word + mskqh t3, a0, t3 # e0 : + extqh t2, a1, t4 # e0 : + ornot t0, t3, t0 # .. e1 : kill s1 garbage + or t1, t4, t1 # e0 : s2 word now complete + cmpbge zero, t0, t7 # .. e1 : find zero in first s1 word + ornot t1, t3, t1 # e0 : kill s2 garbage + lda t3, -1 # .. e1 : + mskql t3, a1, t3 # e0 : mask for s2[1] bits we have seen + bne t7, $eos # .. e1 : + xor t0, t1, t4 # e0 : compare aligned words + bne t4, $wordcmp # .. e1 (zdb) + or t2, t3, t3 # e0 : + cmpbge zero, t3, t7 # e1 : + bne t7, $u_final # e1 : + + /* Unaligned copy main loop. In order to avoid reading too much, + the loop is structured to detect zeros in aligned words from s2. + This has, unfortunately, effectively pulled half of a loop + iteration out into the head and half into the tail, but it does + prevent nastiness from accumulating in the very thing we want + to run as fast as possible. + + On entry to this basic block: + t2 == the unshifted low-bits from the next s2 word. */ + + .align 3 +$u_loop: + extql t2, a1, t3 # e0 : + ldq_u t2, 16(a1) # .. e1 : load next s2 high bits + ldq_u t0, 8(a0) # e0 : load next s1 word + addq a1, 8, a1 # .. e1 : + addq a0, 8, a0 # e0 : + nop # .. e1 : + extqh t2, a1, t1 # e0 : + cmpbge zero, t0, t7 # .. e1 : find zero in current s1 word + or t1, t3, t1 # e0 : + bne t7, $eos # .. e1 : + xor t0, t1, t4 # e0 : compare the words + bne t4, $wordcmp # .. e1 (zdb) + cmpbge zero, t2, t4 # e0 : find zero in next low bits + beq t4, $u_loop # .. e1 (zdb) + + /* We've found a zero in the low bits of the last s2 word. Get + the next s1 word and align them. */ +$u_final: + ldq_u t0, 8(a0) # e1 : + extql t2, a1, t1 # .. e0 : + cmpbge zero, t1, t7 # e0 : + + /* We've found a zero somewhere in a word we just read. + On entry to this basic block: + t0 == s1 word + t1 == s2 word + t7 == cmpbge mask containing the zero. */ + + .align 3 +$eos: + negq t7, t6 # e0 : create bytemask of valid data + and t6, t7, t8 # e1 : + subq t8, 1, t6 # e0 : + or t6, t8, t7 # e1 : + zapnot t0, t7, t0 # e0 : kill the garbage + zapnot t1, t7, t1 # .. e1 : + xor t0, t1, v0 # e0 : and compare + beq v0, $done # .. e1 : + + /* Here we have two differing co-aligned words in t0 & t1. + Bytewise compare them and return (t0 > t1 ? 1 : -1). */ +$wordcmp: + cmpbge t0, t1, t2 # e0 : comparison yields bit mask of ge + cmpbge t1, t0, t3 # .. e1 : + xor t2, t3, t0 # e0 : bits set iff t0/t1 bytes differ + negq t0, t1 # e1 : clear all but least bit + and t0, t1, t0 # e0 : + lda v0, -1 # .. e1 : + and t0, t2, t1 # e0 : was bit set in t0 > t1? + cmovne t1, 1, v0 # .. e1 (zdb) + +$done: + ret # e1 : + + END(strcmp) diff --git a/sysdeps/alpha/strcpy.S b/sysdeps/alpha/strcpy.S index 2975181919..823476f750 100644 --- a/sysdeps/alpha/strcpy.S +++ b/sysdeps/alpha/strcpy.S @@ -27,6 +27,10 @@ Cambridge, MA 02139, USA. */ ENTRY(strcpy) ldgp gp, 0(pv) +#ifdef PROF + lda AT, _mcount + jsr AT, (AT), _mcount +#endif .prologue 1 mov a0, v0 # set up return value diff --git a/sysdeps/alpha/strlen.S b/sysdeps/alpha/strlen.S index 9eab707388..026c8addc2 100644 --- a/sysdeps/alpha/strlen.S +++ b/sysdeps/alpha/strlen.S @@ -34,6 +34,15 @@ Cambridge, MA 02139, USA. */ .set noat ENTRY(strlen) +#ifdef PROF + ldgp gp, 0(pv) + lda AT, _mcount + jsr AT, (AT), _mcount + .prologue 1 +#else + .prologue 0 +#endif + ldq_u t0, 0(a0) # load first quadword (a0 may be misaligned) lda t1, -1(zero) insqh t1, a0, t1 diff --git a/sysdeps/alpha/strncat.S b/sysdeps/alpha/strncat.S index d502037ace..089fba34d9 100644 --- a/sysdeps/alpha/strncat.S +++ b/sysdeps/alpha/strncat.S @@ -27,6 +27,10 @@ Cambridge, MA 02139, USA. */ ENTRY(strncat) ldgp gp, 0(pv) +#ifdef PROF + lda AT, _mcount + jsr AT, (AT), _mcount +#endif .prologue 1 mov a0, v0 # set up return value diff --git a/sysdeps/alpha/strncmp.S b/sysdeps/alpha/strncmp.S new file mode 100644 index 0000000000..682759042f --- /dev/null +++ b/sysdeps/alpha/strncmp.S @@ -0,0 +1,224 @@ +/* Copyright (C) 1996 Free Software Foundation, Inc. + Contributed by Richard Henderson (rth@tamu.edu) + + This file is part of the GNU C Library. + + The GNU C Library is free software; you can redistribute it and/or + modify it under the terms of the GNU Library General Public License as + published by the Free Software Foundation; either version 2 of the + License, or (at your option) any later version. + + The GNU C Library is distributed in the hope that it will be useful, + but WITHOUT ANY WARRANTY; without even the implied warranty of + MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU + Library General Public License for more details. + + You should have received a copy of the GNU Library General Public + License along with the GNU C Library; see the file COPYING.LIB. If + not, write to the Free Software Foundation, Inc., 675 Mass Ave, + Cambridge, MA 02139, USA. */ + +/* Bytewise compare two null-terminated strings of length no longer than N. */ + +#include <sysdep.h> + + .set noat + .set noreorder + + .text + +ENTRY(strncmp) +#ifdef PROF + ldgp gp, 0(pv) + lda AT, _mcount + jsr AT, (AT), _mcount + .prologue 1 +#else + .prologue 0 +#endif + + xor a0, a1, t2 # e0 : are s1 and s2 co-aligned? + beq a2, $zerolength # .. e1 : + ldq_u t0, 0(a0) # e0 : give cache time to catch up + ldq_u t1, 0(a1) # .. e1 : + and t2, 7, t2 # e0 : + and a0, 7, t4 # .. e1 : find s1 misalignment + lda t3, -1 # e0 : + addq a2, t4, a2 # .. e1 : bias count by s1 misalignment + and a2, 7, t10 # e1 : ofs of last byte in last word + srl a2, 3, a2 # .. e0 : remaining full words in count + and a1, 7, t5 # e0 : find s2 misalignment + bne t2, $unaligned # .. e1 : + + /* On entry to this basic block: + t0 == the first word of s1. + t1 == the first word of s2. + t3 == -1. */ + +$aligned: + mskqh t3, a1, t3 # e0 : mask off leading garbage + nop # .. e1 : + ornot t1, t3, t1 # e0 : + ornot t0, t3, t0 # .. e1 : + cmpbge zero, t1, t7 # e0 : bits set iff null found + beq a2, $eoc # .. e1 : check end of count + unop # : + bne t7, $eos # e1 : + + /* Aligned compare main loop. + On entry to this basic block: + t0 == an s1 word. + t1 == an s2 word not containing a null. */ + +$a_loop: + xor t0, t1, t2 # e0 : + bne t2, $wordcmp # .. e1 (zdb) + ldq_u t1, 0(a1) # e0 : + ldq_u t0, 0(a0) # .. e1 : + addq a1, 8, a1 # e0 : + addq a0, 8, a0 # .. e1 : + cmpbge zero, t1, t7 # e0 : + beq a2, $eoc # .. e1 : + subq a2, 1, a2 # e0 : + beq t7, $a_loop # .. e1 : + br $eos # e1 : + + /* The two strings are not co-aligned. Align s1 and cope. */ +$unaligned: + subq a1, t4, a1 # e0 : + unop # : + + /* If s2 misalignment is larger than s2 misalignment, we need + extra startup checks to avoid SEGV. */ + + cmplt t4, t5, t8 # .. e1 : + beq t8, $u_head # e1 : + + mskqh t3, t5, t3 # e0 : + ornot t1, t3, t3 # e0 : + cmpbge zero, t3, t7 # e1 : is there a zero? + beq t7, $u_head # e1 : + + /* We've found a zero in the first partial word of s2. Align + our current s1 and s2 words and compare what we've got. */ + + extql t1, t5, t1 # e0 : + lda t3, -1 # .. e1 : + insql t1, a0, t1 # e0 : + mskqh t3, a0, t3 # e0 : + ornot t1, t3, t1 # e0 : + ornot t0, t3, t0 # .. e1 : + cmpbge zero, t1, t7 # e0 : find that zero again + beq a2, $eoc # .. e1 : and finish up + br $eos # e1 : + + .align 3 +$u_head: + /* We know just enough now to be able to assemble the first + full word of s2. We can still find a zero at the end of it. + + On entry to this basic block: + t0 == first word of s1 + t1 == first partial word of s2. */ + + ldq_u t2, 8(a1) # e0 : load second partial s2 word + lda t3, -1 # .. e1 : create leading garbage mask + extql t1, a1, t1 # e0 : create first s2 word + mskqh t3, a0, t3 # e0 : + extqh t2, a1, t4 # e0 : + ornot t0, t3, t0 # .. e1 : kill s1 garbage + or t1, t4, t1 # e0 : s2 word now complete + ornot t1, t3, t1 # e1 : kill s2 garbage + cmpbge zero, t0, t7 # e0 : find zero in first s1 word + beq a2, $eoc # .. e1 : + lda t3, -1 # e0 : + bne t7, $eos # .. e1 : + subq a2, 1, a2 # e0 : + xor t0, t1, t4 # .. e1 : compare aligned words + mskql t3, a1, t3 # e0 : mask out s2[1] bits we have seen + bne t4, $wordcmp # .. e1 : + or t2, t3, t3 # e0 : + cmpbge zero, t3, t7 # e1 : find zero in high bits of s2[1] + bne t7, $u_final # e1 : + + /* Unaligned copy main loop. In order to avoid reading too much, + the loop is structured to detect zeros in aligned words from s2. + This has, unfortunately, effectively pulled half of a loop + iteration out into the head and half into the tail, but it does + prevent nastiness from accumulating in the very thing we want + to run as fast as possible. + + On entry to this basic block: + t2 == the unshifted low-bits from the next s2 word. */ + + .align 3 +$u_loop: + extql t2, a1, t3 # e0 : + ldq_u t2, 16(a1) # .. e1 : load next s2 high bits + ldq_u t0, 8(a0) # e0 : load next s1 word + addq a1, 8, a1 # .. e1 : + addq a0, 8, a0 # e0 : + nop # .. e1 : + extqh t2, a1, t1 # e0 : + cmpbge zero, t0, t7 # .. e1 : find zero in current s1 word + or t1, t3, t1 # e0 : + beq a2, $eoc # .. e1 : check for end of count + subq a2, 1, a2 # e0 : + bne t7, $eos # .. e1 : + xor t0, t1, t4 # e0 : compare the words + bne t4, $wordcmp # .. e1 (zdb) + cmpbge zero, t2, t4 # e0 : find zero in next low bits + beq t4, $u_loop # .. e1 (zdb) + + /* We've found a zero in the low bits of the last s2 word. Get + the next s1 word and align them. */ +$u_final: + ldq_u t0, 8(a0) # e1 : + extql t2, a1, t1 # .. e0 : + cmpbge zero, t1, t7 # e0 : + bne a2, $eos # .. e1 : + + /* We've hit end of count. Zero everything after the count + and compare whats left. */ + + .align 3 +$eoc: + mskql t0, t10, t0 + mskql t1, t10, t1 + + /* We've found a zero somewhere in a word we just read. + On entry to this basic block: + t0 == s1 word + t1 == s2 word + t7 == cmpbge mask containing the zero. */ + +$eos: + negq t7, t6 # e0 : create bytemask of valid data + and t6, t7, t8 # e1 : + subq t8, 1, t6 # e0 : + or t6, t8, t7 # e1 : + zapnot t0, t7, t0 # e0 : kill the garbage + zapnot t1, t7, t1 # .. e1 : + xor t0, t1, v0 # e0 : and compare + beq v0, $done # .. e1 : + + /* Here we have two differing co-aligned words in t0 & t1. + Bytewise compare them and return (t0 > t1 ? 1 : -1). */ +$wordcmp: + cmpbge t0, t1, t2 # e0 : comparison yields bit mask of ge + cmpbge t1, t0, t3 # .. e1 : + xor t2, t3, t0 # e0 : bits set iff t0/t1 bytes differ + negq t0, t1 # e1 : clear all but least bit + and t0, t1, t0 # e0 : + lda v0, -1 # .. e1 : + and t0, t2, t1 # e0 : was bit set in t0 > t1? + cmovne t1, 1, v0 # .. e1 (zdb) + +$done: + ret # e1 : + +$zerolength: + clr v0 + ret + + END(strncmp) diff --git a/sysdeps/alpha/strncpy.S b/sysdeps/alpha/strncpy.S index e13769c5c3..c077ab35b7 100644 --- a/sysdeps/alpha/strncpy.S +++ b/sysdeps/alpha/strncpy.S @@ -31,6 +31,10 @@ Cambridge, MA 02139, USA. */ ENTRY(strncpy) ldgp gp, 0(pv) +#ifdef PROF + lda AT, _mcount + jsr AT, (AT), _mcount +#endif .prologue 1 mov a0, v0 # set return value now diff --git a/sysdeps/alpha/strrchr.S b/sysdeps/alpha/strrchr.S index 464f754b20..02f37f50be 100644 --- a/sysdeps/alpha/strrchr.S +++ b/sysdeps/alpha/strrchr.S @@ -31,7 +31,14 @@ Cambridge, MA 02139, USA. */ .set noat ENTRY(strrchr) +#ifdef PROF + ldgp gp, 0(pv) + lda AT, _mcount + jsr AT, (AT), _mcount + .prologue 1 +#else .prologue 0 +#endif zapnot a1, 1, a1 # e0 : zero extend our test character mov zero, t6 # .. e1 : t6 is last match aligned addr diff --git a/sysdeps/alpha/udiv_qrnnd.S b/sysdeps/alpha/udiv_qrnnd.S index eb134f25c7..75d11827f9 100644 --- a/sysdeps/alpha/udiv_qrnnd.S +++ b/sysdeps/alpha/udiv_qrnnd.S @@ -27,8 +27,15 @@ .text LEAF(__udiv_qrnnd, 0) - +#ifdef PROF + ldgp gp, 0(pv) + lda AT, _mcount + jsr AT, (AT), _mcount + .prologue 1 +#else .prologue 0 +#endif + #define cnt $2 #define tmp $3 #define rem_ptr $16 @@ -38,9 +45,9 @@ LEAF(__udiv_qrnnd, 0) #define qb $20 ldiq cnt,16 - blt d,.Largedivisor + blt d,$largedivisor -.Loop1: cmplt n0,0,tmp +$loop1: cmplt n0,0,tmp addq n1,n1,n1 bis n1,tmp,n1 addq n0,n0,n0 @@ -73,12 +80,12 @@ LEAF(__udiv_qrnnd, 0) cmovne qb,tmp,n1 bis n0,qb,n0 subq cnt,1,cnt - bgt cnt,.Loop1 + bgt cnt,$loop1 stq n1,0(rem_ptr) bis $31,n0,$0 ret $31,($26),1 -.Largedivisor: +$largedivisor: and n0,1,$4 srl n0,1,n0 @@ -90,7 +97,7 @@ LEAF(__udiv_qrnnd, 0) srl d,1,$5 addq $5,$6,$5 -.Loop2: cmplt n0,0,tmp +$loop2: cmplt n0,0,tmp addq n1,n1,n1 bis n1,tmp,n1 addq n0,n0,n0 @@ -123,27 +130,30 @@ LEAF(__udiv_qrnnd, 0) cmovne qb,tmp,n1 bis n0,qb,n0 subq cnt,1,cnt - bgt cnt,.Loop2 + bgt cnt,$loop2 addq n1,n1,n1 addq $4,n1,n1 - bne $6,.LOdd + bne $6,$Odd stq n1,0(rem_ptr) bis $31,n0,$0 ret $31,($26),1 -.LOdd: +$Odd: /* q' in n0. r' in n1 */ addq n1,n0,n1 + cmpult n1,n0,tmp # tmp := carry from addq - beq tmp,.LLp6 - addq n0,1,n0 - subq n1,d,n1 -.LLp6: cmpult n1,d,tmp - bne tmp,.LLp7 - addq n0,1,n0 - subq n1,d,n1 -.LLp7: + subq n1,d,AT + addq n0,tmp,n0 + cmovne tmp,AT,n1 + + cmpult n1,d,tmp + addq n0,1,AT + cmoveq tmp,AT,n0 + subq n1,d,AT + cmoveq tmp,AT,n1 + stq n1,0(rem_ptr) bis $31,n0,$0 ret $31,($26),1 diff --git a/sysdeps/alpha/w_sqrt.S b/sysdeps/alpha/w_sqrt.S new file mode 100644 index 0000000000..b5c980e557 --- /dev/null +++ b/sysdeps/alpha/w_sqrt.S @@ -0,0 +1,161 @@ +/* Copyright (C) 1996 Free Software Foundation, Inc. + Contributed by David Mosberger (davidm@cs.arizona.edu). + Based on public-domain C source by Linus Torvalds. + + This file is part of the GNU C Library. + + The GNU C Library is free software; you can redistribute it and/or + modify it under the terms of the GNU Library General Public License as + published by the Free Software Foundation; either version 2 of the + License, or (at your option) any later version. + + The GNU C Library is distributed in the hope that it will be useful, + but WITHOUT ANY WARRANTY; without even the implied warranty of + MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU + Library General Public License for more details. + + You should have received a copy of the GNU Library General Public + License along with the GNU C Library; see the file COPYING.LIB. If + not, write to the Free Software Foundation, Inc., 675 Mass Ave, + Cambridge, MA 02139, USA. */ + +/* This version is much faster than generic sqrt implementation, but + it doesn't handle exceptional values or the inexact flag. Don't use + this if _IEEE_FP or _IEEE_FP_INEXACT is in effect. */ + +#ifndef _IEEE_FP + +#include <errnos.h> +#include <sysdep.h> + + .set noreorder + +#ifdef __ELF__ + .section .rodata +#else + .rdata +#endif + .align 5 # align to cache line + + /* Do all memory accesses relative to sqrtdata. */ +sqrtdata: + +#define DN 0x00 +#define UP 0x08 +#define HALF 0x10 +#define ALMOST_THREE_HALF 0x18 +#define T2 0x20 + + .quad 0x3fefffffffffffff /* DN = next(1.0) */ + .quad 0x3ff0000000000001 /* UP = prev(1.0) */ + .quad 0x3fe0000000000000 /* HALF = 0.5 */ + .quad 0x3ff7ffffffc00000 /* ALMOST_THREE_HALF = 1.5-2^-30 */ + +/* table T2: */ +.long 0x1500, 0x2ef8, 0x4d67, 0x6b02, 0x87be, 0xa395, 0xbe7a, 0xd866 +.long 0xf14a, 0x1091b, 0x11fcd, 0x13552, 0x14999, 0x15c98, 0x16e34, 0x17e5f +.long 0x18d03, 0x19a01, 0x1a545, 0x1ae8a, 0x1b5c4, 0x1bb01, 0x1bfde, 0x1c28d +.long 0x1c2de, 0x1c0db, 0x1ba73, 0x1b11c, 0x1a4b5, 0x1953d, 0x18266, 0x16be0 +.long 0x1683e, 0x179d8, 0x18a4d, 0x19992, 0x1a789, 0x1b445, 0x1bf61, 0x1c989 +.long 0x1d16d, 0x1d77b, 0x1dddf, 0x1e2ad, 0x1e5bf, 0x1e6e8, 0x1e654, 0x1e3cd +.long 0x1df2a, 0x1d635, 0x1cb16, 0x1be2c, 0x1ae4e, 0x19bde, 0x1868e, 0x16e2e +.long 0x1527f, 0x1334a, 0x11051, 0xe951, 0xbe01, 0x8e0d, 0x5924, 0x1edd + +/* + * Stack variables: + */ +#define K 16(sp) +#define Y 24(sp) +#define FSIZE 32 + + .text + +LEAF(__sqrt, FSIZE) + lda sp, -FSIZE(sp) + ldgp gp, .-__sqrt(pv) + stq ra, 0(sp) +#ifdef PROF + lda AT, _mcount + jsr AT, (AT), _mcount +#endif + .prologue 1 + + stt $f16, K + lda t3, sqrtdata # load base address into t3 + + fblt $f16, $negative + + /* Compute initial guess. */ + + .align 3 + + ldah t1, 0x5fe8 # e0 : + ldq t2, K # .. e1 : + ldt $f12, HALF(t3) # e0 : + ldt $f18, ALMOST_THREE_HALF(t3) # .. e1 : + srl t2, 33, t0 # e0 : + mult $f16, $f12, $f11 # .. fm : $f11 = x * 0.5 + subl t1, t0, t1 # e0 : + addt $f12, $f12, $f17 # .. fa : $f17 = 1.0 + srl t1, 12, t0 # e0 : + and t0, 0xfc, t0 # .. e1 : + addq t0, t3, t0 # e0 : + ldl t0, T2(t0) # .. e1 : + addt $f12, $f17, $f15 # fa : $f15 = 1.5 + subl t1, t0, t1 # .. e1 : + sll t1, 32, t1 # e0 : + ldt $f14, DN(t3) # .. e1 : + stq t1, Y # e0 : + ldt $f13, Y # e1 : + addq sp, FSIZE, sp # e0 : + + mult $f11, $f13, $f10 # fm : $f10 = (x * 0.5) * y + mult $f10, $f13, $f10 # fm : $f10 = ((x * 0.5) * y) * y + subt $f15, $f10, $f1 # fa : $f1 = (1.5 - 0.5*x*y*y) + mult $f13, $f1, $f13 # fm : yp = y*(1.5 - 0.5*x*y*y) + mult $f11, $f13, $f11 # fm : $f11 = x * 0.5 * yp + mult $f11, $f13, $f11 # fm : $f11 = (x * 0.5 * yp) * yp + subt $f18, $f11, $f1 # fa : $f1= (1.5-2^-30) - 0.5*x*yp*yp + mult $f13, $f1, $f13 # fm : ypp = $f13 = yp*$f1 + subt $f15, $f12, $f1 # fa : $f1 = (1.5 - 0.5) + ldt $f15, UP(t3) # .. e1 : + mult $f16, $f13, $f10 # fm : z = $f10 = x * ypp + mult $f10, $f13, $f11 # fm : $f11 = z*ypp + mult $f10, $f12, $f12 # fm : $f12 = z*0.5 + subt $f1, $f11, $f1 # .. fa : $f1 = 1 - z*ypp + mult $f12, $f1, $f12 # fm : $f12 = z*0.5*(1 - z*ypp) + addt $f10, $f12, $f0 # fa : zp=res=$f0= z + z*0.5*(1 - z*ypp) + + mult/c $f0, $f14, $f12 # fm : zmi = zp * DN + mult/c $f0, $f15, $f11 # fm : zpl = zp * UP + mult/c $f0, $f12, $f1 # fm : $f1 = zp * zmi + mult/c $f0, $f11, $f15 # fm : $f15 = zp * zpl + + subt $f1, $f16, $f13 # fa : y1 = zp*zmi - x + subt $f15, $f16, $f15 # fa : y2 = zp*zpl - x + + fcmovge $f13, $f12, $f0 # res = (y1 >= 0) ? zmi : res + fcmovlt $f15, $f11, $f0 # res = (y2 < 0) ? zpl : res + + ret + +$negative: + lda t1, -1 + stq t1, K + lda t1, EDOM + stl t1, errno +#ifdef _LIBC_REENTRANT + jsr ra, __errno_location + lda t1, -1 + ldq ra, 0(sp) + stl t1, 0(v0) +#endif + ldt $f0, K # res = (double) 0xffffffffffffffff + addq sp, FSIZE, sp + ret + + END(__sqrt) + +weak_alias(__sqrt, sqrt) + +#endif /* !_IEEE_FP */ |