diff options
Diffstat (limited to 'libgcc/config/bfin/lib1funcs.S')
-rw-r--r-- | libgcc/config/bfin/lib1funcs.S | 211 |
1 files changed, 211 insertions, 0 deletions
diff --git a/libgcc/config/bfin/lib1funcs.S b/libgcc/config/bfin/lib1funcs.S new file mode 100644 index 00000000000..c7bf4f3f05c --- /dev/null +++ b/libgcc/config/bfin/lib1funcs.S @@ -0,0 +1,211 @@ +/* libgcc functions for Blackfin. + Copyright (C) 2005, 2009 Free Software Foundation, Inc. + Contributed by Analog Devices. + +This file is part of GCC. + +GCC is free software; you can redistribute it and/or modify +it under the terms of the GNU General Public License as published by +the Free Software Foundation; either version 3, or (at your option) +any later version. + +GCC is distributed in the hope that it will be useful, +but WITHOUT ANY WARRANTY; without even the implied warranty of +MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the +GNU General Public License for more details. + +Under Section 7 of GPL version 3, you are granted additional +permissions described in the GCC Runtime Library Exception, version +3.1, as published by the Free Software Foundation. + +You should have received a copy of the GNU General Public License and +a copy of the GCC Runtime Library Exception along with this program; +see the files COPYING3 and COPYING.RUNTIME respectively. If not, see +<http://www.gnu.org/licenses/>. */ + +#ifdef L_divsi3 +.text +.align 2 +.global ___divsi3; +.type ___divsi3, STT_FUNC; + +___divsi3: + [--SP]= RETS; + [--SP] = R7; + + R2 = -R0; + CC = R0 < 0; + IF CC R0 = R2; + R7 = CC; + + R2 = -R1; + CC = R1 < 0; + IF CC R1 = R2; + R2 = CC; + R7 = R7 ^ R2; + + CALL ___udivsi3; + + CC = R7; + R1 = -R0; + IF CC R0 = R1; + + R7 = [SP++]; + RETS = [SP++]; + RTS; +#endif + +#ifdef L_modsi3 +.align 2 +.global ___modsi3; +.type ___modsi3, STT_FUNC; + +___modsi3: + [--SP] = RETS; + [--SP] = R0; + [--SP] = R1; + CALL ___divsi3; + R2 = [SP++]; + R1 = [SP++]; + R2 *= R0; + R0 = R1 - R2; + RETS = [SP++]; + RTS; +#endif + +#ifdef L_udivsi3 +.align 2 +.global ___udivsi3; +.type ___udivsi3, STT_FUNC; + +___udivsi3: + P0 = 32; + LSETUP (0f, 1f) LC0 = P0; + /* upper half of dividend */ + R3 = 0; +0: + /* The first time round in the loop we shift in garbage, but since we + perform 33 shifts, it doesn't matter. */ + R0 = ROT R0 BY 1; + R3 = ROT R3 BY 1; + R2 = R3 - R1; + CC = R3 < R1 (IU); +1: + /* Last instruction of the loop. */ + IF ! CC R3 = R2; + + /* Shift in the last bit. */ + R0 = ROT R0 BY 1; + /* R0 is the result, R3 contains the remainder. */ + R0 = ~ R0; + RTS; +#endif + +#ifdef L_umodsi3 +.align 2 +.global ___umodsi3; +.type ___umodsi3, STT_FUNC; + +___umodsi3: + [--SP] = RETS; + CALL ___udivsi3; + R0 = R3; + RETS = [SP++]; + RTS; +#endif + +#ifdef L_umulsi3_highpart +.align 2 +.global ___umulsi3_highpart; +.type ___umulsi3_highpart, STT_FUNC; + +___umulsi3_highpart: + A1 = R1.L * R0.L (FU); + A1 = A1 >> 16; + A0 = R1.H * R0.H, A1 += R1.L * R0.H (FU); + A1 += R0.L * R1.H (FU); + A1 = A1 >> 16; + A0 += A1; + R0 = A0 (FU); + RTS; +#endif + +#ifdef L_smulsi3_highpart +.align 2 +.global ___smulsi3_highpart; +.type ___smulsi3_highpart, STT_FUNC; + +___smulsi3_highpart: + A1 = R1.L * R0.L (FU); + A1 = A1 >> 16; + A0 = R0.H * R1.H, A1 += R0.H * R1.L (IS,M); + A1 += R1.H * R0.L (IS,M); + A1 = A1 >>> 16; + R0 = (A0 += A1); + RTS; +#endif + +#ifdef L_muldi3 +.align 2 +.global ___muldi3; +.type ___muldi3, STT_FUNC; + +/* + R1:R0 * R3:R2 + = R1.h:R1.l:R0.h:R0.l * R3.h:R3.l:R2.h:R2.l +[X] = (R1.h * R3.h) * 2^96 +[X] + (R1.h * R3.l + R1.l * R3.h) * 2^80 +[X] + (R1.h * R2.h + R1.l * R3.l + R3.h * R0.h) * 2^64 +[T1] + (R1.h * R2.l + R3.h * R0.l + R1.l * R2.h + R3.l * R0.h) * 2^48 +[T2] + (R1.l * R2.l + R3.l * R0.l + R0.h * R2.h) * 2^32 +[T3] + (R0.l * R2.h + R2.l * R0.h) * 2^16 +[T4] + (R0.l * R2.l) + + We can discard the first three lines marked "X" since we produce + only a 64 bit result. So, we need ten 16-bit multiplies. + + Individual mul-acc results: +[E1] = R1.h * R2.l + R3.h * R0.l + R1.l * R2.h + R3.l * R0.h +[E2] = R1.l * R2.l + R3.l * R0.l + R0.h * R2.h +[E3] = R0.l * R2.h + R2.l * R0.h +[E4] = R0.l * R2.l + + We also need to add high parts from lower-level results to higher ones: + E[n]c = E[n] + (E[n+1]c >> 16), where E4c := E4 + + One interesting property is that all parts of the result that depend + on the sign of the multiplication are discarded. Those would be the + multiplications involving R1.h and R3.h, but only the top 16 bit of + the 32 bit result depend on the sign, and since R1.h and R3.h only + occur in E1, the top half of these results is cut off. + So, we can just use FU mode for all of the 16-bit multiplies, and + ignore questions of when to use mixed mode. */ + +___muldi3: + /* [SP] technically is part of the caller's frame, but we can + use it as scratch space. */ + A0 = R2.H * R1.L, A1 = R2.L * R1.H (FU) || R3 = [SP + 12]; /* E1 */ + A0 += R3.H * R0.L, A1 += R3.L * R0.H (FU) || [SP] = R4; /* E1 */ + A0 += A1; /* E1 */ + R4 = A0.w; + A0 = R0.l * R3.l (FU); /* E2 */ + A0 += R2.l * R1.l (FU); /* E2 */ + + A1 = R2.L * R0.L (FU); /* E4 */ + R3 = A1.w; + A1 = A1 >> 16; /* E3c */ + A0 += R2.H * R0.H, A1 += R2.L * R0.H (FU); /* E2, E3c */ + A1 += R0.L * R2.H (FU); /* E3c */ + R0 = A1.w; + A1 = A1 >> 16; /* E2c */ + A0 += A1; /* E2c */ + R1 = A0.w; + + /* low(result) = low(E3c):low(E4) */ + R0 = PACK (R0.l, R3.l); + /* high(result) = E2c + (E1 << 16) */ + R1.h = R1.h + R4.l (NS) || R4 = [SP]; + RTS; + +.size ___muldi3, .-___muldi3 +#endif |