diff options
Diffstat (limited to 'gcc')
-rw-r--r-- | gcc/ChangeLog | 178 | ||||
-rw-r--r-- | gcc/config/float-sh.h | 2 | ||||
-rw-r--r-- | gcc/config/sh/lib1funcs.asm | 220 | ||||
-rw-r--r-- | gcc/config/sh/sh.c | 749 | ||||
-rw-r--r-- | gcc/config/sh/sh.h | 387 | ||||
-rw-r--r-- | gcc/config/sh/sh.md | 1444 | ||||
-rw-r--r-- | gcc/config/sh/t-sh | 4 | ||||
-rw-r--r-- | gcc/ginclude/va-sh.h | 99 |
8 files changed, 2805 insertions, 278 deletions
diff --git a/gcc/ChangeLog b/gcc/ChangeLog index 252cd911af0..89127e96afe 100644 --- a/gcc/ChangeLog +++ b/gcc/ChangeLog @@ -1,3 +1,181 @@ +Mon Nov 23 16:46:46 1998 J"orn Rennecke <amylaar@cygnus.co.uk> + + Add SH4 support: + + * config/sh/lib1funcs.asm (___movstr_i4_even, ___movstr_i4_odd): Define. + (___movstrSI12_i4, ___sdivsi3_i4, ___udivsi3_i4): Define. + * sh.c (reg_class_from_letter, regno_reg_class): Add DF_REGS. + (fp_reg_names, assembler_dialect): New variables. + (print_operand_address): Handle SUBREGs. + (print_operand): Added 'o' case. + Don't use adj_offsettable_operand on PRE_DEC / POST_INC. + Name of FP registers depends on mode. + (expand_block_move): Emit different code for SH4 hardware. + (prepare_scc_operands): Use emit_sf_insn / emit_df_insn as appropriate. + (from_compare): Likewise. + (add_constant): New argument last_value. Changed all callers. + (find_barrier): Don't try HImode load for FPUL_REG. + (machine_dependent_reorg): Likewise. + (sfunc_uses_reg): A CLOBBER cannot be the address register use. + (gen_far_branch): Emit a barrier after the new jump. + (barrier_align): Don't trust instruction lengths before + fixing up pcloads. + (machine_dependent_reorg): Add support for FIRST_XD_REG .. LAST_XD_REG. + Use auto-inc addressing for fp registers if doubles need to + be loaded in two steps. + Set sh_flag_remove_dead_before_cse. + (push): Support for TARGET_FMOVD. Use gen_push_fpul for fpul. + (pop): Support for TARGET_FMOVD. Use gen_pop_fpul for fpul. + (calc_live_regs): Support for TARGET_FMOVD. Don't save FPSCR. + Support for FIRST_XD_REG .. LAST_XD_REG. + (sh_expand_prologue): Support for FIRST_XD_REG .. LAST_XD_REG. + (sh_expand_epilogue): Likewise. + (sh_builtin_saveregs): Use DFmode moves for fp regs on SH4. + (initial_elimination_offset): Take TARGET_ALIGN_DOUBLE into account. + (arith_reg_operand): FPUL_REG is OK for SH4. + (fp_arith_reg_operand, fp_extended_operand) New functions. + (tertiary_reload_operand, fpscr_operand): Likewise. + (commutative_float_operator, noncommutative_float_operator): Likewise. + (binary_float_operator, get_fpscr_rtx, emit_sf_insn): Likewise. + (emit_df_insn, expand_sf_unop, expand_sf_binop): Likewise. + (expand_df_unop, expand_df_binop, expand_fp_branch): Likewise. + (emit_fpscr_use, mark_use, remove_dead_before_cse): Likewise. + * sh.h (CPP_SPEC): Add support for -m4, m4-single, m4-single-only. + (CONDITIONAL_REGISTER_USAGE): Likewise. + (HARD_SH4_BIT, FPU_SINGLE_BIT, SH4_BIT, FMOVD_BIT): Define. + (TARGET_CACHE32, TARGET_SUPERSCALAR, TARGET_HARWARD): Define. + (TARGET_HARD_SH4, TARGET_FPU_SINGLE, TARGET_SH4, TARGET_FMOVD): Define. + (target_flag): Add -m4, m4-single, m4-single-only, -mfmovd. + (OPTIMIZATION_OPTIONS): If optimizing, set flag_omit_frame_pointer + to -1 and sh_flag_remove_dead_before_cse to 1. + (ASSEMBLER_DIALECT): Define to assembler_dialect. + (assembler_dialect, fp_reg_names): Declare. + (OVERRIDE_OPTIONS): Add code for TARGET_SH4. + Hide names of registers that are not accessible. + (CACHE_LOG): Take TARGET_CACHE32 into account. + (LOOP_ALIGN): Take TARGET_HARWARD into account. + (FIRST_XD_REG, LAST_XD_REG, FPSCR_REG): Define. + (FIRST_PSEUDO_REGISTER: Now 49. + (FIXED_REGISTERS, CALL_USED_REGISTERS): Include values for registers. + (HARD_REGNO_NREGS): Special treatment of FIRST_XD_REG .. LAST_XD_REG. + (HARD_REGNO_MODE_OK): Update. + (enum reg_class): Add DF_REGS and FPSCR_REGS. + (REG_CLASS_NAMES, REG_CLASS_CONTENTS, REG_ALLOC_ORDER): Likewise. + (SECONDARY_OUTPUT_RELOAD_CLASS, SECONDARY_INPUT_RELOAD_CLASS): Update. + (CLASS_CANNOT_CHANGE_SIZE, DEBUG_REGISTER_NAMES): Define. + (NPARM_REGS): Eight floating point parameter registers on SH4. + (BASE_RETURN_VALUE_REG): SH4 also passes double values + in floating point registers. + (GET_SH_ARG_CLASS) Likewise. + Complex float types are also returned in float registers. + (BASE_ARG_REG): Complex float types are also passes in float registers. + (FUNCTION_VALUE): Change mode like PROMOTE_MODE does. + (LIBCALL_VALUE): Remove trailing semicolon. + (ROUND_REG): Round when double precision value is passed in floating + point register(s). + (FUNCTION_ARG_ADVANCE): No change wanted for SH4 when things are + passed on the stack. + (FUNCTION_ARG): Little endian adjustment for SH4 SFmode. + (FUNCTION_ARG_PARTIAL_NREGS): Zero for SH4. + (TRAMPOLINE_ALIGNMENT): Take TARGET_HARWARD into account. + (INITIALIZE_TRAMPOLINE): Emit ic_invalidate_line for TARGET_HARWARD. + (MODE_DISP_OK_8): Not for SH4 DFmode. + (GO_IF_LEGITIMATE_ADDRESS): No base reg + index reg for SH4 DFmode. + Allow indexed addressing for PSImode after reload. + (LEGITIMIZE_ADDRESS): Not for SH4 DFmode. + (LEGITIMIZE_RELOAD_ADDRESS): Handle SH3E SFmode. + Don't change SH4 DFmode nor PSImode RELOAD_FOR_INPUT_ADDRESS. + (DOUBLE_TYPE_SIZE): 64 for SH4. + (RTX_COSTS): Add PLUS case. + Increae cost of ASHIFT, ASHIFTRT, LSHIFTRT case. + (REGISTER_MOVE_COST): Add handling of R0_REGS, FPUL_REGS, T_REGS, + MAC_REGS, PR_REGS, DF_REGS. + (REGISTER_NAMES): Use fp_reg_names. + (enum processor_type): Add PROCESSOR_SH4. + (sh_flag_remove_dead_before_cse): Declare. + (rtx_equal_function_value_matters, fpscr_rtx, get_fpscr_rtx): Declare. + (PREDICATE_CODES): Add binary_float_operator, + commutative_float_operator, fp_arith_reg_operand, fp_extended_operand, + fpscr_operand, noncommutative_float_operator. + (ADJUST_COST): Use different scale for TARGET_SUPERSCALAR. + (SH_DYNAMIC_SHIFT_COST): Cheaper for SH4. + * sh.md (attribute cpu): Add value sh4. + (attrbutes fmovd, issues): Define. + (attribute type): Add values dfp_arith, dfp_cmp, dfp_conv, dfdiv. + (function units memory, int, mpy, fp): Make dependent on issue rate. + (function units issue, single_issue, load_si, load): Define. + (function units load_store, fdiv, gp_fpul): Define. + (attribute hit_stack): Provide proper default. + (use_sfunc_addr+1, udivsi3): Predicated on ! TARGET_SH4. + (udivsi3_i4, udivsi3_i4_single, divsi3_i4, divsi3_i4_single): New insns. + (udivsi3, divsi3): Emit special patterns for SH4 hardware, + (mulsi3_call): Now uses match_operand for function address. + (mulsi3): Also emit code for SH1 case. Wrap result in REG_LIBCALL / + REG_RETVAL notes. + (push, pop, push_e, pop_e): Now define_expands. + (push_fpul, push_4, pop_fpul, pop_4, ic_invalidate_line): New expanders. + (movsi_ie): Added y/i alternative. + (ic_invalidate_line_i, movdf_i4): New insns. + (movdf_i4+[123], reload_outdf+[12345], movsi_y+[12]): New splitters. + (reload_indf, reload_outdf, reload_outsf, reload_insi): New expanders. + (movdf): Add special code for SH4. + (movsf_ie, movsf_ie+1, reload_insf, calli): Make use of fpscr visible. + (call_valuei, calli, call_value): Likewise. + (movsf): Emit no-op move. + (mov_nop, movsi_y): New insns. + (blt, sge): generalize to handle DFmode. + (return predicate): Call emit_fpscr_use and remove_dead_before_cse. + (block_move_real, block_lump_real): Predicate on ! TARGET_HARD_SH4. + (block_move_real_i4, block_lump_real_i4, fpu_switch): New insns. + (fpu_switch0, fpu_switch1, movpsi): New expanders. + (fpu_switch+[12], fix_truncsfsi2_i4_2+1): New splitters. + (toggle_sz): New insn. + (addsf3, subsf3, mulsf3, divsf3): Now define_expands. + (addsf3_i, subsf3_i, mulsf3_i4, mulsf3_ie, divsf3_i): New insns. + (macsf3): Make use of fpscr visible. Disable for SH4. + (floatsisf2): Make use of fpscr visible. + (floatsisf2_i4): New insn. + (floatsisf2_ie, fixsfsi, cmpgtsf_t, cmpeqsf_t): Disable for SH4. + (ieee_ccmpeqsf_t): Likewise. + (fix_truncsfsi2): Emit different code for SH4. + (fix_truncsfsi2_i4, fix_truncsfsi2_i4_2, cmpgtsf_t_i4): New insns. + (cmpeqsf_t_i4, ieee_ccmpeqsf_t_4): New insns. + (negsf2, sqrtsf2, abssf2): Now expanders. + (adddf3, subdf3i, muldf2, divdf3, floatsidf2): New expanders. + (negsf2_i, sqrtsf2_i, abssf2_i, adddf3_i, subdf3_i): New insns. + (muldf3_i, divdf3_i, floatsidf2_i, fix_truncdfsi2_i): New insns. + (fix_truncdfsi2, cmpdf, negdf2, sqrtdf2, absdf2): New expanders. + (fix_truncdfsi2_i4, cmpgtdf_t, cmpeqdf_t, ieee_ccmpeqdf_t): New insns. + (fix_truncdfsi2_i4_2+1): New splitters. + (negdf2_i, sqrtdf2_i, absdf2_i, extendsfdf2_i4): New insns. + (extendsfdf2, truncdfsf2): New expanders. + (truncdfsf2_i4): New insn. + * t-sh (LIB1ASMFUNCS): Add _movstr_i4, _sdivsi3_i4, _udivsi3_i4. + (MULTILIB_OPTIONS): Add m4-single-only/m4-single/m4. + * float-sh.h: When testing for __SH3E__, also test for + __SH4_SINGLE_ONLY__ . + * va-sh.h (__va_freg): Define to float. + (__va_greg, __fa_freg, __gnuc_va_list, va_start): + Define for __SH4_SINGLE_ONLY__ like for __SH3E__ . + (__PASS_AS_FLOAT, __TARGET_SH4_P): Likewise. + (__PASS_AS_FLOAT): Use different definition for __SH4__ and + __SH4_SINGLE__. + (TARGET_SH4_P): Define. + (va_arg): Use it. + + * sh.md (movdf_k, movsf_i): Tweak the condition so that + init_expr_once is satisfied about the existence of load / store insns. + + * sh.md (movsi_i, movsi_ie, movsi_i_lowpart, movsf_i, movsf_ie): + change m constraint in source operand to mr / mf . + + * va-sh.h (__va_arg_sh1): Use __asm instead of asm. + + * (__VA_REEF): Define. + (__va_arg_sh1): Use it. + + * va-sh.h (va_start, va_arg, va_copy): Add parenteses. + Sun Nov 22 21:34:02 1998 Jeffrey A Law (law@cygnus.com) * i386/dgux.c (struct option): Add new "description field". diff --git a/gcc/config/float-sh.h b/gcc/config/float-sh.h index 9a942987920..446692428c2 100644 --- a/gcc/config/float-sh.h +++ b/gcc/config/float-sh.h @@ -37,7 +37,7 @@ #undef FLT_MAX_10_EXP #define FLT_MAX_10_EXP 38 -#ifdef __SH3E__ +#if defined (__SH3E__) || defined (__SH4_SINGLE_ONLY__) /* Number of base-FLT_RADIX digits in the significand of a double */ #undef DBL_MANT_DIG diff --git a/gcc/config/sh/lib1funcs.asm b/gcc/config/sh/lib1funcs.asm index 5084e9830ef..06017e6e873 100644 --- a/gcc/config/sh/lib1funcs.asm +++ b/gcc/config/sh/lib1funcs.asm @@ -770,6 +770,64 @@ ___movstr: add #64,r4 #endif +#ifdef L_movstr_i4 +#if defined(__SH4__) || defined(__SH4_SINGLE__) || defined(__SH4_SINGLE_ONLY__) + .text + .global ___movstr_i4_even + .global ___movstr_i4_odd + .global ___movstrSI12_i4 + + .p2align 5 +L_movstr_2mod4_end: + mov.l r0,@(16,r4) + rts + mov.l r1,@(20,r4) + + .p2align 2 + +___movstr_i4_odd: + mov.l @r5+,r1 + add #-4,r4 + mov.l @r5+,r2 + mov.l @r5+,r3 + mov.l r1,@(4,r4) + mov.l r2,@(8,r4) + +L_movstr_loop: + mov.l r3,@(12,r4) + dt r6 + mov.l @r5+,r0 + bt/s L_movstr_2mod4_end + mov.l @r5+,r1 + add #16,r4 +L_movstr_start_even: + mov.l @r5+,r2 + mov.l @r5+,r3 + mov.l r0,@r4 + dt r6 + mov.l r1,@(4,r4) + bf/s L_movstr_loop + mov.l r2,@(8,r4) + rts + mov.l r3,@(12,r4) + +___movstr_i4_even: + mov.l @r5+,r0 + bra L_movstr_start_even + mov.l @r5+,r1 + + .p2align 4 +___movstrSI12_i4: + mov.l @r5,r0 + mov.l @(4,r5),r1 + mov.l @(8,r5),r2 + mov.l r0,@r4 + mov.l r1,@(4,r4) + rts + mov.l r2,@(8,r4) +#endif /* ! __SH4__ */ +#endif + #ifdef L_mulsi3 @@ -808,9 +866,47 @@ hiset: sts macl,r0 ! r0 = bb*dd #endif -#ifdef L_sdivsi3 +#ifdef L_sdivsi3_i4 .title "SH DIVIDE" !! 4 byte integer Divide code for the Hitachi SH +#ifdef __SH4__ +!! args in r4 and r5, result in fpul, clobber dr0, dr2 + + .global ___sdivsi3_i4 +___sdivsi3_i4: + lds r4,fpul + float fpul,dr0 + lds r5,fpul + float fpul,dr2 + fdiv dr2,dr0 + rts + ftrc dr0,fpul + +#elif defined(__SH4_SINGLE__) || defined(__SH4_SINGLE_ONLY__) +!! args in r4 and r5, result in fpul, clobber r2, dr0, dr2 + + .global ___sdivsi3_i4 +___sdivsi3_i4: + sts.l fpscr,@-r15 + mov #8,r2 + swap.w r2,r2 + lds r2,fpscr + lds r4,fpul + float fpul,dr0 + lds r5,fpul + float fpul,dr2 + fdiv dr2,dr0 + ftrc dr0,fpul + rts + lds.l @r15+,fpscr + +#endif /* ! __SH4__ */ +#endif + +#ifdef L_sdivsi3 +/* __SH4_SINGLE_ONLY__ keeps this part for link compatibility with + sh3e code. */ +#if ! defined(__SH4__) && ! defined (__SH4_SINGLE__) !! !! Steve Chamberlain !! sac@cygnus.com @@ -904,11 +1000,109 @@ ___sdivsi3: div0: rts mov #0,r0 +#endif /* ! __SH4__ */ #endif -#ifdef L_udivsi3 +#ifdef L_udivsi3_i4 .title "SH DIVIDE" !! 4 byte integer Divide code for the Hitachi SH +#ifdef __SH4__ +!! args in r4 and r5, result in fpul, clobber r0, r1, r4, r5, dr0, dr2, dr4 + + .global ___udivsi3_i4 +___udivsi3_i4: + mov #1,r1 + cmp/hi r1,r5 + bf trivial + rotr r1 + xor r1,r4 + lds r4,fpul + mova L1,r0 +#ifdef FMOVD_WORKS + fmov.d @r0+,dr4 +#else +#ifdef __LITTLE_ENDIAN__ + fmov.s @r0+,fr5 + fmov.s @r0,fr4 +#else + fmov.s @r0+,fr4 + fmov.s @r0,fr5 +#endif +#endif + float fpul,dr0 + xor r1,r5 + lds r5,fpul + float fpul,dr2 + fadd dr4,dr0 + fadd dr4,dr2 + fdiv dr2,dr0 + rts + ftrc dr0,fpul + +trivial: + rts + lds r4,fpul + + .align 2 +L1: + .double 2147483648 + +#elif defined(__SH4_SINGLE__) || defined(__SH4_SINGLE_ONLY__) +!! args in r4 and r5, result in fpul, clobber r0, r1, r4, r5, dr0, dr2, dr4 + + .global ___udivsi3_i4 +___udivsi3_i4: + mov #1,r1 + cmp/hi r1,r5 + bf trivial + sts.l fpscr,@-r15 + mova L1,r0 + lds.l @r0+,fpscr + rotr r1 + xor r1,r4 + lds r4,fpul +#ifdef FMOVD_WORKS + fmov.d @r0+,dr4 +#else +#ifdef __LITTLE_ENDIAN__ + fmov.s @r0+,fr5 + fmov.s @r0,fr4 +#else + fmov.s @r0+,fr4 + fmov.s @r0,fr5 +#endif +#endif + float fpul,dr0 + xor r1,r5 + lds r5,fpul + float fpul,dr2 + fadd dr4,dr0 + fadd dr4,dr2 + fdiv dr2,dr0 + ftrc dr0,fpul + rts + lds.l @r15+,fpscr + +trivial: + rts + lds r4,fpul + + .align 2 +L1: +#ifdef __LITTLE_ENDIAN__ + .long 0x80000 +#else + .long 0x180000 +#endif + .double 2147483648 + +#endif /* ! __SH4__ */ +#endif + +#ifdef L_udivsi3 +/* __SH4_SINGLE_ONLY__ keeps this part for link compatibility with + sh3e code. */ +#if ! defined(__SH4__) && ! defined (__SH4_SINGLE__) !! !! Steve Chamberlain !! sac@cygnus.com @@ -966,22 +1160,40 @@ vshortway: ret: rts mov r4,r0 +#endif /* __SH4__ */ #endif #ifdef L_set_fpscr -#if defined (__SH3E__) +#if defined (__SH3E__) || defined(__SH4_SINGLE__) || defined(__SH4__) || defined(__SH4_SINGLE_ONLY__) .global ___set_fpscr ___set_fpscr: lds r4,fpscr mov.l ___set_fpscr_L1,r1 swap.w r4,r0 or #24,r0 +#ifndef FMOVD_WORKS xor #16,r0 +#endif +#if defined(__SH4__) + swap.w r0,r3 + mov.l r3,@(4,r1) +#else /* defined(__SH3E__) || defined(__SH4_SINGLE*__) */ swap.w r0,r2 mov.l r2,@r1 +#endif +#ifndef FMOVD_WORKS xor #8,r0 +#else + xor #24,r0 +#endif +#if defined(__SH4__) + swap.w r0,r2 + rts + mov.l r2,@r1 +#else /* defined(__SH3E__) || defined(__SH4_SINGLE*__) */ swap.w r0,r3 rts mov.l r3,@(4,r1) +#endif .align 2 ___set_fpscr_L1: .long ___fpscr_values @@ -990,5 +1202,5 @@ ___set_fpscr_L1: #else .comm ___fpscr_values,8 #endif /* ELF */ -#endif /* SH3E */ +#endif /* SH3E / SH4 */ #endif /* L_set_fpscr */ diff --git a/gcc/config/sh/sh.c b/gcc/config/sh/sh.c index 12f1b74c317..9184528933b 100644 --- a/gcc/config/sh/sh.c +++ b/gcc/config/sh/sh.c @@ -1,5 +1,5 @@ /* Output routines for GCC for Hitachi Super-H. - Copyright (C) 1993, 1994, 1995, 1996, 1997 Free Software Foundation, Inc. + Copyright (C) 1993-1998 Free Software Foundation, Inc. This file is part of GNU CC. @@ -103,6 +103,17 @@ int regno_reg_class[FIRST_PSEUDO_REGISTER] = FP_REGS, FP_REGS, FP_REGS, FP_REGS, FP_REGS, FP_REGS, FP_REGS, FP_REGS, FP_REGS, FP_REGS, FP_REGS, FP_REGS, + DF_REGS, DF_REGS, DF_REGS, DF_REGS, + DF_REGS, DF_REGS, DF_REGS, DF_REGS, + FPSCR_REGS, +}; + +char fp_reg_names[][5] = +{ + "fr0", "fr1", "fr2", "fr3", "fr4", "fr5", "fr6", "fr7", + "fr8", "fr9", "fr10", "fr11", "fr12", "fr13", "fr14", "fr15", + "fpul", + "xd0","xd2","xd4", "xd6", "xd8", "xd10", "xd12", "xd14", }; /* Provide reg_class from a letter such as appears in the machine @@ -110,7 +121,7 @@ int regno_reg_class[FIRST_PSEUDO_REGISTER] = enum reg_class reg_class_from_letter[] = { - /* a */ NO_REGS, /* b */ NO_REGS, /* c */ NO_REGS, /* d */ NO_REGS, + /* a */ ALL_REGS, /* b */ NO_REGS, /* c */ FPSCR_REGS, /* d */ DF_REGS, /* e */ NO_REGS, /* f */ FP_REGS, /* g */ NO_REGS, /* h */ NO_REGS, /* i */ NO_REGS, /* j */ NO_REGS, /* k */ NO_REGS, /* l */ PR_REGS, /* m */ NO_REGS, /* n */ NO_REGS, /* o */ NO_REGS, /* p */ NO_REGS, @@ -119,6 +130,12 @@ enum reg_class reg_class_from_letter[] = /* y */ FPUL_REGS, /* z */ R0_REGS }; +int assembler_dialect; + +rtx get_fpscr_rtx (); +void emit_sf_insn (); +void emit_df_insn (); + static void split_branches PROTO ((rtx)); /* Print the operand address in x to the stream. */ @@ -131,7 +148,8 @@ print_operand_address (stream, x) switch (GET_CODE (x)) { case REG: - fprintf (stream, "@%s", reg_names[REGNO (x)]); + case SUBREG: + fprintf (stream, "@%s", reg_names[true_regnum (x)]); break; case PLUS: @@ -143,13 +161,19 @@ print_operand_address (stream, x) { case CONST_INT: fprintf (stream, "@(%d,%s)", INTVAL (index), - reg_names[REGNO (base)]); + reg_names[true_regnum (base)]); break; case REG: - fprintf (stream, "@(r0,%s)", - reg_names[MAX (REGNO (base), REGNO (index))]); - break; + case SUBREG: + { + int base_num = true_regnum (base); + int index_num = true_regnum (index); + + fprintf (stream, "@(r0,%s)", + reg_names[MAX (base_num, index_num)]); + break; + } default: debug_rtx (x); @@ -159,11 +183,11 @@ print_operand_address (stream, x) break; case PRE_DEC: - fprintf (stream, "@-%s", reg_names[REGNO (XEXP (x, 0))]); + fprintf (stream, "@-%s", reg_names[true_regnum (XEXP (x, 0))]); break; case POST_INC: - fprintf (stream, "@%s+", reg_names[REGNO (XEXP (x, 0))]); + fprintf (stream, "@%s+", reg_names[true_regnum (XEXP (x, 0))]); break; default: @@ -182,7 +206,8 @@ print_operand_address (stream, x) 'O' print a constant without the # 'R' print the LSW of a dp value - changes if in little endian 'S' print the MSW of a dp value - changes if in little endian - 'T' print the next word of a dp value - same as 'R' in big endian mode. */ + 'T' print the next word of a dp value - same as 'R' in big endian mode. + 'o' output an operator. */ void print_operand (stream, x, code) @@ -230,16 +255,31 @@ print_operand (stream, x, code) fputs (reg_names[REGNO (x) + 1], (stream)); break; case MEM: - print_operand_address (stream, - XEXP (adj_offsettable_operand (x, 4), 0)); + if (GET_CODE (XEXP (x, 0)) != PRE_DEC + && GET_CODE (XEXP (x, 0)) != POST_INC) + x = adj_offsettable_operand (x, 4); + print_operand_address (stream, XEXP (x, 0)); break; } break; + case 'o': + switch (GET_CODE (x)) + { + case PLUS: fputs ("add", stream); break; + case MINUS: fputs ("sub", stream); break; + case MULT: fputs ("mul", stream); break; + case DIV: fputs ("div", stream); break; + } + break; default: switch (GET_CODE (x)) { case REG: - fputs (reg_names[REGNO (x)], (stream)); + if (REGNO (x) >= FIRST_FP_REG && REGNO (x) <= LAST_FP_REG + && GET_MODE_SIZE (GET_MODE (x)) > 4) + fprintf ((stream), "d%s", reg_names[REGNO (x)]+1); + else + fputs (reg_names[REGNO (x)], (stream)); break; case MEM: output_address (XEXP (x, 0)); @@ -273,6 +313,55 @@ expand_block_move (operands) if (! constp || align < 4 || (bytes % 4 != 0)) return 0; + if (TARGET_HARD_SH4) + { + if (bytes < 12) + return 0; + else if (bytes == 12) + { + tree entry_name; + rtx func_addr_rtx; + rtx r4 = gen_rtx (REG, SImode, 4); + rtx r5 = gen_rtx (REG, SImode, 5); + + entry_name = get_identifier ("__movstrSI12_i4"); + + func_addr_rtx + = copy_to_mode_reg (Pmode, + gen_rtx_SYMBOL_REF (Pmode, + IDENTIFIER_POINTER (entry_name))); + emit_insn (gen_move_insn (r4, XEXP (operands[0], 0))); + emit_insn (gen_move_insn (r5, XEXP (operands[1], 0))); + emit_insn (gen_block_move_real_i4 (func_addr_rtx)); + return 1; + } + else if (! TARGET_SMALLCODE) + { + tree entry_name; + rtx func_addr_rtx; + int dwords; + rtx r4 = gen_rtx (REG, SImode, 4); + rtx r5 = gen_rtx (REG, SImode, 5); + rtx r6 = gen_rtx (REG, SImode, 6); + + entry_name = get_identifier (bytes & 4 + ? "__movstr_i4_odd" + : "__movstr_i4_even"); + func_addr_rtx + = copy_to_mode_reg (Pmode, + gen_rtx_SYMBOL_REF (Pmode, + IDENTIFIER_POINTER (entry_name))); + emit_insn (gen_move_insn (r4, XEXP (operands[0], 0))); + emit_insn (gen_move_insn (r5, XEXP (operands[1], 0))); + + dwords = bytes >> 3; + emit_insn (gen_move_insn (r6, GEN_INT (dwords - 1))); + emit_insn (gen_block_lump_real_i4 (func_addr_rtx)); + return 1; + } + else + return 0; + } if (bytes < 64) { char entry[30]; @@ -405,9 +494,17 @@ prepare_scc_operands (code) || TARGET_SH3E && GET_MODE_CLASS (mode) == MODE_FLOAT) sh_compare_op1 = force_reg (mode, sh_compare_op1); - emit_insn (gen_rtx (SET, VOIDmode, t_reg, - gen_rtx (code, SImode, sh_compare_op0, - sh_compare_op1))); + if (TARGET_SH4 && GET_MODE_CLASS (mode) == MODE_FLOAT) + (mode == SFmode ? emit_sf_insn : emit_df_insn) + (gen_rtx (PARALLEL, VOIDmode, gen_rtvec (2, + gen_rtx (SET, VOIDmode, t_reg, + gen_rtx (code, SImode, + sh_compare_op0, sh_compare_op1)), + gen_rtx (USE, VOIDmode, get_fpscr_rtx ())))); + else + emit_insn (gen_rtx (SET, VOIDmode, t_reg, + gen_rtx (code, SImode, sh_compare_op0, + sh_compare_op1))); return t_reg; } @@ -443,7 +540,15 @@ from_compare (operands, code) insn = gen_rtx (SET, VOIDmode, gen_rtx (REG, SImode, 18), gen_rtx (code, SImode, sh_compare_op0, sh_compare_op1)); - emit_insn (insn); + if (TARGET_SH4 && GET_MODE_CLASS (mode) == MODE_FLOAT) + { + insn = gen_rtx (PARALLEL, VOIDmode, + gen_rtvec (2, insn, + gen_rtx (USE, VOIDmode, get_fpscr_rtx ()))); + (mode == SFmode ? emit_sf_insn : emit_df_insn) (insn); + } + else + emit_insn (insn); } /* Functions to output assembly code. */ @@ -1722,7 +1827,8 @@ static int pool_size; /* Add a constant to the pool and return its label. */ static rtx -add_constant (x, mode) +add_constant (x, mode, last_value) + rtx last_value; rtx x; enum machine_mode mode; { @@ -1741,13 +1847,27 @@ add_constant (x, mode) continue; } if (rtx_equal_p (x, pool_vector[i].value)) - return pool_vector[i].label; + { + lab = 0; + if (! last_value + || ! i + || ! rtx_equal_p (last_value, pool_vector[i-1].value)) + { + lab = pool_vector[i].label; + if (! lab) + pool_vector[i].label = lab = gen_label_rtx (); + } + return lab; + } } } /* Need a new one. */ pool_vector[pool_size].value = x; - lab = gen_label_rtx (); + if (last_value && rtx_equal_p (last_value, pool_vector[pool_size - 1].value)) + lab = 0; + else + lab = gen_label_rtx (); pool_vector[pool_size].mode = mode; pool_vector[pool_size].label = lab; pool_size++; @@ -1965,7 +2085,8 @@ find_barrier (num_mova, mova, from) /* We must explicitly check the mode, because sometimes the front end will generate code to load unsigned constants into HImode targets without properly sign extending them. */ - if (mode == HImode || (mode == SImode && hi_const (src))) + if (mode == HImode + || (mode == SImode && hi_const (src) && REGNO (dst) != FPUL_REG)) { found_hi += 2; /* We put the short constants before the long constants, so @@ -2130,7 +2251,7 @@ sfunc_uses_reg (insn) for (i = XVECLEN (pattern, 0) - 1; i >= 0; i--) { part = XVECEXP (pattern, 0, i); - if (part == reg_part) + if (part == reg_part || GET_CODE (part) == CLOBBER) continue; if (reg_mentioned_p (reg, ((GET_CODE (part) == SET && GET_CODE (SET_DEST (part)) == REG) @@ -2470,6 +2591,13 @@ gen_far_branch (bp) } else jump = emit_jump_insn_after (gen_return (), insn); + /* Emit a barrier so that reorg knows that any following instructions + are not reachable via a fall-through path. + But don't do this when not optimizing, since we wouldn't supress the + alignment for the barrier then, and could end up with out-of-range + pc-relative loads. */ + if (optimize) + emit_barrier_after (jump); emit_label_after (bp->near_label, insn); JUMP_LABEL (jump) = bp->far_label; if (! invert_jump (insn, label)) @@ -2556,36 +2684,42 @@ barrier_align (barrier_or_label) if (! TARGET_SH3 || ! optimize) return CACHE_LOG; - /* Check if there is an immediately preceding branch to the insn beyond - the barrier. We must weight the cost of discarding useful information - from the current cache line when executing this branch and there is - an alignment, against that of fetching unneeded insn in front of the - branch target when there is no alignment. */ - - /* PREV is presumed to be the JUMP_INSN for the barrier under - investigation. Skip to the insn before it. */ - prev = prev_real_insn (prev); - - for (slot = 2, credit = 1 << (CACHE_LOG - 2) + 2; - credit >= 0 && prev && GET_CODE (prev) == INSN; - prev = prev_real_insn (prev)) + /* When fixing up pcloads, a constant table might be inserted just before + the basic block that ends with the barrier. Thus, we can't trust the + instruction lengths before that. */ + if (mdep_reorg_phase > SH_FIXUP_PCLOAD) { - if (GET_CODE (PATTERN (prev)) == USE - || GET_CODE (PATTERN (prev)) == CLOBBER) - continue; - if (GET_CODE (PATTERN (prev)) == SEQUENCE) - prev = XVECEXP (PATTERN (prev), 0, 1); - if (slot && - get_attr_in_delay_slot (prev) == IN_DELAY_SLOT_YES) - slot = 0; - credit -= get_attr_length (prev); + /* Check if there is an immediately preceding branch to the insn beyond + the barrier. We must weight the cost of discarding useful information + from the current cache line when executing this branch and there is + an alignment, against that of fetching unneeded insn in front of the + branch target when there is no alignment. */ + + /* PREV is presumed to be the JUMP_INSN for the barrier under + investigation. Skip to the insn before it. */ + prev = prev_real_insn (prev); + + for (slot = 2, credit = 1 << (CACHE_LOG - 2) + 2; + credit >= 0 && prev && GET_CODE (prev) == INSN; + prev = prev_real_insn (prev)) + { + if (GET_CODE (PATTERN (prev)) == USE + || GET_CODE (PATTERN (prev)) == CLOBBER) + continue; + if (GET_CODE (PATTERN (prev)) == SEQUENCE) + prev = XVECEXP (PATTERN (prev), 0, 1); + if (slot && + get_attr_in_delay_slot (prev) == IN_DELAY_SLOT_YES) + slot = 0; + credit -= get_attr_length (prev); + } + if (prev + && GET_CODE (prev) == JUMP_INSN + && JUMP_LABEL (prev) + && next_real_insn (JUMP_LABEL (prev)) == next_real_insn (barrier_or_label) + && (credit - slot >= (GET_CODE (SET_SRC (PATTERN (prev))) == PC ? 2 : 0))) + return 0; } - if (prev - && GET_CODE (prev) == JUMP_INSN - && JUMP_LABEL (prev) - && next_real_insn (JUMP_LABEL (prev)) == next_real_insn (barrier_or_label) - && (credit - slot >= (GET_CODE (SET_SRC (PATTERN (prev))) == PC ? 2 : 0))) - return 0; return CACHE_LOG; } @@ -2914,7 +3048,8 @@ machine_dependent_reorg (first) dst = SET_DEST (pat); mode = GET_MODE (dst); - if (mode == SImode && hi_const (src)) + if (mode == SImode && hi_const (src) + && REGNO (dst) != FPUL_REG) { int offset = 0; @@ -2929,7 +3064,7 @@ machine_dependent_reorg (first) if (GET_CODE (dst) == REG && ((REGNO (dst) >= FIRST_FP_REG - && REGNO (dst) <= LAST_FP_REG) + && REGNO (dst) <= LAST_XD_REG) || REGNO (dst) == FPUL_REG)) { if (last_float @@ -2943,7 +3078,8 @@ machine_dependent_reorg (first) last_float_move = scan; last_float = src; newsrc = gen_rtx (MEM, mode, - (REGNO (dst) == FPUL_REG + ((TARGET_SH4 && ! TARGET_FMOVD + || REGNO (dst) == FPUL_REG) ? r0_inc_rtx : r0_rtx)); last_float_addr = &XEXP (newsrc, 0); @@ -2983,6 +3119,16 @@ machine_dependent_reorg (first) emit_insn_before (gen_use_sfunc_addr (reg), insn); } } +#if 0 + /* fpscr is not actually a user variable, but we pretend it is for the + sake of the previous optimization passes, since we want it handled like + one. However, we don't have eny debugging information for it, so turn + it into a non-user variable now. */ + if (TARGET_SH4) + REG_USERVAR_P (get_fpscr_rtx ()) = 0; +#endif + if (optimize) + sh_flag_remove_dead_before_cse = 1; mdep_reorg_phase = SH_AFTER_MDEP_REORG; } @@ -3386,8 +3532,16 @@ push (rn) int rn; { rtx x; - if ((rn >= FIRST_FP_REG && rn <= LAST_FP_REG) - || rn == FPUL_REG) + if (rn == FPUL_REG) + x = gen_push_fpul (); + else if (TARGET_SH4 && TARGET_FMOVD && ! TARGET_FPU_SINGLE + && rn >= FIRST_FP_REG && rn <= LAST_XD_REG) + { + if ((rn - FIRST_FP_REG) & 1 && rn <= LAST_FP_REG) + return; + x = gen_push_4 (gen_rtx (REG, DFmode, rn)); + } + else if (TARGET_SH3E && rn >= FIRST_FP_REG && rn <= LAST_FP_REG) x = gen_push_e (gen_rtx (REG, SFmode, rn)); else x = gen_push (gen_rtx (REG, SImode, rn)); @@ -3404,8 +3558,16 @@ pop (rn) int rn; { rtx x; - if ((rn >= FIRST_FP_REG && rn <= LAST_FP_REG) - || rn == FPUL_REG) + if (rn == FPUL_REG) + x = gen_pop_fpul (); + else if (TARGET_SH4 && TARGET_FMOVD && ! TARGET_FPU_SINGLE + && rn >= FIRST_FP_REG && rn <= LAST_XD_REG) + { + if ((rn - FIRST_FP_REG) & 1 && rn <= LAST_FP_REG) + return; + x = gen_pop_4 (gen_rtx (REG, DFmode, rn)); + } + else if (TARGET_SH3E && rn >= FIRST_FP_REG && rn <= LAST_FP_REG) x = gen_pop_e (gen_rtx (REG, SFmode, rn)); else x = gen_pop (gen_rtx (REG, SImode, rn)); @@ -3453,6 +3615,16 @@ calc_live_regs (count_ptr, live_regs_mask2) int count; *live_regs_mask2 = 0; + /* If we can save a lot of saves by switching to double mode, do that. */ + if (TARGET_SH4 && TARGET_FMOVD && TARGET_FPU_SINGLE) + for (count = 0, reg = FIRST_FP_REG; reg <= LAST_FP_REG; reg += 2) + if (regs_ever_live[reg] && regs_ever_live[reg+1] + && (! call_used_regs[reg] || (pragma_interrupt && ! pragma_trapa)) + && ++count > 2) + { + target_flags &= ~FPU_SINGLE_BIT; + break; + } for (count = 0, reg = FIRST_PSEUDO_REGISTER - 1; reg >= 0; reg--) { if ((pragma_interrupt && ! pragma_trapa) @@ -3463,7 +3635,7 @@ calc_live_regs (count_ptr, live_regs_mask2) && regs_ever_live[PR_REG])) && reg != STACK_POINTER_REGNUM && reg != ARG_POINTER_REGNUM && reg != RETURN_ADDRESS_POINTER_REGNUM - && reg != T_REG && reg != GBR_REG) + && reg != T_REG && reg != GBR_REG && reg != FPSCR_REG) : (/* Only push those regs which are used and need to be saved. */ regs_ever_live[reg] && ! call_used_regs[reg])) { @@ -3472,6 +3644,24 @@ calc_live_regs (count_ptr, live_regs_mask2) else live_regs_mask |= 1 << reg; count++; + if (TARGET_SH4 && TARGET_FMOVD && reg >= FIRST_FP_REG) + if (reg <= LAST_FP_REG) + { + if (! TARGET_FPU_SINGLE && ! regs_ever_live[reg ^ 1]) + { + if (reg >= 32) + *live_regs_mask2 |= 1 << ((reg ^ 1) - 32); + else + live_regs_mask |= 1 << (reg ^ 1); + count++; + } + } + else if (reg <= LAST_XD_REG) + { + /* Must switch to double mode to access these registers. */ + target_flags &= ~FPU_SINGLE_BIT; + count++; + } } } @@ -3487,6 +3677,7 @@ sh_expand_prologue () int live_regs_mask; int d, i; int live_regs_mask2; + int save_flags = target_flags; int double_align = 0; /* We have pretend args if we had an object sent partially in registers @@ -3524,11 +3715,19 @@ sh_expand_prologue () emit_insn (gen_sp_switch_1 ()); live_regs_mask = calc_live_regs (&d, &live_regs_mask2); + /* ??? Maybe we could save some switching if we can move a mode switch + that already happens to be at the function start into the prologue. */ + if (target_flags != save_flags) + emit_insn (gen_toggle_sz ()); push_regs (live_regs_mask, live_regs_mask2); + if (target_flags != save_flags) + emit_insn (gen_toggle_sz ()); if (TARGET_ALIGN_DOUBLE && d & 1) double_align = 4; + target_flags = save_flags; + output_stack_adjust (-get_frame_size () - double_align, stack_pointer_rtx, 3); @@ -3543,6 +3742,7 @@ sh_expand_epilogue () int d, i; int live_regs_mask2; + int save_flags = target_flags; int frame_size = get_frame_size (); live_regs_mask = calc_live_regs (&d, &live_regs_mask2); @@ -3573,7 +3773,8 @@ sh_expand_epilogue () /* Pop all the registers. */ - live_regs_mask = calc_live_regs (&d, &live_regs_mask2); + if (target_flags != save_flags) + emit_insn (gen_toggle_sz ()); if (live_regs_mask & (1 << PR_REG)) pop (PR_REG); for (i = 0; i < FIRST_PSEUDO_REGISTER; i++) @@ -3584,6 +3785,9 @@ sh_expand_epilogue () else if (j >= 32 && (live_regs_mask2 & (1 << (j - 32)))) pop (j); } + if (target_flags != save_flags) + emit_insn (gen_toggle_sz ()); + target_flags = save_flags; output_stack_adjust (extra_push + current_function_pretend_args_size, stack_pointer_rtx, 7); @@ -3651,6 +3855,25 @@ sh_builtin_saveregs (arglist) emit_move_insn (fpregs, XEXP (regbuf, 0)); emit_insn (gen_addsi3 (fpregs, fpregs, GEN_INT (n_floatregs * UNITS_PER_WORD))); + if (TARGET_SH4) + { + for (regno = NPARM_REGS (DFmode) - 2; regno >= first_floatreg; regno -= 2) + { + emit_insn (gen_addsi3 (fpregs, fpregs, + GEN_INT (-2 * UNITS_PER_WORD))); + emit_move_insn (gen_rtx (MEM, DFmode, fpregs), + gen_rtx (REG, DFmode, BASE_ARG_REG (DFmode) + regno)); + } + regno = first_floatreg; + if (regno & 1) + { + emit_insn (gen_addsi3 (fpregs, fpregs, GEN_INT (- UNITS_PER_WORD))); + emit_move_insn (gen_rtx (MEM, SFmode, fpregs), + gen_rtx (REG, SFmode, BASE_ARG_REG (SFmode) + regno + - (TARGET_LITTLE_ENDIAN != 0))); + } + } + else for (regno = NPARM_REGS (SFmode) - 1; regno >= first_floatreg; regno--) { emit_insn (gen_addsi3 (fpregs, fpregs, GEN_INT (- UNITS_PER_WORD))); @@ -3677,6 +3900,8 @@ initial_elimination_offset (from, to) int live_regs_mask, live_regs_mask2; live_regs_mask = calc_live_regs (®s_saved, &live_regs_mask2); + if (TARGET_ALIGN_DOUBLE && regs_saved & 1) + total_auto_space += 4; target_flags = save_flags; total_saved_regs_space = (regs_saved) * 4; @@ -3885,12 +4110,48 @@ arith_reg_operand (op, mode) else return 1; - return (regno != T_REG && regno != PR_REG && regno != FPUL_REG + return (regno != T_REG && regno != PR_REG + && (regno != FPUL_REG || TARGET_SH4) + && regno != MACH_REG && regno != MACL_REG); + } + return 0; +} + +int +fp_arith_reg_operand (op, mode) + rtx op; + enum machine_mode mode; +{ + if (register_operand (op, mode)) + { + int regno; + + if (GET_CODE (op) == REG) + regno = REGNO (op); + else if (GET_CODE (op) == SUBREG && GET_CODE (SUBREG_REG (op)) == REG) + regno = REGNO (SUBREG_REG (op)); + else + return 1; + + return (regno != T_REG && regno != PR_REG && regno > 15 && regno != MACH_REG && regno != MACL_REG); } return 0; } +int +fp_extended_operand (op, mode) + rtx op; + enum machine_mode mode; +{ + if (GET_CODE (op) == FLOAT_EXTEND && GET_MODE (op) == mode) + { + op = XEXP (op, 0); + mode = GET_MODE (op); + } + return fp_arith_reg_operand (op, mode); +} + /* Returns 1 if OP is a valid source operand for an arithmetic insn. */ int @@ -3991,6 +4252,73 @@ braf_label_ref_operand(op, mode) if (GET_CODE (prev) != PLUS || XEXP (prev, 1) != op) return 0; } + +int +tertiary_reload_operand (op, mode) + rtx op; + enum machine_mode mode; +{ + enum rtx_code code = GET_CODE (op); + return code == MEM || (TARGET_SH4 && code == CONST_DOUBLE); +} + +int +fpscr_operand (op) + rtx op; +{ + return (GET_CODE (op) == REG && REGNO (op) == FPSCR_REG + && GET_MODE (op) == PSImode); +} + +int +commutative_float_operator (op, mode) + rtx op; + enum machine_mode mode; +{ + if (GET_MODE (op) != mode) + return 0; + switch (GET_CODE (op)) + { + case PLUS: + case MULT: + return 1; + } + return 0; +} + +int +noncommutative_float_operator (op, mode) + rtx op; + enum machine_mode mode; +{ + if (GET_MODE (op) != mode) + return 0; + switch (GET_CODE (op)) + { + case MINUS: + case DIV: + return 1; + } + return 0; +} + +int +binary_float_operator (op, mode) + rtx op; + enum machine_mode mode; +{ + if (GET_MODE (op) != mode) + return 0; + switch (GET_CODE (op)) + { + case PLUS: + case MINUS: + case MULT: + case DIV: + return 1; + } + return 0; +} /* Return the destination address of a branch. */ @@ -4102,3 +4430,304 @@ reg_unused_after (reg, insn) } return 1; } + +extern struct obstack permanent_obstack; + +rtx +get_fpscr_rtx () +{ + static rtx fpscr_rtx; + + if (! fpscr_rtx) + { + push_obstacks (&permanent_obstack, &permanent_obstack); + fpscr_rtx = gen_rtx (REG, PSImode, 48); + REG_USERVAR_P (fpscr_rtx) = 1; + pop_obstacks (); + mark_user_reg (fpscr_rtx); + } + if (! reload_completed || mdep_reorg_phase != SH_AFTER_MDEP_REORG) + mark_user_reg (fpscr_rtx); + return fpscr_rtx; +} + +void +emit_sf_insn (pat) + rtx pat; +{ + rtx addr; + /* When generating reload insns, we must not create new registers. FPSCR + should already have the correct value, so do nothing to change it. */ + if (! TARGET_FPU_SINGLE && ! reload_in_progress) + { + addr = gen_reg_rtx (SImode); + emit_insn (gen_fpu_switch0 (addr)); + } + emit_insn (pat); + if (! TARGET_FPU_SINGLE && ! reload_in_progress) + { + addr = gen_reg_rtx (SImode); + emit_insn (gen_fpu_switch1 (addr)); + } +} + +void +emit_df_insn (pat) + rtx pat; +{ + rtx addr; + if (TARGET_FPU_SINGLE && ! reload_in_progress) + { + addr = gen_reg_rtx (SImode); + emit_insn (gen_fpu_switch0 (addr)); + } + emit_insn (pat); + if (TARGET_FPU_SINGLE && ! reload_in_progress) + { + addr = gen_reg_rtx (SImode); + emit_insn (gen_fpu_switch1 (addr)); + } +} + +void +expand_sf_unop (fun, operands) + rtx (*fun)(); + rtx *operands; +{ + emit_sf_insn ((*fun) (operands[0], operands[1], get_fpscr_rtx ())); +} + +void +expand_sf_binop (fun, operands) + rtx (*fun)(); + rtx *operands; +{ + emit_sf_insn ((*fun) (operands[0], operands[1], operands[2], + get_fpscr_rtx ())); +} + +void +expand_df_unop (fun, operands) + rtx (*fun)(); + rtx *operands; +{ + emit_df_insn ((*fun) (operands[0], operands[1], get_fpscr_rtx ())); +} + +void +expand_df_binop (fun, operands) + rtx (*fun)(); + rtx *operands; +{ + emit_df_insn ((*fun) (operands[0], operands[1], operands[2], + get_fpscr_rtx ())); +} + +void +expand_fp_branch (compare, branch) + rtx (*compare) (), (*branch) (); +{ + (GET_MODE (sh_compare_op0) == SFmode ? emit_sf_insn : emit_df_insn) + ((*compare) ()); + emit_jump_insn ((*branch) ()); +} + +/* We don't want to make fpscr call-saved, because that would prevent + channging it, and it would also cost an exstra instruction to save it. + We don't want it to be known as a global register either, because + that disables all flow analysis. But it has to be live at the function + return. Thus, we need to insert a USE at the end of the function. */ +/* This should best be called at about the time FINALIZE_PIC is called, + but not dependent on flag_pic. Alas, there is no suitable hook there, + so this gets called from HAVE_RETURN. */ +int +emit_fpscr_use () +{ + static int fpscr_uses = 0; + + if (rtx_equal_function_value_matters) + { + emit_insn (gen_rtx (USE, VOIDmode, get_fpscr_rtx ())); + fpscr_uses++; + } + else + { + if (fpscr_uses > 1) + { + /* Due to he crude way we emit the USEs, we might end up with + some extra ones. Delete all but the last one. */ + rtx insn; + + for (insn = get_last_insn(); insn; insn = PREV_INSN (insn)) + if (GET_CODE (insn) == INSN + && GET_CODE (PATTERN (insn)) == USE + && GET_CODE (XEXP (PATTERN (insn), 0)) == REG + && REGNO (XEXP (PATTERN (insn), 0)) == FPSCR_REG) + { + insn = PREV_INSN (insn); + break; + } + for (; insn; insn = PREV_INSN (insn)) + if (GET_CODE (insn) == INSN + && GET_CODE (PATTERN (insn)) == USE + && GET_CODE (XEXP (PATTERN (insn), 0)) == REG + && REGNO (XEXP (PATTERN (insn), 0)) == FPSCR_REG) + { + PUT_CODE (insn, NOTE); + NOTE_LINE_NUMBER (insn) = NOTE_INSN_DELETED; + NOTE_SOURCE_FILE (insn) = 0; + } + } + fpscr_uses = 0; + } +} + +/* ??? gcc does flow analysis strictly after common subexpression + elimination. As a result, common subespression elimination fails + when there are some intervening statements setting the same register. + If we did nothing about this, this would hurt the precision switching + for SH4 badly. There is some cse after reload, but it is unable to + undo the extra register pressure from the unused instructions, and + it cannot remove auto-increment loads. + + A C code example that shows this flow/cse weakness for (at least) SH + and sparc (as of gcc ss-970706) is this: + +double +f(double a) +{ + double d; + d = 0.1; + a += d; + d = 1.1; + d = 0.1; + a *= d; + return a; +} + + So we add another pass before common subexpression elimination, to + remove assignments that are dead due to a following assignment in the + same basic block. */ + +int sh_flag_remove_dead_before_cse; + +static void +mark_use (x, reg_set_block) + rtx x, *reg_set_block; +{ + enum rtx_code code; + + if (! x) + return; + code = GET_CODE (x); + switch (code) + { + case REG: + { + int regno = REGNO (x); + int nregs = (regno < FIRST_PSEUDO_REGISTER + ? HARD_REGNO_NREGS (regno, GET_MODE (x)) + : 1); + do + { + reg_set_block[regno + nregs - 1] = 0; + } + while (--nregs); + break; + } + case SET: + { + rtx dest = SET_DEST (x); + + if (GET_CODE (dest) == SUBREG) + dest = SUBREG_REG (dest); + if (GET_CODE (dest) != REG) + mark_use (dest, reg_set_block); + mark_use (SET_SRC (x), reg_set_block); + break; + } + case CLOBBER: + break; + default: + { + char *fmt = GET_RTX_FORMAT (code); + int i, j; + for (i = GET_RTX_LENGTH (code) - 1; i >= 0; i--) + { + if (fmt[i] == 'e') + mark_use (XEXP (x, i), reg_set_block); + else if (fmt[i] == 'E') + for (j = XVECLEN (x, i) - 1; j >= 0; j--) + mark_use (XVECEXP (x, i, j), reg_set_block); + } + break; + } + } +} + +int +remove_dead_before_cse () +{ + rtx *reg_set_block, last, last_call, insn, set; + int in_libcall = 0; + + /* This pass should run just once, after rtl generation. */ + + if (! sh_flag_remove_dead_before_cse + || rtx_equal_function_value_matters + || reload_completed) + return; + + sh_flag_remove_dead_before_cse = 0; + + reg_set_block = (rtx *)alloca (max_reg_num () * sizeof (rtx)); + bzero ((char *)reg_set_block, max_reg_num () * sizeof (rtx)); + last_call = last = get_last_insn (); + for (insn = last; insn; insn = PREV_INSN (insn)) + { + if (GET_RTX_CLASS (GET_CODE (insn)) != 'i') + continue; + if (GET_CODE (insn) == JUMP_INSN) + { + last_call = last = insn; + continue; + } + set = single_set (insn); + + /* Don't delete parts of libcalls, since that would confuse cse, loop + and flow. */ + if (find_reg_note (insn, REG_RETVAL, NULL_RTX)) + in_libcall = 1; + else if (in_libcall) + { + if (find_reg_note (insn, REG_LIBCALL, NULL_RTX)) + in_libcall = 0; + } + else if (set && GET_CODE (SET_DEST (set)) == REG) + { + int regno = REGNO (SET_DEST (set)); + rtx ref_insn = (regno < FIRST_PSEUDO_REGISTER && call_used_regs[regno] + ? last_call + : last); + if (reg_set_block[regno] == ref_insn + && (regno >= FIRST_PSEUDO_REGISTER + || HARD_REGNO_NREGS (regno, GET_MODE (SET_DEST (set))) == 1) + && (GET_CODE (insn) != CALL_INSN || CONST_CALL_P (insn))) + { + PUT_CODE (insn, NOTE); + NOTE_LINE_NUMBER (insn) = NOTE_INSN_DELETED; + NOTE_SOURCE_FILE (insn) = 0; + continue; + } + else + reg_set_block[REGNO (SET_DEST (set))] = ref_insn; + } + if (GET_CODE (insn) == CALL_INSN) + { + last_call = insn; + mark_use (CALL_INSN_FUNCTION_USAGE (insn), reg_set_block); + } + mark_use (PATTERN (insn), reg_set_block); + } + return 0; +} diff --git a/gcc/config/sh/sh.h b/gcc/config/sh/sh.h index 1798d3ffd13..c32b13a000d 100644 --- a/gcc/config/sh/sh.h +++ b/gcc/config/sh/sh.h @@ -1,5 +1,5 @@ /* Definitions of target machine for GNU compiler for Hitachi Super-H. - Copyright (C) 1993, 1994, 1995, 1996, 1997 Free Software Foundation, Inc. + Copyright (C) 1993-1998 Free Software Foundation, Inc. Contributed by Steve Chamberlain (sac@cygnus.com). Improved by Jim Wilson (wilson@cygnus.com). @@ -43,7 +43,10 @@ extern int code_for_indirect_jump_scratch; %{m2:-D__sh2__} \ %{m3:-D__sh3__} \ %{m3e:-D__SH3E__} \ -%{!m1:%{!m2:%{!m3:%{!m3e:-D__sh1__}}}}" +%{m4-single-only:-D__SH4_SINGLE_ONLY__} \ +%{m4-single:-D__SH4_SINGLE__} \ +%{m4:-D__SH4__} \ +%{!m1:%{!m2:%{!m3:%{!m3e:%{!m4:%{!m4-single:%{!m4-single-only:-D__sh1__}}}}}}}" #define CPP_PREDEFINES "-D__sh__ -Acpu(sh) -Amachine(sh)" @@ -54,19 +57,28 @@ extern int code_for_indirect_jump_scratch; /* We can not debug without a frame pointer. */ /* #define CAN_DEBUG_WITHOUT_FP */ -#define CONDITIONAL_REGISTER_USAGE \ - if (! TARGET_SH3E) \ - { \ - int regno; \ - for (regno = FIRST_FP_REG; regno <= LAST_FP_REG; regno++) \ - fixed_regs[regno] = call_used_regs[regno] = 1; \ - fixed_regs[FPUL_REG] = call_used_regs[FPUL_REG] = 1; \ - } \ - /* Hitachi saves and restores mac registers on call. */ \ - if (TARGET_HITACHI) \ - { \ - call_used_regs[MACH_REG] = 0; \ - call_used_regs[MACL_REG] = 0; \ +#define CONDITIONAL_REGISTER_USAGE \ + if (! TARGET_SH4 || ! TARGET_FMOVD) \ + { \ + int regno; \ + for (regno = FIRST_XD_REG; regno <= LAST_XD_REG; regno++) \ + fixed_regs[regno] = call_used_regs[regno] = 1; \ + if (! TARGET_SH4) \ + { \ + if (! TARGET_SH3E) \ + { \ + int regno; \ + for (regno = FIRST_FP_REG; regno <= LAST_FP_REG; regno++) \ + fixed_regs[regno] = call_used_regs[regno] = 1; \ + fixed_regs[FPUL_REG] = call_used_regs[FPUL_REG] = 1; \ + } \ + } \ + } \ + /* Hitachi saves and restores mac registers on call. */ \ + if (TARGET_HITACHI) \ + { \ + call_used_regs[MACH_REG] = 0; \ + call_used_regs[MACL_REG] = 0; \ } /* ??? Need to write documentation for all SH options and add it to the @@ -81,6 +93,10 @@ extern int target_flags; #define SH2_BIT (1<<9) #define SH3_BIT (1<<10) #define SH3E_BIT (1<<11) +#define HARD_SH4_BIT (1<<5) +#define FPU_SINGLE_BIT (1<<7) +#define SH4_BIT (1<<12) +#define FMOVD_BIT (1<<4) #define SPACE_BIT (1<<13) #define BIGTABLE_BIT (1<<14) #define RELAX_BIT (1<<15) @@ -107,6 +123,27 @@ extern int target_flags; /* Nonzero if we should generate code using type 3E insns. */ #define TARGET_SH3E (target_flags & SH3E_BIT) +/* Nonzero if the cache line size is 32. */ +#define TARGET_CACHE32 (target_flags & HARD_SH4_BIT) + +/* Nonzero if we schedule for a superscalar implementation. */ +#define TARGET_SUPERSCALAR (target_flags & HARD_SH4_BIT) + +/* Nonzero if the target has separate instruction and data caches. */ +#define TARGET_HARWARD (target_flags & HARD_SH4_BIT) + +/* Nonzero if compiling for SH4 hardware (to be used for insn costs etc.) */ +#define TARGET_HARD_SH4 (target_flags & HARD_SH4_BIT) + +/* Nonzero if the default precision of th FPU is single */ +#define TARGET_FPU_SINGLE (target_flags & FPU_SINGLE_BIT) + +/* Nonzero if we should generate code using type 4 insns. */ +#define TARGET_SH4 (target_flags & SH4_BIT) + +/* Nonzero if we should generate fmovd. */ +#define TARGET_FMOVD (target_flags & FMOVD_BIT) + /* Nonzero if we respect NANs. */ #define TARGET_IEEE (target_flags & IEEE_BIT) @@ -137,10 +174,14 @@ extern int target_flags; { {"1", SH1_BIT}, \ {"2", SH2_BIT}, \ {"3", SH3_BIT|SH2_BIT}, \ - {"3e", SH3E_BIT|SH3_BIT|SH2_BIT}, \ + {"3e", SH3E_BIT|SH3_BIT|SH2_BIT|FPU_SINGLE_BIT}, \ + {"4-single-only", SH3E_BIT|SH3_BIT|SH2_BIT|SH3E_BIT|HARD_SH4_BIT|FPU_SINGLE_BIT}, \ + {"4-single", SH4_BIT|SH3E_BIT|SH3_BIT|SH2_BIT|HARD_SH4_BIT|FPU_SINGLE_BIT},\ + {"4", SH4_BIT|SH3E_BIT|SH3_BIT|SH2_BIT|HARD_SH4_BIT}, \ {"b", -LITTLE_ENDIAN_BIT}, \ {"bigtable", BIGTABLE_BIT}, \ {"dalign", DALIGN_BIT}, \ + {"fmovd", FMOVD_BIT}, \ {"hitachi", HITACHI_BIT}, \ {"ieee", IEEE_BIT}, \ {"isize", ISIZE_BIT}, \ @@ -160,26 +201,58 @@ extern int target_flags; #define OPTIMIZATION_OPTIONS(LEVEL,SIZE) \ do { \ + if (LEVEL) \ + flag_omit_frame_pointer = -1; \ + if (LEVEL) \ + sh_flag_remove_dead_before_cse = 1; \ if (SIZE) \ target_flags |= SPACE_BIT; \ } while (0) -#define ASSEMBLER_DIALECT 0 /* will allow to distinguish b[tf].s and b[tf]/s . */ -#define OVERRIDE_OPTIONS \ -do { \ - sh_cpu = CPU_SH1; \ - if (TARGET_SH2) \ - sh_cpu = CPU_SH2; \ - if (TARGET_SH3) \ - sh_cpu = CPU_SH3; \ - if (TARGET_SH3E) \ - sh_cpu = CPU_SH3E; \ - \ - /* Never run scheduling before reload, since that can \ - break global alloc, and generates slower code anyway due \ - to the pressure on R0. */ \ - flag_schedule_insns = 0; \ - sh_addr_diff_vec_mode = TARGET_BIGTABLE ? SImode : HImode; \ +#define ASSEMBLER_DIALECT assembler_dialect + +extern int assembler_dialect; + +#define OVERRIDE_OPTIONS \ +do { \ + sh_cpu = CPU_SH1; \ + assembler_dialect = 0; \ + if (TARGET_SH2) \ + sh_cpu = CPU_SH2; \ + if (TARGET_SH3) \ + sh_cpu = CPU_SH3; \ + if (TARGET_SH3E) \ + sh_cpu = CPU_SH3E; \ + if (TARGET_SH4) \ + { \ + assembler_dialect = 1; \ + sh_cpu = CPU_SH4; \ + } \ + if (! TARGET_SH4 || ! TARGET_FMOVD) \ + { \ + /* Prevent usage of explicit register names for variables \ + for registers not present / not addressable in the \ + target architecture. */ \ + int regno; \ + for (regno = (TARGET_SH3E) ? 17 : 0; \ + regno <= 24; regno++) \ + fp_reg_names[regno][0] = 0; \ + } \ + if (flag_omit_frame_pointer < 0) \ + /* The debugging information is sufficient, \ + but gdb doesn't implement this yet */ \ + if (0) \ + flag_omit_frame_pointer \ + = (PREFERRED_DEBUGGING_TYPE == DWARF_DEBUG \ + || PREFERRED_DEBUGGING_TYPE == DWARF2_DEBUG); \ + else \ + flag_omit_frame_pointer = 0; \ + \ + /* Never run scheduling before reload, since that can \ + break global alloc, and generates slower code anyway due \ + to the pressure on R0. */ \ + flag_schedule_insns = 0; \ + sh_addr_diff_vec_mode = TARGET_BIGTABLE ? SImode : HImode; \ } while (0) /* Target machine storage layout. */ @@ -233,7 +306,7 @@ do { \ /* The log (base 2) of the cache line size, in bytes. Processors prior to SH3 have no actual cache, but they fetch code in chunks of 4 bytes. */ -#define CACHE_LOG (TARGET_SH3 ? 4 : 2) +#define CACHE_LOG (TARGET_CACHE32 ? 5 : TARGET_SH3 ? 4 : 2) /* Allocation boundary (in *bits*) for the code of a function. 32 bit alignment is faster, because instructions are always fetched as a @@ -279,7 +352,7 @@ do { \ barrier_align (LABEL_AFTER_BARRIER) #define LOOP_ALIGN(A_LABEL) \ - ((! optimize || TARGET_SMALLCODE) ? 0 : 2) + ((! optimize || TARGET_HARWARD || TARGET_SMALLCODE) ? 0 : 2) #define LABEL_ALIGN(A_LABEL) \ ( \ @@ -341,8 +414,11 @@ do { \ #define RAP_REG 23 #define FIRST_FP_REG 24 #define LAST_FP_REG 39 +#define FIRST_XD_REG 40 +#define LAST_XD_REG 47 +#define FPSCR_REG 48 -#define FIRST_PSEUDO_REGISTER 40 +#define FIRST_PSEUDO_REGISTER 49 /* 1 for registers that have pervasive standard uses and are not available for the register allocator. @@ -361,6 +437,9 @@ do { \ 0, 0, 0, 0, \ 0, 0, 0, 0, \ 0, 0, 0, 0, \ + 0, 0, 0, 0, \ + 0, 0, 0, 0, \ + 1, \ } /* 1 for registers not available across function calls. @@ -381,6 +460,9 @@ do { \ 1, 1, 1, 1, \ 1, 1, 1, 1, \ 0, 0, 0, 0, \ + 1, 1, 1, 1, \ + 1, 1, 0, 0, \ + 1, \ } /* Return number of consecutive hard regs needed starting at reg REGNO @@ -388,20 +470,39 @@ do { \ This is ordinarily the length in words of a value of mode MODE but can be less for certain modes in special long registers. - On the SH regs are UNITS_PER_WORD bits wide. */ + On the SH all but the XD regs are UNITS_PER_WORD bits wide. */ #define HARD_REGNO_NREGS(REGNO, MODE) \ - (((GET_MODE_SIZE (MODE) + UNITS_PER_WORD - 1) / UNITS_PER_WORD)) + ((REGNO) >= FIRST_XD_REG && (REGNO) <= LAST_XD_REG \ + ? (GET_MODE_SIZE (MODE) / (2 * UNITS_PER_WORD)) \ + : ((GET_MODE_SIZE (MODE) + UNITS_PER_WORD - 1) / UNITS_PER_WORD)) \ /* Value is 1 if hard register REGNO can hold a value of machine-mode MODE. We can allow any mode in any general register. The special registers only allow SImode. Don't allow any mode in the PR. */ +/* We cannot hold DCmode values in the XD registers because alter_reg + handles subregs of them incorrectly. We could work around this by + spacing the XD registers like the DR registers, but this would require + additional memory in every compilation to hold larger register vectors. + We could hold SFmode / SCmode values in XD registers, but that + would require a tertiary reload when reloading from / to memory, + and a secondary reload to reload from / to general regs; that + seems to be a loosing proposition. */ #define HARD_REGNO_MODE_OK(REGNO, MODE) \ (SPECIAL_REG (REGNO) ? (MODE) == SImode \ : (REGNO) == FPUL_REG ? (MODE) == SImode || (MODE) == SFmode \ - : (REGNO) >= FIRST_FP_REG && (REGNO) <= LAST_FP_REG ? (MODE) == SFmode \ + : (REGNO) >= FIRST_FP_REG && (REGNO) <= LAST_FP_REG && (MODE) == SFmode \ + ? 1 \ + : (REGNO) >= FIRST_FP_REG && (REGNO) <= LAST_FP_REG \ + ? ((MODE) == SFmode \ + || (TARGET_SH3E && (MODE) == SCmode) \ + || (((TARGET_SH4 && (MODE) == DFmode) || (MODE) == DCmode) \ + && (((REGNO) - FIRST_FP_REG) & 1) == 0)) \ + : (REGNO) >= FIRST_XD_REG && (REGNO) <= LAST_XD_REG \ + ? (MODE) == DFmode \ : (REGNO) == PR_REG ? 0 \ + : (REGNO) == FPSCR_REG ? (MODE) == PSImode \ : 1) /* Value is 1 if it is a good idea to tie two pseudo registers @@ -541,6 +642,8 @@ enum reg_class GENERAL_REGS, FP0_REGS, FP_REGS, + DF_REGS, + FPSCR_REGS, GENERAL_FP_REGS, ALL_REGS, LIM_REG_CLASSES @@ -560,6 +663,8 @@ enum reg_class "GENERAL_REGS", \ "FP0_REGS", \ "FP_REGS", \ + "DF_REGS", \ + "FPSCR_REGS", \ "GENERAL_FP_REGS", \ "ALL_REGS", \ } @@ -579,8 +684,10 @@ enum reg_class { 0x0081FFFF, 0x00000000 }, /* GENERAL_REGS */ \ { 0x01000000, 0x00000000 }, /* FP0_REGS */ \ { 0xFF000000, 0x000000FF }, /* FP_REGS */ \ - { 0xFF81FFFF, 0x000000FF }, /* GENERAL_FP_REGS */ \ - { 0xFFFFFFFF, 0x000000FF }, /* ALL_REGS */ \ + { 0xFF000000, 0x0000FFFF }, /* DF_REGS */ \ + { 0x00000000, 0x00010000 }, /* FPSCR_REGS */ \ + { 0xFF81FFFF, 0x0000FFFF }, /* GENERAL_FP_REGS */ \ + { 0xFFFFFFFF, 0x0001FFFF }, /* ALL_REGS */ \ } /* The same information, inverted: @@ -603,6 +710,7 @@ extern int regno_reg_class[]; spilled or used otherwise, we better have the FP_REGS allocated first. */ #define REG_ALLOC_ORDER \ { 25,26,27,28,29,30,31,24,32,33,34,35,36,37,38,39, \ + 40,41,42,43,44,45,46,47,48, \ 1,2,3,7,6,5,4,0,8,9,10,11,12,13,14, \ 22,15,16,17,18,19,20,21,23 } @@ -657,7 +765,8 @@ extern enum reg_class reg_class_from_letter[]; #define PREFERRED_RELOAD_CLASS(X, CLASS) (CLASS) #define SECONDARY_OUTPUT_RELOAD_CLASS(CLASS,MODE,X) \ - ((((((CLASS) == FP_REGS || (CLASS) == FP0_REGS) \ + ((((((CLASS) == FP_REGS || (CLASS) == FP0_REGS \ + || (CLASS) == DF_REGS) \ && (GET_CODE (X) == REG && REGNO (X) <= AP_REG)) \ || (((CLASS) == GENERAL_REGS || (CLASS) == R0_REGS) \ && GET_CODE (X) == REG \ @@ -666,7 +775,7 @@ extern enum reg_class reg_class_from_letter[]; ? FPUL_REGS \ : ((CLASS) == FPUL_REGS \ && (GET_CODE (X) == MEM \ - || GET_CODE (X) == REG && REGNO (X) >= FIRST_PSEUDO_REGISTER))\ + || (GET_CODE (X) == REG && REGNO (X) >= FIRST_PSEUDO_REGISTER)))\ ? GENERAL_REGS \ : (((CLASS) == MAC_REGS || (CLASS) == PR_REGS) \ && GET_CODE (X) == REG && REGNO (X) > 15 \ @@ -674,10 +783,19 @@ extern enum reg_class reg_class_from_letter[]; ? GENERAL_REGS : NO_REGS) #define SECONDARY_INPUT_RELOAD_CLASS(CLASS,MODE,X) \ - ((((CLASS) == FP_REGS || (CLASS) == FP0_REGS) \ + ((((CLASS) == FP_REGS || (CLASS) == FP0_REGS || (CLASS) == DF_REGS) \ && immediate_operand ((X), (MODE)) \ - && ! (fp_zero_operand (X) || fp_one_operand (X))) \ - ? R0_REGS : SECONDARY_OUTPUT_RELOAD_CLASS((CLASS),(MODE),(X))) + && ! ((fp_zero_operand (X) || fp_one_operand (X)) && (MODE) == SFmode))\ + ? R0_REGS \ + : CLASS == FPUL_REGS && immediate_operand ((X), (MODE)) \ + ? (GET_CODE (X) == CONST_INT && CONST_OK_FOR_I (INTVAL (X)) \ + ? GENERAL_REGS \ + : R0_REGS) \ + : (CLASS == FPSCR_REGS \ + && ((GET_CODE (X) == REG && REGNO (X) >= FIRST_PSEUDO_REGISTER) \ + || GET_CODE (X) == MEM && GET_CODE (XEXP ((X), 0)) == PLUS)) \ + ? GENERAL_REGS \ + : SECONDARY_OUTPUT_RELOAD_CLASS((CLASS),(MODE),(X))) /* Return the maximum number of consecutive registers needed to represent mode MODE in a register of class CLASS. @@ -685,6 +803,11 @@ extern enum reg_class reg_class_from_letter[]; On SH this is the size of MODE in words. */ #define CLASS_MAX_NREGS(CLASS, MODE) \ ((GET_MODE_SIZE (MODE) + UNITS_PER_WORD - 1) / UNITS_PER_WORD) + +/* If defined, gives a class of registers that cannot be used as the + operand of a SUBREG that changes the size of the object. */ + +#define CLASS_CANNOT_CHANGE_SIZE DF_REGS /* Stack layout; function entry, exit and calling. */ @@ -694,6 +817,9 @@ extern enum reg_class reg_class_from_letter[]; #define NPARM_REGS(MODE) \ (TARGET_SH3E && (MODE) == SFmode \ ? 8 \ + : TARGET_SH4 && (GET_MODE_CLASS (MODE) == MODE_FLOAT \ + || GET_MODE_CLASS (MODE) == MODE_COMPLEX_FLOAT) \ + ? 8 \ : 4) #define FIRST_PARM_REG 4 @@ -752,25 +878,48 @@ extern enum reg_class reg_class_from_letter[]; #define BASE_RETURN_VALUE_REG(MODE) \ ((TARGET_SH3E && ((MODE) == SFmode)) \ ? FIRST_FP_RET_REG \ + : TARGET_SH3E && (MODE) == SCmode \ + ? FIRST_FP_RET_REG \ + : (TARGET_SH4 \ + && ((MODE) == DFmode || (MODE) == SFmode \ + || (MODE) == DCmode || (MODE) == SCmode )) \ + ? FIRST_FP_RET_REG \ : FIRST_RET_REG) #define BASE_ARG_REG(MODE) \ ((TARGET_SH3E && ((MODE) == SFmode)) \ ? FIRST_FP_PARM_REG \ + : TARGET_SH4 && (GET_MODE_CLASS (MODE) == MODE_FLOAT \ + || GET_MODE_CLASS (MODE) == MODE_COMPLEX_FLOAT)\ + ? FIRST_FP_PARM_REG \ : FIRST_PARM_REG) /* Define how to find the value returned by a function. VALTYPE is the data type of the value (as a tree). If the precise function being called is known, FUNC is its FUNCTION_DECL; - otherwise, FUNC is 0. */ - -#define FUNCTION_VALUE(VALTYPE, FUNC) \ - LIBCALL_VALUE (TYPE_MODE (VALTYPE)) + otherwise, FUNC is 0. + For the SH, this is like LIBCALL_VALUE, except that we must change the + mode like PROMOTE_MODE does. + ??? PROMOTE_MODE is ignored for non-scalar types. The set of types + tested here has to be kept in sync with the one in explow.c:promote_mode. */ + +#define FUNCTION_VALUE(VALTYPE, FUNC) \ + gen_rtx (REG, \ + ((GET_MODE_CLASS (TYPE_MODE (VALTYPE)) == MODE_INT \ + && GET_MODE_SIZE (TYPE_MODE (VALTYPE)) < UNITS_PER_WORD \ + && (TREE_CODE (VALTYPE) == INTEGER_TYPE \ + || TREE_CODE (VALTYPE) == ENUMERAL_TYPE \ + || TREE_CODE (VALTYPE) == BOOLEAN_TYPE \ + || TREE_CODE (VALTYPE) == CHAR_TYPE \ + || TREE_CODE (VALTYPE) == REAL_TYPE \ + || TREE_CODE (VALTYPE) == OFFSET_TYPE)) \ + ? SImode : TYPE_MODE (VALTYPE)), \ + BASE_RETURN_VALUE_REG (TYPE_MODE (VALTYPE))) /* Define how to find the value returned by a library function assuming the value has mode MODE. */ #define LIBCALL_VALUE(MODE) \ - gen_rtx (REG, (MODE), BASE_RETURN_VALUE_REG (MODE)); + gen_rtx (REG, (MODE), BASE_RETURN_VALUE_REG (MODE)) /* 1 if N is a possible register number for a function value. */ #define FUNCTION_VALUE_REGNO_P(REGNO) \ @@ -801,7 +950,11 @@ struct sh_args { #define CUMULATIVE_ARGS struct sh_args #define GET_SH_ARG_CLASS(MODE) \ - ((TARGET_SH3E && ((MODE) == SFmode)) ? SH_ARG_FLOAT : SH_ARG_INT) + ((TARGET_SH3E && (MODE) == SFmode) \ + ? SH_ARG_FLOAT \ + : TARGET_SH4 && (GET_MODE_CLASS (MODE) == MODE_FLOAT \ + || GET_MODE_CLASS (MODE) == MODE_COMPLEX_FLOAT) \ + ? SH_ARG_FLOAT : SH_ARG_INT) #define ROUND_ADVANCE(SIZE) \ (((SIZE) + UNITS_PER_WORD - 1) / UNITS_PER_WORD) @@ -813,7 +966,9 @@ struct sh_args { round doubles to even regs when asked to explicitly. */ #define ROUND_REG(CUM, MODE) \ - ((TARGET_ALIGN_DOUBLE \ + (((TARGET_ALIGN_DOUBLE \ + || (TARGET_SH4 && ((MODE) == DFmode || (MODE) == DCmode) \ + && (CUM).arg_count[(int) SH_ARG_FLOAT] < NPARM_REGS (MODE)))\ && GET_MODE_UNIT_SIZE ((MODE)) > UNITS_PER_WORD) \ ? ((CUM).arg_count[(int) GET_SH_ARG_CLASS (MODE)] \ + ((CUM).arg_count[(int) GET_SH_ARG_CLASS (MODE)] & 1)) \ @@ -838,11 +993,12 @@ struct sh_args { available.) */ #define FUNCTION_ARG_ADVANCE(CUM, MODE, TYPE, NAMED) \ - ((CUM).arg_count[(int) GET_SH_ARG_CLASS (MODE)] = \ - (ROUND_REG ((CUM), (MODE)) \ - + ((MODE) != BLKmode \ - ? ROUND_ADVANCE (GET_MODE_SIZE (MODE)) \ - : ROUND_ADVANCE (int_size_in_bytes (TYPE))))) + if (! TARGET_SH4 || PASS_IN_REG_P ((CUM), (MODE), (TYPE))) \ + ((CUM).arg_count[(int) GET_SH_ARG_CLASS (MODE)] \ + = (ROUND_REG ((CUM), (MODE)) \ + + ((MODE) == BLKmode \ + ? ROUND_ADVANCE (int_size_in_bytes (TYPE)) \ + : ROUND_ADVANCE (GET_MODE_SIZE (MODE))))) /* Return boolean indicating arg of mode MODE will be passed in a reg. This macro is only used in this file. */ @@ -883,7 +1039,9 @@ extern int current_function_varargs; ((PASS_IN_REG_P ((CUM), (MODE), (TYPE)) \ && ((NAMED) || TARGET_SH3E || ! current_function_varargs)) \ ? gen_rtx (REG, (MODE), \ - (BASE_ARG_REG (MODE) + ROUND_REG ((CUM), (MODE)))) \ + ((BASE_ARG_REG (MODE) + ROUND_REG ((CUM), (MODE))) \ + ^ ((MODE) == SFmode && TARGET_SH4 \ + && TARGET_LITTLE_ENDIAN != 0))) \ : 0) /* For an arg passed partly in registers and partly in memory, @@ -894,8 +1052,9 @@ extern int current_function_varargs; #define FUNCTION_ARG_PARTIAL_NREGS(CUM, MODE, TYPE, NAMED) \ ((PASS_IN_REG_P ((CUM), (MODE), (TYPE)) \ + && ! TARGET_SH4 \ && (ROUND_REG ((CUM), (MODE)) \ - + (MODE != BLKmode \ + + ((MODE) != BLKmode \ ? ROUND_ADVANCE (GET_MODE_SIZE (MODE)) \ : ROUND_ADVANCE (int_size_in_bytes (TYPE))) \ - NPARM_REGS (MODE) > 0)) \ @@ -955,7 +1114,7 @@ extern int current_function_anonymous_args; /* Alignment required for a trampoline in bits . */ #define TRAMPOLINE_ALIGNMENT \ - ((CACHE_LOG < 3 || TARGET_SMALLCODE) ? 32 : 64) \ + ((CACHE_LOG < 3 || TARGET_SMALLCODE && ! TARGET_HARWARD) ? 32 : 64) /* Emit RTL insns to initialize the variable parts of a trampoline. FNADDR is an RTX for the address of the function's pure code. @@ -971,6 +1130,8 @@ extern int current_function_anonymous_args; (CXT)); \ emit_move_insn (gen_rtx (MEM, SImode, plus_constant ((TRAMP), 12)), \ (FNADDR)); \ + if (TARGET_HARWARD) \ + emit_insn (gen_ic_invalidate_line (TRAMP)); \ } /* A C expression whose value is RTL representing the value of the return @@ -1086,7 +1247,10 @@ extern struct rtx_def *sh_builtin_saveregs (); #define MODE_DISP_OK_4(X,MODE) \ (GET_MODE_SIZE (MODE) == 4 && (unsigned) INTVAL (X) < 64 \ && ! (INTVAL (X) & 3) && ! (TARGET_SH3E && (MODE) == SFmode)) -#define MODE_DISP_OK_8(X,MODE) ((GET_MODE_SIZE(MODE)==8) && ((unsigned)INTVAL(X)<60) && (!(INTVAL(X) &3))) + +#define MODE_DISP_OK_8(X,MODE) \ +((GET_MODE_SIZE(MODE)==8) && ((unsigned)INTVAL(X)<60) \ + && ! (INTVAL(X) & 3) && ! (TARGET_SH4 && (MODE) == DFmode)) #define BASE_REGISTER_RTX_P(X) \ ((GET_CODE (X) == REG && REG_OK_FOR_BASE_P (X)) \ @@ -1141,13 +1305,15 @@ extern struct rtx_def *sh_builtin_saveregs (); else if ((GET_CODE (X) == POST_INC || GET_CODE (X) == PRE_DEC) \ && BASE_REGISTER_RTX_P (XEXP ((X), 0))) \ goto LABEL; \ - else if (GET_CODE (X) == PLUS && MODE != PSImode) \ + else if (GET_CODE (X) == PLUS \ + && ((MODE) != PSImode || reload_completed)) \ { \ rtx xop0 = XEXP ((X), 0); \ rtx xop1 = XEXP ((X), 1); \ if (GET_MODE_SIZE (MODE) <= 8 && BASE_REGISTER_RTX_P (xop0)) \ GO_IF_LEGITIMATE_INDEX ((MODE), xop1, LABEL); \ - if (GET_MODE_SIZE (MODE) <= 4) \ + if (GET_MODE_SIZE (MODE) <= 4 \ + || TARGET_SH4 && TARGET_FMOVD && MODE == DFmode) \ { \ if (BASE_REGISTER_RTX_P (xop1) && INDEX_REGISTER_RTX_P (xop0))\ goto LABEL; \ @@ -1181,6 +1347,7 @@ extern struct rtx_def *sh_builtin_saveregs (); || GET_MODE_SIZE (MODE) == 8) \ && GET_CODE (XEXP ((X), 1)) == CONST_INT \ && BASE_REGISTER_RTX_P (XEXP ((X), 0)) \ + && ! (TARGET_SH4 && (MODE) == DFmode) \ && ! (TARGET_SH3E && (MODE) == SFmode)) \ { \ rtx index_rtx = XEXP ((X), 1); \ @@ -1228,12 +1395,21 @@ extern struct rtx_def *sh_builtin_saveregs (); && (GET_MODE_SIZE (MODE) == 4 || GET_MODE_SIZE (MODE) == 8) \ && GET_CODE (XEXP (X, 1)) == CONST_INT \ && BASE_REGISTER_RTX_P (XEXP (X, 0)) \ - && ! (TARGET_SH3E && MODE == SFmode)) \ + && ! (TARGET_SH4 && (MODE) == DFmode) \ + && ! ((MODE) == PSImode && (TYPE) == RELOAD_FOR_INPUT_ADDRESS)) \ { \ rtx index_rtx = XEXP (X, 1); \ HOST_WIDE_INT offset = INTVAL (index_rtx), offset_base; \ rtx sum; \ \ + if (TARGET_SH3E && MODE == SFmode) \ + { \ + X = copy_rtx (X); \ + push_reload (index_rtx, NULL_RTX, &XEXP (X, 1), NULL_PTR, \ + INDEX_REG_CLASS, Pmode, VOIDmode, 0, 0, (OPNUM), \ + (TYPE)); \ + goto WIN; \ + } \ /* Instead of offset_base 128..131 use 124..127, so that \ simple add suffices. */ \ if (offset > 127) \ @@ -1315,7 +1491,7 @@ extern struct rtx_def *sh_builtin_saveregs (); /* Since the SH3e has only `float' support, it is desirable to make all floating point types equivalent to `float'. */ -#define DOUBLE_TYPE_SIZE (TARGET_SH3E ? 32 : 64) +#define DOUBLE_TYPE_SIZE ((TARGET_SH3E && ! TARGET_SH4) ? 32 : 64) /* 'char' is signed by default. */ #define DEFAULT_SIGNED_CHAR 1 @@ -1407,6 +1583,11 @@ extern struct rtx_def *sh_builtin_saveregs (); return 10; #define RTX_COSTS(X, CODE, OUTER_CODE) \ + case PLUS: \ + return (COSTS_N_INSNS (1) \ + + rtx_cost (XEXP ((X), 0), PLUS) \ + + (rtx_equal_p (XEXP ((X), 0), XEXP ((X), 1))\ + ? 0 : rtx_cost (XEXP ((X), 1), PLUS)));\ case AND: \ return COSTS_N_INSNS (andcosts (X)); \ case MULT: \ @@ -1414,7 +1595,13 @@ extern struct rtx_def *sh_builtin_saveregs (); case ASHIFT: \ case ASHIFTRT: \ case LSHIFTRT: \ - return COSTS_N_INSNS (shiftcosts (X)) ; \ + /* Add one extra unit for the matching constraint. \ + Otherwise loop strength reduction would think that\ + a shift with different sourc and destination is \ + as cheap as adding a constant to a register. */ \ + return (COSTS_N_INSNS (shiftcosts (X)) \ + + rtx_cost (XEXP ((X), 0), (CODE)) \ + + 1); \ case DIV: \ case UDIV: \ case MOD: \ @@ -1462,11 +1649,29 @@ extern struct rtx_def *sh_builtin_saveregs (); /* Compute extra cost of moving data between one register class and another. */ +/* Regclass always uses 2 for moves in the same register class; + If SECONDARY*_RELOAD_CLASS says something about the src/dst pair, + it uses this information. Hence, the general register <-> floating point + register information here is not used for SFmode. */ #define REGISTER_MOVE_COST(SRCCLASS, DSTCLASS) \ - ((DSTCLASS) == PR_REG ? 10 \ - : (((DSTCLASS) == FP_REGS && (SRCCLASS) == GENERAL_REGS) \ - || ((DSTCLASS) == GENERAL_REGS && (SRCCLASS) == FP_REGS)) ? 4 \ - : 1) + ((((DSTCLASS) == T_REGS) || ((DSTCLASS) == PR_REG)) ? 10 \ + : ((((DSTCLASS) == FP0_REGS || (DSTCLASS) == FP_REGS || (DSTCLASS) == DF_REGS) \ + && ((SRCCLASS) == GENERAL_REGS || (SRCCLASS) == R0_REGS)) \ + || (((DSTCLASS) == GENERAL_REGS || (DSTCLASS) == R0_REGS) \ + && ((SRCCLASS) == FP0_REGS || (SRCCLASS) == FP_REGS \ + || (SRCCLASS) == DF_REGS))) \ + ? TARGET_FMOVD ? 8 : 12 \ + : (((DSTCLASS) == FPUL_REGS \ + && ((SRCCLASS) == GENERAL_REGS || (SRCCLASS) == R0_REGS)) \ + || (SRCCLASS == FPUL_REGS \ + && ((DSTCLASS) == GENERAL_REGS || (DSTCLASS) == R0_REGS))) \ + ? 5 \ + : (((DSTCLASS) == FPUL_REGS \ + && ((SRCCLASS) == PR_REGS || (SRCCLASS) == MAC_REGS)) \ + || ((SRCCLASS) == FPUL_REGS \ + && ((DSTCLASS) == PR_REGS || (DSTCLASS) == MAC_REGS))) \ + ? 7 \ + : 2) /* ??? Perhaps make MEMORY_MOVE_COST depend on compiler option? This would be so that people would slow memory systems could generate @@ -1573,13 +1778,32 @@ dtors_section() \ the Real framepointer; it can also be used as a normal general register. Note that the name `fp' is horribly misleading since `fp' is in fact only the argument-and-return-context pointer. */ + +extern char fp_reg_names[][5]; + #define REGISTER_NAMES \ { \ "r0", "r1", "r2", "r3", "r4", "r5", "r6", "r7", \ "r8", "r9", "r10", "r11", "r12", "r13", "r14", "r15", \ + "ap", "pr", "t", "gbr", "mach","macl", fp_reg_names[16], "rap", \ + fp_reg_names[0], fp_reg_names[1] , fp_reg_names[2], fp_reg_names[3], \ + fp_reg_names[4], fp_reg_names[5], fp_reg_names[6], fp_reg_names[7], \ + fp_reg_names[8], fp_reg_names[9], fp_reg_names[10], fp_reg_names[11], \ + fp_reg_names[12], fp_reg_names[13], fp_reg_names[14], fp_reg_names[15], \ + fp_reg_names[17], fp_reg_names[18], fp_reg_names[19], fp_reg_names[20], \ + fp_reg_names[21], fp_reg_names[22], fp_reg_names[23], fp_reg_names[24], \ + "fpscr", \ +} + +#define DEBUG_REGISTER_NAMES \ +{ \ + "r0", "r1", "r2", "r3", "r4", "r5", "r6", "r7", \ + "r8", "r9", "r10", "r11", "r12", "r13", "r14", "r15", \ "ap", "pr", "t", "gbr", "mach","macl", "fpul","rap", \ "fr0","fr1","fr2", "fr3", "fr4", "fr5", "fr6", "fr7", \ "fr8","fr9","fr10","fr11","fr12","fr13","fr14","fr15",\ + "xd0","xd2","xd4", "xd6", "xd8", "xd10","xd12","xd14", \ + "fpscr", \ } /* DBX register number for a given compiler register number. */ @@ -1773,7 +1997,8 @@ enum processor_type { PROCESSOR_SH1, PROCESSOR_SH2, PROCESSOR_SH3, - PROCESSOR_SH3E + PROCESSOR_SH3E, + PROCESSOR_SH4 }; #define sh_cpu_attr ((enum attr_cpu)sh_cpu) @@ -1837,6 +2062,11 @@ extern int sh_valid_machine_decl_attribute (); #define VALID_MACHINE_DECL_ATTRIBUTE(DECL, ATTRIBUTES, IDENTIFIER, ARGS) \ sh_valid_machine_decl_attribute (DECL, ATTRIBUTES, IDENTIFIER, ARGS) +extern int sh_flag_remove_dead_before_cse; +extern int rtx_equal_function_value_matters; +extern struct rtx_def *fpscr_rtx; +extern struct rtx_def *get_fpscr_rtx (); + #define MOVE_RATIO (TARGET_SMALLCODE ? 2 : 16) @@ -1860,10 +2090,16 @@ sh_valid_machine_decl_attribute (DECL, ATTRIBUTES, IDENTIFIER, ARGS) {"arith_operand", {SUBREG, REG, CONST_INT}}, \ {"arith_reg_operand", {SUBREG, REG}}, \ {"arith_reg_or_0_operand", {SUBREG, REG, CONST_INT}}, \ + {"binary_float_operator", {PLUS, MULT}}, \ {"braf_label_ref_operand", {LABEL_REF}}, \ + {"commutative_float_operator", {PLUS, MULT}}, \ + {"fp_arith_reg_operand", {SUBREG, REG}}, \ + {"fp_extended_operand", {SUBREG, REG, FLOAT_EXTEND}}, \ + {"fpscr_operand", {REG}}, \ {"general_movsrc_operand", {SUBREG, REG, CONST_INT, MEM}}, \ {"general_movdst_operand", {SUBREG, REG, CONST_INT, MEM}}, \ {"logical_operand", {SUBREG, REG, CONST_INT}}, \ + {"noncommutative_float_operator", {MINUS, DIV}}, \ {"register_operand", {SUBREG, REG}}, /* Define this macro if it is advisable to hold scalars in registers @@ -1929,7 +2165,7 @@ do { \ using their arguments pretty quickly. \ Assume a four cycle delay before they are needed. */ \ if (! reg_set_p (reg, dep_insn)) \ - cost -= 4; \ + cost -= TARGET_SUPERSCALAR ? 40 : 4; \ } \ /* Adjust load_si / pcload_si type insns latency. Use the known \ nominal latency and form of the insn to speed up the check. */ \ @@ -1939,9 +2175,14 @@ do { \ it's actually a move insn. */ \ && general_movsrc_operand (SET_SRC (PATTERN (dep_insn)), SImode))\ cost = 2; \ + else if (cost == 30 \ + && GET_CODE (PATTERN (dep_insn)) == SET \ + && GET_MODE (SET_SRC (PATTERN (dep_insn))) == SImode) \ + cost = 20; \ } while (0) \ /* For the sake of libgcc2.c, indicate target supports atexit. */ #define HAVE_ATEXIT -#define SH_DYNAMIC_SHIFT_COST (TARGET_SH3 ? (TARGET_SMALLCODE ? 1 : 2) : 20) +#define SH_DYNAMIC_SHIFT_COST \ + (TARGET_HARD_SH4 ? 1 : TARGET_SH3 ? (TARGET_SMALLCODE ? 1 : 2) : 20) diff --git a/gcc/config/sh/sh.md b/gcc/config/sh/sh.md index 3ca60b5d9ba..8dde68e856f 100644 --- a/gcc/config/sh/sh.md +++ b/gcc/config/sh/sh.md @@ -70,13 +70,20 @@ ;; Target CPU. (define_attr "cpu" - "sh1,sh2,sh3,sh3e" + "sh1,sh2,sh3,sh3e,sh4" (const (symbol_ref "sh_cpu_attr"))) (define_attr "endian" "big,little" (const (if_then_else (symbol_ref "TARGET_LITTLE_ENDIAN") (const_string "little") (const_string "big")))) +(define_attr "fmovd" "yes,no" + (const (if_then_else (symbol_ref "TARGET_FMOVD") + (const_string "yes") (const_string "no")))) +;; issues/clock +(define_attr "issues" "1,2" + (const (if_then_else (symbol_ref "TARGET_SUPERSCALAR") (const_string "2") (const_string "1")))) + ;; cbranch conditional branch instructions ;; jump unconditional jumps ;; arith ordinary arithmetic @@ -101,10 +108,12 @@ ;; fp floating point ;; fdiv floating point divide (or square root) ;; gp_fpul move between general purpose register and fpul +;; dfp_arith, dfp_cmp,dfp_conv +;; dfdiv double precision floating point divide (or square root) ;; nil no-op move, will be deleted. (define_attr "type" - "cbranch,jump,jump_ind,arith,arith3,arith3b,dyn_shift,other,load,load_si,store,move,fmove,smpy,dmpy,return,pload,pstore,pcload,pcload_si,rte,sfunc,call,fp,fdiv,gp_fpul,nil" + "cbranch,jump,jump_ind,arith,arith3,arith3b,dyn_shift,other,load,load_si,store,move,fmove,smpy,dmpy,return,pload,pstore,pcload,pcload_si,rte,sfunc,call,fp,fdiv,dfp_arith,dfp_cmp,dfp_conv,dfdiv,gp_fpul,nil" (const_string "other")) ; If a conditional branch destination is within -252..258 bytes away @@ -252,34 +261,216 @@ ;; We only do this for SImode loads of general registers, to make the work ;; for ADJUST_COST easier. (define_function_unit "memory" 1 0 - (eq_attr "type" "load_si,pcload_si") + (and (eq_attr "issues" "1") + (eq_attr "type" "load_si,pcload_si")) 3 2) (define_function_unit "memory" 1 0 - (eq_attr "type" "load,pcload,pload,store,pstore") + (and (eq_attr "issues" "1") + (eq_attr "type" "load,pcload,pload,store,pstore")) 2 2) (define_function_unit "int" 1 0 - (eq_attr "type" "arith3,arith3b") 3 3) + (and (eq_attr "issues" "1") (eq_attr "type" "arith3,arith3b")) 3 3) (define_function_unit "int" 1 0 - (eq_attr "type" "dyn_shift") 2 2) + (and (eq_attr "issues" "1") (eq_attr "type" "dyn_shift")) 2 2) (define_function_unit "int" 1 0 - (eq_attr "type" "arith,arith3b,dyn_shift") 2 2) + (and (eq_attr "issues" "1") (eq_attr "type" "!arith3,arith3b,dyn_shift")) 1 1) ;; ??? These are approximations. -(define_function_unit "mpy" 1 0 (eq_attr "type" "smpy") 2 2) -(define_function_unit "mpy" 1 0 (eq_attr "type" "dmpy") 3 3) +(define_function_unit "mpy" 1 0 + (and (eq_attr "issues" "1") (eq_attr "type" "smpy")) 2 2) +(define_function_unit "mpy" 1 0 + (and (eq_attr "issues" "1") (eq_attr "type" "dmpy")) 3 3) + +(define_function_unit "fp" 1 0 + (and (eq_attr "issues" "1") (eq_attr "type" "fp,fmove")) 2 1) +(define_function_unit "fp" 1 0 + (and (eq_attr "issues" "1") (eq_attr "type" "fdiv")) 13 12) + + +;; SH4 scheduling +;; The SH4 is a dual-issue implementation, thus we have to multiply all +;; costs by at least two. +;; There will be single increments of the modeled that don't correspond +;; to the actual target ;; whenever two insns to be issued depend one a +;; single resource, and the scheduler picks to be the first one. +;; If we multiplied the costs just by two, just two of these single +;; increments would amount to an actual cycle. By picking a larger +;; factor, we can ameliorate the effect; However, we then have to make sure +;; that only two insns are modeled as issued per actual cycle. +;; Moreover, we need a way to specify the latency of insns that don't +;; use an actual function unit. +;; We use an 'issue' function unit to do that, and a cost factor of 10. + +(define_function_unit "issue" 2 0 + (and (eq_attr "issues" "2") (eq_attr "type" "!nil,arith3")) + 10 10) + +(define_function_unit "issue" 2 0 + (and (eq_attr "issues" "2") (eq_attr "type" "arith3")) + 30 30) + +;; There is no point in providing exact scheduling information about branches, +;; because they are at the starts / ends of basic blocks anyways. + +;; Some insns cannot be issued before/after another insn in the same cycle, +;; irrespective of the type of the other insn. + +;; default is dual-issue, but can't be paired with an insn that +;; uses multiple function units. +(define_function_unit "single_issue" 1 0 + (and (eq_attr "issues" "2") + (eq_attr "type" "!smpy,dmpy,pload,pstore,dfp_cmp,gp_fpul,call,sfunc,arith3,arith3b")) + 1 10 + [(eq_attr "type" "smpy,dmpy,pload,pstore,dfp_cmp,gp_fpul")]) + +(define_function_unit "single_issue" 1 0 + (and (eq_attr "issues" "2") + (eq_attr "type" "smpy,dmpy,pload,pstore,dfp_cmp,gp_fpul")) + 10 10 + [(const_int 1)]) + +;; arith3 insns are always pairable at the start, but not inecessarily at +;; the end; however, there doesn;t seem to be a way to express that. +(define_function_unit "single_issue" 1 0 + (and (eq_attr "issues" "2") + (eq_attr "type" "arith3")) + 30 20 + [(const_int 1)]) + +;; arith3b insn are pairable at the end and have latency that prevents pairing +;; with the following branch, but we don't want this latency be respected; +;; When the following branch is immediately adjacent, we can redirect the +;; internal branch, which is likly to be a larger win. +(define_function_unit "single_issue" 1 0 + (and (eq_attr "issues" "2") + (eq_attr "type" "arith3b")) + 20 20 + [(const_int 1)]) + +;; calls introduce a longisch delay that is likely to flush the pipelines. +(define_function_unit "single_issue" 1 0 + (and (eq_attr "issues" "2") + (eq_attr "type" "call,sfunc")) + 160 160 + [(eq_attr "type" "!call") (eq_attr "type" "call")]) + +;; Load and store instructions have no alignment peculiarities for the SH4, +;; but they use the load-store unit, which they share with the fmove type +;; insns (fldi[01]; fmov frn,frm; flds; fsts; fabs; fneg) . +;; Loads have a latency of two. +;; However, call insns can only paired with a preceding insn, and have +;; a delay slot, so that we want two more insns to be scheduled between the +;; load of the function address and the call. This is equivalent to a +;; latency of three. +;; We cannot use a conflict list for this, because we need to distinguish +;; between the actual call address and the function arguments. +;; ADJUST_COST can only properly handle reductions of the cost, so we +;; use a latency of three here, which gets multiplied by 10 to yield 30. +;; We only do this for SImode loads of general registers, to make the work +;; for ADJUST_COST easier. -(define_function_unit "fp" 1 0 (eq_attr "type" "fp,fmove") 2 1) -(define_function_unit "fp" 1 0 (eq_attr "type" "fdiv") 13 12) +;; When specifying different latencies for different insns using the +;; the same function unit, genattrtab.c assumes a 'FIFO constraint' +;; so that the blockage is at least READY-COST (E) + 1 - READY-COST (C) +;; for an executing insn E and a candidate insn C. +;; Therefore, we define three different function units for load_store: +;; load_store, load and load_si. + +(define_function_unit "load_si" 1 0 + (and (eq_attr "issues" "2") + (eq_attr "type" "load_si,pcload_si")) 30 10) +(define_function_unit "load" 1 0 + (and (eq_attr "issues" "2") + (eq_attr "type" "load,pcload,pload")) 20 10) +(define_function_unit "load_store" 1 0 + (and (eq_attr "issues" "2") + (eq_attr "type" "load_si,pcload_si,load,pcload,pload,store,pstore,fmove")) + 10 10) +(define_function_unit "int" 1 0 + (and (eq_attr "issues" "2") (eq_attr "type" "arith,dyn_shift")) 10 10) + +;; Again, we have to pretend a lower latency for the "int" unit to avoid a +;; spurious FIFO constraint; the multiply instructions use the "int" +;; unit actually only for two cycles. +(define_function_unit "int" 1 0 + (and (eq_attr "issues" "2") (eq_attr "type" "smpy,dmpy")) 20 20) + +;; We use a fictous "mpy" unit to express the actual latency. +(define_function_unit "mpy" 1 0 + (and (eq_attr "issues" "2") (eq_attr "type" "smpy,dmpy")) 40 20) + +;; Again, we have to pretend a lower latency for the "int" unit to avoid a +;; spurious FIFO constraint. +(define_function_unit "int" 1 0 + (and (eq_attr "issues" "2") (eq_attr "type" "gp_fpul")) 10 10) + +;; We use a fictous "gp_fpul" unit to express the actual latency. +(define_function_unit "gp_fpul" 1 0 + (and (eq_attr "issues" "2") (eq_attr "type" "gp_fpul")) 20 10) + +;; ??? multiply uses the floating point unit, but with a two cycle delay. +;; Thus, a simple single-precision fp operation could finish if issued in +;; the very next cycle, but stalls when issued two or three cycles later. +;; Similarily, a divide / sqrt can work without stalls if issued in +;; the very next cycle, while it would have to block if issued two or +;; three cycles later. +;; There is no way to model this with gcc's function units. This problem is +;; actually mentioned in md.texi. Tackling this problem requires first that +;; it is possible to speak about the target in an open discussion. +;; +;; However, simple double-precision operations always conflict. + +(define_function_unit "fp" 1 0 + (and (eq_attr "issues" "2") (eq_attr "type" "smpy,dmpy")) 40 40 + [(eq_attr "type" "dfp_cmp,dfp_conv,dfp_arith")]) + +;; The "fp" unit is for pipeline stages F1 and F2. + +(define_function_unit "fp" 1 0 + (and (eq_attr "issues" "2") (eq_attr "type" "fp")) 30 10) + +;; Again, we have to pretend a lower latency for the "fp" unit to avoid a +;; spurious FIFO constraint; the bulk of the fdiv type insns executes in +;; the F3 stage. +(define_function_unit "fp" 1 0 + (and (eq_attr "issues" "2") (eq_attr "type" "fdiv")) 30 10) + +;; The "fdiv" function unit models the aggregate effect of the F1, F2 and F3 +;; pipeline stages on the pipelining of fdiv/fsqrt insns. +;; We also use it to give the actual latency here. +;; fsqrt is actually one cycle faster than fdiv (and the value used here), +;; but that will hardly matter in practice for scheduling. +(define_function_unit "fdiv" 1 0 + (and (eq_attr "issues" "2") (eq_attr "type" "fdiv")) 120 100) + +;; There is again a late use of the "fp" unit by [d]fdiv type insns +;; that we can't express. + +(define_function_unit "fp" 1 0 + (and (eq_attr "issues" "2") (eq_attr "type" "dfp_cmp,dfp_conv")) 40 20) + +(define_function_unit "fp" 1 0 + (and (eq_attr "issues" "2") (eq_attr "type" "dfp_arith")) 80 60) + +(define_function_unit "fp" 1 0 + (and (eq_attr "issues" "2") (eq_attr "type" "dfdiv")) 230 10) + +(define_function_unit "fdiv" 1 0 + (and (eq_attr "issues" "2") (eq_attr "type" "dfdiv")) 230 210) ; Definitions for filling branch delay slots. (define_attr "needs_delay_slot" "yes,no" (const_string "no")) -(define_attr "hit_stack" "yes,no" (const_string "no")) +;; ??? This should be (nil) instead of (const_int 0) +(define_attr "hit_stack" "yes,no" + (cond [(eq (symbol_ref "find_regno_note (insn, REG_INC, 15)") (const_int 0)) + (const_string "no")] + (const_string "yes"))) (define_attr "interrupt_function" "no,yes" (const (symbol_ref "pragma_interrupt"))) @@ -668,7 +859,42 @@ (clobber (reg:SI 17)) (clobber (reg:SI 4)) (use (match_operand:SI 1 "arith_reg_operand" "r"))] - "" + "! TARGET_SH4" + "jsr @%1%#" + [(set_attr "type" "sfunc") + (set_attr "needs_delay_slot" "yes")]) + +(define_insn "udivsi3_i4" + [(set (match_operand:SI 0 "register_operand" "=y") + (udiv:SI (reg:SI 4) (reg:SI 5))) + (clobber (reg:SI 17)) + (clobber (reg:DF 24)) + (clobber (reg:DF 26)) + (clobber (reg:DF 28)) + (clobber (reg:SI 0)) + (clobber (reg:SI 1)) + (clobber (reg:SI 4)) + (clobber (reg:SI 5)) + (use (reg:PSI 48)) + (use (match_operand:SI 1 "arith_reg_operand" "r"))] + "TARGET_SH4 && ! TARGET_FPU_SINGLE" + "jsr @%1%#" + [(set_attr "type" "sfunc") + (set_attr "needs_delay_slot" "yes")]) + +(define_insn "udivsi3_i4_single" + [(set (match_operand:SI 0 "register_operand" "=y") + (udiv:SI (reg:SI 4) (reg:SI 5))) + (clobber (reg:SI 17)) + (clobber (reg:DF 24)) + (clobber (reg:DF 26)) + (clobber (reg:DF 28)) + (clobber (reg:SI 0)) + (clobber (reg:SI 1)) + (clobber (reg:SI 4)) + (clobber (reg:SI 5)) + (use (match_operand:SI 1 "arith_reg_operand" "r"))] + "TARGET_HARD_SH4 && TARGET_FPU_SINGLE" "jsr @%1%#" [(set_attr "type" "sfunc") (set_attr "needs_delay_slot" "yes")]) @@ -685,7 +911,22 @@ (clobber (reg:SI 4)) (use (match_dup 3))])] "" - "operands[3] = gen_reg_rtx(SImode);") + " +{ + operands[3] = gen_reg_rtx(SImode); + if (TARGET_HARD_SH4) + { + emit_move_insn (gen_rtx (REG, SImode, 4), operands[1]); + emit_move_insn (gen_rtx (REG, SImode, 5), operands[2]); + emit_move_insn (operands[3], + gen_rtx_SYMBOL_REF (SImode, \"__udivsi3_i4\")); + if (TARGET_FPU_SINGLE) + emit_insn (gen_udivsi3_i4_single (operands[0], operands[3])); + else + emit_insn (gen_udivsi3_i4 (operands[0], operands[3])); + DONE; + } +}") (define_insn "" [(set (match_operand:SI 0 "register_operand" "=z") @@ -696,7 +937,33 @@ (clobber (reg:SI 2)) (clobber (reg:SI 3)) (use (match_operand:SI 1 "arith_reg_operand" "r"))] - "" + "! TARGET_SH4" + "jsr @%1%#" + [(set_attr "type" "sfunc") + (set_attr "needs_delay_slot" "yes")]) + +(define_insn "divsi3_i4" + [(set (match_operand:SI 0 "register_operand" "=y") + (div:SI (reg:SI 4) (reg:SI 5))) + (clobber (reg:SI 17)) + (clobber (reg:DF 24)) + (clobber (reg:DF 26)) + (use (reg:PSI 48)) + (use (match_operand:SI 1 "arith_reg_operand" "r"))] + "TARGET_SH4 && ! TARGET_FPU_SINGLE" + "jsr @%1%#" + [(set_attr "type" "sfunc") + (set_attr "needs_delay_slot" "yes")]) + +(define_insn "divsi3_i4_single" + [(set (match_operand:SI 0 "register_operand" "=y") + (div:SI (reg:SI 4) (reg:SI 5))) + (clobber (reg:SI 17)) + (clobber (reg:DF 24)) + (clobber (reg:DF 26)) + (clobber (reg:SI 2)) + (use (match_operand:SI 1 "arith_reg_operand" "r"))] + "TARGET_HARD_SH4 && TARGET_FPU_SINGLE" "jsr @%1%#" [(set_attr "type" "sfunc") (set_attr "needs_delay_slot" "yes")]) @@ -715,7 +982,22 @@ (clobber (reg:SI 3)) (use (match_dup 3))])] "" - "operands[3] = gen_reg_rtx(SImode);") + " +{ + operands[3] = gen_reg_rtx(SImode); + if (TARGET_HARD_SH4) + { + emit_move_insn (gen_rtx (REG, SImode, 4), operands[1]); + emit_move_insn (gen_rtx (REG, SImode, 5), operands[2]); + emit_move_insn (operands[3], + gen_rtx_SYMBOL_REF (SImode, \"__sdivsi3_i4\")); + if (TARGET_FPU_SINGLE) + emit_insn (gen_divsi3_i4_single (operands[0], operands[3])); + else + emit_insn (gen_divsi3_i4 (operands[0], operands[3])); + DONE; + } +}") ;; ------------------------------------------------------------------------- ;; Multiplication instructions @@ -782,7 +1064,6 @@ (define_expand "mulsi3_call" [(set (reg:SI 4) (match_operand:SI 1 "general_operand" "")) (set (reg:SI 5) (match_operand:SI 2 "general_operand" "")) - (set (match_dup 3) (symbol_ref:SI "__mulsi3")) (parallel[(set (match_operand:SI 0 "register_operand" "") (mult:SI (reg:SI 4) (reg:SI 5))) @@ -792,9 +1073,9 @@ (clobber (reg:SI 3)) (clobber (reg:SI 2)) (clobber (reg:SI 1)) - (use (match_dup 3))])] + (use (match_operand:SI 3 "register_operand" ""))])] "" - "operands[3] = gen_reg_rtx(SImode);") + "") (define_insn "mul_l" [(set (reg:SI 21) @@ -813,13 +1094,29 @@ "" " { + rtx first, last; + if (!TARGET_SH2) { - FAIL; - /* ??? Does this give worse or better code? */ - emit_insn (gen_mulsi3_call (operands[0], operands[1], operands[2])); - DONE; + /* The address must be set outside the libcall, + since it goes into a pseudo. */ + rtx addr = force_reg (SImode, gen_rtx_SYMBOL_REF (SImode, \"__mulsi3\")); + rtx insns = gen_mulsi3_call (operands[0], operands[1], operands[2], addr); + first = XVECEXP (insns, 0, 0); + last = XVECEXP (insns, 0, XVECLEN (insns, 0) - 1); + emit_insn (insns); } + else + { + rtx macl = gen_rtx_REG (SImode, MACL_REG); + first = emit_insn (gen_mul_l (operands[1], operands[2])); + last = emit_insn (gen_movsi_i ((operands[0]), macl)); + } + /* Wrap the sequence in REG_LIBCALL / REG_RETVAL notes so that loop + invariant code motion can move it. */ + REG_NOTES (first) = gen_rtx_INSN_LIST (REG_LIBCALL, last, REG_NOTES (first)); + REG_NOTES (last) = gen_rtx_INSN_LIST (REG_RETVAL, first, REG_NOTES (last)); + DONE; }") (define_insn "mulsidi3_i" @@ -1767,50 +2064,65 @@ ;; define push and pop so it is easy for sh.c -(define_insn "push" +(define_expand "push" [(set (mem:SI (pre_dec:SI (reg:SI 15))) (match_operand:SI 0 "register_operand" "r,l,x"))] "" - "@ - mov.l %0,@-r15 - sts.l %0,@-r15 - sts.l %0,@-r15" - [(set_attr "type" "store,pstore,store") - (set_attr "hit_stack" "yes")]) + "") -(define_insn "pop" +(define_expand "pop" [(set (match_operand:SI 0 "register_operand" "=r,l,x") (mem:SI (post_inc:SI (reg:SI 15))))] "" - "@ - mov.l @r15+,%0 - lds.l @r15+,%0 - lds.l @r15+,%0" - [(set_attr "type" "load,pload,load") - (set_attr "hit_stack" "yes")]) + "") + +(define_expand "push_e" + [(parallel [(set (mem:SF (pre_dec:SI (reg:SI 15))) + (match_operand:SF 0 "" "")) + (use (reg:PSI 48)) + (clobber (scratch:SI))])] + "" + "") -(define_insn "push_e" - [(set (mem:SF (pre_dec:SI (reg:SI 15))) - (match_operand:SF 0 "register_operand" "r,f,y"))] +(define_insn "push_fpul" + [(set (mem:SF (pre_dec:SI (reg:SI 15))) (reg:SF 22))] "TARGET_SH3E" - "@ - mov.l %0,@-r15 - fmov.s %0,@-r15 - sts.l %0,@-r15" + "sts.l fpul,@-r15" [(set_attr "type" "store") (set_attr "hit_stack" "yes")]) -(define_insn "pop_e" - [(set (match_operand:SF 0 "register_operand" "=r,f,y") - (mem:SF (post_inc:SI (reg:SI 15))))] +;; DFmode pushes for sh4 require a lot of what is defined for movdf_i4, +;; so use that. +(define_expand "push_4" + [(parallel [(set (mem:DF (pre_dec:SI (reg:SI 15))) (match_operand:DF 0 "" "")) + (use (reg:PSI 48)) + (clobber (scratch:SI))])] + "" + "") + +(define_expand "pop_e" + [(parallel [(set (match_operand:SF 0 "" "") + (mem:SF (post_inc:SI (reg:SI 15)))) + (use (reg:PSI 48)) + (clobber (scratch:SI))])] + "" + "") + +(define_insn "pop_fpul" + [(set (reg:SF 22) (mem:SF (post_inc:SI (reg:SI 15))))] "TARGET_SH3E" - "@ - mov.l @r15+,%0 - fmov.s @r15+,%0 - lds.l @r15+,%0" + "lds.l @r15+,fpul" [(set_attr "type" "load") (set_attr "hit_stack" "yes")]) +(define_expand "pop_4" + [(parallel [(set (match_operand:DF 0 "" "") + (mem:DF (post_inc:SI (reg:SI 15)))) + (use (reg:PSI 48)) + (clobber (scratch:SI))])] + "" + "") + ;; These two patterns can happen as the result of optimization, when ;; comparisons get simplified to a move of zero or 1 into the T reg. ;; They don't disappear completely, because the T reg is a fixed hard reg. @@ -1829,7 +2141,7 @@ ;; of a pseudo-reg into the T reg (define_insn "movsi_i" [(set (match_operand:SI 0 "general_movdst_operand" "=t,r,r,r,r,r,m,<,<,xl,x,l,r") - (match_operand:SI 1 "general_movsrc_operand" "r,Q,rI,m,xl,t,r,x,l,r,>,>,i"))] + (match_operand:SI 1 "general_movsrc_operand" "r,Q,rI,mr,xl,t,r,x,l,r,>,>,i"))] " ! TARGET_SH3E && (register_operand (operands[0], SImode) @@ -1856,8 +2168,8 @@ ;; ??? This allows moves from macl to fpul to be recognized, but these moves ;; will require a reload. (define_insn "movsi_ie" - [(set (match_operand:SI 0 "general_movdst_operand" "=r,r,t,r,r,r,m,<,<,xl,x,l,r,y,r,y") - (match_operand:SI 1 "general_movsrc_operand" "Q,rI,r,m,xl,t,r,x,l,r,>,>,i,r,y,y"))] + [(set (match_operand:SI 0 "general_movdst_operand" "=r,r,t,r,r,r,m,<,<,xl,x,l,y,r,y,r,y") + (match_operand:SI 1 "general_movsrc_operand" "Q,rI,r,mr,xl,t,r,x,l,r,>,>,>,i,r,y,y"))] "TARGET_SH3E && (register_operand (operands[0], SImode) || register_operand (operands[1], SImode))" @@ -1874,16 +2186,17 @@ lds %1,%0 lds.l %1,%0 lds.l %1,%0 + lds.l %1,%0 fake %1,%0 lds %1,%0 sts %1,%0 ! move optimized away" - [(set_attr "type" "pcload_si,move,*,load_si,move,move,store,store,pstore,move,load,pload,pcload_si,gp_fpul,gp_fpul,nil") - (set_attr "length" "*,*,*,*,*,*,*,*,*,*,*,*,*,*,*,0")]) + [(set_attr "type" "pcload_si,move,*,load_si,move,move,store,store,pstore,move,load,pload,load,pcload_si,gp_fpul,gp_fpul,nil") + (set_attr "length" "*,*,*,*,*,*,*,*,*,*,*,*,*,*,*,*,0")]) (define_insn "movsi_i_lowpart" [(set (strict_low_part (match_operand:SI 0 "general_movdst_operand" "=r,r,r,r,r,m,r")) - (match_operand:SI 1 "general_movsrc_operand" "Q,rI,m,xl,t,r,i"))] + (match_operand:SI 1 "general_movsrc_operand" "Q,rI,mr,xl,t,r,i"))] "register_operand (operands[0], SImode) || register_operand (operands[1], SImode)" "@ @@ -1901,6 +2214,30 @@ "" "{ if (prepare_move_operands (operands, SImode)) DONE; }") +(define_expand "ic_invalidate_line" + [(parallel [(unspec_volatile [(match_operand:SI 0 "register_operand" "+r") + (match_dup 1)] 12) + (clobber (scratch:SI))])] + "TARGET_HARD_SH4" + " +{ + operands[0] = force_reg (Pmode, operands[0]); + operands[1] = force_reg (Pmode, GEN_INT (0xf0000008)); +}") + +;; The address %0 is assumed to be 4-aligned at least. Thus, by ORing +;; 0xf0000008, we get the low-oder bits *1*00 (binary), ;; which fits +;; the requirement *0*00 for associative address writes. The alignment of +;; %0 implies that its least significant bit is cleared, +;; thus we clear the V bit of a matching entry if there is one. +(define_insn "ic_invalidate_line_i" + [(unspec_volatile [(match_operand:SI 0 "register_operand" "r,r") + (match_operand:SI 1 "register_operand" "r,r")] 12) + (clobber (match_scratch:SI 2 "=&r,1"))] + "TARGET_HARD_SH4" + "ocbwb\\t@%0\;extu.w\\t%0,%2\;or\\t%r1,%r2\;mov.l\\t%0,@%2" + [(set_attr "length" "8")]) + (define_insn "movqi_i" [(set (match_operand:QI 0 "general_movdst_operand" "=r,r,m,r,r,l") (match_operand:QI 1 "general_movsrc_operand" "ri,m,r,t,l,r"))] @@ -2014,12 +2351,330 @@ (define_insn "movdf_k" [(set (match_operand:DF 0 "general_movdst_operand" "=r,r,r,m") (match_operand:DF 1 "general_movsrc_operand" "r,FQ,m,r"))] - "arith_reg_operand (operands[0], DFmode) - || arith_reg_operand (operands[1], DFmode)" + "(! TARGET_SH4 || reload_completed + /* ??? We provide some insn so that direct_{load,store}[DFmode] get set */ + || GET_CODE (operands[0]) == REG && REGNO (operands[0]) == 3 + || GET_CODE (operands[1]) == REG && REGNO (operands[1]) == 3) + && (arith_reg_operand (operands[0], DFmode) + || arith_reg_operand (operands[1], DFmode))" "* return output_movedouble (insn, operands, DFmode);" [(set_attr "length" "4") (set_attr "type" "move,pcload,load,store")]) +;; All alternatives of movdf_i4 are split for ! TARGET_FMOVD. +;; However, the d/F/c/z alternative cannot be split directly; it is converted +;; with special code in machine_dependent_reorg into a load of the R0_REG and +;; the d/m/c/X alternative, which is split later into single-precision +;; instructions. And when not optimizing, no splits are done before fixing +;; up pcloads, so we need usable length information for that. +(define_insn "movdf_i4" + [(set (match_operand:DF 0 "general_movdst_operand" "=d,r,d,d,m,r,r,m,!??r,!???d") + (match_operand:DF 1 "general_movsrc_operand" "d,r,F,m,d,FQ,m,r,d,r")) + (use (match_operand:PSI 2 "fpscr_operand" "c,c,c,c,c,c,c,c,c,c")) + (clobber (match_scratch:SI 3 "=X,X,&z,X,X,X,X,X,X,X"))] + "TARGET_SH4 + && (arith_reg_operand (operands[0], DFmode) + || arith_reg_operand (operands[1], DFmode))" + "@ + fmov %1,%0 + # + # + fmov.d %1,%0 + fmov.d %1,%0 + # + # + # + # + #" + [(set_attr_alternative "length" + [(if_then_else (eq_attr "fmovd" "yes") (const_int 2) (const_int 4)) + (const_int 4) + (if_then_else (eq_attr "fmovd" "yes") (const_int 4) (const_int 6)) + (if_then_else (eq_attr "fmovd" "yes") (const_int 2) (const_int 6)) + (if_then_else (eq_attr "fmovd" "yes") (const_int 2) (const_int 6)) + (const_int 4) + (const_int 8) (const_int 8) ;; these need only 8 bytes for @(r0,rn) + (const_int 8) (const_int 8)]) + (set_attr "type" "fmove,move,pcload,load,store,pcload,load,store,load,load")]) + +;; Moving DFmode between fp/general registers through memory +;; (the top of the stack) is faster than moving through fpul even for +;; little endian. Because the type of an instruction is important for its +;; scheduling, it is beneficial to split these operations, rather than +;; emitting them in one single chunk, even if this will expose a stack +;; use that will prevent scheduling of other stack accesses beyond this +;; instruction. +(define_split + [(set (match_operand:DF 0 "register_operand" "") + (match_operand:DF 1 "register_operand" "")) + (use (match_operand:PSI 2 "fpscr_operand" "c")) + (clobber (match_scratch:SI 3 "=X"))] + "TARGET_SH4 && reload_completed + && (true_regnum (operands[0]) < 16) != (true_regnum (operands[1]) < 16)" + [(const_int 0)] + " +{ + rtx insn, tos; + + tos = gen_rtx (MEM, DFmode, gen_rtx (PRE_DEC, Pmode, stack_pointer_rtx)); + insn = emit_insn (gen_movdf_i4 (tos, operands[1], operands[2])); + REG_NOTES (insn) = gen_rtx (EXPR_LIST, REG_INC, stack_pointer_rtx, NULL_RTX); + tos = gen_rtx (MEM, DFmode, gen_rtx (POST_INC, Pmode, stack_pointer_rtx)); + insn = emit_insn (gen_movdf_i4 (operands[0], tos, operands[2])); + REG_NOTES (insn) = gen_rtx (EXPR_LIST, REG_INC, stack_pointer_rtx, NULL_RTX); + DONE; +}") + +;; local-alloc sometimes allocates scratch registers even when not required, +;; so we must be prepared to handle these. + +;; Remove the use and clobber from a movdf_i4 so that we can use movdf_k. +(define_split + [(set (match_operand:DF 0 "general_movdst_operand" "") + (match_operand:DF 1 "general_movsrc_operand" "")) + (use (match_operand:PSI 2 "fpscr_operand" "c")) + (clobber (match_scratch:SI 3 "X"))] + "TARGET_SH4 + && reload_completed + && true_regnum (operands[0]) < 16 + && true_regnum (operands[1]) < 16" + [(set (match_dup 0) (match_dup 1))] + " +{ + /* If this was a reg <-> mem operation with base + index reg addressing, + we have to handle this in a special way. */ + rtx mem = operands[0]; + int store_p = 1; + if (! memory_operand (mem, DFmode)) + { + mem = operands[1]; + store_p = 0; + } + if (GET_CODE (mem) == SUBREG && SUBREG_WORD (mem) == 0) + mem = SUBREG_REG (mem); + if (GET_CODE (mem) == MEM) + { + rtx addr = XEXP (mem, 0); + if (GET_CODE (addr) == PLUS + && GET_CODE (XEXP (addr, 0)) == REG + && GET_CODE (XEXP (addr, 1)) == REG) + { + int offset; + rtx reg0 = gen_rtx (REG, Pmode, 0); + rtx regop = operands[store_p], word0 ,word1; + + if (GET_CODE (regop) == SUBREG) + regop = alter_subreg (regop); + if (REGNO (XEXP (addr, 0)) == REGNO (XEXP (addr, 1))) + offset = 2; + else + offset = 4; + mem = copy_rtx (mem); + PUT_MODE (mem, SImode); + word0 = gen_rtx(SUBREG, SImode, regop, 0); + emit_insn (store_p + ? gen_movsi_ie (mem, word0) : gen_movsi_ie (word0, mem)); + emit_insn (gen_addsi3 (reg0, reg0, GEN_INT (offset))); + mem = copy_rtx (mem); + word1 = gen_rtx(SUBREG, SImode, regop, 1); + emit_insn (store_p + ? gen_movsi_ie (mem, word1) : gen_movsi_ie (word1, mem)); + emit_insn (gen_addsi3 (reg0, reg0, GEN_INT (-offset))); + DONE; + } + } +}") + +;; Split away the clobber of r0 after machine_dependent_reorg has fixed pcloads. +(define_split + [(set (match_operand:DF 0 "register_operand" "") + (match_operand:DF 1 "memory_operand" "")) + (use (match_operand:PSI 2 "fpscr_operand" "c")) + (clobber (reg:SI 0))] + "TARGET_SH4 && reload_completed" + [(parallel [(set (match_dup 0) (match_dup 1)) + (use (match_dup 2)) + (clobber (scratch:SI))])] + "") + +(define_expand "reload_indf" + [(parallel [(set (match_operand:DF 0 "register_operand" "=f") + (match_operand:DF 1 "immediate_operand" "FQ")) + (use (reg:PSI 48)) + (clobber (match_operand:SI 2 "register_operand" "=&z"))])] + "" + "") + +(define_expand "reload_outdf" + [(parallel [(set (match_operand:DF 0 "register_operand" "=r,f") + (match_operand:DF 1 "register_operand" "af,r")) + (clobber (match_operand:SI 2 "register_operand" "=&y,y"))])] + "" + "") + +;; Simplify no-op moves. +(define_split + [(set (match_operand:SF 0 "register_operand" "") + (match_operand:SF 1 "register_operand" "")) + (use (match_operand:PSI 2 "fpscr_operand" "")) + (clobber (match_scratch:SI 3 "X"))] + "TARGET_SH3E && reload_completed + && true_regnum (operands[0]) == true_regnum (operands[1])" + [(set (match_dup 0) (match_dup 0))] + "") + +;; fmovd substitute post-reload splits +(define_split + [(set (match_operand:DF 0 "register_operand" "") + (match_operand:DF 1 "register_operand" "")) + (use (match_operand:PSI 2 "fpscr_operand" "c")) + (clobber (match_scratch:SI 3 "X"))] + "TARGET_SH4 && ! TARGET_FMOVD && reload_completed + && true_regnum (operands[0]) >= FIRST_FP_REG + && true_regnum (operands[1]) >= FIRST_FP_REG" + [(const_int 0)] + " +{ + int dst = true_regnum (operands[0]), src = true_regnum (operands[1]); + emit_insn (gen_movsf_ie (gen_rtx (REG, SFmode, dst), + gen_rtx (REG, SFmode, src), operands[2])); + emit_insn (gen_movsf_ie (gen_rtx (REG, SFmode, dst + 1), + gen_rtx (REG, SFmode, src + 1), operands[2])); + DONE; +}") + +(define_split + [(set (match_operand:DF 0 "register_operand" "") + (mem:DF (match_operand:SI 1 "register_operand" ""))) + (use (match_operand:PSI 2 "fpscr_operand" "c")) + (clobber (match_scratch:SI 3 "X"))] + "TARGET_SH4 && ! TARGET_FMOVD && reload_completed + && true_regnum (operands[0]) >= FIRST_FP_REG + && find_regno_note (insn, REG_DEAD, true_regnum (operands[1]))" + [(const_int 0)] + " +{ + int regno = true_regnum (operands[0]); + rtx insn; + rtx mem2 = gen_rtx (MEM, SFmode, gen_rtx (POST_INC, Pmode, operands[1])); + + insn = emit_insn (gen_movsf_ie (gen_rtx (REG, SFmode, + regno + !! TARGET_LITTLE_ENDIAN), + mem2, operands[2])); + REG_NOTES (insn) = gen_rtx (EXPR_LIST, REG_INC, operands[1], NULL_RTX); + insn = emit_insn (gen_movsf_ie (gen_rtx (REG, SFmode, + regno + ! TARGET_LITTLE_ENDIAN), + gen_rtx (MEM, SFmode, operands[1]), + operands[2])); + DONE; +}") + +(define_split + [(set (match_operand:DF 0 "register_operand" "") + (match_operand:DF 1 "memory_operand" "")) + (use (match_operand:PSI 2 "fpscr_operand" "c")) + (clobber (match_scratch:SI 3 "X"))] + "TARGET_SH4 && ! TARGET_FMOVD && reload_completed + && true_regnum (operands[0]) >= FIRST_FP_REG" + [(const_int 0)] + " +{ + int regno = true_regnum (operands[0]); + rtx addr, insn, adjust = NULL_RTX; + rtx mem2 = copy_rtx (operands[1]); + rtx reg0 = gen_rtx_REG (SFmode, regno + !! TARGET_LITTLE_ENDIAN); + rtx reg1 = gen_rtx_REG (SFmode, regno + ! TARGET_LITTLE_ENDIAN); + + PUT_MODE (mem2, SFmode); + operands[1] = copy_rtx (mem2); + addr = XEXP (mem2, 0); + if (GET_CODE (addr) != POST_INC) + { + /* If we have to modify the stack pointer, the value that we have + read with post-increment might be modified by an interrupt, + so write it back. */ + if (REGNO (addr) == STACK_POINTER_REGNUM) + adjust = gen_push_e (reg0); + else + adjust = gen_addsi3 (addr, addr, GEN_INT (-4)); + XEXP (mem2, 0) = addr = gen_rtx_POST_INC (SImode, addr); + } + addr = XEXP (addr, 0); + insn = emit_insn (gen_movsf_ie (reg0, mem2, operands[2])); + REG_NOTES (insn) = gen_rtx_EXPR_LIST (REG_INC, addr, NULL_RTX); + insn = emit_insn (gen_movsf_ie (reg1, operands[1], operands[2])); + if (adjust) + emit_insn (adjust); + else + REG_NOTES (insn) = gen_rtx_EXPR_LIST (REG_INC, addr, NULL_RTX); + DONE; +}") + +(define_split + [(set (match_operand:DF 0 "memory_operand" "") + (match_operand:DF 1 "register_operand" "")) + (use (match_operand:PSI 2 "fpscr_operand" "c")) + (clobber (match_scratch:SI 3 "X"))] + "TARGET_SH4 && ! TARGET_FMOVD && reload_completed + && true_regnum (operands[1]) >= FIRST_FP_REG" + [(const_int 0)] + " +{ + int regno = true_regnum (operands[1]); + rtx insn, addr, adjust = NULL_RTX; + + operands[0] = copy_rtx (operands[0]); + PUT_MODE (operands[0], SFmode); + insn = emit_insn (gen_movsf_ie (operands[0], + gen_rtx (REG, SFmode, + regno + ! TARGET_LITTLE_ENDIAN), + operands[2])); + operands[0] = copy_rtx (operands[0]); + addr = XEXP (operands[0], 0); + if (GET_CODE (addr) != PRE_DEC) + { + adjust = gen_addsi3 (addr, addr, GEN_INT (4)); + emit_insn_before (adjust, insn); + XEXP (operands[0], 0) = addr = gen_rtx (PRE_DEC, SImode, addr); + } + addr = XEXP (addr, 0); + if (! adjust) + REG_NOTES (insn) = gen_rtx (EXPR_LIST, REG_INC, addr, NULL_RTX); + insn = emit_insn (gen_movsf_ie (operands[0], + gen_rtx (REG, SFmode, + regno + !! TARGET_LITTLE_ENDIAN), + operands[2])); + REG_NOTES (insn) = gen_rtx (EXPR_LIST, REG_INC, addr, NULL_RTX); + DONE; +}") + +;; The '&' for operand 2 is not really true, but push_secondary_reload +;; insists on it. +;; Operand 1 must accept FPUL_REGS in case fpul is reloaded to memory, +;; to avoid a bogus tertiary reload. +;; We need a tertiary reload when a floating point register is reloaded +;; to memory, so the predicate for operand 0 must accept this, while the +;; constraint of operand 1 must reject the secondary reload register. +;; Thus, the secondary reload register for this case has to be GENERAL_REGS, +;; too. +;; By having the predicate for operand 0 reject any register, we make +;; sure that the ordinary moves that just need an intermediate register +;; won't get a bogus tertiary reload. +;; We use tertiary_reload_operand instead of memory_operand here because +;; memory_operand rejects operands that are not directly addressible, e.g.: +;; (mem:SF (plus:SI (reg:SI 14 r14) +;; (const_int 132))) + +(define_expand "reload_outsf" + [(parallel [(set (match_operand:SF 2 "register_operand" "=&r") + (match_operand:SF 1 "register_operand" "y")) + (clobber (scratch:SI))]) + (parallel [(set (match_operand:SF 0 "tertiary_reload_operand" "=m") + (match_dup 2)) + (clobber (scratch:SI))])] + "" + "") + ;; If the output is a register and the input is memory or a register, we have ;; to be careful and see which word needs to be loaded first. @@ -2129,14 +2784,26 @@ " { if (prepare_move_operands (operands, DFmode)) DONE; + if (TARGET_SH4) + { + emit_df_insn (gen_movdf_i4 (operands[0], operands[1], get_fpscr_rtx ())); + /* We need something to tag possible REG_LIBCALL notes on to. */ + if (TARGET_FPU_SINGLE && rtx_equal_function_value_matters + && GET_CODE (operands[0]) == REG) + emit_insn (gen_mov_nop (operands[0])); + DONE; + } }") (define_insn "movsf_i" [(set (match_operand:SF 0 "general_movdst_operand" "=r,r,r,r,m,l,r") - (match_operand:SF 1 "general_movsrc_operand" "r,I,FQ,m,r,r,l"))] + (match_operand:SF 1 "general_movsrc_operand" "r,I,FQ,mr,r,r,l"))] " - ! TARGET_SH3E + (! TARGET_SH3E + /* ??? We provide some insn so that direct_{load,store}[SFmode] get set */ + || GET_CODE (operands[0]) == REG && REGNO (operands[0]) == 3 + || GET_CODE (operands[1]) == REG && REGNO (operands[1]) == 3) && (arith_reg_operand (operands[0], SFmode) || arith_reg_operand (operands[1], SFmode))" "@ @@ -2156,8 +2823,9 @@ [(set (match_operand:SF 0 "general_movdst_operand" "=f,r,f,f,fy,f,m,r,r,m,f,y,y,rf,r,y,y") (match_operand:SF 1 "general_movsrc_operand" - "f,r,G,H,FQ,m,f,FQ,m,r,y,f,>,fr,y,r,y")) - (clobber (match_scratch:SI 2 "=X,X,X,X,&z,X,X,X,X,X,X,X,X,y,X,X,X"))] + "f,r,G,H,FQ,mf,f,FQ,mr,r,y,f,>,fr,y,r,y")) + (use (match_operand:PSI 2 "fpscr_operand" "c,c,c,c,c,c,c,c,c,c,c,c,c,c,c,c,c")) + (clobber (match_scratch:SI 3 "=X,X,X,X,&z,X,X,X,X,X,X,X,X,y,X,X,X"))] "TARGET_SH3E && (arith_reg_operand (operands[0], SFmode) @@ -2181,16 +2849,19 @@ lds %1,%0 ! move optimized away" [(set_attr "type" "fmove,move,fmove,fmove,pcload,load,store,pcload,load,store,fmove,fmove,load,*,gp_fpul,gp_fpul,nil") - (set_attr "length" "*,*,*,*,4,*,*,*,*,*,2,2,2,*,2,2,0")]) + (set_attr "length" "*,*,*,*,4,*,*,*,*,*,2,2,2,4,2,2,0")]) (define_split [(set (match_operand:SF 0 "register_operand" "") (match_operand:SF 1 "register_operand" "")) + (use (match_operand:PSI 2 "fpscr_operand" "c")) (clobber (reg:SI 22))] "" [(parallel [(set (reg:SF 22) (match_dup 1)) + (use (match_dup 2)) (clobber (scratch:SI))]) (parallel [(set (match_dup 0) (reg:SF 22)) + (use (match_dup 2)) (clobber (scratch:SI))])] "") @@ -2204,17 +2875,63 @@ DONE; if (TARGET_SH3E) { - emit_insn (gen_movsf_ie (operands[0], operands[1])); + emit_sf_insn (gen_movsf_ie (operands[0], operands[1], get_fpscr_rtx ())); + /* We need something to tag possible REG_LIBCALL notes on to. */ + if (! TARGET_FPU_SINGLE && rtx_equal_function_value_matters + && GET_CODE (operands[0]) == REG) + emit_insn (gen_mov_nop (operands[0])); DONE; } }") +(define_insn "mov_nop" + [(set (match_operand 0 "register_operand" "") (match_dup 0))] + "TARGET_SH3E" + "" + [(set_attr "length" "0") + (set_attr "type" "nil")]) + (define_expand "reload_insf" [(parallel [(set (match_operand:SF 0 "register_operand" "=f") (match_operand:SF 1 "immediate_operand" "FQ")) + (use (reg:PSI 48)) (clobber (match_operand:SI 2 "register_operand" "=&z"))])] "" "") + +(define_expand "reload_insi" + [(parallel [(set (match_operand:SF 0 "register_operand" "=y") + (match_operand:SF 1 "immediate_operand" "FQ")) + (clobber (match_operand:SI 2 "register_operand" "=&z"))])] + "" + "") + +(define_insn "*movsi_y" + [(set (match_operand:SI 0 "register_operand" "=y,y") + (match_operand:SI 1 "immediate_operand" "Qi,I")) + (clobber (match_scratch:SI 3 "=&z,r"))] + "TARGET_SH3E + && (reload_in_progress || reload_completed)" + "#" + [(set_attr "length" "4") + (set_attr "type" "pcload,move")]) + +(define_split + [(set (match_operand:SI 0 "register_operand" "y") + (match_operand:SI 1 "immediate_operand" "I")) + (clobber (match_operand:SI 2 "register_operand" "r"))] + "" + [(set (match_dup 2) (match_dup 1)) + (set (match_dup 0) (match_dup 2))] + "") + +(define_split + [(set (match_operand:SI 0 "register_operand" "y") + (match_operand:SI 1 "memory_operand" ">")) + (clobber (reg:SI 0))] + "" + [(set (match_dup 0) (match_dup 1))] + "") ;; ------------------------------------------------------------------------ ;; Define the real conditional branch instructions. @@ -2289,7 +3006,7 @@ "" " { - if (GET_MODE (sh_compare_op0) == SFmode) + if (GET_MODE_CLASS (GET_MODE (sh_compare_op0)) == MODE_FLOAT) { rtx tmp = sh_compare_op0; sh_compare_op0 = sh_compare_op1; @@ -2396,6 +3113,7 @@ (define_insn "calli" [(call (mem:SI (match_operand:SI 0 "arith_reg_operand" "r")) (match_operand 1 "" "")) + (use (reg:SI 48)) (clobber (reg:SI 17))] "" "jsr @%0%#" @@ -2406,6 +3124,7 @@ [(set (match_operand 0 "" "=rf") (call (mem:SI (match_operand:SI 1 "arith_reg_operand" "r")) (match_operand 2 "" ""))) + (use (reg:SI 48)) (clobber (reg:SI 17))] "" "jsr @%1%#" @@ -2415,6 +3134,7 @@ (define_expand "call" [(parallel [(call (mem:SI (match_operand 0 "arith_reg_operand" "")) (match_operand 1 "" "")) + (use (reg:SI 48)) (clobber (reg:SI 17))])] "" "operands[0] = force_reg (SImode, XEXP (operands[0], 0));") @@ -2423,6 +3143,7 @@ [(parallel [(set (match_operand 0 "arith_reg_operand" "") (call (mem:SI (match_operand 1 "arith_reg_operand" "")) (match_operand 2 "" ""))) + (use (reg:SI 48)) (clobber (reg:SI 17))])] "" "operands[1] = force_reg (SImode, XEXP (operands[1], 0));") @@ -2656,9 +3377,16 @@ }" [(set_attr "length" "4")]) +;; ??? This is not the proper place to invoke another compiler pass; +;; Alas, there is no proper place to put it. +;; ??? This is also an odd place for the call to emit_fpscr_use. It +;; would be all right if it were for an define_expand for return, but +;; that doesn't mix with emitting a prologue. (define_insn "return" [(return)] - "reload_completed" + "emit_fpscr_use (), + remove_dead_before_cse (), + reload_completed" "%@ %#" [(set_attr "type" "return") (set_attr "needs_delay_slot" "yes")]) @@ -2726,19 +3454,15 @@ "" " { - if (GET_MODE (sh_compare_op0) == SFmode) + if (GET_MODE_CLASS (GET_MODE (sh_compare_op0)) == MODE_FLOAT) { if (TARGET_IEEE) { rtx t_reg = gen_rtx (REG, SImode, T_REG); rtx lab = gen_label_rtx (); - emit_insn (gen_rtx (SET, VOIDmode, t_reg, - gen_rtx (EQ, SImode, sh_compare_op0, - sh_compare_op1))); + prepare_scc_operands (EQ); emit_jump_insn (gen_branch_true (lab)); - emit_insn (gen_rtx (SET, VOIDmode, t_reg, - gen_rtx (GT, SImode, sh_compare_op0, - sh_compare_op1))); + prepare_scc_operands (GT); emit_label (lab); emit_insn (gen_movt (operands[0])); } @@ -2963,7 +3687,7 @@ (use (match_operand:SI 0 "arith_reg_operand" "r")) (clobber (reg:SI 17)) (clobber (reg:SI 0))])] - "" + "! TARGET_HARD_SH4" "jsr @%0%#" [(set_attr "type" "sfunc") (set_attr "needs_delay_slot" "yes")]) @@ -2978,7 +3702,38 @@ (clobber (reg:SI 5)) (clobber (reg:SI 6)) (clobber (reg:SI 0))])] - "" + "! TARGET_HARD_SH4" + "jsr @%0%#" + [(set_attr "type" "sfunc") + (set_attr "needs_delay_slot" "yes")]) + +(define_insn "block_move_real_i4" + [(parallel [(set (mem:BLK (reg:SI 4)) + (mem:BLK (reg:SI 5))) + (use (match_operand:SI 0 "arith_reg_operand" "r")) + (clobber (reg:SI 17)) + (clobber (reg:SI 0)) + (clobber (reg:SI 1)) + (clobber (reg:SI 2))])] + "TARGET_HARD_SH4" + "jsr @%0%#" + [(set_attr "type" "sfunc") + (set_attr "needs_delay_slot" "yes")]) + +(define_insn "block_lump_real_i4" + [(parallel [(set (mem:BLK (reg:SI 4)) + (mem:BLK (reg:SI 5))) + (use (match_operand:SI 0 "arith_reg_operand" "r")) + (use (reg:SI 6)) + (clobber (reg:SI 17)) + (clobber (reg:SI 4)) + (clobber (reg:SI 5)) + (clobber (reg:SI 6)) + (clobber (reg:SI 0)) + (clobber (reg:SI 1)) + (clobber (reg:SI 2)) + (clobber (reg:SI 3))])] + "TARGET_HARD_SH4" "jsr @%0%#" [(set_attr "type" "sfunc") (set_attr "needs_delay_slot" "yes")]) @@ -2989,43 +3744,188 @@ ;; ??? All patterns should have a type attribute. -(define_insn "addsf3" +(define_expand "fpu_switch0" + [(set (match_operand:SI 0 "" "") (symbol_ref "__fpscr_values")) + (set (match_dup 2) (match_dup 1))] + "" + " +{ + operands[1] = gen_rtx (MEM, PSImode, operands[0]); + RTX_UNCHANGING_P (operands[1]) = 1; + operands[2] = get_fpscr_rtx (); +}") + +(define_expand "fpu_switch1" + [(set (match_operand:SI 0 "" "") (symbol_ref "__fpscr_values")) + (set (match_dup 1) (plus:SI (match_dup 0) (const_int 4))) + (set (match_dup 3) (match_dup 2))] + "" + " +{ + operands[1] = gen_reg_rtx (SImode); + operands[2] = gen_rtx (MEM, PSImode, operands[1]); + RTX_UNCHANGING_P (operands[2]) = 1; + operands[3] = get_fpscr_rtx (); +}") + +(define_expand "movpsi" + [(set (match_operand:PSI 0 "register_operand" "") + (match_operand:PSI 1 "general_movsrc_operand" ""))] + "" + "") + +;; The c / m alternative is a fake to guide reload to load directly into +;; fpscr, since reload doesn't know how to use post-increment. +;; GO_IF_LEGITIMATE_ADDRESS guards about bogus addresses before reload, +;; SECONDARY_INPUT_RELOAD_CLASS does this during reload, and the insn's +;; predicate after reload. +;; The gp_fpul type for r/!c might look a bit odd, but it actually schedules +;; like a gpr <-> fpul move. +(define_insn "fpu_switch" + [(set (match_operand:PSI 0 "register_operand" "c,c,r,c,c,r,m,r") + (match_operand:PSI 1 "general_movsrc_operand" "c,>,m,m,r,r,r,!c"))] + "! reload_completed + || true_regnum (operands[0]) != FPSCR_REG || GET_CODE (operands[1]) != MEM + || GET_CODE (XEXP (operands[1], 0)) != PLUS" + "@ + ! precision stays the same + lds.l %1,fpscr + mov.l %1,%0 + # + lds %1,fpscr + mov %1,%0 + mov.l %1,%0 + sts fpscr,%0" + [(set_attr "length" "0,2,2,4,2,2,2,2") + (set_attr "type" "dfp_conv,dfp_conv,load,dfp_conv,dfp_conv,move,store,gp_fpul")]) + +(define_split + [(set (reg:PSI 48) (mem:PSI (match_operand:SI 0 "register_operand" "r")))] + "find_regno_note (insn, REG_DEAD, true_regnum (operands[0]))" + [(set (match_dup 0) (match_dup 0))] + " +{ + rtx insn = emit_insn (gen_fpu_switch (get_fpscr_rtx (), + gen_rtx (MEM, PSImode, + gen_rtx (POST_INC, Pmode, + operands[0])))); + REG_NOTES (insn) = gen_rtx (EXPR_LIST, REG_INC, operands[0], NULL_RTX); +}") + +(define_split + [(set (reg:PSI 48) (mem:PSI (match_operand:SI 0 "register_operand" "r")))] + "" + [(set (match_dup 0) (plus:SI (match_dup 0) (const_int -4)))] + " +{ + rtx insn = emit_insn (gen_fpu_switch (get_fpscr_rtx (), + gen_rtx (MEM, PSImode, + gen_rtx (POST_INC, Pmode, + operands[0])))); + REG_NOTES (insn) = gen_rtx (EXPR_LIST, REG_INC, operands[0], NULL_RTX); +}") + +;; ??? This uses the fp unit, but has no type indicating that. +;; If we did that, this would either give a bogus latency or introduce +;; a bogus FIFO constraint. +;; Since this insn is currently only used for prologues/epilogues, +;; it is probably best to claim no function unit, which matches the +;; current setting. +(define_insn "toggle_sz" + [(set (reg:PSI 48) (xor:PSI (reg:PSI 48) (const_int 1048576)))] + "TARGET_SH4" + "fschg") + +(define_expand "addsf3" + [(match_operand:SF 0 "arith_reg_operand" "") + (match_operand:SF 1 "arith_reg_operand" "") + (match_operand:SF 2 "arith_reg_operand" "")] + "TARGET_SH3E" + "{ expand_sf_binop (&gen_addsf3_i, operands); DONE; }") + +(define_insn "addsf3_i" [(set (match_operand:SF 0 "arith_reg_operand" "=f") (plus:SF (match_operand:SF 1 "arith_reg_operand" "%0") - (match_operand:SF 2 "arith_reg_operand" "f")))] + (match_operand:SF 2 "arith_reg_operand" "f"))) + (use (match_operand:PSI 3 "fpscr_operand" "c"))] "TARGET_SH3E" "fadd %2,%0" [(set_attr "type" "fp")]) -(define_insn "subsf3" +(define_expand "subsf3" + [(match_operand:SF 0 "arith_reg_operand" "") + (match_operand:SF 1 "arith_reg_operand" "") + (match_operand:SF 2 "arith_reg_operand" "")] + "TARGET_SH3E" + "{ expand_sf_binop (&gen_subsf3_i, operands); DONE; }") + +(define_insn "subsf3_i" [(set (match_operand:SF 0 "arith_reg_operand" "=f") (minus:SF (match_operand:SF 1 "arith_reg_operand" "0") - (match_operand:SF 2 "arith_reg_operand" "f")))] + (match_operand:SF 2 "arith_reg_operand" "f"))) + (use (match_operand:PSI 3 "fpscr_operand" "c"))] "TARGET_SH3E" "fsub %2,%0" [(set_attr "type" "fp")]) -(define_insn "mulsf3" +;; Unfortunately, the combiner is unable to cope with the USE of the FPSCR +;; register in feeding fp instructions. Thus, we cannot generate fmac for +;; mixed-precision SH4 targets. To allow it to be still generated for the +;; SH3E, we use a separate insn for SH3E mulsf3. + +(define_expand "mulsf3" + [(match_operand:SF 0 "arith_reg_operand" "") + (match_operand:SF 1 "arith_reg_operand" "") + (match_operand:SF 2 "arith_reg_operand" "")] + "TARGET_SH3E" + " +{ + if (TARGET_SH4) + expand_sf_binop (&gen_mulsf3_i4, operands); + else + emit_insn (gen_mulsf3_ie (operands[0], operands[1], operands[2])); + DONE; +}") + +(define_insn "mulsf3_i4" [(set (match_operand:SF 0 "arith_reg_operand" "=f") (mult:SF (match_operand:SF 1 "arith_reg_operand" "%0") - (match_operand:SF 2 "arith_reg_operand" "f")))] + (match_operand:SF 2 "arith_reg_operand" "f"))) + (use (match_operand:PSI 3 "fpscr_operand" "c"))] "TARGET_SH3E" "fmul %2,%0" [(set_attr "type" "fp")]) +(define_insn "mulsf3_ie" + [(set (match_operand:SF 0 "arith_reg_operand" "=f") + (mult:SF (match_operand:SF 1 "arith_reg_operand" "%0") + (match_operand:SF 2 "arith_reg_operand" "f")))] + "TARGET_SH3E && ! TARGET_SH4" + "fmul %2,%0" + [(set_attr "type" "fp")]) + (define_insn "*macsf3" [(set (match_operand:SF 0 "arith_reg_operand" "=f") (plus:SF (mult:SF (match_operand:SF 1 "arith_reg_operand" "%w") (match_operand:SF 2 "arith_reg_operand" "f")) - (match_operand:SF 3 "arith_reg_operand" "0")))] - "TARGET_SH3E" + (match_operand:SF 3 "arith_reg_operand" "0"))) + (use (match_operand:PSI 4 "fpscr_operand" "c"))] + "TARGET_SH3E && ! TARGET_SH4" "fmac fr0,%2,%0" [(set_attr "type" "fp")]) -(define_insn "divsf3" +(define_expand "divsf3" + [(match_operand:SF 0 "arith_reg_operand" "") + (match_operand:SF 1 "arith_reg_operand" "") + (match_operand:SF 2 "arith_reg_operand" "")] + "TARGET_SH3E" + "{ expand_sf_binop (&gen_divsf3_i, operands); DONE; }") + +(define_insn "divsf3_i" [(set (match_operand:SF 0 "arith_reg_operand" "=f") (div:SF (match_operand:SF 1 "arith_reg_operand" "0") - (match_operand:SF 2 "arith_reg_operand" "f")))] + (match_operand:SF 2 "arith_reg_operand" "f"))) + (use (match_operand:PSI 3 "fpscr_operand" "c"))] "TARGET_SH3E" "fdiv %2,%0" [(set_attr "type" "fdiv")]) @@ -3033,15 +3933,34 @@ (define_expand "floatsisf2" [(set (reg:SI 22) (match_operand:SI 1 "arith_reg_operand" "")) - (set (match_operand:SF 0 "arith_reg_operand" "") - (float:SF (reg:SI 22)))] + (parallel [(set (match_operand:SF 0 "arith_reg_operand" "") + (float:SF (reg:SI 22))) + (use (match_dup 2))])] "TARGET_SH3E" - "") + " +{ + if (TARGET_SH4) + { + emit_insn (gen_rtx (SET, VOIDmode, gen_rtx (REG, SImode, 22), + operands[1])); + emit_sf_insn (gen_floatsisf2_i4 (operands[0], get_fpscr_rtx ())); + DONE; + } + operands[2] = get_fpscr_rtx (); +}") + +(define_insn "floatsisf2_i4" + [(set (match_operand:SF 0 "arith_reg_operand" "=f") + (float:SF (reg:SI 22))) + (use (match_operand:PSI 1 "fpscr_operand" "c"))] + "TARGET_SH3E" + "float fpul,%0" + [(set_attr "type" "fp")]) (define_insn "*floatsisf2_ie" [(set (match_operand:SF 0 "arith_reg_operand" "=f") (float:SF (reg:SI 22)))] - "TARGET_SH3E" + "TARGET_SH3E && ! TARGET_SH4" "float fpul,%0" [(set_attr "type" "fp")]) @@ -3051,26 +3970,62 @@ (set (match_operand:SI 0 "arith_reg_operand" "=r") (reg:SI 22))] "TARGET_SH3E" - "") + " +{ + if (TARGET_SH4) + { + emit_sf_insn (gen_fix_truncsfsi2_i4 (operands[1], get_fpscr_rtx ())); + emit_insn (gen_rtx (SET, VOIDmode, operands[0], + gen_rtx (REG, SImode, 22))); + DONE; + } +}") + +(define_insn "fix_truncsfsi2_i4" + [(set (reg:SI 22) + (fix:SI (match_operand:SF 0 "arith_reg_operand" "f"))) + (use (match_operand:PSI 1 "fpscr_operand" "c"))] + "TARGET_SH4" + "ftrc %0,fpul" + [(set_attr "type" "fp")]) + +(define_insn "fix_truncsfsi2_i4_2" + [(set (match_operand:SI 0 "arith_reg_operand" "=r") + (fix:SI (match_operand:SF 1 "arith_reg_operand" "f"))) + (use (reg:SI 48)) + (clobber (reg:SI 22))] + "TARGET_SH4" + "#" + [(set_attr "length" "4")]) + +(define_split + [(set (match_operand:SI 0 "arith_reg_operand" "=r") + (fix:SI (match_operand:SF 1 "arith_reg_operand" "f"))) + (use (match_operand:PSI 2 "fpscr_operand" "c")) + (clobber (reg:SI 22))] + "TARGET_SH4" + [(parallel [(set (reg:SI 22) (fix:SI (match_dup 1))) + (use (match_dup 2))]) + (set (match_dup 0) (reg:SI 22))]) (define_insn "*fixsfsi" [(set (reg:SI 22) (fix:SI (match_operand:SF 0 "arith_reg_operand" "f")))] - "TARGET_SH3E" + "TARGET_SH3E && ! TARGET_SH4" "ftrc %0,fpul" [(set_attr "type" "fp")]) (define_insn "cmpgtsf_t" [(set (reg:SI 18) (gt:SI (match_operand:SF 0 "arith_reg_operand" "f") (match_operand:SF 1 "arith_reg_operand" "f")))] - "TARGET_SH3E" + "TARGET_SH3E && ! TARGET_SH4" "fcmp/gt %1,%0" [(set_attr "type" "fp")]) (define_insn "cmpeqsf_t" [(set (reg:SI 18) (eq:SI (match_operand:SF 0 "arith_reg_operand" "f") (match_operand:SF 1 "arith_reg_operand" "f")))] - "TARGET_SH3E" + "TARGET_SH3E && ! TARGET_SH4" "fcmp/eq %1,%0" [(set_attr "type" "fp")]) @@ -3078,11 +4033,36 @@ [(set (reg:SI 18) (ior:SI (reg:SI 18) (eq:SI (match_operand:SF 0 "arith_reg_operand" "f") (match_operand:SF 1 "arith_reg_operand" "f"))))] - "TARGET_SH3E && TARGET_IEEE" + "TARGET_SH3E && TARGET_IEEE && ! TARGET_SH4" "* return output_ieee_ccmpeq (insn, operands);" [(set_attr "length" "4")]) +(define_insn "cmpgtsf_t_i4" + [(set (reg:SI 18) (gt:SI (match_operand:SF 0 "arith_reg_operand" "f") + (match_operand:SF 1 "arith_reg_operand" "f"))) + (use (match_operand:PSI 2 "fpscr_operand" "c"))] + "TARGET_SH4" + "fcmp/gt %1,%0" + [(set_attr "type" "fp")]) + +(define_insn "cmpeqsf_t_i4" + [(set (reg:SI 18) (eq:SI (match_operand:SF 0 "arith_reg_operand" "f") + (match_operand:SF 1 "arith_reg_operand" "f"))) + (use (match_operand:PSI 2 "fpscr_operand" "c"))] + "TARGET_SH4" + "fcmp/eq %1,%0" + [(set_attr "type" "fp")]) + +(define_insn "*ieee_ccmpeqsf_t_4" + [(set (reg:SI 18) (ior:SI (reg:SI 18) + (eq:SI (match_operand:SF 0 "arith_reg_operand" "f") + (match_operand:SF 1 "arith_reg_operand" "f")))) + (use (match_operand:PSI 2 "fpscr_operand" "c"))] + "TARGET_IEEE && TARGET_SH4" + "* return output_ieee_ccmpeq (insn, operands);" + [(set_attr "length" "4")]) + (define_expand "cmpsf" [(set (reg:SI 18) (compare (match_operand:SF 0 "arith_operand" "") (match_operand:SF 1 "arith_operand" "")))] @@ -3094,25 +4074,285 @@ DONE; }") -(define_insn "negsf2" +(define_expand "negsf2" + [(match_operand:SF 0 "arith_reg_operand" "") + (match_operand:SF 1 "arith_reg_operand" "")] + "TARGET_SH3E" + "{ expand_sf_unop (&gen_negsf2_i, operands); DONE; }") + +(define_insn "negsf2_i" [(set (match_operand:SF 0 "arith_reg_operand" "=f") - (neg:SF (match_operand:SF 1 "arith_reg_operand" "0")))] + (neg:SF (match_operand:SF 1 "arith_reg_operand" "0"))) + (use (match_operand:PSI 2 "fpscr_operand" "c"))] "TARGET_SH3E" "fneg %0" - [(set_attr "type" "fp")]) + [(set_attr "type" "fmove")]) -(define_insn "sqrtsf2" +(define_expand "sqrtsf2" + [(match_operand:SF 0 "arith_reg_operand" "") + (match_operand:SF 1 "arith_reg_operand" "")] + "TARGET_SH3E" + "{ expand_sf_unop (&gen_sqrtsf2_i, operands); DONE; }") + +(define_insn "sqrtsf2_i" [(set (match_operand:SF 0 "arith_reg_operand" "=f") - (sqrt:SF (match_operand:SF 1 "arith_reg_operand" "0")))] + (sqrt:SF (match_operand:SF 1 "arith_reg_operand" "0"))) + (use (match_operand:PSI 2 "fpscr_operand" "c"))] "TARGET_SH3E" "fsqrt %0" [(set_attr "type" "fdiv")]) -(define_insn "abssf2" +(define_expand "abssf2" + [(match_operand:SF 0 "arith_reg_operand" "") + (match_operand:SF 1 "arith_reg_operand" "")] + "TARGET_SH3E" + "{ expand_sf_unop (&gen_abssf2_i, operands); DONE; }") + +(define_insn "abssf2_i" [(set (match_operand:SF 0 "arith_reg_operand" "=f") - (abs:SF (match_operand:SF 1 "arith_reg_operand" "0")))] + (abs:SF (match_operand:SF 1 "arith_reg_operand" "0"))) + (use (match_operand:PSI 2 "fpscr_operand" "c"))] "TARGET_SH3E" "fabs %0" + [(set_attr "type" "fmove")]) + +(define_expand "adddf3" + [(match_operand:DF 0 "arith_reg_operand" "") + (match_operand:DF 1 "arith_reg_operand" "") + (match_operand:DF 2 "arith_reg_operand" "")] + "TARGET_SH4" + "{ expand_df_binop (&gen_adddf3_i, operands); DONE; }") + +(define_insn "adddf3_i" + [(set (match_operand:DF 0 "arith_reg_operand" "=f") + (plus:DF (match_operand:DF 1 "arith_reg_operand" "%0") + (match_operand:DF 2 "arith_reg_operand" "f"))) + (use (match_operand:PSI 3 "fpscr_operand" "c"))] + "TARGET_SH4" + "fadd %2,%0" + [(set_attr "type" "dfp_arith")]) + +(define_expand "subdf3" + [(match_operand:DF 0 "arith_reg_operand" "") + (match_operand:DF 1 "arith_reg_operand" "") + (match_operand:DF 2 "arith_reg_operand" "")] + "TARGET_SH4" + "{ expand_df_binop (&gen_subdf3_i, operands); DONE; }") + +(define_insn "subdf3_i" + [(set (match_operand:DF 0 "arith_reg_operand" "=f") + (minus:DF (match_operand:DF 1 "arith_reg_operand" "0") + (match_operand:DF 2 "arith_reg_operand" "f"))) + (use (match_operand:PSI 3 "fpscr_operand" "c"))] + "TARGET_SH4" + "fsub %2,%0" + [(set_attr "type" "dfp_arith")]) + +(define_expand "muldf3" + [(match_operand:DF 0 "arith_reg_operand" "") + (match_operand:DF 1 "arith_reg_operand" "") + (match_operand:DF 2 "arith_reg_operand" "")] + "TARGET_SH4" + "{ expand_df_binop (&gen_muldf3_i, operands); DONE; }") + +(define_insn "muldf3_i" + [(set (match_operand:DF 0 "arith_reg_operand" "=f") + (mult:DF (match_operand:DF 1 "arith_reg_operand" "%0") + (match_operand:DF 2 "arith_reg_operand" "f"))) + (use (match_operand:PSI 3 "fpscr_operand" "c"))] + "TARGET_SH4" + "fmul %2,%0" + [(set_attr "type" "dfp_arith")]) + +(define_expand "divdf3" + [(match_operand:DF 0 "arith_reg_operand" "") + (match_operand:DF 1 "arith_reg_operand" "") + (match_operand:DF 2 "arith_reg_operand" "")] + "TARGET_SH4" + "{ expand_df_binop (&gen_divdf3_i, operands); DONE; }") + +(define_insn "divdf3_i" + [(set (match_operand:DF 0 "arith_reg_operand" "=f") + (div:DF (match_operand:DF 1 "arith_reg_operand" "0") + (match_operand:DF 2 "arith_reg_operand" "f"))) + (use (match_operand:PSI 3 "fpscr_operand" "c"))] + "TARGET_SH4" + "fdiv %2,%0" + [(set_attr "type" "dfdiv")]) + +(define_expand "floatsidf2" + [(match_operand:DF 0 "arith_reg_operand" "") + (match_operand:SI 1 "arith_reg_operand" "")] + "TARGET_SH4" + " +{ + emit_insn (gen_rtx (SET, VOIDmode, gen_rtx (REG, SImode, 22), operands[1])); + emit_df_insn (gen_floatsidf2_i (operands[0], get_fpscr_rtx ())); + DONE; +}") + +(define_insn "floatsidf2_i" + [(set (match_operand:DF 0 "arith_reg_operand" "=f") + (float:DF (reg:SI 22))) + (use (match_operand:PSI 1 "fpscr_operand" "c"))] + "TARGET_SH4" + "float fpul,%0" + [(set_attr "type" "dfp_conv")]) + +(define_expand "fix_truncdfsi2" + [(match_operand:SI 0 "arith_reg_operand" "=r") + (match_operand:DF 1 "arith_reg_operand" "f")] + "TARGET_SH4" + " +{ + emit_df_insn (gen_fix_truncdfsi2_i (operands[1], get_fpscr_rtx ())); + emit_insn (gen_rtx (SET, VOIDmode, operands[0], gen_rtx (REG, SImode, 22))); + DONE; +}") + +(define_insn "fix_truncdfsi2_i" + [(set (reg:SI 22) + (fix:SI (match_operand:DF 0 "arith_reg_operand" "f"))) + (use (match_operand:PSI 1 "fpscr_operand" "c"))] + "TARGET_SH4" + "ftrc %0,fpul" + [(set_attr "type" "dfp_conv")]) + +(define_insn "fix_truncdfsi2_i4" + [(set (match_operand:SI 0 "arith_reg_operand" "=r") + (fix:SI (match_operand:DF 1 "arith_reg_operand" "f"))) + (use (match_operand:PSI 2 "fpscr_operand" "c")) + (clobber (reg:SI 22))] + "TARGET_SH4" + "#" + [(set_attr "length" "4")]) + +(define_split + [(set (match_operand:SI 0 "arith_reg_operand" "=r") + (fix:SI (match_operand:DF 1 "arith_reg_operand" "f"))) + (use (match_operand:PSI 2 "fpscr_operand" "c")) + (clobber (reg:SI 22))] + "TARGET_SH4" + [(parallel [(set (reg:SI 22) (fix:SI (match_dup 1))) + (use (match_dup 2))]) + (set (match_dup 0) (reg:SI 22))]) + +(define_insn "cmpgtdf_t" + [(set (reg:SI 18) (gt:SI (match_operand:DF 0 "arith_reg_operand" "f") + (match_operand:DF 1 "arith_reg_operand" "f"))) + (use (match_operand:PSI 2 "fpscr_operand" "c"))] + "TARGET_SH4" + "fcmp/gt %1,%0" + [(set_attr "type" "dfp_cmp")]) + +(define_insn "cmpeqdf_t" + [(set (reg:SI 18) (eq:SI (match_operand:DF 0 "arith_reg_operand" "f") + (match_operand:DF 1 "arith_reg_operand" "f"))) + (use (match_operand:PSI 2 "fpscr_operand" "c"))] + "TARGET_SH4" + "fcmp/eq %1,%0" + [(set_attr "type" "dfp_cmp")]) + +(define_insn "*ieee_ccmpeqdf_t" + [(set (reg:SI 18) (ior:SI (reg:SI 18) + (eq:SI (match_operand:DF 0 "arith_reg_operand" "f") + (match_operand:DF 1 "arith_reg_operand" "f")))) + (use (match_operand:PSI 2 "fpscr_operand" "c"))] + "TARGET_IEEE && TARGET_SH4" + "* return output_ieee_ccmpeq (insn, operands);" + [(set_attr "length" "4")]) + +(define_expand "cmpdf" + [(set (reg:SI 18) (compare (match_operand:DF 0 "arith_operand" "") + (match_operand:DF 1 "arith_operand" "")))] + "TARGET_SH4" + " +{ + sh_compare_op0 = operands[0]; + sh_compare_op1 = operands[1]; + DONE; +}") + +(define_expand "negdf2" + [(match_operand:DF 0 "arith_reg_operand" "") + (match_operand:DF 1 "arith_reg_operand" "")] + "TARGET_SH4" + "{ expand_df_unop (&gen_negdf2_i, operands); DONE; }") + +(define_insn "negdf2_i" + [(set (match_operand:DF 0 "arith_reg_operand" "=f") + (neg:DF (match_operand:DF 1 "arith_reg_operand" "0"))) + (use (match_operand:PSI 2 "fpscr_operand" "c"))] + "TARGET_SH4" + "fneg %0" + [(set_attr "type" "fmove")]) + +(define_expand "sqrtdf2" + [(match_operand:DF 0 "arith_reg_operand" "") + (match_operand:DF 1 "arith_reg_operand" "")] + "TARGET_SH4" + "{ expand_df_unop (&gen_sqrtdf2_i, operands); DONE; }") + +(define_insn "sqrtdf2_i" + [(set (match_operand:DF 0 "arith_reg_operand" "=f") + (sqrt:DF (match_operand:DF 1 "arith_reg_operand" "0"))) + (use (match_operand:PSI 2 "fpscr_operand" "c"))] + "TARGET_SH4" + "fsqrt %0" + [(set_attr "type" "dfdiv")]) + +(define_expand "absdf2" + [(match_operand:DF 0 "arith_reg_operand" "") + (match_operand:DF 1 "arith_reg_operand" "")] + "TARGET_SH4" + "{ expand_df_unop (&gen_absdf2_i, operands); DONE; }") + +(define_insn "absdf2_i" + [(set (match_operand:DF 0 "arith_reg_operand" "=f") + (abs:DF (match_operand:DF 1 "arith_reg_operand" "0"))) + (use (match_operand:PSI 2 "fpscr_operand" "c"))] + "TARGET_SH4" + "fabs %0" + [(set_attr "type" "fmove")]) + +(define_expand "extendsfdf2" + [(match_operand:DF 0 "arith_reg_operand" "") + (match_operand:SF 1 "arith_reg_operand" "")] + "TARGET_SH4" + " +{ + emit_sf_insn (gen_movsf_ie (gen_rtx (REG, SFmode, 22), operands[1], + get_fpscr_rtx ())); + emit_df_insn (gen_extendsfdf2_i4 (operands[0], get_fpscr_rtx ())); + DONE; +}") + +(define_insn "extendsfdf2_i4" + [(set (match_operand:DF 0 "arith_reg_operand" "=f") + (float_extend:DF (reg:SF 22))) + (use (match_operand:PSI 1 "fpscr_operand" "c"))] + "TARGET_SH4" + "fcnvsd fpul,%0" + [(set_attr "type" "fp")]) + +(define_expand "truncdfsf2" + [(match_operand:SF 0 "arith_reg_operand" "") + (match_operand:DF 1 "arith_reg_operand" "")] + "TARGET_SH4" + " +{ + emit_df_insn (gen_truncdfsf2_i4 (operands[1], get_fpscr_rtx ())); + emit_sf_insn (gen_movsf_ie (operands[0], gen_rtx (REG, SFmode, 22), + get_fpscr_rtx ())); + DONE; +}") + +(define_insn "truncdfsf2_i4" + [(set (reg:SF 22) + (float_truncate:SF (match_operand:DF 0 "arith_reg_operand" "f"))) + (use (match_operand:PSI 1 "fpscr_operand" "c"))] + "TARGET_SH4" + "fcnvds %0,fpul" [(set_attr "type" "fp")]) ;; Bit field extract patterns. These give better code for packed bitfields, diff --git a/gcc/config/sh/t-sh b/gcc/config/sh/t-sh index c6af7c1839b..bfbf45ea6a0 100644 --- a/gcc/config/sh/t-sh +++ b/gcc/config/sh/t-sh @@ -1,7 +1,7 @@ CROSS_LIBGCC1 = libgcc1-asm.a LIB1ASMSRC = sh/lib1funcs.asm LIB1ASMFUNCS = _ashiftrt _ashiftrt_n _ashiftlt _lshiftrt _movstr \ - _mulsi3 _sdivsi3 _udivsi3 _set_fpscr + _movstr_i4 _mulsi3 _sdivsi3 _sdivsi3_i4 _udivsi3 _udivsi3_i4 _set_fpscr # These are really part of libgcc1, but this will cause them to be # built correctly, so... @@ -21,7 +21,7 @@ fp-bit.c: $(srcdir)/config/fp-bit.c echo '#endif' >> fp-bit.c cat $(srcdir)/config/fp-bit.c >> fp-bit.c -MULTILIB_OPTIONS= ml m2/m3e +MULTILIB_OPTIONS= ml m2/m3e/m4-single-only/m4-single/m4 MULTILIB_DIRNAMES= MULTILIB_MATCHES = m2=m3 diff --git a/gcc/ginclude/va-sh.h b/gcc/ginclude/va-sh.h index f1671c7b0b6..0bfc84c1350 100644 --- a/gcc/ginclude/va-sh.h +++ b/gcc/ginclude/va-sh.h @@ -6,10 +6,10 @@ #ifndef __GNUC_VA_LIST #define __GNUC_VA_LIST -#ifdef __SH3E__ +#if defined (__SH3E__) || defined (__SH4_SINGLE__) || defined (__SH4__) || defined (__SH4_SINGLE_ONLY__) typedef long __va_greg; -typedef double __va_freg; +typedef float __va_freg; typedef struct { __va_greg * __va_next_o; /* next available register */ @@ -33,24 +33,24 @@ typedef void *__gnuc_va_list; #ifdef _STDARG_H -#ifdef __SH3E__ +#if defined (__SH3E__) || defined (__SH4_SINGLE__) || defined (__SH4__) || defined (__SH4_SINGLE_ONLY__) #define va_start(AP, LASTARG) \ __extension__ \ ({ \ - AP.__va_next_fp = (__va_freg *) __builtin_saveregs (); \ - AP.__va_next_fp_limit = (AP.__va_next_fp + \ + (AP).__va_next_fp = (__va_freg *) __builtin_saveregs (); \ + (AP).__va_next_fp_limit = ((AP).__va_next_fp + \ (__builtin_args_info (1) < 8 ? 8 - __builtin_args_info (1) : 0)); \ - AP.__va_next_o = (__va_greg *) AP.__va_next_fp_limit; \ - AP.__va_next_o_limit = (AP.__va_next_o + \ + (AP).__va_next_o = (__va_greg *) (AP).__va_next_fp_limit; \ + (AP).__va_next_o_limit = ((AP).__va_next_o + \ (__builtin_args_info (0) < 4 ? 4 - __builtin_args_info (0) : 0)); \ - AP.__va_next_stack = (__va_greg *) __builtin_next_arg (LASTARG); \ + (AP).__va_next_stack = (__va_greg *) __builtin_next_arg (LASTARG); \ }) #else /* ! SH3E */ #define va_start(AP, LASTARG) \ - (AP = ((__gnuc_va_list) __builtin_next_arg (LASTARG))) + ((AP) = ((__gnuc_va_list) __builtin_next_arg (LASTARG))) #endif /* ! SH3E */ @@ -59,24 +59,26 @@ __extension__ \ #define va_alist __builtin_va_alist #define va_dcl int __builtin_va_alist;... -#ifdef __SH3E__ +#if defined (__SH3E__) || defined (__SH4_SINGLE__) || defined (__SH4__) || defined (__SH4_SINGLE_ONLY__) #define va_start(AP) \ __extension__ \ ({ \ - AP.__va_next_fp = (__va_freg *) __builtin_saveregs (); \ - AP.__va_next_fp_limit = (AP.__va_next_fp + \ + (AP).__va_next_fp = (__va_freg *) __builtin_saveregs (); \ + (AP).__va_next_fp_limit = ((AP).__va_next_fp + \ (__builtin_args_info (1) < 8 ? 8 - __builtin_args_info (1) : 0)); \ - AP.__va_next_o = (__va_greg *) AP.__va_next_fp_limit; \ - AP.__va_next_o_limit = (AP.__va_next_o + \ + (AP).__va_next_o = (__va_greg *) (AP).__va_next_fp_limit; \ + (AP).__va_next_o_limit = ((AP).__va_next_o + \ (__builtin_args_info (0) < 4 ? 4 - __builtin_args_info (0) : 0)); \ - AP.__va_next_stack = (__va_greg *) __builtin_next_arg (__builtin_va_alist) \ - - (__builtin_args_info (0) >= 4 || __builtin_args_info (1) >= 8 ? 1 : 0); \ + (AP).__va_next_stack \ + = ((__va_greg *) __builtin_next_arg (__builtin_va_alist) \ + - (__builtin_args_info (0) >= 4 || __builtin_args_info (1) >= 8 \ + ? 1 : 0)); \ }) #else /* ! SH3E */ -#define va_start(AP) AP=(char *) &__builtin_va_alist +#define va_start(AP) ((AP) = (char *) &__builtin_va_alist) #endif /* ! SH3E */ @@ -136,53 +138,78 @@ enum __va_type_classes { We want the MEM_IN_STRUCT_P bit set in the emitted RTL, therefore we use unions even when it would otherwise be unnecessary. */ +/* gcc has an extension that allows to use a casted lvalue as an lvalue, + But it doesn't work in C++ with -pedantic - even in the presence of + __extension__ . We work around this problem by using a reference type. */ +#ifdef __cplusplus +#define __VA_REF & +#else +#define __VA_REF +#endif + #define __va_arg_sh1(AP, TYPE) __extension__ \ -__extension__ \ ({(sizeof (TYPE) == 1 \ ? ({union {TYPE t; char c;} __t; \ - asm("" \ - : "=r" (__t.c) \ - : "0" ((((union { int i, j; } *) (AP))++)->i)); \ + __asm("" \ + : "=r" (__t.c) \ + : "0" ((((union { int i, j; } *__VA_REF) (AP))++)->i)); \ __t.t;}) \ : sizeof (TYPE) == 2 \ ? ({union {TYPE t; short s;} __t; \ - asm("" \ - : "=r" (__t.s) \ - : "0" ((((union { int i, j; } *) (AP))++)->i)); \ + __asm("" \ + : "=r" (__t.s) \ + : "0" ((((union { int i, j; } *__VA_REF) (AP))++)->i)); \ __t.t;}) \ : sizeof (TYPE) >= 4 || __LITTLE_ENDIAN_P \ - ? (((union { TYPE t; int i;} *) (AP))++)->t \ - : ((union {TYPE t;TYPE u;}*) ((char *)++(int *)(AP) - sizeof (TYPE)))->t);}) + ? (((union { TYPE t; int i;} *__VA_REF) (AP))++)->t \ + : ((union {TYPE t;TYPE u;}*) ((char *)++(int *__VA_REF)(AP) - sizeof (TYPE)))->t);}) -#ifdef __SH3E__ +#if defined (__SH3E__) || defined (__SH4_SINGLE__) || defined (__SH4__) || defined (__SH4_SINGLE_ONLY__) #define __PASS_AS_FLOAT(TYPE_CLASS,SIZE) \ (TYPE_CLASS == __real_type_class && SIZE == 4) +#define __TARGET_SH4_P 0 + +#if defined(__SH4__) || defined(__SH4_SINGLE__) +#undef __PASS_AS_FLOAT +#define __PASS_AS_FLOAT(TYPE_CLASS,SIZE) \ + (TYPE_CLASS == __real_type_class && SIZE <= 8 \ + || TYPE_CLASS == __complex_type_class && SIZE <= 16) +#undef __TARGET_SH4_P +#define __TARGET_SH4_P 1 +#endif + #define va_arg(pvar,TYPE) \ __extension__ \ ({int __type = __builtin_classify_type (* (TYPE *) 0); \ void * __result_p; \ if (__PASS_AS_FLOAT (__type, sizeof(TYPE))) \ { \ - if (pvar.__va_next_fp < pvar.__va_next_fp_limit) \ + if ((pvar).__va_next_fp < (pvar).__va_next_fp_limit) \ { \ - __result_p = &pvar.__va_next_fp; \ + if (((__type == __real_type_class && sizeof (TYPE) > 4)\ + || sizeof (TYPE) > 8) \ + && (((int) (pvar).__va_next_fp ^ (int) (pvar).__va_next_fp_limit)\ + & 4)) \ + (pvar).__va_next_fp++; \ + __result_p = &(pvar).__va_next_fp; \ } \ else \ - __result_p = &pvar.__va_next_stack; \ + __result_p = &(pvar).__va_next_stack; \ } \ else \ { \ - if (pvar.__va_next_o + ((sizeof (TYPE) + 3) / 4) \ - <= pvar.__va_next_o_limit) \ - __result_p = &pvar.__va_next_o; \ + if ((pvar).__va_next_o + ((sizeof (TYPE) + 3) / 4) \ + <= (pvar).__va_next_o_limit) \ + __result_p = &(pvar).__va_next_o; \ else \ { \ if (sizeof (TYPE) > 4) \ - pvar.__va_next_o = pvar.__va_next_o_limit; \ + if (! __TARGET_SH4_P) \ + (pvar).__va_next_o = (pvar).__va_next_o_limit; \ \ - __result_p = &pvar.__va_next_stack; \ + __result_p = &(pvar).__va_next_stack; \ } \ } \ __va_arg_sh1(*(void **)__result_p, TYPE);}) @@ -194,6 +221,6 @@ __extension__ \ #endif /* SH3E */ /* Copy __gnuc_va_list into another variable of this type. */ -#define __va_copy(dest, src) (dest) = (src) +#define __va_copy(dest, src) ((dest) = (src)) #endif /* defined (_STDARG_H) || defined (_VARARGS_H) */ |