diff options
author | bstarynk <bstarynk@138bc75d-0d04-0410-961f-82ee72b054a4> | 2016-04-17 11:37:12 +0000 |
---|---|---|
committer | bstarynk <bstarynk@138bc75d-0d04-0410-961f-82ee72b054a4> | 2016-04-17 11:37:12 +0000 |
commit | 10bc620ff573f2d1059378d0684bdf985028fe6d (patch) | |
tree | 2686bd58e4311af8f17902454d2c7509c057d67c /gcc/config/s390/s390.c | |
parent | bd356bb6d247b18723734d4d1d0b32191cfb1a9a (diff) | |
download | gcc-10bc620ff573f2d1059378d0684bdf985028fe6d.tar.gz |
2016-04-17 Basile Starynkevitch <basile@starynkevitch.net>
{{merging with even more of GCC 6, using subversion 1.9
svn merge -r233051:233720 ^/trunk
}}
git-svn-id: svn+ssh://gcc.gnu.org/svn/gcc/branches/melt-branch@235079 138bc75d-0d04-0410-961f-82ee72b054a4
Diffstat (limited to 'gcc/config/s390/s390.c')
-rw-r--r-- | gcc/config/s390/s390.c | 741 |
1 files changed, 618 insertions, 123 deletions
diff --git a/gcc/config/s390/s390.c b/gcc/config/s390/s390.c index 3be64de3570..cd53b15f112 100644 --- a/gcc/config/s390/s390.c +++ b/gcc/config/s390/s390.c @@ -340,6 +340,19 @@ extern int reload_completed; /* Kept up to date using the SCHED_VARIABLE_ISSUE hook. */ static rtx_insn *last_scheduled_insn; +#define MAX_SCHED_UNITS 3 +static int last_scheduled_unit_distance[MAX_SCHED_UNITS]; + +/* The maximum score added for an instruction whose unit hasn't been + in use for MAX_SCHED_MIX_DISTANCE steps. Increase this value to + give instruction mix scheduling more priority over instruction + grouping. */ +#define MAX_SCHED_MIX_SCORE 8 + +/* The maximum distance up to which individual scores will be + calculated. Everything beyond this gives MAX_SCHED_MIX_SCORE. + Increase this with the OOO windows size of the machine. */ +#define MAX_SCHED_MIX_DISTANCE 100 /* Structure used to hold the components of a S/390 memory address. A legitimate address on S/390 is of the general @@ -380,6 +393,8 @@ struct GTY (()) s390_frame_layout be saved to. 0 - does not need to be saved at all -1 - stack slot */ +#define SAVE_SLOT_NONE 0 +#define SAVE_SLOT_STACK -1 signed char gpr_save_slots[16]; /* Number of first and last gpr to be saved, restored. */ @@ -426,6 +441,13 @@ struct GTY(()) machine_function /* True if the current function may contain a tbegin clobbering FPRs. */ bool tbegin_p; + + /* For -fsplit-stack support: A stack local which holds a pointer to + the stack arguments for a function with a variable number of + arguments. This is set at the start of the function and is used + to initialize the overflow_arg_area field of the va_list + structure. */ + rtx split_stack_varargs_pointer; }; /* Few accessor macros for struct cfun->machine->s390_frame_layout. */ @@ -5600,6 +5622,124 @@ s390_expand_vec_strlen (rtx target, rtx string, rtx alignment) emit_move_insn (target, temp); } +void +s390_expand_vec_movstr (rtx result, rtx dst, rtx src) +{ + int very_unlikely = REG_BR_PROB_BASE / 100 - 1; + rtx temp = gen_reg_rtx (Pmode); + rtx src_addr = XEXP (src, 0); + rtx dst_addr = XEXP (dst, 0); + rtx src_addr_reg = gen_reg_rtx (Pmode); + rtx dst_addr_reg = gen_reg_rtx (Pmode); + rtx offset = gen_reg_rtx (Pmode); + rtx vsrc = gen_reg_rtx (V16QImode); + rtx vpos = gen_reg_rtx (V16QImode); + rtx loadlen = gen_reg_rtx (SImode); + rtx gpos_qi = gen_reg_rtx(QImode); + rtx gpos = gen_reg_rtx (SImode); + rtx done_label = gen_label_rtx (); + rtx loop_label = gen_label_rtx (); + rtx exit_label = gen_label_rtx (); + rtx full_label = gen_label_rtx (); + + /* Perform a quick check for string ending on the first up to 16 + bytes and exit early if successful. */ + + emit_insn (gen_vlbb (vsrc, src, GEN_INT (6))); + emit_insn (gen_lcbb (loadlen, src_addr, GEN_INT (6))); + emit_insn (gen_vfenezv16qi (vpos, vsrc, vsrc)); + emit_insn (gen_vec_extractv16qi (gpos_qi, vpos, GEN_INT (7))); + emit_move_insn (gpos, gen_rtx_SUBREG (SImode, gpos_qi, 0)); + /* gpos is the byte index if a zero was found and 16 otherwise. + So if it is lower than the loaded bytes we have a hit. */ + emit_cmp_and_jump_insns (gpos, loadlen, GE, NULL_RTX, SImode, 1, + full_label); + emit_insn (gen_vstlv16qi (vsrc, gpos, dst)); + + force_expand_binop (Pmode, add_optab, dst_addr, gpos, result, + 1, OPTAB_DIRECT); + emit_jump (exit_label); + emit_barrier (); + + emit_label (full_label); + LABEL_NUSES (full_label) = 1; + + /* Calculate `offset' so that src + offset points to the last byte + before 16 byte alignment. */ + + /* temp = src_addr & 0xf */ + force_expand_binop (Pmode, and_optab, src_addr, GEN_INT (15), temp, + 1, OPTAB_DIRECT); + + /* offset = 0xf - temp */ + emit_move_insn (offset, GEN_INT (15)); + force_expand_binop (Pmode, sub_optab, offset, temp, offset, + 1, OPTAB_DIRECT); + + /* Store `offset' bytes in the dstination string. The quick check + has loaded at least `offset' bytes into vsrc. */ + + emit_insn (gen_vstlv16qi (vsrc, gen_lowpart (SImode, offset), dst)); + + /* Advance to the next byte to be loaded. */ + force_expand_binop (Pmode, add_optab, offset, const1_rtx, offset, + 1, OPTAB_DIRECT); + + /* Make sure the addresses are single regs which can be used as a + base. */ + emit_move_insn (src_addr_reg, src_addr); + emit_move_insn (dst_addr_reg, dst_addr); + + /* MAIN LOOP */ + + emit_label (loop_label); + LABEL_NUSES (loop_label) = 1; + + emit_move_insn (vsrc, + gen_rtx_MEM (V16QImode, + gen_rtx_PLUS (Pmode, src_addr_reg, offset))); + + emit_insn (gen_vec_vfenesv16qi (vpos, vsrc, vsrc, + GEN_INT (VSTRING_FLAG_ZS | VSTRING_FLAG_CS))); + add_int_reg_note (s390_emit_ccraw_jump (8, EQ, done_label), + REG_BR_PROB, very_unlikely); + + emit_move_insn (gen_rtx_MEM (V16QImode, + gen_rtx_PLUS (Pmode, dst_addr_reg, offset)), + vsrc); + /* offset += 16 */ + force_expand_binop (Pmode, add_optab, offset, GEN_INT (16), + offset, 1, OPTAB_DIRECT); + + emit_jump (loop_label); + emit_barrier (); + + /* REGULAR EXIT */ + + /* We are done. Add the offset of the zero character to the dst_addr + pointer to get the result. */ + + emit_label (done_label); + LABEL_NUSES (done_label) = 1; + + force_expand_binop (Pmode, add_optab, dst_addr_reg, offset, dst_addr_reg, + 1, OPTAB_DIRECT); + + emit_insn (gen_vec_extractv16qi (gpos_qi, vpos, GEN_INT (7))); + emit_move_insn (gpos, gen_rtx_SUBREG (SImode, gpos_qi, 0)); + + emit_insn (gen_vstlv16qi (vsrc, gpos, gen_rtx_MEM (BLKmode, dst_addr_reg))); + + force_expand_binop (Pmode, add_optab, dst_addr_reg, gpos, result, + 1, OPTAB_DIRECT); + + /* EARLY EXIT */ + + emit_label (exit_label); + LABEL_NUSES (exit_label) = 1; +} + + /* Expand conditional increment or decrement using alc/slb instructions. Should generate code setting DST to either SRC or SRC + INCREMENT, depending on the result of the comparison CMP_OP0 CMP_CODE CMP_OP1. @@ -6189,10 +6329,10 @@ s390_expand_vcond (rtx target, rtx then, rtx els, can be handled by the optimization above but not by the following code. Hence, force them into registers here. */ if (!REG_P (cmp_op1)) - cmp_op1 = force_reg (target_mode, cmp_op1); + cmp_op1 = force_reg (GET_MODE (cmp_op1), cmp_op1); if (!REG_P (cmp_op2)) - cmp_op2 = force_reg (target_mode, cmp_op2); + cmp_op2 = force_reg (GET_MODE (cmp_op2), cmp_op2); s390_expand_vec_compare (result_target, cond, cmp_op1, cmp_op2); @@ -9198,7 +9338,7 @@ s390_register_info_gprtofpr () for (i = 15; i >= 6; i--) { - if (cfun_gpr_save_slot (i) == 0) + if (cfun_gpr_save_slot (i) == SAVE_SLOT_NONE) continue; /* Advance to the next FP register which can be used as a @@ -9215,7 +9355,7 @@ s390_register_info_gprtofpr () case we ran out of FPR save slots. */ for (j = 6; j <= 15; j++) if (FP_REGNO_P (cfun_gpr_save_slot (j))) - cfun_gpr_save_slot (j) = -1; + cfun_gpr_save_slot (j) = SAVE_SLOT_STACK; break; } cfun_gpr_save_slot (i) = save_reg_slot++; @@ -9242,12 +9382,16 @@ s390_register_info_stdarg_fpr () return; min_fpr = crtl->args.info.fprs; - max_fpr = min_fpr + cfun->va_list_fpr_size; - if (max_fpr > FP_ARG_NUM_REG) - max_fpr = FP_ARG_NUM_REG; + max_fpr = min_fpr + cfun->va_list_fpr_size - 1; + if (max_fpr >= FP_ARG_NUM_REG) + max_fpr = FP_ARG_NUM_REG - 1; + + /* FPR argument regs start at f0. */ + min_fpr += FPR0_REGNUM; + max_fpr += FPR0_REGNUM; - for (i = min_fpr; i < max_fpr; i++) - cfun_set_fpr_save (i + FPR0_REGNUM); + for (i = min_fpr; i <= max_fpr; i++) + cfun_set_fpr_save (i); } /* Reserve the GPR save slots for GPRs which need to be saved due to @@ -9267,12 +9411,61 @@ s390_register_info_stdarg_gpr () return; min_gpr = crtl->args.info.gprs; - max_gpr = min_gpr + cfun->va_list_gpr_size; - if (max_gpr > GP_ARG_NUM_REG) - max_gpr = GP_ARG_NUM_REG; + max_gpr = min_gpr + cfun->va_list_gpr_size - 1; + if (max_gpr >= GP_ARG_NUM_REG) + max_gpr = GP_ARG_NUM_REG - 1; - for (i = min_gpr; i < max_gpr; i++) - cfun_gpr_save_slot (2 + i) = -1; + /* GPR argument regs start at r2. */ + min_gpr += GPR2_REGNUM; + max_gpr += GPR2_REGNUM; + + /* If r6 was supposed to be saved into an FPR and now needs to go to + the stack for vararg we have to adjust the restore range to make + sure that the restore is done from stack as well. */ + if (FP_REGNO_P (cfun_gpr_save_slot (GPR6_REGNUM)) + && min_gpr <= GPR6_REGNUM + && max_gpr >= GPR6_REGNUM) + { + if (cfun_frame_layout.first_restore_gpr == -1 + || cfun_frame_layout.first_restore_gpr > GPR6_REGNUM) + cfun_frame_layout.first_restore_gpr = GPR6_REGNUM; + if (cfun_frame_layout.last_restore_gpr == -1 + || cfun_frame_layout.last_restore_gpr < GPR6_REGNUM) + cfun_frame_layout.last_restore_gpr = GPR6_REGNUM; + } + + if (cfun_frame_layout.first_save_gpr == -1 + || cfun_frame_layout.first_save_gpr > min_gpr) + cfun_frame_layout.first_save_gpr = min_gpr; + + if (cfun_frame_layout.last_save_gpr == -1 + || cfun_frame_layout.last_save_gpr < max_gpr) + cfun_frame_layout.last_save_gpr = max_gpr; + + for (i = min_gpr; i <= max_gpr; i++) + cfun_gpr_save_slot (i) = SAVE_SLOT_STACK; +} + +/* Calculate the save and restore ranges for stm(g) and lm(g) in the + prologue and epilogue. */ + +static void +s390_register_info_set_ranges () +{ + int i, j; + + /* Find the first and the last save slot supposed to use the stack + to set the restore range. + Vararg regs might be marked as save to stack but only the + call-saved regs really need restoring (i.e. r6). This code + assumes that the vararg regs have not yet been recorded in + cfun_gpr_save_slot. */ + for (i = 0; i < 16 && cfun_gpr_save_slot (i) != SAVE_SLOT_STACK; i++); + for (j = 15; j > i && cfun_gpr_save_slot (j) != SAVE_SLOT_STACK; j--); + cfun_frame_layout.first_restore_gpr = (i == 16) ? -1 : i; + cfun_frame_layout.last_restore_gpr = (i == 16) ? -1 : j; + cfun_frame_layout.first_save_gpr = (i == 16) ? -1 : i; + cfun_frame_layout.last_save_gpr = (i == 16) ? -1 : j; } /* The GPR and FPR save slots in cfun->machine->frame_layout are set @@ -9283,7 +9476,7 @@ s390_register_info_stdarg_gpr () static void s390_register_info () { - int i, j; + int i; char clobbered_regs[32]; gcc_assert (!epilogue_completed); @@ -9316,9 +9509,13 @@ s390_register_info () cfun_frame_layout.high_fprs++; } - if (flag_pic) - clobbered_regs[PIC_OFFSET_TABLE_REGNUM] - |= !!df_regs_ever_live_p (PIC_OFFSET_TABLE_REGNUM); + /* Register 12 is used for GOT address, but also as temp in prologue + for split-stack stdarg functions (unless r14 is available). */ + clobbered_regs[12] + |= ((flag_pic && df_regs_ever_live_p (PIC_OFFSET_TABLE_REGNUM)) + || (flag_split_stack && cfun->stdarg + && (crtl->is_leaf || TARGET_TPF_PROFILING + || has_hard_reg_initial_val (Pmode, RETURN_REGNUM)))); clobbered_regs[BASE_REGNUM] |= (cfun->machine->base_reg @@ -9347,33 +9544,20 @@ s390_register_info () || (reload_completed && cfun_frame_layout.frame_size > 0) || cfun->calls_alloca); - memset (cfun_frame_layout.gpr_save_slots, 0, 16); + memset (cfun_frame_layout.gpr_save_slots, SAVE_SLOT_NONE, 16); for (i = 6; i < 16; i++) if (clobbered_regs[i]) - cfun_gpr_save_slot (i) = -1; + cfun_gpr_save_slot (i) = SAVE_SLOT_STACK; s390_register_info_stdarg_fpr (); s390_register_info_gprtofpr (); - - /* First find the range of GPRs to be restored. Vararg regs don't - need to be restored so we do it before assigning slots to the - vararg GPRs. */ - for (i = 0; i < 16 && cfun_gpr_save_slot (i) != -1; i++); - for (j = 15; j > i && cfun_gpr_save_slot (j) != -1; j--); - cfun_frame_layout.first_restore_gpr = (i == 16) ? -1 : i; - cfun_frame_layout.last_restore_gpr = (i == 16) ? -1 : j; - + s390_register_info_set_ranges (); /* stdarg functions might need to save GPRs 2 to 6. This might - override the GPR->FPR save decision made above for r6 since - vararg regs must go to the stack. */ + override the GPR->FPR save decision made by + s390_register_info_gprtofpr for r6 since vararg regs must go to + the stack. */ s390_register_info_stdarg_gpr (); - - /* Now the range of GPRs which need saving. */ - for (i = 0; i < 16 && cfun_gpr_save_slot (i) != -1; i++); - for (j = 15; j > i && cfun_gpr_save_slot (j) != -1; j--); - cfun_frame_layout.first_save_gpr = (i == 16) ? -1 : i; - cfun_frame_layout.last_save_gpr = (i == 16) ? -1 : j; } /* This function is called by s390_optimize_prologue in order to get @@ -9384,7 +9568,7 @@ static void s390_optimize_register_info () { char clobbered_regs[32]; - int i, j; + int i; gcc_assert (epilogue_completed); gcc_assert (!cfun->machine->split_branches_pending_p); @@ -9407,23 +9591,14 @@ s390_optimize_register_info () || cfun_frame_layout.save_return_addr_p || crtl->calls_eh_return); - memset (cfun_frame_layout.gpr_save_slots, 0, 6); + memset (cfun_frame_layout.gpr_save_slots, SAVE_SLOT_NONE, 6); for (i = 6; i < 16; i++) if (!clobbered_regs[i]) - cfun_gpr_save_slot (i) = 0; - - for (i = 0; i < 16 && cfun_gpr_save_slot (i) != -1; i++); - for (j = 15; j > i && cfun_gpr_save_slot (j) != -1; j--); - cfun_frame_layout.first_restore_gpr = (i == 16) ? -1 : i; - cfun_frame_layout.last_restore_gpr = (i == 16) ? -1 : j; + cfun_gpr_save_slot (i) = SAVE_SLOT_NONE; + s390_register_info_set_ranges (); s390_register_info_stdarg_gpr (); - - for (i = 0; i < 16 && cfun_gpr_save_slot (i) != -1; i++); - for (j = 15; j > i && cfun_gpr_save_slot (j) != -1; j--); - cfun_frame_layout.first_save_gpr = (i == 16) ? -1 : i; - cfun_frame_layout.last_save_gpr = (i == 16) ? -1 : j; } /* Fill cfun->machine with info about frame of current function. */ @@ -9844,7 +10019,7 @@ s390_hard_regno_rename_ok (unsigned int old_reg, unsigned int new_reg) regrename manually about it. */ if (GENERAL_REGNO_P (new_reg) && !call_really_used_regs[new_reg] - && cfun_gpr_save_slot (new_reg) == 0) + && cfun_gpr_save_slot (new_reg) == SAVE_SLOT_NONE) return false; return true; @@ -9859,7 +10034,7 @@ s390_hard_regno_scratch_ok (unsigned int regno) /* See s390_hard_regno_rename_ok. */ if (GENERAL_REGNO_P (regno) && !call_really_used_regs[regno] - && cfun_gpr_save_slot (regno) == 0) + && cfun_gpr_save_slot (regno) == SAVE_SLOT_NONE) return false; return true; @@ -10440,12 +10615,15 @@ s390_emit_prologue (void) int next_fpr = 0; /* Choose best register to use for temp use within prologue. - See below for why TPF must use the register 1. */ + TPF with profiling must avoid the register 14 - the tracing function + needs the original contents of r14 to be preserved. */ if (!has_hard_reg_initial_val (Pmode, RETURN_REGNUM) && !crtl->is_leaf && !TARGET_TPF_PROFILING) temp_reg = gen_rtx_REG (Pmode, RETURN_REGNUM); + else if (flag_split_stack && cfun->stdarg) + temp_reg = gen_rtx_REG (Pmode, 12); else temp_reg = gen_rtx_REG (Pmode, 1); @@ -10875,7 +11053,7 @@ s390_emit_epilogue (bool sibcall) be in between two GPRs which need saving.) Otherwise it would be difficult to take that decision back in s390_optimize_prologue. */ - if (cfun_gpr_save_slot (RETURN_REGNUM) == -1) + if (cfun_gpr_save_slot (RETURN_REGNUM) == SAVE_SLOT_STACK) { int return_regnum = find_unused_clobbered_reg(); if (!return_regnum) @@ -10939,6 +11117,166 @@ s300_set_up_by_prologue (hard_reg_set_container *regs) SET_HARD_REG_BIT (regs->set, REGNO (cfun->machine->base_reg)); } +/* -fsplit-stack support. */ + +/* A SYMBOL_REF for __morestack. */ +static GTY(()) rtx morestack_ref; + +/* When using -fsplit-stack, the allocation routines set a field in + the TCB to the bottom of the stack plus this much space, measured + in bytes. */ + +#define SPLIT_STACK_AVAILABLE 1024 + +/* Emit -fsplit-stack prologue, which goes before the regular function + prologue. */ + +void +s390_expand_split_stack_prologue (void) +{ + rtx r1, guard, cc = NULL; + rtx_insn *insn; + /* Offset from thread pointer to __private_ss. */ + int psso = TARGET_64BIT ? 0x38 : 0x20; + /* Pointer size in bytes. */ + /* Frame size and argument size - the two parameters to __morestack. */ + HOST_WIDE_INT frame_size = cfun_frame_layout.frame_size; + /* Align argument size to 8 bytes - simplifies __morestack code. */ + HOST_WIDE_INT args_size = crtl->args.size >= 0 + ? ((crtl->args.size + 7) & ~7) + : 0; + /* Label to be called by __morestack. */ + rtx_code_label *call_done = NULL; + rtx_code_label *parm_base = NULL; + rtx tmp; + + gcc_assert (flag_split_stack && reload_completed); + if (!TARGET_CPU_ZARCH) + { + sorry ("CPUs older than z900 are not supported for -fsplit-stack"); + return; + } + + r1 = gen_rtx_REG (Pmode, 1); + + /* If no stack frame will be allocated, don't do anything. */ + if (!frame_size) + { + if (cfun->machine->split_stack_varargs_pointer != NULL_RTX) + { + /* If va_start is used, just use r15. */ + emit_move_insn (r1, + gen_rtx_PLUS (Pmode, stack_pointer_rtx, + GEN_INT (STACK_POINTER_OFFSET))); + + } + return; + } + + if (morestack_ref == NULL_RTX) + { + morestack_ref = gen_rtx_SYMBOL_REF (Pmode, "__morestack"); + SYMBOL_REF_FLAGS (morestack_ref) |= (SYMBOL_FLAG_LOCAL + | SYMBOL_FLAG_FUNCTION); + } + + if (CONST_OK_FOR_K (frame_size) || CONST_OK_FOR_Op (frame_size)) + { + /* If frame_size will fit in an add instruction, do a stack space + check, and only call __morestack if there's not enough space. */ + + /* Get thread pointer. r1 is the only register we can always destroy - r0 + could contain a static chain (and cannot be used to address memory + anyway), r2-r6 can contain parameters, and r6-r15 are callee-saved. */ + emit_move_insn (r1, gen_rtx_REG (Pmode, TP_REGNUM)); + /* Aim at __private_ss. */ + guard = gen_rtx_MEM (Pmode, plus_constant (Pmode, r1, psso)); + + /* If less that 1kiB used, skip addition and compare directly with + __private_ss. */ + if (frame_size > SPLIT_STACK_AVAILABLE) + { + emit_move_insn (r1, guard); + if (TARGET_64BIT) + emit_insn (gen_adddi3 (r1, r1, GEN_INT (frame_size))); + else + emit_insn (gen_addsi3 (r1, r1, GEN_INT (frame_size))); + guard = r1; + } + + /* Compare the (maybe adjusted) guard with the stack pointer. */ + cc = s390_emit_compare (LT, stack_pointer_rtx, guard); + } + + call_done = gen_label_rtx (); + parm_base = gen_label_rtx (); + + /* Emit the parameter block. */ + tmp = gen_split_stack_data (parm_base, call_done, + GEN_INT (frame_size), + GEN_INT (args_size)); + insn = emit_insn (tmp); + add_reg_note (insn, REG_LABEL_OPERAND, call_done); + LABEL_NUSES (call_done)++; + add_reg_note (insn, REG_LABEL_OPERAND, parm_base); + LABEL_NUSES (parm_base)++; + + /* %r1 = litbase. */ + insn = emit_move_insn (r1, gen_rtx_LABEL_REF (VOIDmode, parm_base)); + add_reg_note (insn, REG_LABEL_OPERAND, parm_base); + LABEL_NUSES (parm_base)++; + + /* Now, we need to call __morestack. It has very special calling + conventions: it preserves param/return/static chain registers for + calling main function body, and looks for its own parameters at %r1. */ + + if (cc != NULL) + { + tmp = gen_split_stack_cond_call (morestack_ref, cc, call_done); + + insn = emit_jump_insn (tmp); + JUMP_LABEL (insn) = call_done; + LABEL_NUSES (call_done)++; + + /* Mark the jump as very unlikely to be taken. */ + add_int_reg_note (insn, REG_BR_PROB, REG_BR_PROB_BASE / 100); + + if (cfun->machine->split_stack_varargs_pointer != NULL_RTX) + { + /* If va_start is used, and __morestack was not called, just use + r15. */ + emit_move_insn (r1, + gen_rtx_PLUS (Pmode, stack_pointer_rtx, + GEN_INT (STACK_POINTER_OFFSET))); + } + } + else + { + tmp = gen_split_stack_call (morestack_ref, call_done); + insn = emit_jump_insn (tmp); + JUMP_LABEL (insn) = call_done; + LABEL_NUSES (call_done)++; + emit_barrier (); + } + + /* __morestack will call us here. */ + + emit_label (call_done); +} + +/* We may have to tell the dataflow pass that the split stack prologue + is initializing a register. */ + +static void +s390_live_on_entry (bitmap regs) +{ + if (cfun->machine->split_stack_varargs_pointer != NULL_RTX) + { + gcc_assert (flag_split_stack); + bitmap_set_bit (regs, 1); + } +} + /* Return true if the function can use simple_return to return outside of a shrink-wrapped region. At present shrink-wrapping is supported in all cases. */ @@ -10969,7 +11307,7 @@ s390_can_use_return_insn (void) return false; for (i = 0; i < 16; i++) - if (cfun_gpr_save_slot (i)) + if (cfun_gpr_save_slot (i) != SAVE_SLOT_NONE) return false; /* For 31 bit this is not covered by the frame_size check below @@ -11541,6 +11879,27 @@ s390_va_start (tree valist, rtx nextarg ATTRIBUTE_UNUSED) expand_expr (t, const0_rtx, VOIDmode, EXPAND_NORMAL); } + if (flag_split_stack + && (lookup_attribute ("no_split_stack", DECL_ATTRIBUTES (cfun->decl)) + == NULL) + && cfun->machine->split_stack_varargs_pointer == NULL_RTX) + { + rtx reg; + rtx_insn *seq; + + reg = gen_reg_rtx (Pmode); + cfun->machine->split_stack_varargs_pointer = reg; + + start_sequence (); + emit_move_insn (reg, gen_rtx_REG (Pmode, 1)); + seq = get_insns (); + end_sequence (); + + push_topmost_sequence (); + emit_insn_after (seq, entry_of_function ()); + pop_topmost_sequence (); + } + /* Find the overflow area. FIXME: This currently is too pessimistic when the vector ABI is enabled. In that case we *always* set up the overflow area @@ -11549,7 +11908,10 @@ s390_va_start (tree valist, rtx nextarg ATTRIBUTE_UNUSED) || n_fpr + cfun->va_list_fpr_size > FP_ARG_NUM_REG || TARGET_VX_ABI) { - t = make_tree (TREE_TYPE (ovf), virtual_incoming_args_rtx); + if (cfun->machine->split_stack_varargs_pointer == NULL_RTX) + t = make_tree (TREE_TYPE (ovf), virtual_incoming_args_rtx); + else + t = make_tree (TREE_TYPE (ovf), cfun->machine->split_stack_varargs_pointer); off = INTVAL (crtl->args.arg_offset_rtx); off = off < 0 ? 0 : off; @@ -11974,6 +12336,13 @@ s390_function_profiler (FILE *file, int labelno) output_asm_insn ("brasl\t%0,%4", op); output_asm_insn ("lg\t%0,%1", op); } + else if (TARGET_CPU_ZARCH) + { + output_asm_insn ("st\t%0,%1", op); + output_asm_insn ("larl\t%2,%3", op); + output_asm_insn ("brasl\t%0,%4", op); + output_asm_insn ("l\t%0,%1", op); + } else if (!flag_pic) { op[6] = gen_label_rtx (); @@ -12470,7 +12839,7 @@ s390_emit_call (rtx addr_location, rtx tls_call, rtx result_reg, replace the symbol itself with the PLT stub. */ if (flag_pic && !SYMBOL_REF_LOCAL_P (addr_location)) { - if (retaddr_reg != NULL_RTX) + if (TARGET_64BIT || retaddr_reg != NULL_RTX) { addr_location = gen_rtx_UNSPEC (Pmode, gen_rtvec (1, addr_location), @@ -12677,9 +13046,9 @@ s390_optimize_prologue (void) /* It must not happen that what we once saved in an FPR now needs a stack slot. */ - gcc_assert (cfun_gpr_save_slot (gpr_regno) != -1); + gcc_assert (cfun_gpr_save_slot (gpr_regno) != SAVE_SLOT_STACK); - if (cfun_gpr_save_slot (gpr_regno) == 0) + if (cfun_gpr_save_slot (gpr_regno) == SAVE_SLOT_NONE) { remove_insn (insn); continue; @@ -13322,27 +13691,66 @@ s390_z10_prevent_earlyload_conflicts (rtx_insn **ready, int *nready_p) static int s390_sched_state; -#define S390_OOO_SCHED_STATE_NORMAL 3 -#define S390_OOO_SCHED_STATE_CRACKED 4 +#define S390_SCHED_STATE_NORMAL 3 +#define S390_SCHED_STATE_CRACKED 4 -#define S390_OOO_SCHED_ATTR_MASK_CRACKED 0x1 -#define S390_OOO_SCHED_ATTR_MASK_EXPANDED 0x2 -#define S390_OOO_SCHED_ATTR_MASK_ENDGROUP 0x4 -#define S390_OOO_SCHED_ATTR_MASK_GROUPALONE 0x8 +#define S390_SCHED_ATTR_MASK_CRACKED 0x1 +#define S390_SCHED_ATTR_MASK_EXPANDED 0x2 +#define S390_SCHED_ATTR_MASK_ENDGROUP 0x4 +#define S390_SCHED_ATTR_MASK_GROUPALONE 0x8 static unsigned int s390_get_sched_attrmask (rtx_insn *insn) { unsigned int mask = 0; - if (get_attr_ooo_cracked (insn)) - mask |= S390_OOO_SCHED_ATTR_MASK_CRACKED; - if (get_attr_ooo_expanded (insn)) - mask |= S390_OOO_SCHED_ATTR_MASK_EXPANDED; - if (get_attr_ooo_endgroup (insn)) - mask |= S390_OOO_SCHED_ATTR_MASK_ENDGROUP; - if (get_attr_ooo_groupalone (insn)) - mask |= S390_OOO_SCHED_ATTR_MASK_GROUPALONE; + switch (s390_tune) + { + case PROCESSOR_2827_ZEC12: + if (get_attr_zEC12_cracked (insn)) + mask |= S390_SCHED_ATTR_MASK_CRACKED; + if (get_attr_zEC12_expanded (insn)) + mask |= S390_SCHED_ATTR_MASK_EXPANDED; + if (get_attr_zEC12_endgroup (insn)) + mask |= S390_SCHED_ATTR_MASK_ENDGROUP; + if (get_attr_zEC12_groupalone (insn)) + mask |= S390_SCHED_ATTR_MASK_GROUPALONE; + break; + case PROCESSOR_2964_Z13: + if (get_attr_z13_cracked (insn)) + mask |= S390_SCHED_ATTR_MASK_CRACKED; + if (get_attr_z13_expanded (insn)) + mask |= S390_SCHED_ATTR_MASK_EXPANDED; + if (get_attr_z13_endgroup (insn)) + mask |= S390_SCHED_ATTR_MASK_ENDGROUP; + if (get_attr_z13_groupalone (insn)) + mask |= S390_SCHED_ATTR_MASK_GROUPALONE; + break; + default: + gcc_unreachable (); + } + return mask; +} + +static unsigned int +s390_get_unit_mask (rtx_insn *insn, int *units) +{ + unsigned int mask = 0; + + switch (s390_tune) + { + case PROCESSOR_2964_Z13: + *units = 3; + if (get_attr_z13_unit_lsu (insn)) + mask |= 1 << 0; + if (get_attr_z13_unit_fxu (insn)) + mask |= 1 << 1; + if (get_attr_z13_unit_vfu (insn)) + mask |= 1 << 2; + break; + default: + gcc_unreachable (); + } return mask; } @@ -13360,48 +13768,66 @@ s390_sched_score (rtx_insn *insn) case 0: /* Try to put insns into the first slot which would otherwise break a group. */ - if ((mask & S390_OOO_SCHED_ATTR_MASK_CRACKED) != 0 - || (mask & S390_OOO_SCHED_ATTR_MASK_EXPANDED) != 0) + if ((mask & S390_SCHED_ATTR_MASK_CRACKED) != 0 + || (mask & S390_SCHED_ATTR_MASK_EXPANDED) != 0) score += 5; - if ((mask & S390_OOO_SCHED_ATTR_MASK_GROUPALONE) != 0) + if ((mask & S390_SCHED_ATTR_MASK_GROUPALONE) != 0) score += 10; case 1: /* Prefer not cracked insns while trying to put together a group. */ - if ((mask & S390_OOO_SCHED_ATTR_MASK_CRACKED) == 0 - && (mask & S390_OOO_SCHED_ATTR_MASK_EXPANDED) == 0 - && (mask & S390_OOO_SCHED_ATTR_MASK_GROUPALONE) == 0) + if ((mask & S390_SCHED_ATTR_MASK_CRACKED) == 0 + && (mask & S390_SCHED_ATTR_MASK_EXPANDED) == 0 + && (mask & S390_SCHED_ATTR_MASK_GROUPALONE) == 0) score += 10; - if ((mask & S390_OOO_SCHED_ATTR_MASK_ENDGROUP) == 0) + if ((mask & S390_SCHED_ATTR_MASK_ENDGROUP) == 0) score += 5; break; case 2: /* Prefer not cracked insns while trying to put together a group. */ - if ((mask & S390_OOO_SCHED_ATTR_MASK_CRACKED) == 0 - && (mask & S390_OOO_SCHED_ATTR_MASK_EXPANDED) == 0 - && (mask & S390_OOO_SCHED_ATTR_MASK_GROUPALONE) == 0) + if ((mask & S390_SCHED_ATTR_MASK_CRACKED) == 0 + && (mask & S390_SCHED_ATTR_MASK_EXPANDED) == 0 + && (mask & S390_SCHED_ATTR_MASK_GROUPALONE) == 0) score += 10; /* Prefer endgroup insns in the last slot. */ - if ((mask & S390_OOO_SCHED_ATTR_MASK_ENDGROUP) != 0) + if ((mask & S390_SCHED_ATTR_MASK_ENDGROUP) != 0) score += 10; break; - case S390_OOO_SCHED_STATE_NORMAL: + case S390_SCHED_STATE_NORMAL: /* Prefer not cracked insns if the last was not cracked. */ - if ((mask & S390_OOO_SCHED_ATTR_MASK_CRACKED) == 0 - && (mask & S390_OOO_SCHED_ATTR_MASK_EXPANDED) == 0) + if ((mask & S390_SCHED_ATTR_MASK_CRACKED) == 0 + && (mask & S390_SCHED_ATTR_MASK_EXPANDED) == 0) score += 5; - if ((mask & S390_OOO_SCHED_ATTR_MASK_GROUPALONE) != 0) + if ((mask & S390_SCHED_ATTR_MASK_GROUPALONE) != 0) score += 10; break; - case S390_OOO_SCHED_STATE_CRACKED: + case S390_SCHED_STATE_CRACKED: /* Try to keep cracked insns together to prevent them from interrupting groups. */ - if ((mask & S390_OOO_SCHED_ATTR_MASK_CRACKED) != 0 - || (mask & S390_OOO_SCHED_ATTR_MASK_EXPANDED) != 0) + if ((mask & S390_SCHED_ATTR_MASK_CRACKED) != 0 + || (mask & S390_SCHED_ATTR_MASK_EXPANDED) != 0) score += 5; break; } + + if (s390_tune == PROCESSOR_2964_Z13) + { + int units, i; + unsigned unit_mask, m = 1; + + unit_mask = s390_get_unit_mask (insn, &units); + gcc_assert (units <= MAX_SCHED_UNITS); + + /* Add a score in range 0..MAX_SCHED_MIX_SCORE depending on how long + ago the last insn of this unit type got scheduled. This is + supposed to help providing a proper instruction mix to the + CPU. */ + for (i = 0; i < units; i++, m <<= 1) + if (m & unit_mask) + score += (last_scheduled_unit_distance[i] * MAX_SCHED_MIX_SCORE / + MAX_SCHED_MIX_DISTANCE); + } return score; } @@ -13457,12 +13883,12 @@ s390_sched_reorder (FILE *file, int verbose, if (verbose > 5) fprintf (file, - "move insn %d to the top of list\n", + ";;\t\tBACKEND: move insn %d to the top of list\n", INSN_UID (ready[last_index])); } else if (verbose > 5) fprintf (file, - "best insn %d already on top\n", + ";;\t\tBACKEND: best insn %d already on top\n", INSN_UID (ready[last_index])); } @@ -13473,16 +13899,35 @@ s390_sched_reorder (FILE *file, int verbose, for (i = last_index; i >= 0; i--) { - if (recog_memoized (ready[i]) < 0) + unsigned int sched_mask; + rtx_insn *insn = ready[i]; + + if (recog_memoized (insn) < 0) continue; - fprintf (file, "insn %d score: %d: ", INSN_UID (ready[i]), - s390_sched_score (ready[i])); -#define PRINT_OOO_ATTR(ATTR) fprintf (file, "%s ", get_attr_##ATTR (ready[i]) ? #ATTR : "!" #ATTR); - PRINT_OOO_ATTR (ooo_cracked); - PRINT_OOO_ATTR (ooo_expanded); - PRINT_OOO_ATTR (ooo_endgroup); - PRINT_OOO_ATTR (ooo_groupalone); -#undef PRINT_OOO_ATTR + + sched_mask = s390_get_sched_attrmask (insn); + fprintf (file, ";;\t\tBACKEND: insn %d score: %d: ", + INSN_UID (insn), + s390_sched_score (insn)); +#define PRINT_SCHED_ATTR(M, ATTR) fprintf (file, "%s ",\ + ((M) & sched_mask) ? #ATTR : ""); + PRINT_SCHED_ATTR (S390_SCHED_ATTR_MASK_CRACKED, cracked); + PRINT_SCHED_ATTR (S390_SCHED_ATTR_MASK_EXPANDED, expanded); + PRINT_SCHED_ATTR (S390_SCHED_ATTR_MASK_ENDGROUP, endgroup); + PRINT_SCHED_ATTR (S390_SCHED_ATTR_MASK_GROUPALONE, groupalone); +#undef PRINT_SCHED_ATTR + if (s390_tune == PROCESSOR_2964_Z13) + { + unsigned int unit_mask, m = 1; + int units, j; + + unit_mask = s390_get_unit_mask (insn, &units); + fprintf (file, "(units:"); + for (j = 0; j < units; j++, m <<= 1) + if (m & unit_mask) + fprintf (file, " u%d", j); + fprintf (file, ")"); + } fprintf (file, "\n"); } } @@ -13507,12 +13952,12 @@ s390_sched_variable_issue (FILE *file, int verbose, rtx_insn *insn, int more) { unsigned int mask = s390_get_sched_attrmask (insn); - if ((mask & S390_OOO_SCHED_ATTR_MASK_CRACKED) != 0 - || (mask & S390_OOO_SCHED_ATTR_MASK_EXPANDED) != 0) - s390_sched_state = S390_OOO_SCHED_STATE_CRACKED; - else if ((mask & S390_OOO_SCHED_ATTR_MASK_ENDGROUP) != 0 - || (mask & S390_OOO_SCHED_ATTR_MASK_GROUPALONE) != 0) - s390_sched_state = S390_OOO_SCHED_STATE_NORMAL; + if ((mask & S390_SCHED_ATTR_MASK_CRACKED) != 0 + || (mask & S390_SCHED_ATTR_MASK_EXPANDED) != 0) + s390_sched_state = S390_SCHED_STATE_CRACKED; + else if ((mask & S390_SCHED_ATTR_MASK_ENDGROUP) != 0 + || (mask & S390_SCHED_ATTR_MASK_GROUPALONE) != 0) + s390_sched_state = S390_SCHED_STATE_NORMAL; else { /* Only normal insns are left (mask == 0). */ @@ -13521,30 +13966,73 @@ s390_sched_variable_issue (FILE *file, int verbose, rtx_insn *insn, int more) case 0: case 1: case 2: - case S390_OOO_SCHED_STATE_NORMAL: - if (s390_sched_state == S390_OOO_SCHED_STATE_NORMAL) + case S390_SCHED_STATE_NORMAL: + if (s390_sched_state == S390_SCHED_STATE_NORMAL) s390_sched_state = 1; else s390_sched_state++; break; - case S390_OOO_SCHED_STATE_CRACKED: - s390_sched_state = S390_OOO_SCHED_STATE_NORMAL; + case S390_SCHED_STATE_CRACKED: + s390_sched_state = S390_SCHED_STATE_NORMAL; break; } } + + if (s390_tune == PROCESSOR_2964_Z13) + { + int units, i; + unsigned unit_mask, m = 1; + + unit_mask = s390_get_unit_mask (insn, &units); + gcc_assert (units <= MAX_SCHED_UNITS); + + for (i = 0; i < units; i++, m <<= 1) + if (m & unit_mask) + last_scheduled_unit_distance[i] = 0; + else if (last_scheduled_unit_distance[i] < MAX_SCHED_MIX_DISTANCE) + last_scheduled_unit_distance[i]++; + } + if (verbose > 5) { - fprintf (file, "insn %d: ", INSN_UID (insn)); -#define PRINT_OOO_ATTR(ATTR) \ - fprintf (file, "%s ", get_attr_##ATTR (insn) ? #ATTR : ""); - PRINT_OOO_ATTR (ooo_cracked); - PRINT_OOO_ATTR (ooo_expanded); - PRINT_OOO_ATTR (ooo_endgroup); - PRINT_OOO_ATTR (ooo_groupalone); -#undef PRINT_OOO_ATTR - fprintf (file, "\n"); - fprintf (file, "sched state: %d\n", s390_sched_state); + unsigned int sched_mask; + + sched_mask = s390_get_sched_attrmask (insn); + + fprintf (file, ";;\t\tBACKEND: insn %d: ", INSN_UID (insn)); +#define PRINT_SCHED_ATTR(M, ATTR) fprintf (file, "%s ", ((M) & sched_mask) ? #ATTR : ""); + PRINT_SCHED_ATTR (S390_SCHED_ATTR_MASK_CRACKED, cracked); + PRINT_SCHED_ATTR (S390_SCHED_ATTR_MASK_EXPANDED, expanded); + PRINT_SCHED_ATTR (S390_SCHED_ATTR_MASK_ENDGROUP, endgroup); + PRINT_SCHED_ATTR (S390_SCHED_ATTR_MASK_GROUPALONE, groupalone); +#undef PRINT_SCHED_ATTR + + if (s390_tune == PROCESSOR_2964_Z13) + { + unsigned int unit_mask, m = 1; + int units, j; + + unit_mask = s390_get_unit_mask (insn, &units); + fprintf (file, "(units:"); + for (j = 0; j < units; j++, m <<= 1) + if (m & unit_mask) + fprintf (file, " %d", j); + fprintf (file, ")"); + } + fprintf (file, " sched state: %d\n", s390_sched_state); + + if (s390_tune == PROCESSOR_2964_Z13) + { + int units, j; + + s390_get_unit_mask (insn, &units); + + fprintf (file, ";;\t\tBACKEND: units unused for: "); + for (j = 0; j < units; j++) + fprintf (file, "%d:%d ", j, last_scheduled_unit_distance[j]); + fprintf (file, "\n"); + } } } @@ -13561,6 +14049,7 @@ s390_sched_init (FILE *file ATTRIBUTE_UNUSED, int max_ready ATTRIBUTE_UNUSED) { last_scheduled_insn = NULL; + memset (last_scheduled_unit_distance, 0, MAX_SCHED_UNITS * sizeof (int)); s390_sched_state = 0; } @@ -13570,7 +14059,7 @@ s390_sched_init (FILE *file ATTRIBUTE_UNUSED, The loop is analyzed for memory accesses by calling check_dpu for each rtx of the loop. Depending on the loop_depth and the amount of memory accesses a new number <=nunroll is returned to improve the - behaviour of the hardware prefetch unit. */ + behavior of the hardware prefetch unit. */ static unsigned s390_loop_unroll_adjust (unsigned nunroll, struct loop *loop) { @@ -14469,6 +14958,9 @@ s390_asm_file_end (void) s390_vector_abi); #endif file_end_indicate_exec_stack (); + + if (flag_split_stack) + file_end_indicate_split_stack (); } /* Return true if TYPE is a vector bool type. */ @@ -14724,6 +15216,9 @@ s390_invalid_binary_op (int op ATTRIBUTE_UNUSED, const_tree type1, const_tree ty #undef TARGET_SET_UP_BY_PROLOGUE #define TARGET_SET_UP_BY_PROLOGUE s300_set_up_by_prologue +#undef TARGET_EXTRA_LIVE_ON_ENTRY +#define TARGET_EXTRA_LIVE_ON_ENTRY s390_live_on_entry + #undef TARGET_USE_BY_PIECES_INFRASTRUCTURE_P #define TARGET_USE_BY_PIECES_INFRASTRUCTURE_P \ s390_use_by_pieces_infrastructure_p |