diff options
Diffstat (limited to 'gcc/config/i386/i386.c')
-rw-r--r-- | gcc/config/i386/i386.c | 1269 |
1 files changed, 887 insertions, 382 deletions
diff --git a/gcc/config/i386/i386.c b/gcc/config/i386/i386.c index 619b13b3d09..4b684522082 100644 --- a/gcc/config/i386/i386.c +++ b/gcc/config/i386/i386.c @@ -102,6 +102,9 @@ static rtx legitimize_pe_coff_symbol (rtx, bool); static void ix86_print_operand_address_as (FILE *, rtx, addr_space_t, bool); static bool ix86_save_reg (unsigned int, bool, bool); static bool ix86_function_naked (const_tree); +static bool ix86_notrack_prefixed_insn_p (rtx); +static void ix86_emit_restore_reg_using_pop (rtx); + #ifndef CHECK_STACK_LIMIT #define CHECK_STACK_LIMIT (-1) @@ -302,7 +305,7 @@ int const dbx64_register_map[FIRST_PSEUDO_REGISTER] = 7 for %edi (gcc regno = 5) The following three DWARF register numbers are never generated by the SVR4 C compiler or by the GNU compilers, but SDB on x86/svr4 - believes these numbers have these meanings. + believed these numbers have these meanings. 8 for %eip (no gcc equivalent) 9 for %eflags (gcc regno = 17) 10 for %trapno (no gcc equivalent) @@ -310,20 +313,20 @@ int const dbx64_register_map[FIRST_PSEUDO_REGISTER] = for the x86 architecture. If the version of SDB on x86/svr4 were a bit less brain dead with respect to floating-point then we would have a precedent to follow with respect to DWARF register numbers - for x86 FP registers, but the SDB on x86/svr4 is so completely + for x86 FP registers, but the SDB on x86/svr4 was so completely broken with respect to FP registers that it is hardly worth thinking of it as something to strive for compatibility with. - The version of x86/svr4 SDB I have at the moment does (partially) + The version of x86/svr4 SDB I had does (partially) seem to believe that DWARF register number 11 is associated with the x86 register %st(0), but that's about all. Higher DWARF register numbers don't seem to be associated with anything in - particular, and even for DWARF regno 11, SDB only seems to under- + particular, and even for DWARF regno 11, SDB only seemed to under- stand that it should say that a variable lives in %st(0) (when asked via an `=' command) if we said it was in DWARF regno 11, - but SDB still prints garbage when asked for the value of the + but SDB still printed garbage when asked for the value of the variable in question (via a `/' command). - (Also note that the labels SDB prints for various FP stack regs - when doing an `x' command are all wrong.) + (Also note that the labels SDB printed for various FP stack regs + when doing an `x' command were all wrong.) Note that these problems generally don't affect the native SVR4 C compiler because it doesn't allow the use of -O with -g and because when it is *not* optimizing, it allocates a memory @@ -1602,7 +1605,7 @@ dimode_scalar_chain::compute_convert_gain () rtx dst = SET_DEST (def_set); if (REG_P (src) && REG_P (dst)) - gain += COSTS_N_INSNS (2) - ix86_cost->sse_move; + gain += COSTS_N_INSNS (2) - ix86_cost->xmm_move; else if (REG_P (src) && MEM_P (dst)) gain += 2 * ix86_cost->int_store[2] - ix86_cost->sse_store[1]; else if (MEM_P (src) && REG_P (dst)) @@ -2570,6 +2573,151 @@ make_pass_stv (gcc::context *ctxt) return new pass_stv (ctxt); } +/* Inserting ENDBRANCH instructions. */ + +static unsigned int +rest_of_insert_endbranch (void) +{ + timevar_push (TV_MACH_DEP); + + rtx cet_eb; + rtx_insn *insn; + basic_block bb; + + /* Currently emit EB if it's a tracking function, i.e. 'nocf_check' is + absent among function attributes. Later an optimization will be + introduced to make analysis if an address of a static function is + taken. A static function whose address is not taken will get a + nocf_check attribute. This will allow to reduce the number of EB. */ + + if (!lookup_attribute ("nocf_check", + TYPE_ATTRIBUTES (TREE_TYPE (cfun->decl))) + && !cgraph_node::get (cfun->decl)->only_called_directly_p ()) + { + cet_eb = gen_nop_endbr (); + + bb = ENTRY_BLOCK_PTR_FOR_FN (cfun)->next_bb; + insn = BB_HEAD (bb); + emit_insn_before (cet_eb, insn); + } + + bb = 0; + FOR_EACH_BB_FN (bb, cfun) + { + for (insn = BB_HEAD (bb); insn != NEXT_INSN (BB_END (bb)); + insn = NEXT_INSN (insn)) + { + if (INSN_P (insn) && GET_CODE (insn) == CALL_INSN) + { + rtx_insn *next_insn = insn; + + while ((next_insn != BB_END (bb)) + && (DEBUG_INSN_P (NEXT_INSN (next_insn)) + || NOTE_P (NEXT_INSN (next_insn)) + || BARRIER_P (NEXT_INSN (next_insn)))) + next_insn = NEXT_INSN (next_insn); + + /* Generate ENDBRANCH after CALL, which can return more than + twice, setjmp-like functions. */ + if (find_reg_note (insn, REG_SETJMP, NULL) != NULL) + { + cet_eb = gen_nop_endbr (); + emit_insn_after (cet_eb, next_insn); + } + continue; + } + + if (INSN_P (insn) && JUMP_P (insn) && flag_cet_switch) + { + rtx target = JUMP_LABEL (insn); + if (target == NULL_RTX || ANY_RETURN_P (target)) + continue; + + /* Check the jump is a switch table. */ + rtx_insn *label = as_a<rtx_insn *> (target); + rtx_insn *table = next_insn (label); + if (table == NULL_RTX || !JUMP_TABLE_DATA_P (table)) + continue; + + /* For the indirect jump find out all places it jumps and insert + ENDBRANCH there. It should be done under a special flag to + control ENDBRANCH generation for switch stmts. */ + edge_iterator ei; + edge e; + basic_block dest_blk; + + FOR_EACH_EDGE (e, ei, bb->succs) + { + rtx_insn *insn; + + dest_blk = e->dest; + insn = BB_HEAD (dest_blk); + gcc_assert (LABEL_P (insn)); + cet_eb = gen_nop_endbr (); + emit_insn_after (cet_eb, insn); + } + continue; + } + + if ((LABEL_P (insn) && LABEL_PRESERVE_P (insn)) + || (NOTE_P (insn) + && NOTE_KIND (insn) == NOTE_INSN_DELETED_LABEL)) +/* TODO. Check /s bit also. */ + { + cet_eb = gen_nop_endbr (); + emit_insn_after (cet_eb, insn); + continue; + } + } + } + + timevar_pop (TV_MACH_DEP); + return 0; +} + +namespace { + +const pass_data pass_data_insert_endbranch = +{ + RTL_PASS, /* type. */ + "cet", /* name. */ + OPTGROUP_NONE, /* optinfo_flags. */ + TV_MACH_DEP, /* tv_id. */ + 0, /* properties_required. */ + 0, /* properties_provided. */ + 0, /* properties_destroyed. */ + 0, /* todo_flags_start. */ + 0, /* todo_flags_finish. */ +}; + +class pass_insert_endbranch : public rtl_opt_pass +{ +public: + pass_insert_endbranch (gcc::context *ctxt) + : rtl_opt_pass (pass_data_insert_endbranch, ctxt) + {} + + /* opt_pass methods: */ + virtual bool gate (function *) + { + return ((flag_cf_protection & CF_BRANCH) && TARGET_IBT); + } + + virtual unsigned int execute (function *) + { + return rest_of_insert_endbranch (); + } + +}; // class pass_insert_endbranch + +} // anon namespace + +rtl_opt_pass * +make_pass_insert_endbranch (gcc::context *ctxt) +{ + return new pass_insert_endbranch (ctxt); +} + /* Return true if a red-zone is in use. */ bool @@ -2597,11 +2745,14 @@ ix86_target_string (HOST_WIDE_INT isa, HOST_WIDE_INT isa2, ISAs come first. Target string will be displayed in the same order. */ static struct ix86_target_opts isa2_opts[] = { + { "-mgfni", OPTION_MASK_ISA_GFNI }, { "-mrdpid", OPTION_MASK_ISA_RDPID }, { "-msgx", OPTION_MASK_ISA_SGX }, { "-mavx5124vnniw", OPTION_MASK_ISA_AVX5124VNNIW }, { "-mavx5124fmaps", OPTION_MASK_ISA_AVX5124FMAPS }, - { "-mavx512vpopcntdq", OPTION_MASK_ISA_AVX512VPOPCNTDQ } + { "-mavx512vpopcntdq", OPTION_MASK_ISA_AVX512VPOPCNTDQ }, + { "-mibt", OPTION_MASK_ISA_IBT }, + { "-mshstk", OPTION_MASK_ISA_SHSTK } }; static struct ix86_target_opts isa_opts[] = { @@ -4694,6 +4845,37 @@ ix86_option_override_internal (bool main_args_p, target_option_default_node = target_option_current_node = build_target_option_node (opts); + /* Do not support control flow instrumentation if CET is not enabled. */ + if (opts->x_flag_cf_protection != CF_NONE) + { + if (!(TARGET_IBT_P (opts->x_ix86_isa_flags2) + || TARGET_SHSTK_P (opts->x_ix86_isa_flags2))) + { + if (flag_cf_protection == CF_FULL) + { + error ("%<-fcf-protection=full%> requires CET support " + "on this target. Use -mcet or one of -mibt, " + "-mshstk options to enable CET"); + } + else if (flag_cf_protection == CF_BRANCH) + { + error ("%<-fcf-protection=branch%> requires CET support " + "on this target. Use -mcet or one of -mibt, " + "-mshstk options to enable CET"); + } + else if (flag_cf_protection == CF_RETURN) + { + error ("%<-fcf-protection=return%> requires CET support " + "on this target. Use -mcet or one of -mibt, " + "-mshstk options to enable CET"); + } + flag_cf_protection = CF_NONE; + return false; + } + opts->x_flag_cf_protection = + (cf_protection_level) (opts->x_flag_cf_protection | CF_SET); + } + return true; } @@ -5123,6 +5305,9 @@ ix86_valid_target_attribute_inner_p (tree args, char *p_strings[], IX86_ATTR_ISA ("mpx", OPT_mmpx), IX86_ATTR_ISA ("clwb", OPT_mclwb), IX86_ATTR_ISA ("rdpid", OPT_mrdpid), + IX86_ATTR_ISA ("gfni", OPT_mgfni), + IX86_ATTR_ISA ("ibt", OPT_mibt), + IX86_ATTR_ISA ("shstk", OPT_mshstk), /* enum options */ IX86_ATTR_ENUM ("fpmath=", OPT_mfpmath_), @@ -11943,8 +12128,14 @@ ix86_adjust_stack_and_probe_stack_clash (const HOST_WIDE_INT size) we just probe when we cross PROBE_INTERVAL. */ if (TREE_THIS_VOLATILE (cfun->decl)) { - emit_stack_probe (plus_constant (Pmode, stack_pointer_rtx, - -GET_MODE_SIZE (word_mode))); + /* We can safely use any register here since we're just going to push + its value and immediately pop it back. But we do try and avoid + argument passing registers so as not to introduce dependencies in + the pipeline. For 32 bit we use %esi and for 64 bit we use %rax. */ + rtx dummy_reg = gen_rtx_REG (word_mode, TARGET_64BIT ? AX_REG : SI_REG); + rtx_insn *insn = emit_insn (gen_push (dummy_reg)); + RTX_FRAME_RELATED_P (insn) = 1; + ix86_emit_restore_reg_using_pop (dummy_reg); emit_insn (gen_blockage ()); } @@ -12512,10 +12703,13 @@ ix86_finalize_stack_frame_flags (void) for (ref = DF_REG_USE_CHAIN (HARD_FRAME_POINTER_REGNUM); ref; ref = next) { - rtx_insn *insn = DF_REF_INSN (ref); + next = DF_REF_NEXT_REG (ref); + if (!DF_REF_INSN_INFO (ref)) + continue; + /* Make sure the next ref is for a different instruction, so that we're not affected by the rescan. */ - next = DF_REF_NEXT_REG (ref); + rtx_insn *insn = DF_REF_INSN (ref); while (next && DF_REF_INSN (next) == insn) next = DF_REF_NEXT_REG (next); @@ -12836,7 +13030,7 @@ ix86_expand_prologue (void) if (frame_pointer_needed && !m->fs.fp_valid) { /* Note: AT&T enter does NOT have reversed args. Enter is probably - slower on all targets. Also sdb doesn't like it. */ + slower on all targets. Also sdb didn't like it. */ insn = emit_insn (gen_push (hard_frame_pointer_rtx)); RTX_FRAME_RELATED_P (insn) = 1; @@ -12983,8 +13177,12 @@ ix86_expand_prologue (void) && (flag_stack_check == STATIC_BUILTIN_STACK_CHECK || flag_stack_clash_protection)) { - /* We expect the GP registers to be saved when probes are used. */ - gcc_assert (int_registers_saved); + /* This assert wants to verify that integer registers were saved + prior to probing. This is necessary when probing may be implemented + as a function call (Windows). It is not necessary for stack clash + protection probing. */ + if (!flag_stack_clash_protection) + gcc_assert (int_registers_saved); if (flag_stack_clash_protection) { @@ -13628,7 +13826,7 @@ ix86_expand_epilogue (int style) the stack pointer, if we will restore SSE regs via sp. */ if (TARGET_64BIT && m->fs.sp_offset > 0x7fffffff - && sp_valid_at (frame.stack_realign_offset) + && sp_valid_at (frame.stack_realign_offset + 1) && (frame.nsseregs + frame.nregs) != 0) { pro_epilogue_adjust_stack (stack_pointer_rtx, stack_pointer_rtx, @@ -14895,10 +15093,16 @@ legitimate_pic_address_disp_p (rtx disp) break; op0 = XEXP (XEXP (disp, 0), 0); op1 = XEXP (XEXP (disp, 0), 1); - if (!CONST_INT_P (op1) - || INTVAL (op1) >= 16*1024*1024 + if (!CONST_INT_P (op1)) + break; + if (GET_CODE (op0) == UNSPEC + && (XINT (op0, 1) == UNSPEC_DTPOFF + || XINT (op0, 1) == UNSPEC_NTPOFF) + && trunc_int_for_mode (INTVAL (op1), SImode) == INTVAL (op1)) + return true; + if (INTVAL (op1) >= 16*1024*1024 || INTVAL (op1) < -16*1024*1024) - break; + break; if (GET_CODE (op0) == LABEL_REF) return true; if (GET_CODE (op0) == CONST @@ -16657,13 +16861,17 @@ ix86_delegitimize_address_1 (rtx x, bool base_term_p) movl foo@GOTOFF(%ecx), %edx in which case we return (%ecx - %ebx) + foo or (%ecx - _GLOBAL_OFFSET_TABLE_) + foo if pseudo_pic_reg - and reload has completed. */ + and reload has completed. Don't do the latter for debug, + as _GLOBAL_OFFSET_TABLE_ can't be expressed in the assembly. */ if (pic_offset_table_rtx && (!reload_completed || !ix86_use_pseudo_pic_reg ())) result = gen_rtx_PLUS (Pmode, gen_rtx_MINUS (Pmode, copy_rtx (addend), pic_offset_table_rtx), result); - else if (pic_offset_table_rtx && !TARGET_MACHO && !TARGET_VXWORKS_RTP) + else if (base_term_p + && pic_offset_table_rtx + && !TARGET_MACHO + && !TARGET_VXWORKS_RTP) { rtx tmp = gen_rtx_SYMBOL_REF (Pmode, GOT_SYMBOL_NAME); tmp = gen_rtx_MINUS (Pmode, copy_rtx (addend), tmp); @@ -16716,6 +16924,25 @@ ix86_find_base_term (rtx x) return ix86_delegitimize_address_1 (x, true); } + +/* Return true if X shouldn't be emitted into the debug info. + Disallow UNSPECs other than @gotoff - we can't emit _GLOBAL_OFFSET_TABLE_ + symbol easily into the .debug_info section, so we need not to + delegitimize, but instead assemble as @gotoff. + Disallow _GLOBAL_OFFSET_TABLE_ SYMBOL_REF - the assembler magically + assembles that as _GLOBAL_OFFSET_TABLE_-. expression. */ + +static bool +ix86_const_not_ok_for_debug_p (rtx x) +{ + if (GET_CODE (x) == UNSPEC && XINT (x, 1) != UNSPEC_GOTOFF) + return true; + + if (SYMBOL_REF_P (x) && strcmp (XSTR (x, 0), GOT_SYMBOL_NAME) == 0) + return true; + + return false; +} static void put_condition_code (enum rtx_code code, machine_mode mode, bool reverse, @@ -16723,7 +16950,7 @@ put_condition_code (enum rtx_code code, machine_mode mode, bool reverse, { const char *suffix; - if (mode == CCFPmode || mode == CCFPUmode) + if (mode == CCFPmode) { code = ix86_fp_compare_code_to_integer (code); mode = CCmode; @@ -16734,6 +16961,7 @@ put_condition_code (enum rtx_code code, machine_mode mode, bool reverse, switch (code) { case EQ: + gcc_assert (mode != CCGZmode); switch (mode) { case E_CCAmode: @@ -16757,6 +16985,7 @@ put_condition_code (enum rtx_code code, machine_mode mode, bool reverse, } break; case NE: + gcc_assert (mode != CCGZmode); switch (mode) { case E_CCAmode: @@ -16801,6 +17030,7 @@ put_condition_code (enum rtx_code code, machine_mode mode, bool reverse, case E_CCmode: case E_CCGCmode: + case E_CCGZmode: suffix = "l"; break; @@ -16809,7 +17039,7 @@ put_condition_code (enum rtx_code code, machine_mode mode, bool reverse, } break; case LTU: - if (mode == CCmode) + if (mode == CCmode || mode == CCGZmode) suffix = "b"; else if (mode == CCCmode) suffix = fp ? "b" : "c"; @@ -16826,6 +17056,7 @@ put_condition_code (enum rtx_code code, machine_mode mode, bool reverse, case E_CCmode: case E_CCGCmode: + case E_CCGZmode: suffix = "ge"; break; @@ -16834,7 +17065,7 @@ put_condition_code (enum rtx_code code, machine_mode mode, bool reverse, } break; case GEU: - if (mode == CCmode) + if (mode == CCmode || mode == CCGZmode) suffix = "nb"; else if (mode == CCCmode) suffix = fp ? "nb" : "nc"; @@ -17613,6 +17844,8 @@ ix86_print_operand (FILE *file, rtx x, int code) case '!': if (ix86_bnd_prefixed_insn_p (current_output_insn)) fputs ("bnd ", file); + if (ix86_notrack_prefixed_insn_p (current_output_insn)) + fputs ("notrack ", file); return; default: @@ -18028,6 +18261,10 @@ i386_asm_output_addr_const_extra (FILE *file, rtx x) op = XVECEXP (x, 0, 0); switch (XINT (x, 1)) { + case UNSPEC_GOTOFF: + output_addr_const (file, op); + fputs ("@gotoff", file); + break; case UNSPEC_GOTTPOFF: output_addr_const (file, op); /* FIXME: This might be @TPOFF in Sun ld. */ @@ -18147,89 +18384,66 @@ output_387_binary_op (rtx_insn *insn, rtx *operands) { static char buf[40]; const char *p; - const char *ssep; - int is_sse = SSE_REG_P (operands[0]) || SSE_REG_P (operands[1]) || SSE_REG_P (operands[2]); + bool is_sse + = (SSE_REG_P (operands[0]) + || SSE_REG_P (operands[1]) || SSE_REG_P (operands[2])); - /* Even if we do not want to check the inputs, this documents input - constraints. Which helps in understanding the following code. */ - if (flag_checking) - { - if (STACK_REG_P (operands[0]) - && ((REG_P (operands[1]) - && REGNO (operands[0]) == REGNO (operands[1]) - && (STACK_REG_P (operands[2]) || MEM_P (operands[2]))) - || (REG_P (operands[2]) - && REGNO (operands[0]) == REGNO (operands[2]) - && (STACK_REG_P (operands[1]) || MEM_P (operands[1])))) - && (STACK_TOP_P (operands[1]) || STACK_TOP_P (operands[2]))) - ; /* ok */ - else - gcc_assert (is_sse); - } + if (is_sse) + p = "%v"; + else if (GET_MODE_CLASS (GET_MODE (operands[1])) == MODE_INT + || GET_MODE_CLASS (GET_MODE (operands[2])) == MODE_INT) + p = "fi"; + else + p = "f"; + + strcpy (buf, p); switch (GET_CODE (operands[3])) { case PLUS: - if (GET_MODE_CLASS (GET_MODE (operands[1])) == MODE_INT - || GET_MODE_CLASS (GET_MODE (operands[2])) == MODE_INT) - p = "fiadd"; - else - p = "fadd"; - ssep = "vadd"; - break; - + p = "add"; break; case MINUS: - if (GET_MODE_CLASS (GET_MODE (operands[1])) == MODE_INT - || GET_MODE_CLASS (GET_MODE (operands[2])) == MODE_INT) - p = "fisub"; - else - p = "fsub"; - ssep = "vsub"; - break; - + p = "sub"; break; case MULT: - if (GET_MODE_CLASS (GET_MODE (operands[1])) == MODE_INT - || GET_MODE_CLASS (GET_MODE (operands[2])) == MODE_INT) - p = "fimul"; - else - p = "fmul"; - ssep = "vmul"; - break; - + p = "mul"; break; case DIV: - if (GET_MODE_CLASS (GET_MODE (operands[1])) == MODE_INT - || GET_MODE_CLASS (GET_MODE (operands[2])) == MODE_INT) - p = "fidiv"; - else - p = "fdiv"; - ssep = "vdiv"; - break; - + p = "div"; break; default: gcc_unreachable (); } + strcat (buf, p); + if (is_sse) { + p = (GET_MODE (operands[0]) == SFmode) ? "ss" : "sd"; + strcat (buf, p); + if (TARGET_AVX) - { - strcpy (buf, ssep); - if (GET_MODE (operands[0]) == SFmode) - strcat (buf, "ss\t{%2, %1, %0|%0, %1, %2}"); - else - strcat (buf, "sd\t{%2, %1, %0|%0, %1, %2}"); - } + p = "\t{%2, %1, %0|%0, %1, %2}"; else - { - strcpy (buf, ssep + 1); - if (GET_MODE (operands[0]) == SFmode) - strcat (buf, "ss\t{%2, %0|%0, %2}"); - else - strcat (buf, "sd\t{%2, %0|%0, %2}"); - } - return buf; + p = "\t{%2, %0|%0, %2}"; + + strcat (buf, p); + return buf; } - strcpy (buf, p); + + /* Even if we do not want to check the inputs, this documents input + constraints. Which helps in understanding the following code. */ + if (flag_checking) + { + if (STACK_REG_P (operands[0]) + && ((REG_P (operands[1]) + && REGNO (operands[0]) == REGNO (operands[1]) + && (STACK_REG_P (operands[2]) || MEM_P (operands[2]))) + || (REG_P (operands[2]) + && REGNO (operands[0]) == REGNO (operands[2]) + && (STACK_REG_P (operands[1]) || MEM_P (operands[1])))) + && (STACK_TOP_P (operands[1]) || STACK_TOP_P (operands[2]))) + ; /* ok */ + else + gcc_unreachable (); + } switch (GET_CODE (operands[3])) { @@ -18818,10 +19032,13 @@ ix86_emit_mode_set (int entity, int mode, int prev_mode ATTRIBUTE_UNUSED, const char * output_fix_trunc (rtx_insn *insn, rtx *operands, bool fisttp) { - int stack_top_dies = find_regno_note (insn, REG_DEAD, FIRST_STACK_REG) != 0; - int dimode_p = GET_MODE (operands[0]) == DImode; + bool stack_top_dies = find_regno_note (insn, REG_DEAD, FIRST_STACK_REG); + bool dimode_p = GET_MODE (operands[0]) == DImode; int round_mode = get_attr_i387_cw (insn); + static char buf[40]; + const char *p; + /* Jump through a hoop or two for DImode, since the hardware has no non-popping instruction. We used to do this a different way, but that was somewhat fragile and broke with post-reload splitters. */ @@ -18833,18 +19050,20 @@ output_fix_trunc (rtx_insn *insn, rtx *operands, bool fisttp) gcc_assert (GET_MODE (operands[1]) != TFmode); if (fisttp) - output_asm_insn ("fisttp%Z0\t%0", operands); - else - { - if (round_mode != I387_CW_ANY) - output_asm_insn ("fldcw\t%3", operands); - if (stack_top_dies || dimode_p) - output_asm_insn ("fistp%Z0\t%0", operands); - else - output_asm_insn ("fist%Z0\t%0", operands); - if (round_mode != I387_CW_ANY) - output_asm_insn ("fldcw\t%2", operands); - } + return "fisttp%Z0\t%0"; + + strcpy (buf, "fist"); + + if (round_mode != I387_CW_ANY) + output_asm_insn ("fldcw\t%3", operands); + + p = "p%Z0\t%0"; + strcat (buf, p + !(stack_top_dies || dimode_p)); + + output_asm_insn (buf, operands); + + if (round_mode != I387_CW_ANY) + output_asm_insn ("fldcw\t%2", operands); return ""; } @@ -18881,120 +19100,65 @@ output_387_ffreep (rtx *operands ATTRIBUTE_UNUSED, int opno) should be used. UNORDERED_P is true when fucom should be used. */ const char * -output_fp_compare (rtx_insn *insn, rtx *operands, bool eflags_p, bool unordered_p) +output_fp_compare (rtx_insn *insn, rtx *operands, + bool eflags_p, bool unordered_p) { - int stack_top_dies; - rtx cmp_op0, cmp_op1; - int is_sse = SSE_REG_P (operands[0]) || SSE_REG_P (operands[1]); - - if (eflags_p) - { - cmp_op0 = operands[0]; - cmp_op1 = operands[1]; - } - else - { - cmp_op0 = operands[1]; - cmp_op1 = operands[2]; - } + rtx *xops = eflags_p ? &operands[0] : &operands[1]; + bool stack_top_dies; - if (is_sse) - { - if (GET_MODE (operands[0]) == SFmode) - if (unordered_p) - return "%vucomiss\t{%1, %0|%0, %1}"; - else - return "%vcomiss\t{%1, %0|%0, %1}"; - else - if (unordered_p) - return "%vucomisd\t{%1, %0|%0, %1}"; - else - return "%vcomisd\t{%1, %0|%0, %1}"; - } + static char buf[40]; + const char *p; - gcc_assert (STACK_TOP_P (cmp_op0)); + gcc_assert (STACK_TOP_P (xops[0])); - stack_top_dies = find_regno_note (insn, REG_DEAD, FIRST_STACK_REG) != 0; + stack_top_dies = find_regno_note (insn, REG_DEAD, FIRST_STACK_REG); - if (cmp_op1 == CONST0_RTX (GET_MODE (cmp_op1))) + if (eflags_p) { - if (stack_top_dies) - { - output_asm_insn ("ftst\n\tfnstsw\t%0", operands); - return output_387_ffreep (operands, 1); - } - else - return "ftst\n\tfnstsw\t%0"; + p = unordered_p ? "fucomi" : "fcomi"; + strcpy (buf, p); + + p = "p\t{%y1, %0|%0, %y1}"; + strcat (buf, p + !stack_top_dies); + + return buf; } - if (STACK_REG_P (cmp_op1) + if (STACK_REG_P (xops[1]) && stack_top_dies - && find_regno_note (insn, REG_DEAD, REGNO (cmp_op1)) - && REGNO (cmp_op1) != FIRST_STACK_REG) + && find_regno_note (insn, REG_DEAD, FIRST_STACK_REG + 1)) { - /* If both the top of the 387 stack dies, and the other operand - is also a stack register that dies, then this must be a - `fcompp' float compare */ + gcc_assert (REGNO (xops[1]) == FIRST_STACK_REG + 1); - if (eflags_p) - { - /* There is no double popping fcomi variant. Fortunately, - eflags is immune from the fstp's cc clobbering. */ - if (unordered_p) - output_asm_insn ("fucomip\t{%y1, %0|%0, %y1}", operands); - else - output_asm_insn ("fcomip\t{%y1, %0|%0, %y1}", operands); - return output_387_ffreep (operands, 0); - } - else - { - if (unordered_p) - return "fucompp\n\tfnstsw\t%0"; - else - return "fcompp\n\tfnstsw\t%0"; - } + /* If both the top of the 387 stack die, and the other operand + is also a stack register that dies, then this must be a + `fcompp' float compare. */ + p = unordered_p ? "fucompp" : "fcompp"; + strcpy (buf, p); + } + else if (const0_operand (xops[1], VOIDmode)) + { + gcc_assert (!unordered_p); + strcpy (buf, "ftst"); } else { - /* Encoded here as eflags_p | intmode | unordered_p | stack_top_dies. */ - - static const char * const alt[16] = - { - "fcom%Z2\t%y2\n\tfnstsw\t%0", - "fcomp%Z2\t%y2\n\tfnstsw\t%0", - "fucom%Z2\t%y2\n\tfnstsw\t%0", - "fucomp%Z2\t%y2\n\tfnstsw\t%0", - - "ficom%Z2\t%y2\n\tfnstsw\t%0", - "ficomp%Z2\t%y2\n\tfnstsw\t%0", - NULL, - NULL, - - "fcomi\t{%y1, %0|%0, %y1}", - "fcomip\t{%y1, %0|%0, %y1}", - "fucomi\t{%y1, %0|%0, %y1}", - "fucomip\t{%y1, %0|%0, %y1}", - - NULL, - NULL, - NULL, - NULL - }; - - int mask; - const char *ret; - - mask = eflags_p << 3; - mask |= (GET_MODE_CLASS (GET_MODE (cmp_op1)) == MODE_INT) << 2; - mask |= unordered_p << 1; - mask |= stack_top_dies; + if (GET_MODE_CLASS (GET_MODE (xops[1])) == MODE_INT) + { + gcc_assert (!unordered_p); + p = "ficom"; + } + else + p = unordered_p ? "fucom" : "fcom"; - gcc_assert (mask < 16); - ret = alt[mask]; - gcc_assert (ret); + strcpy (buf, p); - return ret; + p = "p%Z2\t%y2"; + strcat (buf, p + !stack_top_dies); } + + output_asm_insn (buf, operands); + return "fnstsw\t%0"; } void @@ -19067,20 +19231,6 @@ ix86_expand_clear (rtx dest) emit_insn (tmp); } -/* X is an unchanging MEM. If it is a constant pool reference, return - the constant pool rtx, else NULL. */ - -rtx -maybe_get_pool_constant (rtx x) -{ - x = ix86_delegitimize_address (XEXP (x, 0)); - - if (GET_CODE (x) == SYMBOL_REF && CONSTANT_POOL_ADDRESS_P (x)) - return get_pool_constant (x); - - return NULL_RTX; -} - void ix86_expand_move (machine_mode mode, rtx operands[]) { @@ -21526,6 +21676,8 @@ ix86_match_ccmode (rtx insn, machine_mode req_mode) case E_CCZmode: break; + case E_CCGZmode: + case E_CCAmode: case E_CCCmode: case E_CCOmode: @@ -21563,18 +21715,38 @@ ix86_expand_int_compare (enum rtx_code code, rtx op0, rtx op1) return gen_rtx_fmt_ee (code, VOIDmode, flags, const0_rtx); } -/* Figure out whether to use ordered or unordered fp comparisons. - Return the appropriate mode to use. */ +/* Figure out whether to use unordered fp comparisons. */ -machine_mode -ix86_fp_compare_mode (enum rtx_code) +static bool +ix86_unordered_fp_compare (enum rtx_code code) { - /* ??? In order to make all comparisons reversible, we do all comparisons - non-trapping when compiling for IEEE. Once gcc is able to distinguish - all forms trapping and nontrapping comparisons, we can make inequality - comparisons trapping again, since it results in better code when using - FCOM based compares. */ - return TARGET_IEEE_FP ? CCFPUmode : CCFPmode; + if (!TARGET_IEEE_FP) + return false; + + switch (code) + { + case GT: + case GE: + case LT: + case LE: + return false; + + case EQ: + case NE: + + case LTGT: + case UNORDERED: + case ORDERED: + case UNLT: + case UNLE: + case UNGT: + case UNGE: + case UNEQ: + return true; + + default: + gcc_unreachable (); + } } machine_mode @@ -21585,7 +21757,7 @@ ix86_cc_mode (enum rtx_code code, rtx op0, rtx op1) if (SCALAR_FLOAT_MODE_P (mode)) { gcc_assert (!DECIMAL_FLOAT_MODE_P (mode)); - return ix86_fp_compare_mode (code); + return CCFPmode; } switch (code) @@ -21707,7 +21879,6 @@ ix86_cc_modes_compatible (machine_mode m1, machine_mode m2) } case E_CCFPmode: - case E_CCFPUmode: /* These are only compatible with themselves, which we already checked above. */ return VOIDmode; @@ -21811,10 +21982,10 @@ ix86_fp_comparison_strategy (enum rtx_code) static enum rtx_code ix86_prepare_fp_compare_args (enum rtx_code code, rtx *pop0, rtx *pop1) { - machine_mode fpcmp_mode = ix86_fp_compare_mode (code); + bool unordered_compare = ix86_unordered_fp_compare (code); rtx op0 = *pop0, op1 = *pop1; machine_mode op_mode = GET_MODE (op0); - int is_sse = TARGET_SSE_MATH && SSE_FLOAT_MODE_P (op_mode); + bool is_sse = TARGET_SSE_MATH && SSE_FLOAT_MODE_P (op_mode); /* All of the unordered compare instructions only work on registers. The same is true of the fcomi compare instructions. The XFmode @@ -21823,7 +21994,7 @@ ix86_prepare_fp_compare_args (enum rtx_code code, rtx *pop0, rtx *pop1) floating point. */ if (!is_sse - && (fpcmp_mode == CCFPUmode + && (unordered_compare || (op_mode == XFmode && ! (standard_80387_constant_p (op0) == 1 || standard_80387_constant_p (op1) == 1) @@ -21920,27 +22091,29 @@ ix86_fp_compare_code_to_integer (enum rtx_code code) static rtx ix86_expand_fp_compare (enum rtx_code code, rtx op0, rtx op1, rtx scratch) { - machine_mode fpcmp_mode, intcmp_mode; + bool unordered_compare = ix86_unordered_fp_compare (code); + machine_mode intcmp_mode; rtx tmp, tmp2; - fpcmp_mode = ix86_fp_compare_mode (code); code = ix86_prepare_fp_compare_args (code, &op0, &op1); /* Do fcomi/sahf based test when profitable. */ switch (ix86_fp_comparison_strategy (code)) { case IX86_FPCMP_COMI: - intcmp_mode = fpcmp_mode; - tmp = gen_rtx_COMPARE (fpcmp_mode, op0, op1); - tmp = gen_rtx_SET (gen_rtx_REG (fpcmp_mode, FLAGS_REG), tmp); - emit_insn (tmp); + intcmp_mode = CCFPmode; + tmp = gen_rtx_COMPARE (CCFPmode, op0, op1); + if (unordered_compare) + tmp = gen_rtx_UNSPEC (CCFPmode, gen_rtvec (1, tmp), UNSPEC_NOTRAP); + emit_insn (gen_rtx_SET (gen_rtx_REG (CCFPmode, FLAGS_REG), tmp)); break; case IX86_FPCMP_SAHF: - intcmp_mode = fpcmp_mode; - tmp = gen_rtx_COMPARE (fpcmp_mode, op0, op1); - tmp = gen_rtx_SET (gen_rtx_REG (fpcmp_mode, FLAGS_REG), tmp); - + intcmp_mode = CCFPmode; + tmp = gen_rtx_COMPARE (CCFPmode, op0, op1); + if (unordered_compare) + tmp = gen_rtx_UNSPEC (CCFPmode, gen_rtvec (1, tmp), UNSPEC_NOTRAP); + tmp = gen_rtx_SET (gen_rtx_REG (CCFPmode, FLAGS_REG), tmp); if (!scratch) scratch = gen_reg_rtx (HImode); tmp2 = gen_rtx_CLOBBER (VOIDmode, scratch); @@ -21949,11 +22122,13 @@ ix86_expand_fp_compare (enum rtx_code code, rtx op0, rtx op1, rtx scratch) case IX86_FPCMP_ARITH: /* Sadness wrt reg-stack pops killing fpsr -- gotta get fnstsw first. */ - tmp = gen_rtx_COMPARE (fpcmp_mode, op0, op1); - tmp2 = gen_rtx_UNSPEC (HImode, gen_rtvec (1, tmp), UNSPEC_FNSTSW); + tmp = gen_rtx_COMPARE (CCFPmode, op0, op1); + if (unordered_compare) + tmp = gen_rtx_UNSPEC (CCFPmode, gen_rtvec (1, tmp), UNSPEC_NOTRAP); + tmp = gen_rtx_UNSPEC (HImode, gen_rtvec (1, tmp), UNSPEC_FNSTSW); if (!scratch) scratch = gen_reg_rtx (HImode); - emit_insn (gen_rtx_SET (scratch, tmp2)); + emit_insn (gen_rtx_SET (scratch, tmp)); /* In the unordered case, we have to check C2 for NaN's, which doesn't happen to work out to anything nice combination-wise. @@ -22234,6 +22409,62 @@ ix86_expand_branch (enum rtx_code code, rtx op0, rtx op1, rtx label) break; } + /* Emulate comparisons that do not depend on Zero flag with + double-word subtraction. Note that only Overflow, Sign + and Carry flags are valid, so swap arguments and condition + of comparisons that would otherwise test Zero flag. */ + + switch (code) + { + case LE: case LEU: case GT: case GTU: + std::swap (lo[0], lo[1]); + std::swap (hi[0], hi[1]); + code = swap_condition (code); + /* FALLTHRU */ + + case LT: case LTU: case GE: case GEU: + { + rtx (*cmp_insn) (rtx, rtx); + rtx (*sbb_insn) (rtx, rtx, rtx); + bool uns = (code == LTU || code == GEU); + + if (TARGET_64BIT) + { + cmp_insn = gen_cmpdi_1; + sbb_insn + = uns ? gen_subdi3_carry_ccc : gen_subdi3_carry_ccgz; + } + else + { + cmp_insn = gen_cmpsi_1; + sbb_insn + = uns ? gen_subsi3_carry_ccc : gen_subsi3_carry_ccgz; + } + + if (!nonimmediate_operand (lo[0], submode)) + lo[0] = force_reg (submode, lo[0]); + if (!x86_64_general_operand (lo[1], submode)) + lo[1] = force_reg (submode, lo[1]); + + if (!register_operand (hi[0], submode)) + hi[0] = force_reg (submode, hi[0]); + if ((uns && !nonimmediate_operand (hi[1], submode)) + || (!uns && !x86_64_general_operand (hi[1], submode))) + hi[1] = force_reg (submode, hi[1]); + + emit_insn (cmp_insn (lo[0], lo[1])); + emit_insn (sbb_insn (gen_rtx_SCRATCH (submode), hi[0], hi[1])); + + tmp = gen_rtx_REG (uns ? CCCmode : CCGZmode, FLAGS_REG); + + ix86_expand_branch (code, tmp, const0_rtx, label); + return; + } + + default: + break; + } + /* Otherwise, we need two or three jumps. */ label2 = gen_label_rtx (); @@ -22339,8 +22570,7 @@ ix86_expand_carry_flag_compare (enum rtx_code code, rtx op0, rtx op1, rtx *pop) compare_seq = get_insns (); end_sequence (); - if (GET_MODE (XEXP (compare_op, 0)) == CCFPmode - || GET_MODE (XEXP (compare_op, 0)) == CCFPUmode) + if (GET_MODE (XEXP (compare_op, 0)) == CCFPmode) code = ix86_fp_compare_code_to_integer (GET_CODE (compare_op)); else code = GET_CODE (compare_op); @@ -22480,8 +22710,7 @@ ix86_expand_int_movcc (rtx operands[]) flags = XEXP (compare_op, 0); - if (GET_MODE (flags) == CCFPmode - || GET_MODE (flags) == CCFPUmode) + if (GET_MODE (flags) == CCFPmode) { fpcmp = true; compare_code @@ -23826,10 +24055,10 @@ struct expand_vec_perm_d }; static bool -ix86_expand_vec_perm_vpermi2 (rtx target, rtx op0, rtx mask, rtx op1, +ix86_expand_vec_perm_vpermt2 (rtx target, rtx mask, rtx op0, rtx op1, struct expand_vec_perm_d *d) { - /* ix86_expand_vec_perm_vpermi2 is called from both const and non-const + /* ix86_expand_vec_perm_vpermt2 is called from both const and non-const expander, so args are either in d, or in op0, op1 etc. */ machine_mode mode = GET_MODE (d ? d->op0 : op0); machine_mode maskmode = mode; @@ -23839,83 +24068,83 @@ ix86_expand_vec_perm_vpermi2 (rtx target, rtx op0, rtx mask, rtx op1, { case E_V8HImode: if (TARGET_AVX512VL && TARGET_AVX512BW) - gen = gen_avx512vl_vpermi2varv8hi3; + gen = gen_avx512vl_vpermt2varv8hi3; break; case E_V16HImode: if (TARGET_AVX512VL && TARGET_AVX512BW) - gen = gen_avx512vl_vpermi2varv16hi3; + gen = gen_avx512vl_vpermt2varv16hi3; break; case E_V64QImode: if (TARGET_AVX512VBMI) - gen = gen_avx512bw_vpermi2varv64qi3; + gen = gen_avx512bw_vpermt2varv64qi3; break; case E_V32HImode: if (TARGET_AVX512BW) - gen = gen_avx512bw_vpermi2varv32hi3; + gen = gen_avx512bw_vpermt2varv32hi3; break; case E_V4SImode: if (TARGET_AVX512VL) - gen = gen_avx512vl_vpermi2varv4si3; + gen = gen_avx512vl_vpermt2varv4si3; break; case E_V8SImode: if (TARGET_AVX512VL) - gen = gen_avx512vl_vpermi2varv8si3; + gen = gen_avx512vl_vpermt2varv8si3; break; case E_V16SImode: if (TARGET_AVX512F) - gen = gen_avx512f_vpermi2varv16si3; + gen = gen_avx512f_vpermt2varv16si3; break; case E_V4SFmode: if (TARGET_AVX512VL) { - gen = gen_avx512vl_vpermi2varv4sf3; + gen = gen_avx512vl_vpermt2varv4sf3; maskmode = V4SImode; } break; case E_V8SFmode: if (TARGET_AVX512VL) { - gen = gen_avx512vl_vpermi2varv8sf3; + gen = gen_avx512vl_vpermt2varv8sf3; maskmode = V8SImode; } break; case E_V16SFmode: if (TARGET_AVX512F) { - gen = gen_avx512f_vpermi2varv16sf3; + gen = gen_avx512f_vpermt2varv16sf3; maskmode = V16SImode; } break; case E_V2DImode: if (TARGET_AVX512VL) - gen = gen_avx512vl_vpermi2varv2di3; + gen = gen_avx512vl_vpermt2varv2di3; break; case E_V4DImode: if (TARGET_AVX512VL) - gen = gen_avx512vl_vpermi2varv4di3; + gen = gen_avx512vl_vpermt2varv4di3; break; case E_V8DImode: if (TARGET_AVX512F) - gen = gen_avx512f_vpermi2varv8di3; + gen = gen_avx512f_vpermt2varv8di3; break; case E_V2DFmode: if (TARGET_AVX512VL) { - gen = gen_avx512vl_vpermi2varv2df3; + gen = gen_avx512vl_vpermt2varv2df3; maskmode = V2DImode; } break; case E_V4DFmode: if (TARGET_AVX512VL) { - gen = gen_avx512vl_vpermi2varv4df3; + gen = gen_avx512vl_vpermt2varv4df3; maskmode = V4DImode; } break; case E_V8DFmode: if (TARGET_AVX512F) { - gen = gen_avx512f_vpermi2varv8df3; + gen = gen_avx512f_vpermt2varv8df3; maskmode = V8DImode; } break; @@ -23926,7 +24155,7 @@ ix86_expand_vec_perm_vpermi2 (rtx target, rtx op0, rtx mask, rtx op1, if (gen == NULL) return false; - /* ix86_expand_vec_perm_vpermi2 is called from both const and non-const + /* ix86_expand_vec_perm_vpermt2 is called from both const and non-const expander, so args are either in d, or in op0, op1 etc. */ if (d) { @@ -23939,7 +24168,7 @@ ix86_expand_vec_perm_vpermi2 (rtx target, rtx op0, rtx mask, rtx op1, mask = gen_rtx_CONST_VECTOR (maskmode, gen_rtvec_v (d->nelt, vec)); } - emit_insn (gen (target, op0, force_reg (maskmode, mask), op1)); + emit_insn (gen (target, force_reg (maskmode, mask), op0, op1)); return true; } @@ -23990,7 +24219,7 @@ ix86_expand_vec_perm (rtx operands[]) } } - if (ix86_expand_vec_perm_vpermi2 (target, op0, mask, op1, NULL)) + if (ix86_expand_vec_perm_vpermt2 (target, mask, op0, op1, NULL)) return; if (TARGET_AVX2) @@ -24515,8 +24744,7 @@ ix86_expand_int_addcc (rtx operands[]) flags = XEXP (compare_op, 0); - if (GET_MODE (flags) == CCFPmode - || GET_MODE (flags) == CCFPUmode) + if (GET_MODE (flags) == CCFPmode) { fpcmp = true; code = ix86_fp_compare_code_to_integer (code); @@ -24603,11 +24831,7 @@ ix86_split_to_parts (rtx operand, rtx *parts, machine_mode mode) /* Optimize constant pool reference to immediates. This is used by fp moves, that force all constants to memory to allow combining. */ if (MEM_P (operand) && MEM_READONLY_P (operand)) - { - rtx tmp = maybe_get_pool_constant (operand); - if (tmp) - operand = tmp; - } + operand = avoid_constant_pool_reference (operand); if (MEM_P (operand) && !offsettable_memref_p (operand)) { @@ -29804,8 +30028,12 @@ BDESC_VERIFYS (IX86_BUILTIN__BDESC_MPX_CONST_FIRST, IX86_BUILTIN__BDESC_MPX_LAST, 1); BDESC_VERIFYS (IX86_BUILTIN__BDESC_MULTI_ARG_FIRST, IX86_BUILTIN__BDESC_MPX_CONST_LAST, 1); -BDESC_VERIFYS (IX86_BUILTIN_MAX, +BDESC_VERIFYS (IX86_BUILTIN__BDESC_CET_FIRST, IX86_BUILTIN__BDESC_MULTI_ARG_LAST, 1); +BDESC_VERIFYS (IX86_BUILTIN__BDESC_CET_NORMAL_FIRST, + IX86_BUILTIN__BDESC_CET_LAST, 1); +BDESC_VERIFYS (IX86_BUILTIN_MAX, + IX86_BUILTIN__BDESC_CET_NORMAL_LAST, 1); /* Set up all the MMX/SSE builtins, even builtins for instructions that are not in the current target ISA to allow the user to compile particular modules @@ -30472,6 +30700,35 @@ ix86_init_mmx_sse_builtins (void) BDESC_VERIFYS (IX86_BUILTIN__BDESC_MULTI_ARG_LAST, IX86_BUILTIN__BDESC_MULTI_ARG_FIRST, ARRAY_SIZE (bdesc_multi_arg) - 1); + + /* Add CET inrinsics. */ + for (i = 0, d = bdesc_cet; i < ARRAY_SIZE (bdesc_cet); i++, d++) + { + BDESC_VERIFY (d->code, IX86_BUILTIN__BDESC_CET_FIRST, i); + if (d->name == 0) + continue; + + ftype = (enum ix86_builtin_func_type) d->flag; + def_builtin2 (d->mask, d->name, ftype, d->code); + } + BDESC_VERIFYS (IX86_BUILTIN__BDESC_CET_LAST, + IX86_BUILTIN__BDESC_CET_FIRST, + ARRAY_SIZE (bdesc_cet) - 1); + + for (i = 0, d = bdesc_cet_rdssp; + i < ARRAY_SIZE (bdesc_cet_rdssp); + i++, d++) + { + BDESC_VERIFY (d->code, IX86_BUILTIN__BDESC_CET_NORMAL_FIRST, i); + if (d->name == 0) + continue; + + ftype = (enum ix86_builtin_func_type) d->flag; + def_builtin2 (d->mask, d->name, ftype, d->code); + } + BDESC_VERIFYS (IX86_BUILTIN__BDESC_CET_NORMAL_LAST, + IX86_BUILTIN__BDESC_CET_NORMAL_FIRST, + ARRAY_SIZE (bdesc_cet_rdssp) - 1); } static void @@ -33425,6 +33682,7 @@ ix86_expand_args_builtin (const struct builtin_description *d, case UQI_FTYPE_V4SF_V4SF_INT: case UHI_FTYPE_V16SI_V16SI_INT: case UHI_FTYPE_V16SF_V16SF_INT: + case V64QI_FTYPE_V64QI_V64QI_INT: nargs = 3; nargs_constant = 1; break; @@ -33652,6 +33910,13 @@ ix86_expand_args_builtin (const struct builtin_description *d, mask_pos = 1; nargs_constant = 1; break; + case V64QI_FTYPE_V64QI_V64QI_INT_V64QI_UDI: + case V32QI_FTYPE_V32QI_V32QI_INT_V32QI_USI: + case V16QI_FTYPE_V16QI_V16QI_INT_V16QI_UHI: + nargs = 5; + mask_pos = 1; + nargs_constant = 2; + break; default: gcc_unreachable (); @@ -34830,10 +35095,10 @@ ix86_expand_builtin (tree exp, rtx target, rtx subtarget, machine_mode mode, int ignore) { size_t i; - enum insn_code icode; + enum insn_code icode, icode2; tree fndecl = TREE_OPERAND (CALL_EXPR_FN (exp), 0); tree arg0, arg1, arg2, arg3, arg4; - rtx op0, op1, op2, op3, op4, pat, insn; + rtx op0, op1, op2, op3, op4, pat, pat2, insn; machine_mode mode0, mode1, mode2, mode3, mode4; unsigned int fcode = DECL_FUNCTION_CODE (fndecl); @@ -35808,22 +36073,34 @@ rdseed_step: case IX86_BUILTIN_SBB32: icode = CODE_FOR_subborrowsi; + icode2 = CODE_FOR_subborrowsi_0; mode0 = SImode; + mode1 = DImode; + mode2 = CCmode; goto handlecarry; case IX86_BUILTIN_SBB64: icode = CODE_FOR_subborrowdi; + icode2 = CODE_FOR_subborrowdi_0; mode0 = DImode; + mode1 = TImode; + mode2 = CCmode; goto handlecarry; case IX86_BUILTIN_ADDCARRYX32: icode = CODE_FOR_addcarrysi; + icode2 = CODE_FOR_addcarrysi_0; mode0 = SImode; + mode1 = DImode; + mode2 = CCCmode; goto handlecarry; case IX86_BUILTIN_ADDCARRYX64: icode = CODE_FOR_addcarrydi; + icode2 = CODE_FOR_addcarrydi_0; mode0 = DImode; + mode1 = TImode; + mode2 = CCCmode; handlecarry: arg0 = CALL_EXPR_ARG (exp, 0); /* unsigned char c_in. */ @@ -35832,7 +36109,8 @@ rdseed_step: arg3 = CALL_EXPR_ARG (exp, 3); /* unsigned int *sum_out. */ op1 = expand_normal (arg0); - op1 = copy_to_mode_reg (QImode, convert_to_mode (QImode, op1, 1)); + if (!integer_zerop (arg0)) + op1 = copy_to_mode_reg (QImode, convert_to_mode (QImode, op1, 1)); op2 = expand_normal (arg1); if (!register_operand (op2, mode0)) @@ -35849,21 +36127,31 @@ rdseed_step: op4 = copy_addr_to_reg (op4); } - /* Generate CF from input operand. */ - emit_insn (gen_addqi3_cconly_overflow (op1, constm1_rtx)); - - /* Generate instruction that consumes CF. */ op0 = gen_reg_rtx (mode0); + if (integer_zerop (arg0)) + { + /* If arg0 is 0, optimize right away into add or sub + instruction that sets CCCmode flags. */ + op1 = gen_rtx_REG (mode2, FLAGS_REG); + emit_insn (GEN_FCN (icode2) (op0, op2, op3)); + } + else + { + /* Generate CF from input operand. */ + emit_insn (gen_addqi3_cconly_overflow (op1, constm1_rtx)); - op1 = gen_rtx_REG (CCCmode, FLAGS_REG); - pat = gen_rtx_LTU (mode0, op1, const0_rtx); - emit_insn (GEN_FCN (icode) (op0, op2, op3, op1, pat)); + /* Generate instruction that consumes CF. */ + op1 = gen_rtx_REG (CCCmode, FLAGS_REG); + pat = gen_rtx_LTU (mode1, op1, const0_rtx); + pat2 = gen_rtx_LTU (mode0, op1, const0_rtx); + emit_insn (GEN_FCN (icode) (op0, op2, op3, op1, pat, pat2)); + } /* Return current CF value. */ if (target == 0) target = gen_reg_rtx (QImode); - PUT_MODE (pat, QImode); + pat = gen_rtx_LTU (QImode, op1, const0_rtx); emit_insn (gen_rtx_SET (target, pat)); /* Store the result. */ @@ -36656,6 +36944,57 @@ rdseed_step: emit_insn (gen_xabort (op0)); return 0; + case IX86_BUILTIN_RSTORSSP: + case IX86_BUILTIN_CLRSSBSY: + arg0 = CALL_EXPR_ARG (exp, 0); + op0 = expand_normal (arg0); + icode = (fcode == IX86_BUILTIN_RSTORSSP + ? CODE_FOR_rstorssp + : CODE_FOR_clrssbsy); + if (!address_operand (op0, VOIDmode)) + { + op1 = convert_memory_address (Pmode, op0); + op0 = copy_addr_to_reg (op1); + } + emit_insn (GEN_FCN (icode) (gen_rtx_MEM (Pmode, op0))); + return 0; + + case IX86_BUILTIN_WRSSD: + case IX86_BUILTIN_WRSSQ: + case IX86_BUILTIN_WRUSSD: + case IX86_BUILTIN_WRUSSQ: + arg0 = CALL_EXPR_ARG (exp, 0); + op0 = expand_normal (arg0); + arg1 = CALL_EXPR_ARG (exp, 1); + op1 = expand_normal (arg1); + switch (fcode) + { + case IX86_BUILTIN_WRSSD: + icode = CODE_FOR_wrsssi; + mode = SImode; + break; + case IX86_BUILTIN_WRSSQ: + icode = CODE_FOR_wrssdi; + mode = DImode; + break; + case IX86_BUILTIN_WRUSSD: + icode = CODE_FOR_wrusssi; + mode = SImode; + break; + case IX86_BUILTIN_WRUSSQ: + icode = CODE_FOR_wrussdi; + mode = DImode; + break; + } + op0 = force_reg (mode, op0); + if (!address_operand (op1, VOIDmode)) + { + op2 = convert_memory_address (Pmode, op1); + op1 = copy_addr_to_reg (op2); + } + emit_insn (GEN_FCN (icode) (op0, gen_rtx_MEM (mode, op1))); + return 0; + default: break; } @@ -36958,6 +37297,22 @@ s4fma_expand: d->flag, d->comparison); } + if (fcode >= IX86_BUILTIN__BDESC_CET_FIRST + && fcode <= IX86_BUILTIN__BDESC_CET_LAST) + { + i = fcode - IX86_BUILTIN__BDESC_CET_FIRST; + return ix86_expand_special_args_builtin (bdesc_cet + i, exp, + target); + } + + if (fcode >= IX86_BUILTIN__BDESC_CET_NORMAL_FIRST + && fcode <= IX86_BUILTIN__BDESC_CET_NORMAL_LAST) + { + i = fcode - IX86_BUILTIN__BDESC_CET_NORMAL_FIRST; + return ix86_expand_args_builtin (bdesc_cet_rdssp + i, exp, + target); + } + gcc_unreachable (); } @@ -38347,6 +38702,28 @@ ix86_can_change_mode_class (machine_mode from, machine_mode to, return true; } +/* Return index of MODE in the sse load/store tables. */ + +static inline int +sse_store_index (machine_mode mode) +{ + switch (GET_MODE_SIZE (mode)) + { + case 4: + return 0; + case 8: + return 1; + case 16: + return 2; + case 32: + return 3; + case 64: + return 4; + default: + return -1; + } +} + /* Return the cost of moving data of mode M between a register and memory. A value of 2 is the default; this cost is relative to those in `REGISTER_MOVE_COST'. @@ -38390,21 +38767,9 @@ inline_memory_move_cost (machine_mode mode, enum reg_class regclass, } if (SSE_CLASS_P (regclass)) { - int index; - switch (GET_MODE_SIZE (mode)) - { - case 4: - index = 0; - break; - case 8: - index = 1; - break; - case 16: - index = 2; - break; - default: - return 100; - } + int index = sse_store_index (mode); + if (index == -1) + return 100; if (in == 2) return MAX (ix86_cost->sse_load [index], ix86_cost->sse_store [index]); return in ? ix86_cost->sse_load [index] : ix86_cost->sse_store [index]; @@ -38507,8 +38872,10 @@ ix86_register_move_cost (machine_mode mode, reg_class_t class1_i, /* In case of copying from general_purpose_register we may emit multiple stores followed by single load causing memory size mismatch stall. Count this as arbitrarily high cost of 20. */ - if (targetm.class_max_nregs (class1, mode) - > targetm.class_max_nregs (class2, mode)) + if (GET_MODE_BITSIZE (mode) > BITS_PER_WORD + && TARGET_MEMORY_MISMATCH_STALL + && targetm.class_max_nregs (class1, mode) + > targetm.class_max_nregs (class2, mode)) cost += 20; /* In the case of FP/MMX moves, the registers actually overlap, and we @@ -38530,12 +38897,19 @@ ix86_register_move_cost (machine_mode mode, reg_class_t class1_i, where integer modes in MMX/SSE registers are not tieable because of missing QImode and HImode moves to, from or between MMX/SSE registers. */ - return MAX (8, ix86_cost->mmxsse_to_integer); + return MAX (8, MMX_CLASS_P (class1) || MMX_CLASS_P (class2) + ? ix86_cost->mmxsse_to_integer : ix86_cost->ssemmx_to_integer); if (MAYBE_FLOAT_CLASS_P (class1)) return ix86_cost->fp_move; if (MAYBE_SSE_CLASS_P (class1)) - return ix86_cost->sse_move; + { + if (GET_MODE_BITSIZE (mode) <= 128) + return ix86_cost->xmm_move; + if (GET_MODE_BITSIZE (mode) <= 256) + return ix86_cost->ymm_move; + return ix86_cost->zmm_move; + } if (MAYBE_MMX_CLASS_P (class1)) return ix86_cost->mmx_move; return 2; @@ -38806,6 +39180,27 @@ ix86_set_reg_reg_cost (machine_mode mode) return COSTS_N_INSNS (CEIL (GET_MODE_SIZE (mode), units)); } +/* Return cost of vector operation in MODE given that scalar version has + COST. If PARALLEL is true assume that CPU has more than one unit + performing the operation. */ + +static int +ix86_vec_cost (machine_mode mode, int cost, bool parallel) +{ + if (!VECTOR_MODE_P (mode)) + return cost; + + if (!parallel) + return cost * GET_MODE_NUNITS (mode); + if (GET_MODE_BITSIZE (mode) == 128 + && TARGET_SSE_SPLIT_REGS) + return cost * 2; + if (GET_MODE_BITSIZE (mode) > 128 + && TARGET_AVX128_OPTIMAL) + return cost * GET_MODE_BITSIZE (mode) / 128; + return cost; +} + /* Compute a (partial) cost for rtx X. Return true if the complete cost has been computed, and false if subexpressions should be scanned. In either case, *TOTAL contains the cost result. */ @@ -38819,6 +39214,9 @@ ix86_rtx_costs (rtx x, machine_mode mode, int outer_code_i, int opno, enum rtx_code outer_code = (enum rtx_code) outer_code_i; const struct processor_costs *cost = speed ? ix86_cost : &ix86_size_cost; int src_cost; + machine_mode inner_mode = mode; + if (VECTOR_MODE_P (mode)) + inner_mode = GET_MODE_INNER (mode); switch (code) { @@ -38963,19 +39361,20 @@ ix86_rtx_costs (rtx x, machine_mode mode, int outer_code_i, int opno, shift with one insn set the cost to prefer paddb. */ if (CONSTANT_P (XEXP (x, 1))) { - *total = (cost->fabs + *total = ix86_vec_cost (mode, + cost->sse_op + rtx_cost (XEXP (x, 0), mode, code, 0, speed) - + (speed ? 2 : COSTS_N_BYTES (16))); + + (speed ? 2 : COSTS_N_BYTES (16)), true); return true; } count = 3; } else if (TARGET_SSSE3) count = 7; - *total = cost->fabs * count; + *total = ix86_vec_cost (mode, cost->sse_op * count, true); } else - *total = cost->fabs; + *total = ix86_vec_cost (mode, cost->sse_op, true); } else if (GET_MODE_SIZE (mode) > UNITS_PER_WORD) { @@ -39017,9 +39416,9 @@ ix86_rtx_costs (rtx x, machine_mode mode, int outer_code_i, int opno, gcc_assert (FLOAT_MODE_P (mode)); gcc_assert (TARGET_FMA || TARGET_FMA4 || TARGET_AVX512F); - /* ??? SSE scalar/vector cost should be used here. */ - /* ??? Bald assumption that fma has the same cost as fmul. */ - *total = cost->fmul; + *total = ix86_vec_cost (mode, + mode == SFmode ? cost->fmass : cost->fmasd, + true); *total += rtx_cost (XEXP (x, 1), mode, FMA, 1, speed); /* Negate in op0 or op2 is free: FMS, FNMA, FNMS. */ @@ -39038,8 +39437,7 @@ ix86_rtx_costs (rtx x, machine_mode mode, int outer_code_i, int opno, case MULT: if (SSE_FLOAT_MODE_P (mode) && TARGET_SSE_MATH) { - /* ??? SSE scalar cost should be used here. */ - *total = cost->fmul; + *total = inner_mode == DFmode ? cost->mulsd : cost->mulss; return false; } else if (X87_FLOAT_MODE_P (mode)) @@ -39049,8 +39447,9 @@ ix86_rtx_costs (rtx x, machine_mode mode, int outer_code_i, int opno, } else if (FLOAT_MODE_P (mode)) { - /* ??? SSE vector cost should be used here. */ - *total = cost->fmul; + *total = ix86_vec_cost (mode, + inner_mode == DFmode + ? cost->mulsd : cost->mulss, true); return false; } else if (GET_MODE_CLASS (mode) == MODE_VECTOR_INT) @@ -39063,22 +39462,29 @@ ix86_rtx_costs (rtx x, machine_mode mode, int outer_code_i, int opno, extra = 5; else if (TARGET_SSSE3) extra = 6; - *total = cost->fmul * 2 + cost->fabs * extra; + *total = ix86_vec_cost (mode, + cost->mulss * 2 + cost->sse_op * extra, + true); } /* V*DImode is emulated with 5-8 insns. */ else if (mode == V2DImode || mode == V4DImode) { if (TARGET_XOP && mode == V2DImode) - *total = cost->fmul * 2 + cost->fabs * 3; + *total = ix86_vec_cost (mode, + cost->mulss * 2 + cost->sse_op * 3, + true); else - *total = cost->fmul * 3 + cost->fabs * 5; + *total = ix86_vec_cost (mode, + cost->mulss * 3 + cost->sse_op * 5, + true); } /* Without sse4.1, we don't have PMULLD; it's emulated with 7 insns, including two PMULUDQ. */ else if (mode == V4SImode && !(TARGET_SSE4_1 || TARGET_AVX)) - *total = cost->fmul * 2 + cost->fabs * 5; + *total = ix86_vec_cost (mode, cost->mulss * 2 + cost->sse_op * 5, + true); else - *total = cost->fmul; + *total = ix86_vec_cost (mode, cost->mulss, true); return false; } else @@ -39132,13 +39538,13 @@ ix86_rtx_costs (rtx x, machine_mode mode, int outer_code_i, int opno, case MOD: case UMOD: if (SSE_FLOAT_MODE_P (mode) && TARGET_SSE_MATH) - /* ??? SSE cost should be used here. */ - *total = cost->fdiv; + *total = inner_mode == DFmode ? cost->divsd : cost->divss; else if (X87_FLOAT_MODE_P (mode)) *total = cost->fdiv; else if (FLOAT_MODE_P (mode)) - /* ??? SSE vector cost should be used here. */ - *total = cost->fdiv; + *total = ix86_vec_cost (mode, + inner_mode == DFmode ? cost->divsd : cost->divss, + true); else *total = cost->divide[MODE_INDEX (mode)]; return false; @@ -39217,8 +39623,7 @@ ix86_rtx_costs (rtx x, machine_mode mode, int outer_code_i, int opno, if (SSE_FLOAT_MODE_P (mode) && TARGET_SSE_MATH) { - /* ??? SSE cost should be used here. */ - *total = cost->fadd; + *total = cost->addss; return false; } else if (X87_FLOAT_MODE_P (mode)) @@ -39228,8 +39633,7 @@ ix86_rtx_costs (rtx x, machine_mode mode, int outer_code_i, int opno, } else if (FLOAT_MODE_P (mode)) { - /* ??? SSE vector cost should be used here. */ - *total = cost->fadd; + *total = ix86_vec_cost (mode, cost->addss, true); return false; } /* FALLTHRU */ @@ -39252,8 +39656,7 @@ ix86_rtx_costs (rtx x, machine_mode mode, int outer_code_i, int opno, case NEG: if (SSE_FLOAT_MODE_P (mode) && TARGET_SSE_MATH) { - /* ??? SSE cost should be used here. */ - *total = cost->fchs; + *total = cost->sse_op; return false; } else if (X87_FLOAT_MODE_P (mode)) @@ -39263,20 +39666,14 @@ ix86_rtx_costs (rtx x, machine_mode mode, int outer_code_i, int opno, } else if (FLOAT_MODE_P (mode)) { - /* ??? SSE vector cost should be used here. */ - *total = cost->fchs; + *total = ix86_vec_cost (mode, cost->sse_op, true); return false; } /* FALLTHRU */ case NOT: if (GET_MODE_CLASS (mode) == MODE_VECTOR_INT) - { - /* ??? Should be SSE vector operation cost. */ - /* At least for published AMD latencies, this really is the same - as the latency for a simple fpu operation like fabs. */ - *total = cost->fabs; - } + *total = ix86_vec_cost (mode, cost->sse_op, true); else if (GET_MODE_SIZE (mode) > UNITS_PER_WORD) *total = cost->add * 2; else @@ -39309,28 +39706,38 @@ ix86_rtx_costs (rtx x, machine_mode mode, int outer_code_i, int opno, case FLOAT_EXTEND: if (!(SSE_FLOAT_MODE_P (mode) && TARGET_SSE_MATH)) *total = 0; + else + *total = ix86_vec_cost (mode, cost->addss, true); + return false; + + case FLOAT_TRUNCATE: + if (!(SSE_FLOAT_MODE_P (mode) && TARGET_SSE_MATH)) + *total = cost->fadd; + else + *total = ix86_vec_cost (mode, cost->addss, true); return false; case ABS: + /* SSE requires memory load for the constant operand. It may make + sense to account for this. Of course the constant operand may or + may not be reused. */ if (SSE_FLOAT_MODE_P (mode) && TARGET_SSE_MATH) - /* ??? SSE cost should be used here. */ - *total = cost->fabs; + *total = cost->sse_op; else if (X87_FLOAT_MODE_P (mode)) *total = cost->fabs; else if (FLOAT_MODE_P (mode)) - /* ??? SSE vector cost should be used here. */ - *total = cost->fabs; + *total = ix86_vec_cost (mode, cost->sse_op, true); return false; case SQRT: if (SSE_FLOAT_MODE_P (mode) && TARGET_SSE_MATH) - /* ??? SSE cost should be used here. */ - *total = cost->fsqrt; + *total = mode == SFmode ? cost->sqrtss : cost->sqrtsd; else if (X87_FLOAT_MODE_P (mode)) *total = cost->fsqrt; else if (FLOAT_MODE_P (mode)) - /* ??? SSE vector cost should be used here. */ - *total = cost->fsqrt; + *total = ix86_vec_cost (mode, + mode == SFmode ? cost->sqrtss : cost->sqrtsd, + true); return false; case UNSPEC: @@ -39344,7 +39751,7 @@ ix86_rtx_costs (rtx x, machine_mode mode, int outer_code_i, int opno, /* ??? Assume all of these vector manipulation patterns are recognizable. In which case they all pretty much have the same cost. */ - *total = cost->fabs; + *total = cost->sse_op; return true; case VEC_MERGE: mask = XEXP (x, 2); @@ -39353,7 +39760,7 @@ ix86_rtx_costs (rtx x, machine_mode mode, int outer_code_i, int opno, if (TARGET_AVX512F && register_operand (mask, GET_MODE (mask))) *total = rtx_cost (XEXP (x, 0), mode, outer_code, opno, speed); else - *total = cost->fabs; + *total = cost->sse_op; return true; default: @@ -39818,6 +40225,10 @@ x86_output_mi_thunk (FILE *file, tree, HOST_WIDE_INT delta, emit_note (NOTE_INSN_PROLOGUE_END); + /* CET is enabled, insert EB instruction. */ + if ((flag_cf_protection & CF_BRANCH) && TARGET_IBT) + emit_insn (gen_nop_endbr ()); + /* If VCALL_OFFSET, we'll need THIS in a register. Might as well pull it in now and let DELTA benefit. */ if (REG_P (this_param)) @@ -40835,7 +41246,7 @@ ix86_vector_duplicate_value (machine_mode mode, rtx target, rtx val) reg = force_reg (innermode, val); if (GET_MODE (reg) != innermode) reg = gen_lowpart (innermode, reg); - XEXP (dup, 0) = reg; + SET_SRC (PATTERN (insn)) = gen_vec_duplicate (mode, reg); seq = get_insns (); end_sequence (); if (seq) @@ -42800,9 +43211,9 @@ ix86_encode_section_info (tree decl, rtx rtl, int first) enum rtx_code ix86_reverse_condition (enum rtx_code code, machine_mode mode) { - return (mode != CCFPmode && mode != CCFPUmode - ? reverse_condition (code) - : reverse_condition_maybe_unordered (code)); + return (mode == CCFPmode + ? reverse_condition_maybe_unordered (code) + : reverse_condition (code)); } /* Output code to perform an x87 FP register move, from OPERANDS[1] @@ -43415,17 +43826,20 @@ static rtx_code_label * ix86_expand_sse_compare_and_jump (enum rtx_code code, rtx op0, rtx op1, bool swap_operands) { - machine_mode fpcmp_mode = ix86_fp_compare_mode (code); + bool unordered_compare = ix86_unordered_fp_compare (code); rtx_code_label *label; - rtx tmp; + rtx tmp, reg; if (swap_operands) std::swap (op0, op1); label = gen_label_rtx (); - tmp = gen_rtx_REG (fpcmp_mode, FLAGS_REG); - emit_insn (gen_rtx_SET (tmp, gen_rtx_COMPARE (fpcmp_mode, op0, op1))); - tmp = gen_rtx_fmt_ee (code, VOIDmode, tmp, const0_rtx); + tmp = gen_rtx_COMPARE (CCFPmode, op0, op1); + if (unordered_compare) + tmp = gen_rtx_UNSPEC (CCFPmode, gen_rtvec (1, tmp), UNSPEC_NOTRAP); + reg = gen_rtx_REG (CCFPmode, FLAGS_REG); + emit_insn (gen_rtx_SET (reg, tmp)); + tmp = gen_rtx_fmt_ee (code, VOIDmode, reg, const0_rtx); tmp = gen_rtx_IF_THEN_ELSE (VOIDmode, tmp, gen_rtx_LABEL_REF (VOIDmode, label), pc_rtx); tmp = emit_jump_insn (gen_rtx_SET (pc_rtx, tmp)); @@ -44044,35 +44458,83 @@ static int ix86_builtin_vectorization_cost (enum vect_cost_for_stmt type_of_cost, tree vectype, int) { + bool fp = false; + machine_mode mode = TImode; + int index; + if (vectype != NULL) + { + fp = FLOAT_TYPE_P (vectype); + mode = TYPE_MODE (vectype); + } + switch (type_of_cost) { case scalar_stmt: - return ix86_cost->scalar_stmt_cost; + return fp ? ix86_cost->addss : COSTS_N_INSNS (1); case scalar_load: - return ix86_cost->scalar_load_cost; + /* load/store costs are relative to register move which is 2. Recompute + it to COSTS_N_INSNS so everything have same base. */ + return COSTS_N_INSNS (fp ? ix86_cost->sse_load[0] + : ix86_cost->int_load [2]) / 2; case scalar_store: - return ix86_cost->scalar_store_cost; + return COSTS_N_INSNS (fp ? ix86_cost->sse_store[0] + : ix86_cost->int_store [2]) / 2; case vector_stmt: - return ix86_cost->vec_stmt_cost; + return ix86_vec_cost (mode, + fp ? ix86_cost->addss : ix86_cost->sse_op, + true); case vector_load: - return ix86_cost->vec_align_load_cost; + index = sse_store_index (mode); + gcc_assert (index >= 0); + return ix86_vec_cost (mode, + COSTS_N_INSNS (ix86_cost->sse_load[index]) / 2, + true); case vector_store: - return ix86_cost->vec_store_cost; + index = sse_store_index (mode); + return ix86_vec_cost (mode, + COSTS_N_INSNS (ix86_cost->sse_store[index]) / 2, + true); case vec_to_scalar: - return ix86_cost->vec_to_scalar_cost; - case scalar_to_vec: - return ix86_cost->scalar_to_vec_cost; + return ix86_vec_cost (mode, ix86_cost->sse_op, true); + /* We should have separate costs for unaligned loads and gather/scatter. + Do that incrementally. */ case unaligned_load: + index = sse_store_index (mode); + return ix86_vec_cost (mode, + COSTS_N_INSNS + (ix86_cost->sse_unaligned_load[index]) / 2, + true); + case unaligned_store: - return ix86_cost->vec_unalign_load_cost; + index = sse_store_index (mode); + return ix86_vec_cost (mode, + COSTS_N_INSNS + (ix86_cost->sse_unaligned_store[index]) / 2, + true); + + case vector_gather_load: + return ix86_vec_cost (mode, + COSTS_N_INSNS + (ix86_cost->gather_static + + ix86_cost->gather_per_elt + * TYPE_VECTOR_SUBPARTS (vectype)) / 2, + true); + + case vector_scatter_store: + return ix86_vec_cost (mode, + COSTS_N_INSNS + (ix86_cost->scatter_static + + ix86_cost->scatter_per_elt + * TYPE_VECTOR_SUBPARTS (vectype)) / 2, + true); case cond_branch_taken: return ix86_cost->cond_taken_branch_cost; @@ -44082,10 +44544,11 @@ ix86_builtin_vectorization_cost (enum vect_cost_for_stmt type_of_cost, case vec_perm: case vec_promote_demote: - return ix86_cost->vec_stmt_cost; + return ix86_vec_cost (mode, + ix86_cost->sse_op, true); case vec_construct: - return ix86_cost->vec_stmt_cost * (TYPE_VECTOR_SUBPARTS (vectype) - 1); + return ix86_vec_cost (mode, ix86_cost->sse_op, false); default: gcc_unreachable (); @@ -44963,8 +45426,8 @@ expand_vec_perm_1 (struct expand_vec_perm_d *d) if (ix86_expand_vec_one_operand_perm_avx512 (d)) return true; - /* Try the AVX512F vpermi2 instructions. */ - if (ix86_expand_vec_perm_vpermi2 (NULL_RTX, NULL_RTX, NULL_RTX, NULL_RTX, d)) + /* Try the AVX512F vpermt2/vpermi2 instructions. */ + if (ix86_expand_vec_perm_vpermt2 (NULL_RTX, NULL_RTX, NULL_RTX, NULL_RTX, d)) return true; /* See if we can get the same permutation in different vector integer @@ -46621,9 +47084,9 @@ expand_vec_perm_broadcast (struct expand_vec_perm_d *d) } /* Implement arbitrary permutations of two V64QImode operands - will 2 vpermi2w, 2 vpshufb and one vpor instruction. */ + with 2 vperm[it]2w, 2 vpshufb and one vpor instruction. */ static bool -expand_vec_perm_vpermi2_vpshub2 (struct expand_vec_perm_d *d) +expand_vec_perm_vpermt2_vpshub2 (struct expand_vec_perm_d *d) { if (!TARGET_AVX512BW || !(d->vmode == V64QImode)) return false; @@ -46868,7 +47331,7 @@ ix86_expand_vec_perm_const_1 (struct expand_vec_perm_d *d) if (expand_vec_perm_vpshufb2_vpermq_even_odd (d)) return true; - if (expand_vec_perm_vpermi2_vpshub2 (d)) + if (expand_vec_perm_vpermt2_vpshub2 (d)) return true; /* ??? Look for narrow permutations whose element orderings would @@ -47016,17 +47479,17 @@ ix86_vectorize_vec_perm_const_ok (machine_mode vmode, vec_perm_indices sel) case E_V8DImode: case E_V8DFmode: if (TARGET_AVX512F) - /* All implementable with a single vpermi2 insn. */ + /* All implementable with a single vperm[it]2 insn. */ return true; break; case E_V32HImode: if (TARGET_AVX512BW) - /* All implementable with a single vpermi2 insn. */ + /* All implementable with a single vperm[it]2 insn. */ return true; break; case E_V64QImode: if (TARGET_AVX512BW) - /* Implementable with 2 vpermi2, 2 vpshufb and 1 or insn. */ + /* Implementable with 2 vperm[it]2, 2 vpshufb and 1 or insn. */ return true; break; case E_V8SImode: @@ -47034,7 +47497,7 @@ ix86_vectorize_vec_perm_const_ok (machine_mode vmode, vec_perm_indices sel) case E_V4DFmode: case E_V4DImode: if (TARGET_AVX512VL) - /* All implementable with a single vpermi2 insn. */ + /* All implementable with a single vperm[it]2 insn. */ return true; break; case E_V16HImode: @@ -47204,7 +47667,6 @@ ix86_expand_vecop_qihi (enum rtx_code code, rtx dest, rtx op1, rtx op2) op2_h = gen_reg_rtx (qimode); emit_insn (gen_il (op2_l, op2, op2)); emit_insn (gen_ih (op2_h, op2, op2)); - /* FALLTHRU */ op1_l = gen_reg_rtx (qimode); op1_h = gen_reg_rtx (qimode); @@ -47632,6 +48094,46 @@ ix86_bnd_prefixed_insn_p (rtx insn) return chkp_function_instrumented_p (current_function_decl); } +/* Return 1 if control tansfer instruction INSN + should be encoded with notrack prefix. */ + +static bool +ix86_notrack_prefixed_insn_p (rtx insn) +{ + if (!insn || !((flag_cf_protection & CF_BRANCH) && TARGET_IBT)) + return false; + + if (CALL_P (insn)) + { + rtx call = get_call_rtx_from (insn); + gcc_assert (call != NULL_RTX); + rtx addr = XEXP (call, 0); + + /* Do not emit 'notrack' if it's not an indirect call. */ + if (MEM_P (addr) + && GET_CODE (XEXP (addr, 0)) == SYMBOL_REF) + return false; + else + return find_reg_note (insn, REG_CALL_NOCF_CHECK, 0); + } + + if (JUMP_P (insn) && !flag_cet_switch) + { + rtx target = JUMP_LABEL (insn); + if (target == NULL_RTX || ANY_RETURN_P (target)) + return false; + + /* Check the jump is a switch table. */ + rtx_insn *label = as_a<rtx_insn *> (target); + rtx_insn *table = next_insn (label); + if (table == NULL_RTX || !JUMP_TABLE_DATA_P (table)) + return false; + else + return true; + } + return false; +} + /* Calculate integer abs() using only SSE2 instructions. */ void @@ -49420,6 +49922,9 @@ ix86_run_selftests (void) #undef TARGET_DELEGITIMIZE_ADDRESS #define TARGET_DELEGITIMIZE_ADDRESS ix86_delegitimize_address +#undef TARGET_CONST_NOT_OK_FOR_DEBUG_P +#define TARGET_CONST_NOT_OK_FOR_DEBUG_P ix86_const_not_ok_for_debug_p + #undef TARGET_MS_BITFIELD_LAYOUT_P #define TARGET_MS_BITFIELD_LAYOUT_P ix86_ms_bitfield_layout_p |