summaryrefslogtreecommitdiff
path: root/gcc/config/i386/i386.c
diff options
context:
space:
mode:
Diffstat (limited to 'gcc/config/i386/i386.c')
-rw-r--r--gcc/config/i386/i386.c1269
1 files changed, 887 insertions, 382 deletions
diff --git a/gcc/config/i386/i386.c b/gcc/config/i386/i386.c
index 619b13b3d09..4b684522082 100644
--- a/gcc/config/i386/i386.c
+++ b/gcc/config/i386/i386.c
@@ -102,6 +102,9 @@ static rtx legitimize_pe_coff_symbol (rtx, bool);
static void ix86_print_operand_address_as (FILE *, rtx, addr_space_t, bool);
static bool ix86_save_reg (unsigned int, bool, bool);
static bool ix86_function_naked (const_tree);
+static bool ix86_notrack_prefixed_insn_p (rtx);
+static void ix86_emit_restore_reg_using_pop (rtx);
+
#ifndef CHECK_STACK_LIMIT
#define CHECK_STACK_LIMIT (-1)
@@ -302,7 +305,7 @@ int const dbx64_register_map[FIRST_PSEUDO_REGISTER] =
7 for %edi (gcc regno = 5)
The following three DWARF register numbers are never generated by
the SVR4 C compiler or by the GNU compilers, but SDB on x86/svr4
- believes these numbers have these meanings.
+ believed these numbers have these meanings.
8 for %eip (no gcc equivalent)
9 for %eflags (gcc regno = 17)
10 for %trapno (no gcc equivalent)
@@ -310,20 +313,20 @@ int const dbx64_register_map[FIRST_PSEUDO_REGISTER] =
for the x86 architecture. If the version of SDB on x86/svr4 were
a bit less brain dead with respect to floating-point then we would
have a precedent to follow with respect to DWARF register numbers
- for x86 FP registers, but the SDB on x86/svr4 is so completely
+ for x86 FP registers, but the SDB on x86/svr4 was so completely
broken with respect to FP registers that it is hardly worth thinking
of it as something to strive for compatibility with.
- The version of x86/svr4 SDB I have at the moment does (partially)
+ The version of x86/svr4 SDB I had does (partially)
seem to believe that DWARF register number 11 is associated with
the x86 register %st(0), but that's about all. Higher DWARF
register numbers don't seem to be associated with anything in
- particular, and even for DWARF regno 11, SDB only seems to under-
+ particular, and even for DWARF regno 11, SDB only seemed to under-
stand that it should say that a variable lives in %st(0) (when
asked via an `=' command) if we said it was in DWARF regno 11,
- but SDB still prints garbage when asked for the value of the
+ but SDB still printed garbage when asked for the value of the
variable in question (via a `/' command).
- (Also note that the labels SDB prints for various FP stack regs
- when doing an `x' command are all wrong.)
+ (Also note that the labels SDB printed for various FP stack regs
+ when doing an `x' command were all wrong.)
Note that these problems generally don't affect the native SVR4
C compiler because it doesn't allow the use of -O with -g and
because when it is *not* optimizing, it allocates a memory
@@ -1602,7 +1605,7 @@ dimode_scalar_chain::compute_convert_gain ()
rtx dst = SET_DEST (def_set);
if (REG_P (src) && REG_P (dst))
- gain += COSTS_N_INSNS (2) - ix86_cost->sse_move;
+ gain += COSTS_N_INSNS (2) - ix86_cost->xmm_move;
else if (REG_P (src) && MEM_P (dst))
gain += 2 * ix86_cost->int_store[2] - ix86_cost->sse_store[1];
else if (MEM_P (src) && REG_P (dst))
@@ -2570,6 +2573,151 @@ make_pass_stv (gcc::context *ctxt)
return new pass_stv (ctxt);
}
+/* Inserting ENDBRANCH instructions. */
+
+static unsigned int
+rest_of_insert_endbranch (void)
+{
+ timevar_push (TV_MACH_DEP);
+
+ rtx cet_eb;
+ rtx_insn *insn;
+ basic_block bb;
+
+ /* Currently emit EB if it's a tracking function, i.e. 'nocf_check' is
+ absent among function attributes. Later an optimization will be
+ introduced to make analysis if an address of a static function is
+ taken. A static function whose address is not taken will get a
+ nocf_check attribute. This will allow to reduce the number of EB. */
+
+ if (!lookup_attribute ("nocf_check",
+ TYPE_ATTRIBUTES (TREE_TYPE (cfun->decl)))
+ && !cgraph_node::get (cfun->decl)->only_called_directly_p ())
+ {
+ cet_eb = gen_nop_endbr ();
+
+ bb = ENTRY_BLOCK_PTR_FOR_FN (cfun)->next_bb;
+ insn = BB_HEAD (bb);
+ emit_insn_before (cet_eb, insn);
+ }
+
+ bb = 0;
+ FOR_EACH_BB_FN (bb, cfun)
+ {
+ for (insn = BB_HEAD (bb); insn != NEXT_INSN (BB_END (bb));
+ insn = NEXT_INSN (insn))
+ {
+ if (INSN_P (insn) && GET_CODE (insn) == CALL_INSN)
+ {
+ rtx_insn *next_insn = insn;
+
+ while ((next_insn != BB_END (bb))
+ && (DEBUG_INSN_P (NEXT_INSN (next_insn))
+ || NOTE_P (NEXT_INSN (next_insn))
+ || BARRIER_P (NEXT_INSN (next_insn))))
+ next_insn = NEXT_INSN (next_insn);
+
+ /* Generate ENDBRANCH after CALL, which can return more than
+ twice, setjmp-like functions. */
+ if (find_reg_note (insn, REG_SETJMP, NULL) != NULL)
+ {
+ cet_eb = gen_nop_endbr ();
+ emit_insn_after (cet_eb, next_insn);
+ }
+ continue;
+ }
+
+ if (INSN_P (insn) && JUMP_P (insn) && flag_cet_switch)
+ {
+ rtx target = JUMP_LABEL (insn);
+ if (target == NULL_RTX || ANY_RETURN_P (target))
+ continue;
+
+ /* Check the jump is a switch table. */
+ rtx_insn *label = as_a<rtx_insn *> (target);
+ rtx_insn *table = next_insn (label);
+ if (table == NULL_RTX || !JUMP_TABLE_DATA_P (table))
+ continue;
+
+ /* For the indirect jump find out all places it jumps and insert
+ ENDBRANCH there. It should be done under a special flag to
+ control ENDBRANCH generation for switch stmts. */
+ edge_iterator ei;
+ edge e;
+ basic_block dest_blk;
+
+ FOR_EACH_EDGE (e, ei, bb->succs)
+ {
+ rtx_insn *insn;
+
+ dest_blk = e->dest;
+ insn = BB_HEAD (dest_blk);
+ gcc_assert (LABEL_P (insn));
+ cet_eb = gen_nop_endbr ();
+ emit_insn_after (cet_eb, insn);
+ }
+ continue;
+ }
+
+ if ((LABEL_P (insn) && LABEL_PRESERVE_P (insn))
+ || (NOTE_P (insn)
+ && NOTE_KIND (insn) == NOTE_INSN_DELETED_LABEL))
+/* TODO. Check /s bit also. */
+ {
+ cet_eb = gen_nop_endbr ();
+ emit_insn_after (cet_eb, insn);
+ continue;
+ }
+ }
+ }
+
+ timevar_pop (TV_MACH_DEP);
+ return 0;
+}
+
+namespace {
+
+const pass_data pass_data_insert_endbranch =
+{
+ RTL_PASS, /* type. */
+ "cet", /* name. */
+ OPTGROUP_NONE, /* optinfo_flags. */
+ TV_MACH_DEP, /* tv_id. */
+ 0, /* properties_required. */
+ 0, /* properties_provided. */
+ 0, /* properties_destroyed. */
+ 0, /* todo_flags_start. */
+ 0, /* todo_flags_finish. */
+};
+
+class pass_insert_endbranch : public rtl_opt_pass
+{
+public:
+ pass_insert_endbranch (gcc::context *ctxt)
+ : rtl_opt_pass (pass_data_insert_endbranch, ctxt)
+ {}
+
+ /* opt_pass methods: */
+ virtual bool gate (function *)
+ {
+ return ((flag_cf_protection & CF_BRANCH) && TARGET_IBT);
+ }
+
+ virtual unsigned int execute (function *)
+ {
+ return rest_of_insert_endbranch ();
+ }
+
+}; // class pass_insert_endbranch
+
+} // anon namespace
+
+rtl_opt_pass *
+make_pass_insert_endbranch (gcc::context *ctxt)
+{
+ return new pass_insert_endbranch (ctxt);
+}
+
/* Return true if a red-zone is in use. */
bool
@@ -2597,11 +2745,14 @@ ix86_target_string (HOST_WIDE_INT isa, HOST_WIDE_INT isa2,
ISAs come first. Target string will be displayed in the same order. */
static struct ix86_target_opts isa2_opts[] =
{
+ { "-mgfni", OPTION_MASK_ISA_GFNI },
{ "-mrdpid", OPTION_MASK_ISA_RDPID },
{ "-msgx", OPTION_MASK_ISA_SGX },
{ "-mavx5124vnniw", OPTION_MASK_ISA_AVX5124VNNIW },
{ "-mavx5124fmaps", OPTION_MASK_ISA_AVX5124FMAPS },
- { "-mavx512vpopcntdq", OPTION_MASK_ISA_AVX512VPOPCNTDQ }
+ { "-mavx512vpopcntdq", OPTION_MASK_ISA_AVX512VPOPCNTDQ },
+ { "-mibt", OPTION_MASK_ISA_IBT },
+ { "-mshstk", OPTION_MASK_ISA_SHSTK }
};
static struct ix86_target_opts isa_opts[] =
{
@@ -4694,6 +4845,37 @@ ix86_option_override_internal (bool main_args_p,
target_option_default_node = target_option_current_node
= build_target_option_node (opts);
+ /* Do not support control flow instrumentation if CET is not enabled. */
+ if (opts->x_flag_cf_protection != CF_NONE)
+ {
+ if (!(TARGET_IBT_P (opts->x_ix86_isa_flags2)
+ || TARGET_SHSTK_P (opts->x_ix86_isa_flags2)))
+ {
+ if (flag_cf_protection == CF_FULL)
+ {
+ error ("%<-fcf-protection=full%> requires CET support "
+ "on this target. Use -mcet or one of -mibt, "
+ "-mshstk options to enable CET");
+ }
+ else if (flag_cf_protection == CF_BRANCH)
+ {
+ error ("%<-fcf-protection=branch%> requires CET support "
+ "on this target. Use -mcet or one of -mibt, "
+ "-mshstk options to enable CET");
+ }
+ else if (flag_cf_protection == CF_RETURN)
+ {
+ error ("%<-fcf-protection=return%> requires CET support "
+ "on this target. Use -mcet or one of -mibt, "
+ "-mshstk options to enable CET");
+ }
+ flag_cf_protection = CF_NONE;
+ return false;
+ }
+ opts->x_flag_cf_protection =
+ (cf_protection_level) (opts->x_flag_cf_protection | CF_SET);
+ }
+
return true;
}
@@ -5123,6 +5305,9 @@ ix86_valid_target_attribute_inner_p (tree args, char *p_strings[],
IX86_ATTR_ISA ("mpx", OPT_mmpx),
IX86_ATTR_ISA ("clwb", OPT_mclwb),
IX86_ATTR_ISA ("rdpid", OPT_mrdpid),
+ IX86_ATTR_ISA ("gfni", OPT_mgfni),
+ IX86_ATTR_ISA ("ibt", OPT_mibt),
+ IX86_ATTR_ISA ("shstk", OPT_mshstk),
/* enum options */
IX86_ATTR_ENUM ("fpmath=", OPT_mfpmath_),
@@ -11943,8 +12128,14 @@ ix86_adjust_stack_and_probe_stack_clash (const HOST_WIDE_INT size)
we just probe when we cross PROBE_INTERVAL. */
if (TREE_THIS_VOLATILE (cfun->decl))
{
- emit_stack_probe (plus_constant (Pmode, stack_pointer_rtx,
- -GET_MODE_SIZE (word_mode)));
+ /* We can safely use any register here since we're just going to push
+ its value and immediately pop it back. But we do try and avoid
+ argument passing registers so as not to introduce dependencies in
+ the pipeline. For 32 bit we use %esi and for 64 bit we use %rax. */
+ rtx dummy_reg = gen_rtx_REG (word_mode, TARGET_64BIT ? AX_REG : SI_REG);
+ rtx_insn *insn = emit_insn (gen_push (dummy_reg));
+ RTX_FRAME_RELATED_P (insn) = 1;
+ ix86_emit_restore_reg_using_pop (dummy_reg);
emit_insn (gen_blockage ());
}
@@ -12512,10 +12703,13 @@ ix86_finalize_stack_frame_flags (void)
for (ref = DF_REG_USE_CHAIN (HARD_FRAME_POINTER_REGNUM);
ref; ref = next)
{
- rtx_insn *insn = DF_REF_INSN (ref);
+ next = DF_REF_NEXT_REG (ref);
+ if (!DF_REF_INSN_INFO (ref))
+ continue;
+
/* Make sure the next ref is for a different instruction,
so that we're not affected by the rescan. */
- next = DF_REF_NEXT_REG (ref);
+ rtx_insn *insn = DF_REF_INSN (ref);
while (next && DF_REF_INSN (next) == insn)
next = DF_REF_NEXT_REG (next);
@@ -12836,7 +13030,7 @@ ix86_expand_prologue (void)
if (frame_pointer_needed && !m->fs.fp_valid)
{
/* Note: AT&T enter does NOT have reversed args. Enter is probably
- slower on all targets. Also sdb doesn't like it. */
+ slower on all targets. Also sdb didn't like it. */
insn = emit_insn (gen_push (hard_frame_pointer_rtx));
RTX_FRAME_RELATED_P (insn) = 1;
@@ -12983,8 +13177,12 @@ ix86_expand_prologue (void)
&& (flag_stack_check == STATIC_BUILTIN_STACK_CHECK
|| flag_stack_clash_protection))
{
- /* We expect the GP registers to be saved when probes are used. */
- gcc_assert (int_registers_saved);
+ /* This assert wants to verify that integer registers were saved
+ prior to probing. This is necessary when probing may be implemented
+ as a function call (Windows). It is not necessary for stack clash
+ protection probing. */
+ if (!flag_stack_clash_protection)
+ gcc_assert (int_registers_saved);
if (flag_stack_clash_protection)
{
@@ -13628,7 +13826,7 @@ ix86_expand_epilogue (int style)
the stack pointer, if we will restore SSE regs via sp. */
if (TARGET_64BIT
&& m->fs.sp_offset > 0x7fffffff
- && sp_valid_at (frame.stack_realign_offset)
+ && sp_valid_at (frame.stack_realign_offset + 1)
&& (frame.nsseregs + frame.nregs) != 0)
{
pro_epilogue_adjust_stack (stack_pointer_rtx, stack_pointer_rtx,
@@ -14895,10 +15093,16 @@ legitimate_pic_address_disp_p (rtx disp)
break;
op0 = XEXP (XEXP (disp, 0), 0);
op1 = XEXP (XEXP (disp, 0), 1);
- if (!CONST_INT_P (op1)
- || INTVAL (op1) >= 16*1024*1024
+ if (!CONST_INT_P (op1))
+ break;
+ if (GET_CODE (op0) == UNSPEC
+ && (XINT (op0, 1) == UNSPEC_DTPOFF
+ || XINT (op0, 1) == UNSPEC_NTPOFF)
+ && trunc_int_for_mode (INTVAL (op1), SImode) == INTVAL (op1))
+ return true;
+ if (INTVAL (op1) >= 16*1024*1024
|| INTVAL (op1) < -16*1024*1024)
- break;
+ break;
if (GET_CODE (op0) == LABEL_REF)
return true;
if (GET_CODE (op0) == CONST
@@ -16657,13 +16861,17 @@ ix86_delegitimize_address_1 (rtx x, bool base_term_p)
movl foo@GOTOFF(%ecx), %edx
in which case we return (%ecx - %ebx) + foo
or (%ecx - _GLOBAL_OFFSET_TABLE_) + foo if pseudo_pic_reg
- and reload has completed. */
+ and reload has completed. Don't do the latter for debug,
+ as _GLOBAL_OFFSET_TABLE_ can't be expressed in the assembly. */
if (pic_offset_table_rtx
&& (!reload_completed || !ix86_use_pseudo_pic_reg ()))
result = gen_rtx_PLUS (Pmode, gen_rtx_MINUS (Pmode, copy_rtx (addend),
pic_offset_table_rtx),
result);
- else if (pic_offset_table_rtx && !TARGET_MACHO && !TARGET_VXWORKS_RTP)
+ else if (base_term_p
+ && pic_offset_table_rtx
+ && !TARGET_MACHO
+ && !TARGET_VXWORKS_RTP)
{
rtx tmp = gen_rtx_SYMBOL_REF (Pmode, GOT_SYMBOL_NAME);
tmp = gen_rtx_MINUS (Pmode, copy_rtx (addend), tmp);
@@ -16716,6 +16924,25 @@ ix86_find_base_term (rtx x)
return ix86_delegitimize_address_1 (x, true);
}
+
+/* Return true if X shouldn't be emitted into the debug info.
+ Disallow UNSPECs other than @gotoff - we can't emit _GLOBAL_OFFSET_TABLE_
+ symbol easily into the .debug_info section, so we need not to
+ delegitimize, but instead assemble as @gotoff.
+ Disallow _GLOBAL_OFFSET_TABLE_ SYMBOL_REF - the assembler magically
+ assembles that as _GLOBAL_OFFSET_TABLE_-. expression. */
+
+static bool
+ix86_const_not_ok_for_debug_p (rtx x)
+{
+ if (GET_CODE (x) == UNSPEC && XINT (x, 1) != UNSPEC_GOTOFF)
+ return true;
+
+ if (SYMBOL_REF_P (x) && strcmp (XSTR (x, 0), GOT_SYMBOL_NAME) == 0)
+ return true;
+
+ return false;
+}
static void
put_condition_code (enum rtx_code code, machine_mode mode, bool reverse,
@@ -16723,7 +16950,7 @@ put_condition_code (enum rtx_code code, machine_mode mode, bool reverse,
{
const char *suffix;
- if (mode == CCFPmode || mode == CCFPUmode)
+ if (mode == CCFPmode)
{
code = ix86_fp_compare_code_to_integer (code);
mode = CCmode;
@@ -16734,6 +16961,7 @@ put_condition_code (enum rtx_code code, machine_mode mode, bool reverse,
switch (code)
{
case EQ:
+ gcc_assert (mode != CCGZmode);
switch (mode)
{
case E_CCAmode:
@@ -16757,6 +16985,7 @@ put_condition_code (enum rtx_code code, machine_mode mode, bool reverse,
}
break;
case NE:
+ gcc_assert (mode != CCGZmode);
switch (mode)
{
case E_CCAmode:
@@ -16801,6 +17030,7 @@ put_condition_code (enum rtx_code code, machine_mode mode, bool reverse,
case E_CCmode:
case E_CCGCmode:
+ case E_CCGZmode:
suffix = "l";
break;
@@ -16809,7 +17039,7 @@ put_condition_code (enum rtx_code code, machine_mode mode, bool reverse,
}
break;
case LTU:
- if (mode == CCmode)
+ if (mode == CCmode || mode == CCGZmode)
suffix = "b";
else if (mode == CCCmode)
suffix = fp ? "b" : "c";
@@ -16826,6 +17056,7 @@ put_condition_code (enum rtx_code code, machine_mode mode, bool reverse,
case E_CCmode:
case E_CCGCmode:
+ case E_CCGZmode:
suffix = "ge";
break;
@@ -16834,7 +17065,7 @@ put_condition_code (enum rtx_code code, machine_mode mode, bool reverse,
}
break;
case GEU:
- if (mode == CCmode)
+ if (mode == CCmode || mode == CCGZmode)
suffix = "nb";
else if (mode == CCCmode)
suffix = fp ? "nb" : "nc";
@@ -17613,6 +17844,8 @@ ix86_print_operand (FILE *file, rtx x, int code)
case '!':
if (ix86_bnd_prefixed_insn_p (current_output_insn))
fputs ("bnd ", file);
+ if (ix86_notrack_prefixed_insn_p (current_output_insn))
+ fputs ("notrack ", file);
return;
default:
@@ -18028,6 +18261,10 @@ i386_asm_output_addr_const_extra (FILE *file, rtx x)
op = XVECEXP (x, 0, 0);
switch (XINT (x, 1))
{
+ case UNSPEC_GOTOFF:
+ output_addr_const (file, op);
+ fputs ("@gotoff", file);
+ break;
case UNSPEC_GOTTPOFF:
output_addr_const (file, op);
/* FIXME: This might be @TPOFF in Sun ld. */
@@ -18147,89 +18384,66 @@ output_387_binary_op (rtx_insn *insn, rtx *operands)
{
static char buf[40];
const char *p;
- const char *ssep;
- int is_sse = SSE_REG_P (operands[0]) || SSE_REG_P (operands[1]) || SSE_REG_P (operands[2]);
+ bool is_sse
+ = (SSE_REG_P (operands[0])
+ || SSE_REG_P (operands[1]) || SSE_REG_P (operands[2]));
- /* Even if we do not want to check the inputs, this documents input
- constraints. Which helps in understanding the following code. */
- if (flag_checking)
- {
- if (STACK_REG_P (operands[0])
- && ((REG_P (operands[1])
- && REGNO (operands[0]) == REGNO (operands[1])
- && (STACK_REG_P (operands[2]) || MEM_P (operands[2])))
- || (REG_P (operands[2])
- && REGNO (operands[0]) == REGNO (operands[2])
- && (STACK_REG_P (operands[1]) || MEM_P (operands[1]))))
- && (STACK_TOP_P (operands[1]) || STACK_TOP_P (operands[2])))
- ; /* ok */
- else
- gcc_assert (is_sse);
- }
+ if (is_sse)
+ p = "%v";
+ else if (GET_MODE_CLASS (GET_MODE (operands[1])) == MODE_INT
+ || GET_MODE_CLASS (GET_MODE (operands[2])) == MODE_INT)
+ p = "fi";
+ else
+ p = "f";
+
+ strcpy (buf, p);
switch (GET_CODE (operands[3]))
{
case PLUS:
- if (GET_MODE_CLASS (GET_MODE (operands[1])) == MODE_INT
- || GET_MODE_CLASS (GET_MODE (operands[2])) == MODE_INT)
- p = "fiadd";
- else
- p = "fadd";
- ssep = "vadd";
- break;
-
+ p = "add"; break;
case MINUS:
- if (GET_MODE_CLASS (GET_MODE (operands[1])) == MODE_INT
- || GET_MODE_CLASS (GET_MODE (operands[2])) == MODE_INT)
- p = "fisub";
- else
- p = "fsub";
- ssep = "vsub";
- break;
-
+ p = "sub"; break;
case MULT:
- if (GET_MODE_CLASS (GET_MODE (operands[1])) == MODE_INT
- || GET_MODE_CLASS (GET_MODE (operands[2])) == MODE_INT)
- p = "fimul";
- else
- p = "fmul";
- ssep = "vmul";
- break;
-
+ p = "mul"; break;
case DIV:
- if (GET_MODE_CLASS (GET_MODE (operands[1])) == MODE_INT
- || GET_MODE_CLASS (GET_MODE (operands[2])) == MODE_INT)
- p = "fidiv";
- else
- p = "fdiv";
- ssep = "vdiv";
- break;
-
+ p = "div"; break;
default:
gcc_unreachable ();
}
+ strcat (buf, p);
+
if (is_sse)
{
+ p = (GET_MODE (operands[0]) == SFmode) ? "ss" : "sd";
+ strcat (buf, p);
+
if (TARGET_AVX)
- {
- strcpy (buf, ssep);
- if (GET_MODE (operands[0]) == SFmode)
- strcat (buf, "ss\t{%2, %1, %0|%0, %1, %2}");
- else
- strcat (buf, "sd\t{%2, %1, %0|%0, %1, %2}");
- }
+ p = "\t{%2, %1, %0|%0, %1, %2}";
else
- {
- strcpy (buf, ssep + 1);
- if (GET_MODE (operands[0]) == SFmode)
- strcat (buf, "ss\t{%2, %0|%0, %2}");
- else
- strcat (buf, "sd\t{%2, %0|%0, %2}");
- }
- return buf;
+ p = "\t{%2, %0|%0, %2}";
+
+ strcat (buf, p);
+ return buf;
}
- strcpy (buf, p);
+
+ /* Even if we do not want to check the inputs, this documents input
+ constraints. Which helps in understanding the following code. */
+ if (flag_checking)
+ {
+ if (STACK_REG_P (operands[0])
+ && ((REG_P (operands[1])
+ && REGNO (operands[0]) == REGNO (operands[1])
+ && (STACK_REG_P (operands[2]) || MEM_P (operands[2])))
+ || (REG_P (operands[2])
+ && REGNO (operands[0]) == REGNO (operands[2])
+ && (STACK_REG_P (operands[1]) || MEM_P (operands[1]))))
+ && (STACK_TOP_P (operands[1]) || STACK_TOP_P (operands[2])))
+ ; /* ok */
+ else
+ gcc_unreachable ();
+ }
switch (GET_CODE (operands[3]))
{
@@ -18818,10 +19032,13 @@ ix86_emit_mode_set (int entity, int mode, int prev_mode ATTRIBUTE_UNUSED,
const char *
output_fix_trunc (rtx_insn *insn, rtx *operands, bool fisttp)
{
- int stack_top_dies = find_regno_note (insn, REG_DEAD, FIRST_STACK_REG) != 0;
- int dimode_p = GET_MODE (operands[0]) == DImode;
+ bool stack_top_dies = find_regno_note (insn, REG_DEAD, FIRST_STACK_REG);
+ bool dimode_p = GET_MODE (operands[0]) == DImode;
int round_mode = get_attr_i387_cw (insn);
+ static char buf[40];
+ const char *p;
+
/* Jump through a hoop or two for DImode, since the hardware has no
non-popping instruction. We used to do this a different way, but
that was somewhat fragile and broke with post-reload splitters. */
@@ -18833,18 +19050,20 @@ output_fix_trunc (rtx_insn *insn, rtx *operands, bool fisttp)
gcc_assert (GET_MODE (operands[1]) != TFmode);
if (fisttp)
- output_asm_insn ("fisttp%Z0\t%0", operands);
- else
- {
- if (round_mode != I387_CW_ANY)
- output_asm_insn ("fldcw\t%3", operands);
- if (stack_top_dies || dimode_p)
- output_asm_insn ("fistp%Z0\t%0", operands);
- else
- output_asm_insn ("fist%Z0\t%0", operands);
- if (round_mode != I387_CW_ANY)
- output_asm_insn ("fldcw\t%2", operands);
- }
+ return "fisttp%Z0\t%0";
+
+ strcpy (buf, "fist");
+
+ if (round_mode != I387_CW_ANY)
+ output_asm_insn ("fldcw\t%3", operands);
+
+ p = "p%Z0\t%0";
+ strcat (buf, p + !(stack_top_dies || dimode_p));
+
+ output_asm_insn (buf, operands);
+
+ if (round_mode != I387_CW_ANY)
+ output_asm_insn ("fldcw\t%2", operands);
return "";
}
@@ -18881,120 +19100,65 @@ output_387_ffreep (rtx *operands ATTRIBUTE_UNUSED, int opno)
should be used. UNORDERED_P is true when fucom should be used. */
const char *
-output_fp_compare (rtx_insn *insn, rtx *operands, bool eflags_p, bool unordered_p)
+output_fp_compare (rtx_insn *insn, rtx *operands,
+ bool eflags_p, bool unordered_p)
{
- int stack_top_dies;
- rtx cmp_op0, cmp_op1;
- int is_sse = SSE_REG_P (operands[0]) || SSE_REG_P (operands[1]);
-
- if (eflags_p)
- {
- cmp_op0 = operands[0];
- cmp_op1 = operands[1];
- }
- else
- {
- cmp_op0 = operands[1];
- cmp_op1 = operands[2];
- }
+ rtx *xops = eflags_p ? &operands[0] : &operands[1];
+ bool stack_top_dies;
- if (is_sse)
- {
- if (GET_MODE (operands[0]) == SFmode)
- if (unordered_p)
- return "%vucomiss\t{%1, %0|%0, %1}";
- else
- return "%vcomiss\t{%1, %0|%0, %1}";
- else
- if (unordered_p)
- return "%vucomisd\t{%1, %0|%0, %1}";
- else
- return "%vcomisd\t{%1, %0|%0, %1}";
- }
+ static char buf[40];
+ const char *p;
- gcc_assert (STACK_TOP_P (cmp_op0));
+ gcc_assert (STACK_TOP_P (xops[0]));
- stack_top_dies = find_regno_note (insn, REG_DEAD, FIRST_STACK_REG) != 0;
+ stack_top_dies = find_regno_note (insn, REG_DEAD, FIRST_STACK_REG);
- if (cmp_op1 == CONST0_RTX (GET_MODE (cmp_op1)))
+ if (eflags_p)
{
- if (stack_top_dies)
- {
- output_asm_insn ("ftst\n\tfnstsw\t%0", operands);
- return output_387_ffreep (operands, 1);
- }
- else
- return "ftst\n\tfnstsw\t%0";
+ p = unordered_p ? "fucomi" : "fcomi";
+ strcpy (buf, p);
+
+ p = "p\t{%y1, %0|%0, %y1}";
+ strcat (buf, p + !stack_top_dies);
+
+ return buf;
}
- if (STACK_REG_P (cmp_op1)
+ if (STACK_REG_P (xops[1])
&& stack_top_dies
- && find_regno_note (insn, REG_DEAD, REGNO (cmp_op1))
- && REGNO (cmp_op1) != FIRST_STACK_REG)
+ && find_regno_note (insn, REG_DEAD, FIRST_STACK_REG + 1))
{
- /* If both the top of the 387 stack dies, and the other operand
- is also a stack register that dies, then this must be a
- `fcompp' float compare */
+ gcc_assert (REGNO (xops[1]) == FIRST_STACK_REG + 1);
- if (eflags_p)
- {
- /* There is no double popping fcomi variant. Fortunately,
- eflags is immune from the fstp's cc clobbering. */
- if (unordered_p)
- output_asm_insn ("fucomip\t{%y1, %0|%0, %y1}", operands);
- else
- output_asm_insn ("fcomip\t{%y1, %0|%0, %y1}", operands);
- return output_387_ffreep (operands, 0);
- }
- else
- {
- if (unordered_p)
- return "fucompp\n\tfnstsw\t%0";
- else
- return "fcompp\n\tfnstsw\t%0";
- }
+ /* If both the top of the 387 stack die, and the other operand
+ is also a stack register that dies, then this must be a
+ `fcompp' float compare. */
+ p = unordered_p ? "fucompp" : "fcompp";
+ strcpy (buf, p);
+ }
+ else if (const0_operand (xops[1], VOIDmode))
+ {
+ gcc_assert (!unordered_p);
+ strcpy (buf, "ftst");
}
else
{
- /* Encoded here as eflags_p | intmode | unordered_p | stack_top_dies. */
-
- static const char * const alt[16] =
- {
- "fcom%Z2\t%y2\n\tfnstsw\t%0",
- "fcomp%Z2\t%y2\n\tfnstsw\t%0",
- "fucom%Z2\t%y2\n\tfnstsw\t%0",
- "fucomp%Z2\t%y2\n\tfnstsw\t%0",
-
- "ficom%Z2\t%y2\n\tfnstsw\t%0",
- "ficomp%Z2\t%y2\n\tfnstsw\t%0",
- NULL,
- NULL,
-
- "fcomi\t{%y1, %0|%0, %y1}",
- "fcomip\t{%y1, %0|%0, %y1}",
- "fucomi\t{%y1, %0|%0, %y1}",
- "fucomip\t{%y1, %0|%0, %y1}",
-
- NULL,
- NULL,
- NULL,
- NULL
- };
-
- int mask;
- const char *ret;
-
- mask = eflags_p << 3;
- mask |= (GET_MODE_CLASS (GET_MODE (cmp_op1)) == MODE_INT) << 2;
- mask |= unordered_p << 1;
- mask |= stack_top_dies;
+ if (GET_MODE_CLASS (GET_MODE (xops[1])) == MODE_INT)
+ {
+ gcc_assert (!unordered_p);
+ p = "ficom";
+ }
+ else
+ p = unordered_p ? "fucom" : "fcom";
- gcc_assert (mask < 16);
- ret = alt[mask];
- gcc_assert (ret);
+ strcpy (buf, p);
- return ret;
+ p = "p%Z2\t%y2";
+ strcat (buf, p + !stack_top_dies);
}
+
+ output_asm_insn (buf, operands);
+ return "fnstsw\t%0";
}
void
@@ -19067,20 +19231,6 @@ ix86_expand_clear (rtx dest)
emit_insn (tmp);
}
-/* X is an unchanging MEM. If it is a constant pool reference, return
- the constant pool rtx, else NULL. */
-
-rtx
-maybe_get_pool_constant (rtx x)
-{
- x = ix86_delegitimize_address (XEXP (x, 0));
-
- if (GET_CODE (x) == SYMBOL_REF && CONSTANT_POOL_ADDRESS_P (x))
- return get_pool_constant (x);
-
- return NULL_RTX;
-}
-
void
ix86_expand_move (machine_mode mode, rtx operands[])
{
@@ -21526,6 +21676,8 @@ ix86_match_ccmode (rtx insn, machine_mode req_mode)
case E_CCZmode:
break;
+ case E_CCGZmode:
+
case E_CCAmode:
case E_CCCmode:
case E_CCOmode:
@@ -21563,18 +21715,38 @@ ix86_expand_int_compare (enum rtx_code code, rtx op0, rtx op1)
return gen_rtx_fmt_ee (code, VOIDmode, flags, const0_rtx);
}
-/* Figure out whether to use ordered or unordered fp comparisons.
- Return the appropriate mode to use. */
+/* Figure out whether to use unordered fp comparisons. */
-machine_mode
-ix86_fp_compare_mode (enum rtx_code)
+static bool
+ix86_unordered_fp_compare (enum rtx_code code)
{
- /* ??? In order to make all comparisons reversible, we do all comparisons
- non-trapping when compiling for IEEE. Once gcc is able to distinguish
- all forms trapping and nontrapping comparisons, we can make inequality
- comparisons trapping again, since it results in better code when using
- FCOM based compares. */
- return TARGET_IEEE_FP ? CCFPUmode : CCFPmode;
+ if (!TARGET_IEEE_FP)
+ return false;
+
+ switch (code)
+ {
+ case GT:
+ case GE:
+ case LT:
+ case LE:
+ return false;
+
+ case EQ:
+ case NE:
+
+ case LTGT:
+ case UNORDERED:
+ case ORDERED:
+ case UNLT:
+ case UNLE:
+ case UNGT:
+ case UNGE:
+ case UNEQ:
+ return true;
+
+ default:
+ gcc_unreachable ();
+ }
}
machine_mode
@@ -21585,7 +21757,7 @@ ix86_cc_mode (enum rtx_code code, rtx op0, rtx op1)
if (SCALAR_FLOAT_MODE_P (mode))
{
gcc_assert (!DECIMAL_FLOAT_MODE_P (mode));
- return ix86_fp_compare_mode (code);
+ return CCFPmode;
}
switch (code)
@@ -21707,7 +21879,6 @@ ix86_cc_modes_compatible (machine_mode m1, machine_mode m2)
}
case E_CCFPmode:
- case E_CCFPUmode:
/* These are only compatible with themselves, which we already
checked above. */
return VOIDmode;
@@ -21811,10 +21982,10 @@ ix86_fp_comparison_strategy (enum rtx_code)
static enum rtx_code
ix86_prepare_fp_compare_args (enum rtx_code code, rtx *pop0, rtx *pop1)
{
- machine_mode fpcmp_mode = ix86_fp_compare_mode (code);
+ bool unordered_compare = ix86_unordered_fp_compare (code);
rtx op0 = *pop0, op1 = *pop1;
machine_mode op_mode = GET_MODE (op0);
- int is_sse = TARGET_SSE_MATH && SSE_FLOAT_MODE_P (op_mode);
+ bool is_sse = TARGET_SSE_MATH && SSE_FLOAT_MODE_P (op_mode);
/* All of the unordered compare instructions only work on registers.
The same is true of the fcomi compare instructions. The XFmode
@@ -21823,7 +21994,7 @@ ix86_prepare_fp_compare_args (enum rtx_code code, rtx *pop0, rtx *pop1)
floating point. */
if (!is_sse
- && (fpcmp_mode == CCFPUmode
+ && (unordered_compare
|| (op_mode == XFmode
&& ! (standard_80387_constant_p (op0) == 1
|| standard_80387_constant_p (op1) == 1)
@@ -21920,27 +22091,29 @@ ix86_fp_compare_code_to_integer (enum rtx_code code)
static rtx
ix86_expand_fp_compare (enum rtx_code code, rtx op0, rtx op1, rtx scratch)
{
- machine_mode fpcmp_mode, intcmp_mode;
+ bool unordered_compare = ix86_unordered_fp_compare (code);
+ machine_mode intcmp_mode;
rtx tmp, tmp2;
- fpcmp_mode = ix86_fp_compare_mode (code);
code = ix86_prepare_fp_compare_args (code, &op0, &op1);
/* Do fcomi/sahf based test when profitable. */
switch (ix86_fp_comparison_strategy (code))
{
case IX86_FPCMP_COMI:
- intcmp_mode = fpcmp_mode;
- tmp = gen_rtx_COMPARE (fpcmp_mode, op0, op1);
- tmp = gen_rtx_SET (gen_rtx_REG (fpcmp_mode, FLAGS_REG), tmp);
- emit_insn (tmp);
+ intcmp_mode = CCFPmode;
+ tmp = gen_rtx_COMPARE (CCFPmode, op0, op1);
+ if (unordered_compare)
+ tmp = gen_rtx_UNSPEC (CCFPmode, gen_rtvec (1, tmp), UNSPEC_NOTRAP);
+ emit_insn (gen_rtx_SET (gen_rtx_REG (CCFPmode, FLAGS_REG), tmp));
break;
case IX86_FPCMP_SAHF:
- intcmp_mode = fpcmp_mode;
- tmp = gen_rtx_COMPARE (fpcmp_mode, op0, op1);
- tmp = gen_rtx_SET (gen_rtx_REG (fpcmp_mode, FLAGS_REG), tmp);
-
+ intcmp_mode = CCFPmode;
+ tmp = gen_rtx_COMPARE (CCFPmode, op0, op1);
+ if (unordered_compare)
+ tmp = gen_rtx_UNSPEC (CCFPmode, gen_rtvec (1, tmp), UNSPEC_NOTRAP);
+ tmp = gen_rtx_SET (gen_rtx_REG (CCFPmode, FLAGS_REG), tmp);
if (!scratch)
scratch = gen_reg_rtx (HImode);
tmp2 = gen_rtx_CLOBBER (VOIDmode, scratch);
@@ -21949,11 +22122,13 @@ ix86_expand_fp_compare (enum rtx_code code, rtx op0, rtx op1, rtx scratch)
case IX86_FPCMP_ARITH:
/* Sadness wrt reg-stack pops killing fpsr -- gotta get fnstsw first. */
- tmp = gen_rtx_COMPARE (fpcmp_mode, op0, op1);
- tmp2 = gen_rtx_UNSPEC (HImode, gen_rtvec (1, tmp), UNSPEC_FNSTSW);
+ tmp = gen_rtx_COMPARE (CCFPmode, op0, op1);
+ if (unordered_compare)
+ tmp = gen_rtx_UNSPEC (CCFPmode, gen_rtvec (1, tmp), UNSPEC_NOTRAP);
+ tmp = gen_rtx_UNSPEC (HImode, gen_rtvec (1, tmp), UNSPEC_FNSTSW);
if (!scratch)
scratch = gen_reg_rtx (HImode);
- emit_insn (gen_rtx_SET (scratch, tmp2));
+ emit_insn (gen_rtx_SET (scratch, tmp));
/* In the unordered case, we have to check C2 for NaN's, which
doesn't happen to work out to anything nice combination-wise.
@@ -22234,6 +22409,62 @@ ix86_expand_branch (enum rtx_code code, rtx op0, rtx op1, rtx label)
break;
}
+ /* Emulate comparisons that do not depend on Zero flag with
+ double-word subtraction. Note that only Overflow, Sign
+ and Carry flags are valid, so swap arguments and condition
+ of comparisons that would otherwise test Zero flag. */
+
+ switch (code)
+ {
+ case LE: case LEU: case GT: case GTU:
+ std::swap (lo[0], lo[1]);
+ std::swap (hi[0], hi[1]);
+ code = swap_condition (code);
+ /* FALLTHRU */
+
+ case LT: case LTU: case GE: case GEU:
+ {
+ rtx (*cmp_insn) (rtx, rtx);
+ rtx (*sbb_insn) (rtx, rtx, rtx);
+ bool uns = (code == LTU || code == GEU);
+
+ if (TARGET_64BIT)
+ {
+ cmp_insn = gen_cmpdi_1;
+ sbb_insn
+ = uns ? gen_subdi3_carry_ccc : gen_subdi3_carry_ccgz;
+ }
+ else
+ {
+ cmp_insn = gen_cmpsi_1;
+ sbb_insn
+ = uns ? gen_subsi3_carry_ccc : gen_subsi3_carry_ccgz;
+ }
+
+ if (!nonimmediate_operand (lo[0], submode))
+ lo[0] = force_reg (submode, lo[0]);
+ if (!x86_64_general_operand (lo[1], submode))
+ lo[1] = force_reg (submode, lo[1]);
+
+ if (!register_operand (hi[0], submode))
+ hi[0] = force_reg (submode, hi[0]);
+ if ((uns && !nonimmediate_operand (hi[1], submode))
+ || (!uns && !x86_64_general_operand (hi[1], submode)))
+ hi[1] = force_reg (submode, hi[1]);
+
+ emit_insn (cmp_insn (lo[0], lo[1]));
+ emit_insn (sbb_insn (gen_rtx_SCRATCH (submode), hi[0], hi[1]));
+
+ tmp = gen_rtx_REG (uns ? CCCmode : CCGZmode, FLAGS_REG);
+
+ ix86_expand_branch (code, tmp, const0_rtx, label);
+ return;
+ }
+
+ default:
+ break;
+ }
+
/* Otherwise, we need two or three jumps. */
label2 = gen_label_rtx ();
@@ -22339,8 +22570,7 @@ ix86_expand_carry_flag_compare (enum rtx_code code, rtx op0, rtx op1, rtx *pop)
compare_seq = get_insns ();
end_sequence ();
- if (GET_MODE (XEXP (compare_op, 0)) == CCFPmode
- || GET_MODE (XEXP (compare_op, 0)) == CCFPUmode)
+ if (GET_MODE (XEXP (compare_op, 0)) == CCFPmode)
code = ix86_fp_compare_code_to_integer (GET_CODE (compare_op));
else
code = GET_CODE (compare_op);
@@ -22480,8 +22710,7 @@ ix86_expand_int_movcc (rtx operands[])
flags = XEXP (compare_op, 0);
- if (GET_MODE (flags) == CCFPmode
- || GET_MODE (flags) == CCFPUmode)
+ if (GET_MODE (flags) == CCFPmode)
{
fpcmp = true;
compare_code
@@ -23826,10 +24055,10 @@ struct expand_vec_perm_d
};
static bool
-ix86_expand_vec_perm_vpermi2 (rtx target, rtx op0, rtx mask, rtx op1,
+ix86_expand_vec_perm_vpermt2 (rtx target, rtx mask, rtx op0, rtx op1,
struct expand_vec_perm_d *d)
{
- /* ix86_expand_vec_perm_vpermi2 is called from both const and non-const
+ /* ix86_expand_vec_perm_vpermt2 is called from both const and non-const
expander, so args are either in d, or in op0, op1 etc. */
machine_mode mode = GET_MODE (d ? d->op0 : op0);
machine_mode maskmode = mode;
@@ -23839,83 +24068,83 @@ ix86_expand_vec_perm_vpermi2 (rtx target, rtx op0, rtx mask, rtx op1,
{
case E_V8HImode:
if (TARGET_AVX512VL && TARGET_AVX512BW)
- gen = gen_avx512vl_vpermi2varv8hi3;
+ gen = gen_avx512vl_vpermt2varv8hi3;
break;
case E_V16HImode:
if (TARGET_AVX512VL && TARGET_AVX512BW)
- gen = gen_avx512vl_vpermi2varv16hi3;
+ gen = gen_avx512vl_vpermt2varv16hi3;
break;
case E_V64QImode:
if (TARGET_AVX512VBMI)
- gen = gen_avx512bw_vpermi2varv64qi3;
+ gen = gen_avx512bw_vpermt2varv64qi3;
break;
case E_V32HImode:
if (TARGET_AVX512BW)
- gen = gen_avx512bw_vpermi2varv32hi3;
+ gen = gen_avx512bw_vpermt2varv32hi3;
break;
case E_V4SImode:
if (TARGET_AVX512VL)
- gen = gen_avx512vl_vpermi2varv4si3;
+ gen = gen_avx512vl_vpermt2varv4si3;
break;
case E_V8SImode:
if (TARGET_AVX512VL)
- gen = gen_avx512vl_vpermi2varv8si3;
+ gen = gen_avx512vl_vpermt2varv8si3;
break;
case E_V16SImode:
if (TARGET_AVX512F)
- gen = gen_avx512f_vpermi2varv16si3;
+ gen = gen_avx512f_vpermt2varv16si3;
break;
case E_V4SFmode:
if (TARGET_AVX512VL)
{
- gen = gen_avx512vl_vpermi2varv4sf3;
+ gen = gen_avx512vl_vpermt2varv4sf3;
maskmode = V4SImode;
}
break;
case E_V8SFmode:
if (TARGET_AVX512VL)
{
- gen = gen_avx512vl_vpermi2varv8sf3;
+ gen = gen_avx512vl_vpermt2varv8sf3;
maskmode = V8SImode;
}
break;
case E_V16SFmode:
if (TARGET_AVX512F)
{
- gen = gen_avx512f_vpermi2varv16sf3;
+ gen = gen_avx512f_vpermt2varv16sf3;
maskmode = V16SImode;
}
break;
case E_V2DImode:
if (TARGET_AVX512VL)
- gen = gen_avx512vl_vpermi2varv2di3;
+ gen = gen_avx512vl_vpermt2varv2di3;
break;
case E_V4DImode:
if (TARGET_AVX512VL)
- gen = gen_avx512vl_vpermi2varv4di3;
+ gen = gen_avx512vl_vpermt2varv4di3;
break;
case E_V8DImode:
if (TARGET_AVX512F)
- gen = gen_avx512f_vpermi2varv8di3;
+ gen = gen_avx512f_vpermt2varv8di3;
break;
case E_V2DFmode:
if (TARGET_AVX512VL)
{
- gen = gen_avx512vl_vpermi2varv2df3;
+ gen = gen_avx512vl_vpermt2varv2df3;
maskmode = V2DImode;
}
break;
case E_V4DFmode:
if (TARGET_AVX512VL)
{
- gen = gen_avx512vl_vpermi2varv4df3;
+ gen = gen_avx512vl_vpermt2varv4df3;
maskmode = V4DImode;
}
break;
case E_V8DFmode:
if (TARGET_AVX512F)
{
- gen = gen_avx512f_vpermi2varv8df3;
+ gen = gen_avx512f_vpermt2varv8df3;
maskmode = V8DImode;
}
break;
@@ -23926,7 +24155,7 @@ ix86_expand_vec_perm_vpermi2 (rtx target, rtx op0, rtx mask, rtx op1,
if (gen == NULL)
return false;
- /* ix86_expand_vec_perm_vpermi2 is called from both const and non-const
+ /* ix86_expand_vec_perm_vpermt2 is called from both const and non-const
expander, so args are either in d, or in op0, op1 etc. */
if (d)
{
@@ -23939,7 +24168,7 @@ ix86_expand_vec_perm_vpermi2 (rtx target, rtx op0, rtx mask, rtx op1,
mask = gen_rtx_CONST_VECTOR (maskmode, gen_rtvec_v (d->nelt, vec));
}
- emit_insn (gen (target, op0, force_reg (maskmode, mask), op1));
+ emit_insn (gen (target, force_reg (maskmode, mask), op0, op1));
return true;
}
@@ -23990,7 +24219,7 @@ ix86_expand_vec_perm (rtx operands[])
}
}
- if (ix86_expand_vec_perm_vpermi2 (target, op0, mask, op1, NULL))
+ if (ix86_expand_vec_perm_vpermt2 (target, mask, op0, op1, NULL))
return;
if (TARGET_AVX2)
@@ -24515,8 +24744,7 @@ ix86_expand_int_addcc (rtx operands[])
flags = XEXP (compare_op, 0);
- if (GET_MODE (flags) == CCFPmode
- || GET_MODE (flags) == CCFPUmode)
+ if (GET_MODE (flags) == CCFPmode)
{
fpcmp = true;
code = ix86_fp_compare_code_to_integer (code);
@@ -24603,11 +24831,7 @@ ix86_split_to_parts (rtx operand, rtx *parts, machine_mode mode)
/* Optimize constant pool reference to immediates. This is used by fp
moves, that force all constants to memory to allow combining. */
if (MEM_P (operand) && MEM_READONLY_P (operand))
- {
- rtx tmp = maybe_get_pool_constant (operand);
- if (tmp)
- operand = tmp;
- }
+ operand = avoid_constant_pool_reference (operand);
if (MEM_P (operand) && !offsettable_memref_p (operand))
{
@@ -29804,8 +30028,12 @@ BDESC_VERIFYS (IX86_BUILTIN__BDESC_MPX_CONST_FIRST,
IX86_BUILTIN__BDESC_MPX_LAST, 1);
BDESC_VERIFYS (IX86_BUILTIN__BDESC_MULTI_ARG_FIRST,
IX86_BUILTIN__BDESC_MPX_CONST_LAST, 1);
-BDESC_VERIFYS (IX86_BUILTIN_MAX,
+BDESC_VERIFYS (IX86_BUILTIN__BDESC_CET_FIRST,
IX86_BUILTIN__BDESC_MULTI_ARG_LAST, 1);
+BDESC_VERIFYS (IX86_BUILTIN__BDESC_CET_NORMAL_FIRST,
+ IX86_BUILTIN__BDESC_CET_LAST, 1);
+BDESC_VERIFYS (IX86_BUILTIN_MAX,
+ IX86_BUILTIN__BDESC_CET_NORMAL_LAST, 1);
/* Set up all the MMX/SSE builtins, even builtins for instructions that are not
in the current target ISA to allow the user to compile particular modules
@@ -30472,6 +30700,35 @@ ix86_init_mmx_sse_builtins (void)
BDESC_VERIFYS (IX86_BUILTIN__BDESC_MULTI_ARG_LAST,
IX86_BUILTIN__BDESC_MULTI_ARG_FIRST,
ARRAY_SIZE (bdesc_multi_arg) - 1);
+
+ /* Add CET inrinsics. */
+ for (i = 0, d = bdesc_cet; i < ARRAY_SIZE (bdesc_cet); i++, d++)
+ {
+ BDESC_VERIFY (d->code, IX86_BUILTIN__BDESC_CET_FIRST, i);
+ if (d->name == 0)
+ continue;
+
+ ftype = (enum ix86_builtin_func_type) d->flag;
+ def_builtin2 (d->mask, d->name, ftype, d->code);
+ }
+ BDESC_VERIFYS (IX86_BUILTIN__BDESC_CET_LAST,
+ IX86_BUILTIN__BDESC_CET_FIRST,
+ ARRAY_SIZE (bdesc_cet) - 1);
+
+ for (i = 0, d = bdesc_cet_rdssp;
+ i < ARRAY_SIZE (bdesc_cet_rdssp);
+ i++, d++)
+ {
+ BDESC_VERIFY (d->code, IX86_BUILTIN__BDESC_CET_NORMAL_FIRST, i);
+ if (d->name == 0)
+ continue;
+
+ ftype = (enum ix86_builtin_func_type) d->flag;
+ def_builtin2 (d->mask, d->name, ftype, d->code);
+ }
+ BDESC_VERIFYS (IX86_BUILTIN__BDESC_CET_NORMAL_LAST,
+ IX86_BUILTIN__BDESC_CET_NORMAL_FIRST,
+ ARRAY_SIZE (bdesc_cet_rdssp) - 1);
}
static void
@@ -33425,6 +33682,7 @@ ix86_expand_args_builtin (const struct builtin_description *d,
case UQI_FTYPE_V4SF_V4SF_INT:
case UHI_FTYPE_V16SI_V16SI_INT:
case UHI_FTYPE_V16SF_V16SF_INT:
+ case V64QI_FTYPE_V64QI_V64QI_INT:
nargs = 3;
nargs_constant = 1;
break;
@@ -33652,6 +33910,13 @@ ix86_expand_args_builtin (const struct builtin_description *d,
mask_pos = 1;
nargs_constant = 1;
break;
+ case V64QI_FTYPE_V64QI_V64QI_INT_V64QI_UDI:
+ case V32QI_FTYPE_V32QI_V32QI_INT_V32QI_USI:
+ case V16QI_FTYPE_V16QI_V16QI_INT_V16QI_UHI:
+ nargs = 5;
+ mask_pos = 1;
+ nargs_constant = 2;
+ break;
default:
gcc_unreachable ();
@@ -34830,10 +35095,10 @@ ix86_expand_builtin (tree exp, rtx target, rtx subtarget,
machine_mode mode, int ignore)
{
size_t i;
- enum insn_code icode;
+ enum insn_code icode, icode2;
tree fndecl = TREE_OPERAND (CALL_EXPR_FN (exp), 0);
tree arg0, arg1, arg2, arg3, arg4;
- rtx op0, op1, op2, op3, op4, pat, insn;
+ rtx op0, op1, op2, op3, op4, pat, pat2, insn;
machine_mode mode0, mode1, mode2, mode3, mode4;
unsigned int fcode = DECL_FUNCTION_CODE (fndecl);
@@ -35808,22 +36073,34 @@ rdseed_step:
case IX86_BUILTIN_SBB32:
icode = CODE_FOR_subborrowsi;
+ icode2 = CODE_FOR_subborrowsi_0;
mode0 = SImode;
+ mode1 = DImode;
+ mode2 = CCmode;
goto handlecarry;
case IX86_BUILTIN_SBB64:
icode = CODE_FOR_subborrowdi;
+ icode2 = CODE_FOR_subborrowdi_0;
mode0 = DImode;
+ mode1 = TImode;
+ mode2 = CCmode;
goto handlecarry;
case IX86_BUILTIN_ADDCARRYX32:
icode = CODE_FOR_addcarrysi;
+ icode2 = CODE_FOR_addcarrysi_0;
mode0 = SImode;
+ mode1 = DImode;
+ mode2 = CCCmode;
goto handlecarry;
case IX86_BUILTIN_ADDCARRYX64:
icode = CODE_FOR_addcarrydi;
+ icode2 = CODE_FOR_addcarrydi_0;
mode0 = DImode;
+ mode1 = TImode;
+ mode2 = CCCmode;
handlecarry:
arg0 = CALL_EXPR_ARG (exp, 0); /* unsigned char c_in. */
@@ -35832,7 +36109,8 @@ rdseed_step:
arg3 = CALL_EXPR_ARG (exp, 3); /* unsigned int *sum_out. */
op1 = expand_normal (arg0);
- op1 = copy_to_mode_reg (QImode, convert_to_mode (QImode, op1, 1));
+ if (!integer_zerop (arg0))
+ op1 = copy_to_mode_reg (QImode, convert_to_mode (QImode, op1, 1));
op2 = expand_normal (arg1);
if (!register_operand (op2, mode0))
@@ -35849,21 +36127,31 @@ rdseed_step:
op4 = copy_addr_to_reg (op4);
}
- /* Generate CF from input operand. */
- emit_insn (gen_addqi3_cconly_overflow (op1, constm1_rtx));
-
- /* Generate instruction that consumes CF. */
op0 = gen_reg_rtx (mode0);
+ if (integer_zerop (arg0))
+ {
+ /* If arg0 is 0, optimize right away into add or sub
+ instruction that sets CCCmode flags. */
+ op1 = gen_rtx_REG (mode2, FLAGS_REG);
+ emit_insn (GEN_FCN (icode2) (op0, op2, op3));
+ }
+ else
+ {
+ /* Generate CF from input operand. */
+ emit_insn (gen_addqi3_cconly_overflow (op1, constm1_rtx));
- op1 = gen_rtx_REG (CCCmode, FLAGS_REG);
- pat = gen_rtx_LTU (mode0, op1, const0_rtx);
- emit_insn (GEN_FCN (icode) (op0, op2, op3, op1, pat));
+ /* Generate instruction that consumes CF. */
+ op1 = gen_rtx_REG (CCCmode, FLAGS_REG);
+ pat = gen_rtx_LTU (mode1, op1, const0_rtx);
+ pat2 = gen_rtx_LTU (mode0, op1, const0_rtx);
+ emit_insn (GEN_FCN (icode) (op0, op2, op3, op1, pat, pat2));
+ }
/* Return current CF value. */
if (target == 0)
target = gen_reg_rtx (QImode);
- PUT_MODE (pat, QImode);
+ pat = gen_rtx_LTU (QImode, op1, const0_rtx);
emit_insn (gen_rtx_SET (target, pat));
/* Store the result. */
@@ -36656,6 +36944,57 @@ rdseed_step:
emit_insn (gen_xabort (op0));
return 0;
+ case IX86_BUILTIN_RSTORSSP:
+ case IX86_BUILTIN_CLRSSBSY:
+ arg0 = CALL_EXPR_ARG (exp, 0);
+ op0 = expand_normal (arg0);
+ icode = (fcode == IX86_BUILTIN_RSTORSSP
+ ? CODE_FOR_rstorssp
+ : CODE_FOR_clrssbsy);
+ if (!address_operand (op0, VOIDmode))
+ {
+ op1 = convert_memory_address (Pmode, op0);
+ op0 = copy_addr_to_reg (op1);
+ }
+ emit_insn (GEN_FCN (icode) (gen_rtx_MEM (Pmode, op0)));
+ return 0;
+
+ case IX86_BUILTIN_WRSSD:
+ case IX86_BUILTIN_WRSSQ:
+ case IX86_BUILTIN_WRUSSD:
+ case IX86_BUILTIN_WRUSSQ:
+ arg0 = CALL_EXPR_ARG (exp, 0);
+ op0 = expand_normal (arg0);
+ arg1 = CALL_EXPR_ARG (exp, 1);
+ op1 = expand_normal (arg1);
+ switch (fcode)
+ {
+ case IX86_BUILTIN_WRSSD:
+ icode = CODE_FOR_wrsssi;
+ mode = SImode;
+ break;
+ case IX86_BUILTIN_WRSSQ:
+ icode = CODE_FOR_wrssdi;
+ mode = DImode;
+ break;
+ case IX86_BUILTIN_WRUSSD:
+ icode = CODE_FOR_wrusssi;
+ mode = SImode;
+ break;
+ case IX86_BUILTIN_WRUSSQ:
+ icode = CODE_FOR_wrussdi;
+ mode = DImode;
+ break;
+ }
+ op0 = force_reg (mode, op0);
+ if (!address_operand (op1, VOIDmode))
+ {
+ op2 = convert_memory_address (Pmode, op1);
+ op1 = copy_addr_to_reg (op2);
+ }
+ emit_insn (GEN_FCN (icode) (op0, gen_rtx_MEM (mode, op1)));
+ return 0;
+
default:
break;
}
@@ -36958,6 +37297,22 @@ s4fma_expand:
d->flag, d->comparison);
}
+ if (fcode >= IX86_BUILTIN__BDESC_CET_FIRST
+ && fcode <= IX86_BUILTIN__BDESC_CET_LAST)
+ {
+ i = fcode - IX86_BUILTIN__BDESC_CET_FIRST;
+ return ix86_expand_special_args_builtin (bdesc_cet + i, exp,
+ target);
+ }
+
+ if (fcode >= IX86_BUILTIN__BDESC_CET_NORMAL_FIRST
+ && fcode <= IX86_BUILTIN__BDESC_CET_NORMAL_LAST)
+ {
+ i = fcode - IX86_BUILTIN__BDESC_CET_NORMAL_FIRST;
+ return ix86_expand_args_builtin (bdesc_cet_rdssp + i, exp,
+ target);
+ }
+
gcc_unreachable ();
}
@@ -38347,6 +38702,28 @@ ix86_can_change_mode_class (machine_mode from, machine_mode to,
return true;
}
+/* Return index of MODE in the sse load/store tables. */
+
+static inline int
+sse_store_index (machine_mode mode)
+{
+ switch (GET_MODE_SIZE (mode))
+ {
+ case 4:
+ return 0;
+ case 8:
+ return 1;
+ case 16:
+ return 2;
+ case 32:
+ return 3;
+ case 64:
+ return 4;
+ default:
+ return -1;
+ }
+}
+
/* Return the cost of moving data of mode M between a
register and memory. A value of 2 is the default; this cost is
relative to those in `REGISTER_MOVE_COST'.
@@ -38390,21 +38767,9 @@ inline_memory_move_cost (machine_mode mode, enum reg_class regclass,
}
if (SSE_CLASS_P (regclass))
{
- int index;
- switch (GET_MODE_SIZE (mode))
- {
- case 4:
- index = 0;
- break;
- case 8:
- index = 1;
- break;
- case 16:
- index = 2;
- break;
- default:
- return 100;
- }
+ int index = sse_store_index (mode);
+ if (index == -1)
+ return 100;
if (in == 2)
return MAX (ix86_cost->sse_load [index], ix86_cost->sse_store [index]);
return in ? ix86_cost->sse_load [index] : ix86_cost->sse_store [index];
@@ -38507,8 +38872,10 @@ ix86_register_move_cost (machine_mode mode, reg_class_t class1_i,
/* In case of copying from general_purpose_register we may emit multiple
stores followed by single load causing memory size mismatch stall.
Count this as arbitrarily high cost of 20. */
- if (targetm.class_max_nregs (class1, mode)
- > targetm.class_max_nregs (class2, mode))
+ if (GET_MODE_BITSIZE (mode) > BITS_PER_WORD
+ && TARGET_MEMORY_MISMATCH_STALL
+ && targetm.class_max_nregs (class1, mode)
+ > targetm.class_max_nregs (class2, mode))
cost += 20;
/* In the case of FP/MMX moves, the registers actually overlap, and we
@@ -38530,12 +38897,19 @@ ix86_register_move_cost (machine_mode mode, reg_class_t class1_i,
where integer modes in MMX/SSE registers are not tieable
because of missing QImode and HImode moves to, from or between
MMX/SSE registers. */
- return MAX (8, ix86_cost->mmxsse_to_integer);
+ return MAX (8, MMX_CLASS_P (class1) || MMX_CLASS_P (class2)
+ ? ix86_cost->mmxsse_to_integer : ix86_cost->ssemmx_to_integer);
if (MAYBE_FLOAT_CLASS_P (class1))
return ix86_cost->fp_move;
if (MAYBE_SSE_CLASS_P (class1))
- return ix86_cost->sse_move;
+ {
+ if (GET_MODE_BITSIZE (mode) <= 128)
+ return ix86_cost->xmm_move;
+ if (GET_MODE_BITSIZE (mode) <= 256)
+ return ix86_cost->ymm_move;
+ return ix86_cost->zmm_move;
+ }
if (MAYBE_MMX_CLASS_P (class1))
return ix86_cost->mmx_move;
return 2;
@@ -38806,6 +39180,27 @@ ix86_set_reg_reg_cost (machine_mode mode)
return COSTS_N_INSNS (CEIL (GET_MODE_SIZE (mode), units));
}
+/* Return cost of vector operation in MODE given that scalar version has
+ COST. If PARALLEL is true assume that CPU has more than one unit
+ performing the operation. */
+
+static int
+ix86_vec_cost (machine_mode mode, int cost, bool parallel)
+{
+ if (!VECTOR_MODE_P (mode))
+ return cost;
+
+ if (!parallel)
+ return cost * GET_MODE_NUNITS (mode);
+ if (GET_MODE_BITSIZE (mode) == 128
+ && TARGET_SSE_SPLIT_REGS)
+ return cost * 2;
+ if (GET_MODE_BITSIZE (mode) > 128
+ && TARGET_AVX128_OPTIMAL)
+ return cost * GET_MODE_BITSIZE (mode) / 128;
+ return cost;
+}
+
/* Compute a (partial) cost for rtx X. Return true if the complete
cost has been computed, and false if subexpressions should be
scanned. In either case, *TOTAL contains the cost result. */
@@ -38819,6 +39214,9 @@ ix86_rtx_costs (rtx x, machine_mode mode, int outer_code_i, int opno,
enum rtx_code outer_code = (enum rtx_code) outer_code_i;
const struct processor_costs *cost = speed ? ix86_cost : &ix86_size_cost;
int src_cost;
+ machine_mode inner_mode = mode;
+ if (VECTOR_MODE_P (mode))
+ inner_mode = GET_MODE_INNER (mode);
switch (code)
{
@@ -38963,19 +39361,20 @@ ix86_rtx_costs (rtx x, machine_mode mode, int outer_code_i, int opno,
shift with one insn set the cost to prefer paddb. */
if (CONSTANT_P (XEXP (x, 1)))
{
- *total = (cost->fabs
+ *total = ix86_vec_cost (mode,
+ cost->sse_op
+ rtx_cost (XEXP (x, 0), mode, code, 0, speed)
- + (speed ? 2 : COSTS_N_BYTES (16)));
+ + (speed ? 2 : COSTS_N_BYTES (16)), true);
return true;
}
count = 3;
}
else if (TARGET_SSSE3)
count = 7;
- *total = cost->fabs * count;
+ *total = ix86_vec_cost (mode, cost->sse_op * count, true);
}
else
- *total = cost->fabs;
+ *total = ix86_vec_cost (mode, cost->sse_op, true);
}
else if (GET_MODE_SIZE (mode) > UNITS_PER_WORD)
{
@@ -39017,9 +39416,9 @@ ix86_rtx_costs (rtx x, machine_mode mode, int outer_code_i, int opno,
gcc_assert (FLOAT_MODE_P (mode));
gcc_assert (TARGET_FMA || TARGET_FMA4 || TARGET_AVX512F);
- /* ??? SSE scalar/vector cost should be used here. */
- /* ??? Bald assumption that fma has the same cost as fmul. */
- *total = cost->fmul;
+ *total = ix86_vec_cost (mode,
+ mode == SFmode ? cost->fmass : cost->fmasd,
+ true);
*total += rtx_cost (XEXP (x, 1), mode, FMA, 1, speed);
/* Negate in op0 or op2 is free: FMS, FNMA, FNMS. */
@@ -39038,8 +39437,7 @@ ix86_rtx_costs (rtx x, machine_mode mode, int outer_code_i, int opno,
case MULT:
if (SSE_FLOAT_MODE_P (mode) && TARGET_SSE_MATH)
{
- /* ??? SSE scalar cost should be used here. */
- *total = cost->fmul;
+ *total = inner_mode == DFmode ? cost->mulsd : cost->mulss;
return false;
}
else if (X87_FLOAT_MODE_P (mode))
@@ -39049,8 +39447,9 @@ ix86_rtx_costs (rtx x, machine_mode mode, int outer_code_i, int opno,
}
else if (FLOAT_MODE_P (mode))
{
- /* ??? SSE vector cost should be used here. */
- *total = cost->fmul;
+ *total = ix86_vec_cost (mode,
+ inner_mode == DFmode
+ ? cost->mulsd : cost->mulss, true);
return false;
}
else if (GET_MODE_CLASS (mode) == MODE_VECTOR_INT)
@@ -39063,22 +39462,29 @@ ix86_rtx_costs (rtx x, machine_mode mode, int outer_code_i, int opno,
extra = 5;
else if (TARGET_SSSE3)
extra = 6;
- *total = cost->fmul * 2 + cost->fabs * extra;
+ *total = ix86_vec_cost (mode,
+ cost->mulss * 2 + cost->sse_op * extra,
+ true);
}
/* V*DImode is emulated with 5-8 insns. */
else if (mode == V2DImode || mode == V4DImode)
{
if (TARGET_XOP && mode == V2DImode)
- *total = cost->fmul * 2 + cost->fabs * 3;
+ *total = ix86_vec_cost (mode,
+ cost->mulss * 2 + cost->sse_op * 3,
+ true);
else
- *total = cost->fmul * 3 + cost->fabs * 5;
+ *total = ix86_vec_cost (mode,
+ cost->mulss * 3 + cost->sse_op * 5,
+ true);
}
/* Without sse4.1, we don't have PMULLD; it's emulated with 7
insns, including two PMULUDQ. */
else if (mode == V4SImode && !(TARGET_SSE4_1 || TARGET_AVX))
- *total = cost->fmul * 2 + cost->fabs * 5;
+ *total = ix86_vec_cost (mode, cost->mulss * 2 + cost->sse_op * 5,
+ true);
else
- *total = cost->fmul;
+ *total = ix86_vec_cost (mode, cost->mulss, true);
return false;
}
else
@@ -39132,13 +39538,13 @@ ix86_rtx_costs (rtx x, machine_mode mode, int outer_code_i, int opno,
case MOD:
case UMOD:
if (SSE_FLOAT_MODE_P (mode) && TARGET_SSE_MATH)
- /* ??? SSE cost should be used here. */
- *total = cost->fdiv;
+ *total = inner_mode == DFmode ? cost->divsd : cost->divss;
else if (X87_FLOAT_MODE_P (mode))
*total = cost->fdiv;
else if (FLOAT_MODE_P (mode))
- /* ??? SSE vector cost should be used here. */
- *total = cost->fdiv;
+ *total = ix86_vec_cost (mode,
+ inner_mode == DFmode ? cost->divsd : cost->divss,
+ true);
else
*total = cost->divide[MODE_INDEX (mode)];
return false;
@@ -39217,8 +39623,7 @@ ix86_rtx_costs (rtx x, machine_mode mode, int outer_code_i, int opno,
if (SSE_FLOAT_MODE_P (mode) && TARGET_SSE_MATH)
{
- /* ??? SSE cost should be used here. */
- *total = cost->fadd;
+ *total = cost->addss;
return false;
}
else if (X87_FLOAT_MODE_P (mode))
@@ -39228,8 +39633,7 @@ ix86_rtx_costs (rtx x, machine_mode mode, int outer_code_i, int opno,
}
else if (FLOAT_MODE_P (mode))
{
- /* ??? SSE vector cost should be used here. */
- *total = cost->fadd;
+ *total = ix86_vec_cost (mode, cost->addss, true);
return false;
}
/* FALLTHRU */
@@ -39252,8 +39656,7 @@ ix86_rtx_costs (rtx x, machine_mode mode, int outer_code_i, int opno,
case NEG:
if (SSE_FLOAT_MODE_P (mode) && TARGET_SSE_MATH)
{
- /* ??? SSE cost should be used here. */
- *total = cost->fchs;
+ *total = cost->sse_op;
return false;
}
else if (X87_FLOAT_MODE_P (mode))
@@ -39263,20 +39666,14 @@ ix86_rtx_costs (rtx x, machine_mode mode, int outer_code_i, int opno,
}
else if (FLOAT_MODE_P (mode))
{
- /* ??? SSE vector cost should be used here. */
- *total = cost->fchs;
+ *total = ix86_vec_cost (mode, cost->sse_op, true);
return false;
}
/* FALLTHRU */
case NOT:
if (GET_MODE_CLASS (mode) == MODE_VECTOR_INT)
- {
- /* ??? Should be SSE vector operation cost. */
- /* At least for published AMD latencies, this really is the same
- as the latency for a simple fpu operation like fabs. */
- *total = cost->fabs;
- }
+ *total = ix86_vec_cost (mode, cost->sse_op, true);
else if (GET_MODE_SIZE (mode) > UNITS_PER_WORD)
*total = cost->add * 2;
else
@@ -39309,28 +39706,38 @@ ix86_rtx_costs (rtx x, machine_mode mode, int outer_code_i, int opno,
case FLOAT_EXTEND:
if (!(SSE_FLOAT_MODE_P (mode) && TARGET_SSE_MATH))
*total = 0;
+ else
+ *total = ix86_vec_cost (mode, cost->addss, true);
+ return false;
+
+ case FLOAT_TRUNCATE:
+ if (!(SSE_FLOAT_MODE_P (mode) && TARGET_SSE_MATH))
+ *total = cost->fadd;
+ else
+ *total = ix86_vec_cost (mode, cost->addss, true);
return false;
case ABS:
+ /* SSE requires memory load for the constant operand. It may make
+ sense to account for this. Of course the constant operand may or
+ may not be reused. */
if (SSE_FLOAT_MODE_P (mode) && TARGET_SSE_MATH)
- /* ??? SSE cost should be used here. */
- *total = cost->fabs;
+ *total = cost->sse_op;
else if (X87_FLOAT_MODE_P (mode))
*total = cost->fabs;
else if (FLOAT_MODE_P (mode))
- /* ??? SSE vector cost should be used here. */
- *total = cost->fabs;
+ *total = ix86_vec_cost (mode, cost->sse_op, true);
return false;
case SQRT:
if (SSE_FLOAT_MODE_P (mode) && TARGET_SSE_MATH)
- /* ??? SSE cost should be used here. */
- *total = cost->fsqrt;
+ *total = mode == SFmode ? cost->sqrtss : cost->sqrtsd;
else if (X87_FLOAT_MODE_P (mode))
*total = cost->fsqrt;
else if (FLOAT_MODE_P (mode))
- /* ??? SSE vector cost should be used here. */
- *total = cost->fsqrt;
+ *total = ix86_vec_cost (mode,
+ mode == SFmode ? cost->sqrtss : cost->sqrtsd,
+ true);
return false;
case UNSPEC:
@@ -39344,7 +39751,7 @@ ix86_rtx_costs (rtx x, machine_mode mode, int outer_code_i, int opno,
/* ??? Assume all of these vector manipulation patterns are
recognizable. In which case they all pretty much have the
same cost. */
- *total = cost->fabs;
+ *total = cost->sse_op;
return true;
case VEC_MERGE:
mask = XEXP (x, 2);
@@ -39353,7 +39760,7 @@ ix86_rtx_costs (rtx x, machine_mode mode, int outer_code_i, int opno,
if (TARGET_AVX512F && register_operand (mask, GET_MODE (mask)))
*total = rtx_cost (XEXP (x, 0), mode, outer_code, opno, speed);
else
- *total = cost->fabs;
+ *total = cost->sse_op;
return true;
default:
@@ -39818,6 +40225,10 @@ x86_output_mi_thunk (FILE *file, tree, HOST_WIDE_INT delta,
emit_note (NOTE_INSN_PROLOGUE_END);
+ /* CET is enabled, insert EB instruction. */
+ if ((flag_cf_protection & CF_BRANCH) && TARGET_IBT)
+ emit_insn (gen_nop_endbr ());
+
/* If VCALL_OFFSET, we'll need THIS in a register. Might as well
pull it in now and let DELTA benefit. */
if (REG_P (this_param))
@@ -40835,7 +41246,7 @@ ix86_vector_duplicate_value (machine_mode mode, rtx target, rtx val)
reg = force_reg (innermode, val);
if (GET_MODE (reg) != innermode)
reg = gen_lowpart (innermode, reg);
- XEXP (dup, 0) = reg;
+ SET_SRC (PATTERN (insn)) = gen_vec_duplicate (mode, reg);
seq = get_insns ();
end_sequence ();
if (seq)
@@ -42800,9 +43211,9 @@ ix86_encode_section_info (tree decl, rtx rtl, int first)
enum rtx_code
ix86_reverse_condition (enum rtx_code code, machine_mode mode)
{
- return (mode != CCFPmode && mode != CCFPUmode
- ? reverse_condition (code)
- : reverse_condition_maybe_unordered (code));
+ return (mode == CCFPmode
+ ? reverse_condition_maybe_unordered (code)
+ : reverse_condition (code));
}
/* Output code to perform an x87 FP register move, from OPERANDS[1]
@@ -43415,17 +43826,20 @@ static rtx_code_label *
ix86_expand_sse_compare_and_jump (enum rtx_code code, rtx op0, rtx op1,
bool swap_operands)
{
- machine_mode fpcmp_mode = ix86_fp_compare_mode (code);
+ bool unordered_compare = ix86_unordered_fp_compare (code);
rtx_code_label *label;
- rtx tmp;
+ rtx tmp, reg;
if (swap_operands)
std::swap (op0, op1);
label = gen_label_rtx ();
- tmp = gen_rtx_REG (fpcmp_mode, FLAGS_REG);
- emit_insn (gen_rtx_SET (tmp, gen_rtx_COMPARE (fpcmp_mode, op0, op1)));
- tmp = gen_rtx_fmt_ee (code, VOIDmode, tmp, const0_rtx);
+ tmp = gen_rtx_COMPARE (CCFPmode, op0, op1);
+ if (unordered_compare)
+ tmp = gen_rtx_UNSPEC (CCFPmode, gen_rtvec (1, tmp), UNSPEC_NOTRAP);
+ reg = gen_rtx_REG (CCFPmode, FLAGS_REG);
+ emit_insn (gen_rtx_SET (reg, tmp));
+ tmp = gen_rtx_fmt_ee (code, VOIDmode, reg, const0_rtx);
tmp = gen_rtx_IF_THEN_ELSE (VOIDmode, tmp,
gen_rtx_LABEL_REF (VOIDmode, label), pc_rtx);
tmp = emit_jump_insn (gen_rtx_SET (pc_rtx, tmp));
@@ -44044,35 +44458,83 @@ static int
ix86_builtin_vectorization_cost (enum vect_cost_for_stmt type_of_cost,
tree vectype, int)
{
+ bool fp = false;
+ machine_mode mode = TImode;
+ int index;
+ if (vectype != NULL)
+ {
+ fp = FLOAT_TYPE_P (vectype);
+ mode = TYPE_MODE (vectype);
+ }
+
switch (type_of_cost)
{
case scalar_stmt:
- return ix86_cost->scalar_stmt_cost;
+ return fp ? ix86_cost->addss : COSTS_N_INSNS (1);
case scalar_load:
- return ix86_cost->scalar_load_cost;
+ /* load/store costs are relative to register move which is 2. Recompute
+ it to COSTS_N_INSNS so everything have same base. */
+ return COSTS_N_INSNS (fp ? ix86_cost->sse_load[0]
+ : ix86_cost->int_load [2]) / 2;
case scalar_store:
- return ix86_cost->scalar_store_cost;
+ return COSTS_N_INSNS (fp ? ix86_cost->sse_store[0]
+ : ix86_cost->int_store [2]) / 2;
case vector_stmt:
- return ix86_cost->vec_stmt_cost;
+ return ix86_vec_cost (mode,
+ fp ? ix86_cost->addss : ix86_cost->sse_op,
+ true);
case vector_load:
- return ix86_cost->vec_align_load_cost;
+ index = sse_store_index (mode);
+ gcc_assert (index >= 0);
+ return ix86_vec_cost (mode,
+ COSTS_N_INSNS (ix86_cost->sse_load[index]) / 2,
+ true);
case vector_store:
- return ix86_cost->vec_store_cost;
+ index = sse_store_index (mode);
+ return ix86_vec_cost (mode,
+ COSTS_N_INSNS (ix86_cost->sse_store[index]) / 2,
+ true);
case vec_to_scalar:
- return ix86_cost->vec_to_scalar_cost;
-
case scalar_to_vec:
- return ix86_cost->scalar_to_vec_cost;
+ return ix86_vec_cost (mode, ix86_cost->sse_op, true);
+ /* We should have separate costs for unaligned loads and gather/scatter.
+ Do that incrementally. */
case unaligned_load:
+ index = sse_store_index (mode);
+ return ix86_vec_cost (mode,
+ COSTS_N_INSNS
+ (ix86_cost->sse_unaligned_load[index]) / 2,
+ true);
+
case unaligned_store:
- return ix86_cost->vec_unalign_load_cost;
+ index = sse_store_index (mode);
+ return ix86_vec_cost (mode,
+ COSTS_N_INSNS
+ (ix86_cost->sse_unaligned_store[index]) / 2,
+ true);
+
+ case vector_gather_load:
+ return ix86_vec_cost (mode,
+ COSTS_N_INSNS
+ (ix86_cost->gather_static
+ + ix86_cost->gather_per_elt
+ * TYPE_VECTOR_SUBPARTS (vectype)) / 2,
+ true);
+
+ case vector_scatter_store:
+ return ix86_vec_cost (mode,
+ COSTS_N_INSNS
+ (ix86_cost->scatter_static
+ + ix86_cost->scatter_per_elt
+ * TYPE_VECTOR_SUBPARTS (vectype)) / 2,
+ true);
case cond_branch_taken:
return ix86_cost->cond_taken_branch_cost;
@@ -44082,10 +44544,11 @@ ix86_builtin_vectorization_cost (enum vect_cost_for_stmt type_of_cost,
case vec_perm:
case vec_promote_demote:
- return ix86_cost->vec_stmt_cost;
+ return ix86_vec_cost (mode,
+ ix86_cost->sse_op, true);
case vec_construct:
- return ix86_cost->vec_stmt_cost * (TYPE_VECTOR_SUBPARTS (vectype) - 1);
+ return ix86_vec_cost (mode, ix86_cost->sse_op, false);
default:
gcc_unreachable ();
@@ -44963,8 +45426,8 @@ expand_vec_perm_1 (struct expand_vec_perm_d *d)
if (ix86_expand_vec_one_operand_perm_avx512 (d))
return true;
- /* Try the AVX512F vpermi2 instructions. */
- if (ix86_expand_vec_perm_vpermi2 (NULL_RTX, NULL_RTX, NULL_RTX, NULL_RTX, d))
+ /* Try the AVX512F vpermt2/vpermi2 instructions. */
+ if (ix86_expand_vec_perm_vpermt2 (NULL_RTX, NULL_RTX, NULL_RTX, NULL_RTX, d))
return true;
/* See if we can get the same permutation in different vector integer
@@ -46621,9 +47084,9 @@ expand_vec_perm_broadcast (struct expand_vec_perm_d *d)
}
/* Implement arbitrary permutations of two V64QImode operands
- will 2 vpermi2w, 2 vpshufb and one vpor instruction. */
+ with 2 vperm[it]2w, 2 vpshufb and one vpor instruction. */
static bool
-expand_vec_perm_vpermi2_vpshub2 (struct expand_vec_perm_d *d)
+expand_vec_perm_vpermt2_vpshub2 (struct expand_vec_perm_d *d)
{
if (!TARGET_AVX512BW || !(d->vmode == V64QImode))
return false;
@@ -46868,7 +47331,7 @@ ix86_expand_vec_perm_const_1 (struct expand_vec_perm_d *d)
if (expand_vec_perm_vpshufb2_vpermq_even_odd (d))
return true;
- if (expand_vec_perm_vpermi2_vpshub2 (d))
+ if (expand_vec_perm_vpermt2_vpshub2 (d))
return true;
/* ??? Look for narrow permutations whose element orderings would
@@ -47016,17 +47479,17 @@ ix86_vectorize_vec_perm_const_ok (machine_mode vmode, vec_perm_indices sel)
case E_V8DImode:
case E_V8DFmode:
if (TARGET_AVX512F)
- /* All implementable with a single vpermi2 insn. */
+ /* All implementable with a single vperm[it]2 insn. */
return true;
break;
case E_V32HImode:
if (TARGET_AVX512BW)
- /* All implementable with a single vpermi2 insn. */
+ /* All implementable with a single vperm[it]2 insn. */
return true;
break;
case E_V64QImode:
if (TARGET_AVX512BW)
- /* Implementable with 2 vpermi2, 2 vpshufb and 1 or insn. */
+ /* Implementable with 2 vperm[it]2, 2 vpshufb and 1 or insn. */
return true;
break;
case E_V8SImode:
@@ -47034,7 +47497,7 @@ ix86_vectorize_vec_perm_const_ok (machine_mode vmode, vec_perm_indices sel)
case E_V4DFmode:
case E_V4DImode:
if (TARGET_AVX512VL)
- /* All implementable with a single vpermi2 insn. */
+ /* All implementable with a single vperm[it]2 insn. */
return true;
break;
case E_V16HImode:
@@ -47204,7 +47667,6 @@ ix86_expand_vecop_qihi (enum rtx_code code, rtx dest, rtx op1, rtx op2)
op2_h = gen_reg_rtx (qimode);
emit_insn (gen_il (op2_l, op2, op2));
emit_insn (gen_ih (op2_h, op2, op2));
- /* FALLTHRU */
op1_l = gen_reg_rtx (qimode);
op1_h = gen_reg_rtx (qimode);
@@ -47632,6 +48094,46 @@ ix86_bnd_prefixed_insn_p (rtx insn)
return chkp_function_instrumented_p (current_function_decl);
}
+/* Return 1 if control tansfer instruction INSN
+ should be encoded with notrack prefix. */
+
+static bool
+ix86_notrack_prefixed_insn_p (rtx insn)
+{
+ if (!insn || !((flag_cf_protection & CF_BRANCH) && TARGET_IBT))
+ return false;
+
+ if (CALL_P (insn))
+ {
+ rtx call = get_call_rtx_from (insn);
+ gcc_assert (call != NULL_RTX);
+ rtx addr = XEXP (call, 0);
+
+ /* Do not emit 'notrack' if it's not an indirect call. */
+ if (MEM_P (addr)
+ && GET_CODE (XEXP (addr, 0)) == SYMBOL_REF)
+ return false;
+ else
+ return find_reg_note (insn, REG_CALL_NOCF_CHECK, 0);
+ }
+
+ if (JUMP_P (insn) && !flag_cet_switch)
+ {
+ rtx target = JUMP_LABEL (insn);
+ if (target == NULL_RTX || ANY_RETURN_P (target))
+ return false;
+
+ /* Check the jump is a switch table. */
+ rtx_insn *label = as_a<rtx_insn *> (target);
+ rtx_insn *table = next_insn (label);
+ if (table == NULL_RTX || !JUMP_TABLE_DATA_P (table))
+ return false;
+ else
+ return true;
+ }
+ return false;
+}
+
/* Calculate integer abs() using only SSE2 instructions. */
void
@@ -49420,6 +49922,9 @@ ix86_run_selftests (void)
#undef TARGET_DELEGITIMIZE_ADDRESS
#define TARGET_DELEGITIMIZE_ADDRESS ix86_delegitimize_address
+#undef TARGET_CONST_NOT_OK_FOR_DEBUG_P
+#define TARGET_CONST_NOT_OK_FOR_DEBUG_P ix86_const_not_ok_for_debug_p
+
#undef TARGET_MS_BITFIELD_LAYOUT_P
#define TARGET_MS_BITFIELD_LAYOUT_P ix86_ms_bitfield_layout_p