summaryrefslogtreecommitdiff
path: root/gcc/config/i386
diff options
context:
space:
mode:
authorbstarynk <bstarynk@138bc75d-0d04-0410-961f-82ee72b054a4>2011-11-21 09:36:11 +0000
committerbstarynk <bstarynk@138bc75d-0d04-0410-961f-82ee72b054a4>2011-11-21 09:36:11 +0000
commit8726b204214570b8506b06e7f1af935e21cb25c9 (patch)
tree197000f9c24e34c902c5357dd6d522f430ac0722 /gcc/config/i386
parent0f3b427f9513aaff0bb89af90bb60fd21aa23ce8 (diff)
downloadgcc-8726b204214570b8506b06e7f1af935e21cb25c9.tar.gz
2011-11-21 Basile Starynkevitch <basile@starynkevitch.net>
MELT branch merged with trunk rev 181552 using svnmerge git-svn-id: svn+ssh://gcc.gnu.org/svn/gcc/branches/melt-branch@181554 138bc75d-0d04-0410-961f-82ee72b054a4
Diffstat (limited to 'gcc/config/i386')
-rw-r--r--gcc/config/i386/emmintrin.h8
-rw-r--r--gcc/config/i386/i386-builtin-types.def2
-rw-r--r--gcc/config/i386/i386.c247
-rw-r--r--gcc/config/i386/i386.md1
-rw-r--r--gcc/config/i386/i386elf.h4
-rw-r--r--gcc/config/i386/mmx.md4
-rw-r--r--gcc/config/i386/sse.md77
7 files changed, 242 insertions, 101 deletions
diff --git a/gcc/config/i386/emmintrin.h b/gcc/config/i386/emmintrin.h
index fe4cd6abaea..07ac9f3d8ae 100644
--- a/gcc/config/i386/emmintrin.h
+++ b/gcc/config/i386/emmintrin.h
@@ -1418,6 +1418,14 @@ _mm_stream_si32 (int *__A, int __B)
__builtin_ia32_movnti (__A, __B);
}
+#ifdef __x86_64__
+extern __inline void __attribute__((__gnu_inline__, __always_inline__, __artificial__))
+_mm_stream_si64 (long long int *__A, long long int __B)
+{
+ __builtin_ia32_movnti64 (__A, __B);
+}
+#endif
+
extern __inline void __attribute__((__gnu_inline__, __always_inline__, __artificial__))
_mm_stream_si128 (__m128i *__A, __m128i __B)
{
diff --git a/gcc/config/i386/i386-builtin-types.def b/gcc/config/i386/i386-builtin-types.def
index 79fb1427787..d00b05341b9 100644
--- a/gcc/config/i386/i386-builtin-types.def
+++ b/gcc/config/i386/i386-builtin-types.def
@@ -111,6 +111,7 @@ DEF_POINTER_TYPE (PDOUBLE, DOUBLE)
DEF_POINTER_TYPE (PFLOAT, FLOAT)
DEF_POINTER_TYPE (PUSHORT, USHORT)
DEF_POINTER_TYPE (PINT, INT)
+DEF_POINTER_TYPE (PLONGLONG, LONGLONG)
DEF_POINTER_TYPE (PULONGLONG, ULONGLONG)
DEF_POINTER_TYPE (PUNSIGNED, UNSIGNED)
@@ -357,6 +358,7 @@ DEF_FUNCTION_TYPE (VOID, PDOUBLE, V4DF)
DEF_FUNCTION_TYPE (VOID, PFLOAT, V4SF)
DEF_FUNCTION_TYPE (VOID, PFLOAT, V8SF)
DEF_FUNCTION_TYPE (VOID, PINT, INT)
+DEF_FUNCTION_TYPE (VOID, PLONGLONG, LONGLONG)
DEF_FUNCTION_TYPE (VOID, PULONGLONG, ULONGLONG)
DEF_FUNCTION_TYPE (VOID, PV2SI, V2SI)
DEF_FUNCTION_TYPE (VOID, PV2DI, V2DI)
diff --git a/gcc/config/i386/i386.c b/gcc/config/i386/i386.c
index 691c89a9d28..34368203cd9 100644
--- a/gcc/config/i386/i386.c
+++ b/gcc/config/i386/i386.c
@@ -1783,18 +1783,18 @@ struct processor_costs atom_cost = {
/* stringop_algs for memcpy.
SSE loops works best on Atom, but fall back into non-SSE unrolled loop variant
if that fails. */
- {{{libcall, {{4096, sse_loop}, {4096, unrolled_loop}, {-1, libcall}}}, /* Known alignment. */
- {libcall, {{4096, sse_loop}, {4096, unrolled_loop}, {-1, libcall}}}},
- {{libcall, {{-1, libcall}}}, /* Unknown alignment. */
- {libcall, {{2048, sse_loop}, {2048, unrolled_loop},
+ {{{libcall, {{4096, unrolled_loop}, {-1, libcall}}}, /* Known alignment. */
+ {libcall, {{4096, unrolled_loop}, {-1, libcall}}}},
+ {{libcall, {{2048, unrolled_loop}, {-1, libcall}}}, /* Unknown alignment. */
+ {libcall, {{2048, unrolled_loop},
{-1, libcall}}}}},
/* stringop_algs for memset. */
- {{{libcall, {{4096, sse_loop}, {4096, unrolled_loop}, {-1, libcall}}}, /* Known alignment. */
- {libcall, {{4096, sse_loop}, {4096, unrolled_loop}, {-1, libcall}}}},
- {{libcall, {{1024, sse_loop}, {1024, unrolled_loop}, /* Unknown alignment. */
+ {{{libcall, {{4096, unrolled_loop}, {-1, libcall}}}, /* Known alignment. */
+ {libcall, {{4096, unrolled_loop}, {-1, libcall}}}},
+ {{libcall, {{1024, unrolled_loop}, /* Unknown alignment. */
{-1, libcall}}},
- {libcall, {{2048, sse_loop}, {2048, unrolled_loop},
+ {libcall, {{2048, unrolled_loop},
{-1, libcall}}}}},
1, /* scalar_stmt_cost. */
1, /* scalar load_cost. */
@@ -19619,8 +19619,12 @@ ix86_expand_int_vcond (rtx operands[])
cop0 = operands[4];
cop1 = operands[5];
- /* XOP supports all of the comparisons on all vector int types. */
- if (!TARGET_XOP)
+ /* XOP supports all of the comparisons on all 128-bit vector int types. */
+ if (TARGET_XOP
+ && (mode == V16QImode || mode == V8HImode
+ || mode == V4SImode || mode == V2DImode))
+ ;
+ else
{
/* Canonicalize the comparison to EQ, GT, GTU. */
switch (code)
@@ -21149,20 +21153,25 @@ expand_set_or_movmem_via_loop_with_iter (rtx destmem, rtx srcmem,
top_label = gen_label_rtx ();
out_label = gen_label_rtx ();
- if (!reuse_iter)
- iter = gen_reg_rtx (iter_mode);
-
size = expand_simple_binop (iter_mode, AND, count, piece_size_mask,
- NULL, 1, OPTAB_DIRECT);
- /* Those two should combine. */
- if (piece_size == const1_rtx)
+ NULL, 1, OPTAB_DIRECT);
+ if (!reuse_iter)
+ {
+ iter = gen_reg_rtx (iter_mode);
+ /* Those two should combine. */
+ if (piece_size == const1_rtx)
+ {
+ emit_cmp_and_jump_insns (size, const0_rtx, EQ, NULL_RTX, iter_mode,
+ true, out_label);
+ predict_jump (REG_BR_PROB_BASE * 10 / 100);
+ }
+ emit_move_insn (iter, const0_rtx);
+ }
+ else
{
- emit_cmp_and_jump_insns (size, const0_rtx, EQ, NULL_RTX, iter_mode,
+ emit_cmp_and_jump_insns (iter, size, GE, NULL_RTX, iter_mode,
true, out_label);
- predict_jump (REG_BR_PROB_BASE * 10 / 100);
}
- if (!reuse_iter)
- emit_move_insn (iter, const0_rtx);
emit_label (top_label);
@@ -21460,7 +21469,7 @@ expand_movmem_epilogue (rtx destmem, rtx srcmem,
gcc_assert (remainder_size == 0);
return;
}
- if (max_size > 8)
+ if (max_size > 16)
{
count = expand_simple_binop (GET_MODE (count), AND, count, GEN_INT (max_size - 1),
count, 1, OPTAB_DIRECT);
@@ -21475,6 +21484,25 @@ expand_movmem_epilogue (rtx destmem, rtx srcmem,
*/
if (TARGET_SINGLE_STRINGOP)
{
+ if (max_size > 8)
+ {
+ rtx label = ix86_expand_aligntest (count, 8, true);
+ if (TARGET_64BIT)
+ {
+ src = change_address (srcmem, DImode, srcptr);
+ dest = change_address (destmem, DImode, destptr);
+ emit_insn (gen_strmov (destptr, dest, srcptr, src));
+ }
+ else
+ {
+ src = change_address (srcmem, SImode, srcptr);
+ dest = change_address (destmem, SImode, destptr);
+ emit_insn (gen_strmov (destptr, dest, srcptr, src));
+ emit_insn (gen_strmov (destptr, dest, srcptr, src));
+ }
+ emit_label (label);
+ LABEL_NUSES (label) = 1;
+ }
if (max_size > 4)
{
rtx label = ix86_expand_aligntest (count, 4, true);
@@ -21508,6 +21536,35 @@ expand_movmem_epilogue (rtx destmem, rtx srcmem,
rtx offset = force_reg (Pmode, const0_rtx);
rtx tmp;
+ if (max_size > 8)
+ {
+ rtx label = ix86_expand_aligntest (count, 8, true);
+ if (TARGET_64BIT)
+ {
+ src = change_address (srcmem, DImode, srcptr);
+ dest = change_address (destmem, DImode, destptr);
+ emit_move_insn (dest, src);
+ tmp = expand_simple_binop (Pmode, PLUS, offset, GEN_INT (8), NULL,
+ true, OPTAB_LIB_WIDEN);
+ }
+ else
+ {
+ src = change_address (srcmem, SImode, srcptr);
+ dest = change_address (destmem, SImode, destptr);
+ emit_move_insn (dest, src);
+ tmp = expand_simple_binop (Pmode, PLUS, offset, GEN_INT (4), NULL,
+ true, OPTAB_LIB_WIDEN);
+ if (tmp != offset)
+ emit_move_insn (offset, tmp);
+ tmp = expand_simple_binop (Pmode, PLUS, offset, GEN_INT (4), NULL,
+ true, OPTAB_LIB_WIDEN);
+ emit_move_insn (dest, src);
+ }
+ if (tmp != offset)
+ emit_move_insn (offset, tmp);
+ emit_label (label);
+ LABEL_NUSES (label) = 1;
+ }
if (max_size > 4)
{
rtx label = ix86_expand_aligntest (count, 4, true);
@@ -21588,17 +21645,28 @@ expand_setmem_epilogue (rtx destmem, rtx destptr, rtx promoted_to_vector_value,
Remaining part we'll move using Pmode and narrower modes. */
if (promoted_to_vector_value)
- while (remainder_size >= 16)
- {
- if (GET_MODE (destmem) != move_mode)
- destmem = adjust_automodify_address_nv (destmem, move_mode,
- destptr, offset);
- emit_strset (destmem, promoted_to_vector_value, destptr,
- move_mode, offset);
-
- offset += 16;
- remainder_size -= 16;
- }
+ {
+ if (promoted_to_vector_value)
+ {
+ if (max_size >= GET_MODE_SIZE (V4SImode))
+ move_mode = V4SImode;
+ else if (max_size >= GET_MODE_SIZE (DImode))
+ move_mode = DImode;
+ }
+ while (remainder_size >= GET_MODE_SIZE (move_mode))
+ {
+ if (GET_MODE (destmem) != move_mode)
+ destmem = adjust_automodify_address_nv (destmem, move_mode,
+ destptr, offset);
+ emit_strset (destmem,
+ promoted_to_vector_value,
+ destptr,
+ move_mode, offset);
+
+ offset += GET_MODE_SIZE (move_mode);
+ remainder_size -= GET_MODE_SIZE (move_mode);
+ }
+ }
/* Move the remaining part of epilogue - its size might be
a size of the widest mode. */
@@ -22022,10 +22090,11 @@ decide_alg (HOST_WIDE_INT count, HOST_WIDE_INT expected_size, bool memset,
|| (memset
? fixed_regs[AX_REG] : fixed_regs[SI_REG]));
-#define ALG_USABLE_P(alg) (rep_prefix_usable \
- || (alg != rep_prefix_1_byte \
- && alg != rep_prefix_4_byte \
- && alg != rep_prefix_8_byte))
+#define ALG_USABLE_P(alg) ((rep_prefix_usable \
+ || (alg != rep_prefix_1_byte \
+ && alg != rep_prefix_4_byte \
+ && alg != rep_prefix_8_byte)) \
+ && (TARGET_SSE2 || alg != sse_loop))
const struct processor_costs *cost;
/* Even if the string operation call is cold, we still might spend a lot
@@ -22037,6 +22106,9 @@ decide_alg (HOST_WIDE_INT count, HOST_WIDE_INT expected_size, bool memset,
else
optimize_for_speed = true;
+ if (!optimize)
+ return (rep_prefix_usable ? rep_prefix_1_byte : libcall);
+
cost = optimize_for_speed ? ix86_cost : &ix86_size_cost;
*dynamic_check = -1;
@@ -22049,10 +22121,10 @@ decide_alg (HOST_WIDE_INT count, HOST_WIDE_INT expected_size, bool memset,
/* rep; movq or rep; movl is the smallest variant. */
else if (!optimize_for_speed)
{
- if (!count || (count & 3))
- return rep_prefix_usable ? rep_prefix_1_byte : loop_1_byte;
+ if (!count || (count & 3) || memset)
+ return rep_prefix_usable ? rep_prefix_1_byte : libcall;
else
- return rep_prefix_usable ? rep_prefix_4_byte : loop;
+ return rep_prefix_usable ? rep_prefix_4_byte : libcall;
}
/* Very tiny blocks are best handled via the loop, REP is expensive to setup.
*/
@@ -22106,13 +22178,11 @@ decide_alg (HOST_WIDE_INT count, HOST_WIDE_INT expected_size, bool memset,
int max = -1;
enum stringop_alg alg;
int i;
- bool any_alg_usable_p = true;
bool only_libcall_fits = true;
for (i = 0; i < MAX_STRINGOP_ALGS; i++)
{
enum stringop_alg candidate = algs->size[i].alg;
- any_alg_usable_p = any_alg_usable_p && ALG_USABLE_P (candidate);
if (candidate != libcall && candidate
&& ALG_USABLE_P (candidate))
@@ -22124,7 +22194,7 @@ decide_alg (HOST_WIDE_INT count, HOST_WIDE_INT expected_size, bool memset,
/* If there aren't any usable algorithms, then recursing on
smaller sizes isn't going to find anything. Just return the
simple byte-at-a-time copy loop. */
- if (!any_alg_usable_p || only_libcall_fits)
+ if (only_libcall_fits)
{
/* Pick something reasonable. */
if (TARGET_INLINE_STRINGOPS_DYNAMICALLY)
@@ -22253,7 +22323,7 @@ ix86_expand_movmem (rtx dst, rtx src, rtx count_exp, rtx align_exp,
int dynamic_check;
bool need_zero_guard = false;
bool align_unknown;
- int unroll_factor;
+ unsigned int unroll_factor;
enum machine_mode move_mode;
rtx loop_iter = NULL_RTX;
int dst_offset, src_offset;
@@ -22316,14 +22386,28 @@ ix86_expand_movmem (rtx dst, rtx src, rtx count_exp, rtx align_exp,
case unrolled_loop:
need_zero_guard = true;
move_mode = Pmode;
- unroll_factor = TARGET_64BIT ? 4 : 2;
+ unroll_factor = 1;
+ /* Select maximal available 1,2 or 4 unroll factor.
+ In 32bit we can not afford to use 4 registers inside the loop. */
+ if (!count)
+ unroll_factor = TARGET_64BIT ? 4 : 2;
+ else
+ while (GET_MODE_SIZE (move_mode) * unroll_factor * 2 < count
+ && unroll_factor < (TARGET_64BIT ? 4 :2))
+ unroll_factor *= 2;
size_needed = GET_MODE_SIZE (move_mode) * unroll_factor;
break;
case sse_loop:
need_zero_guard = true;
/* Use SSE instructions, if possible. */
- move_mode = align_unknown ? DImode : V4SImode;
- unroll_factor = TARGET_64BIT ? 4 : 2;
+ move_mode = V4SImode;
+ /* Select maximal available 1,2 or 4 unroll factor. */
+ if (!count)
+ unroll_factor = 4;
+ else
+ while (GET_MODE_SIZE (move_mode) * unroll_factor * 2 < count
+ && unroll_factor < 4)
+ unroll_factor *= 2;
size_needed = GET_MODE_SIZE (move_mode) * unroll_factor;
break;
case rep_prefix_8_byte:
@@ -22568,7 +22652,13 @@ ix86_expand_movmem (rtx dst, rtx src, rtx count_exp, rtx align_exp,
if (alg == sse_loop || alg == unrolled_loop)
{
rtx tmp;
- if (align_unknown && unroll_factor > 1)
+ int remainder_size = epilogue_size_needed;
+
+ /* We may not need the epilgoue loop at all when the count is known
+ and alignment is not adjusted. */
+ if (count && desired_align <= align)
+ remainder_size = count % epilogue_size_needed;
+ if (remainder_size > 31)
{
/* Reduce epilogue's size by creating not-unrolled loop. If we won't
do this, we can have very big epilogue - when alignment is statically
@@ -22710,7 +22800,7 @@ promote_duplicated_reg_to_size (rtx val, int size_needed, int desired_align, int
{
rtx promoted_val = NULL_RTX;
- if (size_needed > 8 || (desired_align > align && desired_align > 8))
+ if (size_needed > 8)
{
/* We want to promote to vector register, so we expect that at least SSE
is available. */
@@ -22724,7 +22814,7 @@ promote_duplicated_reg_to_size (rtx val, int size_needed, int desired_align, int
else
promoted_val = promote_duplicated_reg (V4SImode, val);
}
- else if (size_needed > 4 || (desired_align > align && desired_align > 4))
+ else if (size_needed > 4)
{
gcc_assert (TARGET_64BIT);
promoted_val = promote_duplicated_reg (DImode, val);
@@ -22764,6 +22854,7 @@ ix86_expand_setmem (rtx dst, rtx count_exp, rtx val_exp, rtx align_exp,
unsigned int unroll_factor;
enum machine_mode move_mode;
rtx loop_iter = NULL_RTX;
+ bool early_jump = false;
if (CONST_INT_P (align_exp))
align = INTVAL (align_exp);
@@ -22783,7 +22874,7 @@ ix86_expand_setmem (rtx dst, rtx count_exp, rtx val_exp, rtx align_exp,
/* Step 0: Decide on preferred algorithm, desired alignment and
size of chunks to be copied by main loop. */
- align_unknown = CONST_INT_P (align_exp) && INTVAL (align_exp) > 0;
+ align_unknown = !(CONST_INT_P (align_exp) && INTVAL (align_exp) > 0);
alg = decide_alg (count, expected_size, true, &dynamic_check, align_unknown);
desired_align = decide_alignment (align, alg, expected_size);
unroll_factor = 1;
@@ -22813,9 +22904,12 @@ ix86_expand_setmem (rtx dst, rtx count_exp, rtx val_exp, rtx align_exp,
move_mode = Pmode;
unroll_factor = 1;
/* Select maximal available 1,2 or 4 unroll factor. */
- while (GET_MODE_SIZE (move_mode) * unroll_factor * 2 < count
- && unroll_factor < 4)
- unroll_factor *= 2;
+ if (!count)
+ unroll_factor = 4;
+ else
+ while (GET_MODE_SIZE (move_mode) * unroll_factor * 2 < count
+ && unroll_factor < 4)
+ unroll_factor *= 2;
size_needed = GET_MODE_SIZE (move_mode) * unroll_factor;
break;
case sse_loop:
@@ -22823,9 +22917,12 @@ ix86_expand_setmem (rtx dst, rtx count_exp, rtx val_exp, rtx align_exp,
move_mode = TARGET_64BIT ? V2DImode : V4SImode;
unroll_factor = 1;
/* Select maximal available 1,2 or 4 unroll factor. */
- while (GET_MODE_SIZE (move_mode) * unroll_factor * 2 < count
- && unroll_factor < 4)
- unroll_factor *= 2;
+ if (!count)
+ unroll_factor = 4;
+ else
+ while (GET_MODE_SIZE (move_mode) * unroll_factor * 2 < count
+ && unroll_factor < 4)
+ unroll_factor *= 2;
size_needed = GET_MODE_SIZE (move_mode) * unroll_factor;
break;
case rep_prefix_8_byte:
@@ -22904,6 +23001,7 @@ ix86_expand_setmem (rtx dst, rtx count_exp, rtx val_exp, rtx align_exp,
emit_move_insn (loop_iter, const0_rtx);
}
label = gen_label_rtx ();
+ early_jump = true;
emit_cmp_and_jump_insns (count_exp,
GEN_INT (epilogue_size_needed),
LTU, 0, counter_mode (count_exp), 1, label);
@@ -23016,7 +23114,7 @@ ix86_expand_setmem (rtx dst, rtx count_exp, rtx val_exp, rtx align_exp,
vec_promoted_val =
promote_duplicated_reg_to_size (gpr_promoted_val,
GET_MODE_SIZE (move_mode),
- desired_align, align);
+ GET_MODE_SIZE (move_mode), align);
loop_iter = expand_set_or_movmem_via_loop_with_iter (dst, NULL, destreg,
NULL, vec_promoted_val, count_exp,
loop_iter, move_mode, unroll_factor,
@@ -23065,21 +23163,26 @@ ix86_expand_setmem (rtx dst, rtx count_exp, rtx val_exp, rtx align_exp,
LABEL_NUSES (label) = 1;
/* We can not rely on fact that promoved value is known. */
vec_promoted_val = 0;
- gpr_promoted_val = 0;
+ if (early_jump)
+ gpr_promoted_val = 0;
}
epilogue:
if (alg == unrolled_loop || alg == sse_loop)
{
rtx tmp;
- if (align_unknown && unroll_factor > 1
- && epilogue_size_needed >= GET_MODE_SIZE (move_mode)
- && vec_promoted_val)
+ int remainder_size = epilogue_size_needed;
+ if (count && desired_align <= align)
+ remainder_size = count % epilogue_size_needed;
+ /* We may not need the epilgoue loop at all when the count is known
+ and alignment is not adjusted. */
+ if (remainder_size > 31
+ && (alg == sse_loop ? vec_promoted_val : gpr_promoted_val))
{
/* Reduce epilogue's size by creating not-unrolled loop. If we won't
do this, we can have very big epilogue - when alignment is statically
unknown we'll have the epilogue byte by byte which may be very slow. */
loop_iter = expand_set_or_movmem_via_loop_with_iter (dst, NULL, destreg,
- NULL, vec_promoted_val, count_exp,
+ NULL, (alg == sse_loop ? vec_promoted_val : gpr_promoted_val), count_exp,
loop_iter, move_mode, 1,
expected_size, false);
dst = change_address (dst, BLKmode, destreg);
@@ -23090,17 +23193,14 @@ ix86_expand_setmem (rtx dst, rtx count_exp, rtx val_exp, rtx align_exp,
if (tmp != destreg)
emit_move_insn (destreg, tmp);
}
- if (count_exp == const0_rtx)
+ if (count_exp == const0_rtx || epilogue_size_needed <= 1)
;
- else if (!gpr_promoted_val && epilogue_size_needed > 1)
+ else if (!gpr_promoted_val)
expand_setmem_epilogue_via_loop (dst, destreg, val_exp, count_exp,
epilogue_size_needed);
else
- {
- if (epilogue_size_needed > 1)
- expand_setmem_epilogue (dst, destreg, vec_promoted_val, gpr_promoted_val,
- val_exp, count_exp, epilogue_size_needed);
- }
+ expand_setmem_epilogue (dst, destreg, vec_promoted_val, gpr_promoted_val,
+ val_exp, count_exp, epilogue_size_needed);
if (jump_around_label)
emit_label (jump_around_label);
return true;
@@ -25245,6 +25345,7 @@ enum ix86_builtins
IX86_BUILTIN_CVTTPS2DQ,
IX86_BUILTIN_MOVNTI,
+ IX86_BUILTIN_MOVNTI64,
IX86_BUILTIN_MOVNTPD,
IX86_BUILTIN_MOVNTDQ,
@@ -26318,7 +26419,7 @@ static const struct builtin_description bdesc_special_args[] =
/* SSE or 3DNow!A */
{ OPTION_MASK_ISA_SSE | OPTION_MASK_ISA_3DNOW_A, CODE_FOR_sse_sfence, "__builtin_ia32_sfence", IX86_BUILTIN_SFENCE, UNKNOWN, (int) VOID_FTYPE_VOID },
- { OPTION_MASK_ISA_SSE | OPTION_MASK_ISA_3DNOW_A, CODE_FOR_sse_movntdi, "__builtin_ia32_movntq", IX86_BUILTIN_MOVNTQ, UNKNOWN, (int) VOID_FTYPE_PULONGLONG_ULONGLONG },
+ { OPTION_MASK_ISA_SSE | OPTION_MASK_ISA_3DNOW_A, CODE_FOR_sse_movntq, "__builtin_ia32_movntq", IX86_BUILTIN_MOVNTQ, UNKNOWN, (int) VOID_FTYPE_PULONGLONG_ULONGLONG },
/* SSE2 */
{ OPTION_MASK_ISA_SSE2, CODE_FOR_sse2_lfence, "__builtin_ia32_lfence", IX86_BUILTIN_LFENCE, UNKNOWN, (int) VOID_FTYPE_VOID },
@@ -26327,7 +26428,8 @@ static const struct builtin_description bdesc_special_args[] =
{ OPTION_MASK_ISA_SSE2, CODE_FOR_sse2_movdqu, "__builtin_ia32_storedqu", IX86_BUILTIN_STOREDQU, UNKNOWN, (int) VOID_FTYPE_PCHAR_V16QI },
{ OPTION_MASK_ISA_SSE2, CODE_FOR_sse2_movntv2df, "__builtin_ia32_movntpd", IX86_BUILTIN_MOVNTPD, UNKNOWN, (int) VOID_FTYPE_PDOUBLE_V2DF },
{ OPTION_MASK_ISA_SSE2, CODE_FOR_sse2_movntv2di, "__builtin_ia32_movntdq", IX86_BUILTIN_MOVNTDQ, UNKNOWN, (int) VOID_FTYPE_PV2DI_V2DI },
- { OPTION_MASK_ISA_SSE2, CODE_FOR_sse2_movntsi, "__builtin_ia32_movnti", IX86_BUILTIN_MOVNTI, UNKNOWN, (int) VOID_FTYPE_PINT_INT },
+ { OPTION_MASK_ISA_SSE2, CODE_FOR_sse2_movntisi, "__builtin_ia32_movnti", IX86_BUILTIN_MOVNTI, UNKNOWN, (int) VOID_FTYPE_PINT_INT },
+ { OPTION_MASK_ISA_SSE2 | OPTION_MASK_ISA_64BIT, CODE_FOR_sse2_movntidi, "__builtin_ia32_movnti64", IX86_BUILTIN_MOVNTI64, UNKNOWN, (int) VOID_FTYPE_PLONGLONG_LONGLONG },
{ OPTION_MASK_ISA_SSE2, CODE_FOR_sse2_movupd, "__builtin_ia32_loadupd", IX86_BUILTIN_LOADUPD, UNKNOWN, (int) V2DF_FTYPE_PCDOUBLE },
{ OPTION_MASK_ISA_SSE2, CODE_FOR_sse2_movdqu, "__builtin_ia32_loaddqu", IX86_BUILTIN_LOADDQU, UNKNOWN, (int) V16QI_FTYPE_PCCHAR },
@@ -29313,6 +29415,7 @@ ix86_expand_special_args_builtin (const struct builtin_description *d,
case VOID_FTYPE_PFLOAT_V4SF:
case VOID_FTYPE_PDOUBLE_V4DF:
case VOID_FTYPE_PDOUBLE_V2DF:
+ case VOID_FTYPE_PLONGLONG_LONGLONG:
case VOID_FTYPE_PULONGLONG_ULONGLONG:
case VOID_FTYPE_PINT_INT:
nargs = 1;
@@ -29914,7 +30017,7 @@ rdrand_step:
icode = CODE_FOR_avx2_gatherdiv8sf;
goto gather_gen;
case IX86_BUILTIN_GATHERALTSIV4DI:
- icode = CODE_FOR_avx2_gathersiv4df;
+ icode = CODE_FOR_avx2_gathersiv4di;
goto gather_gen;
case IX86_BUILTIN_GATHERALTDIV8SI:
icode = CODE_FOR_avx2_gatherdiv8si;
diff --git a/gcc/config/i386/i386.md b/gcc/config/i386/i386.md
index bc602532304..912c17229a2 100644
--- a/gcc/config/i386/i386.md
+++ b/gcc/config/i386/i386.md
@@ -115,6 +115,7 @@
UNSPEC_FIX_NOTRUNC
UNSPEC_MASKMOV
UNSPEC_MOVMSK
+ UNSPEC_MOVNTQ
UNSPEC_MOVNT
UNSPEC_MOVU
UNSPEC_RCP
diff --git a/gcc/config/i386/i386elf.h b/gcc/config/i386/i386elf.h
index 179c601738d..1bf3feb7479 100644
--- a/gcc/config/i386/i386elf.h
+++ b/gcc/config/i386/i386elf.h
@@ -20,10 +20,6 @@ You should have received a copy of the GNU General Public License
along with GCC; see the file COPYING3. If not see
<http://www.gnu.org/licenses/>. */
-/* Use stabs instead of DWARF debug format. */
-#undef PREFERRED_DEBUGGING_TYPE
-#define PREFERRED_DEBUGGING_TYPE DBX_DEBUG
-
/* The ELF ABI for the i386 says that records and unions are returned
in memory. */
diff --git a/gcc/config/i386/mmx.md b/gcc/config/i386/mmx.md
index f76834e5ab2..7fa072eb6fe 100644
--- a/gcc/config/i386/mmx.md
+++ b/gcc/config/i386/mmx.md
@@ -329,10 +329,10 @@
DONE;
})
-(define_insn "sse_movntdi"
+(define_insn "sse_movntq"
[(set (match_operand:DI 0 "memory_operand" "=m")
(unspec:DI [(match_operand:DI 1 "register_operand" "y")]
- UNSPEC_MOVNT))]
+ UNSPEC_MOVNTQ))]
"TARGET_SSE || TARGET_3DNOW_A"
"movntq\t{%1, %0|%0, %1}"
[(set_attr "type" "mmxmov")
diff --git a/gcc/config/i386/sse.md b/gcc/config/i386/sse.md
index b8e821de90e..89559966f0e 100644
--- a/gcc/config/i386/sse.md
+++ b/gcc/config/i386/sse.md
@@ -573,15 +573,15 @@
(set_attr "prefix" "maybe_vex")
(set_attr "mode" "<sseinsnmode>")])
-(define_insn "sse2_movntsi"
- [(set (match_operand:SI 0 "memory_operand" "=m")
- (unspec:SI [(match_operand:SI 1 "register_operand" "r")]
- UNSPEC_MOVNT))]
+(define_insn "sse2_movnti<mode>"
+ [(set (match_operand:SWI48 0 "memory_operand" "=m")
+ (unspec:SWI48 [(match_operand:SWI48 1 "register_operand" "r")]
+ UNSPEC_MOVNT))]
"TARGET_SSE2"
"movnti\t{%1, %0|%0, %1}"
[(set_attr "type" "ssemov")
(set_attr "prefix_data16" "0")
- (set_attr "mode" "V2DF")])
+ (set_attr "mode" "<MODE>")])
(define_insn "<sse>_movnt<mode>"
[(set (match_operand:VF 0 "memory_operand" "=m")
@@ -614,8 +614,9 @@
;; Modes handled by storent patterns.
(define_mode_iterator STORENT_MODE
- [(SI "TARGET_SSE2") (SF "TARGET_SSE4A") (DF "TARGET_SSE4A")
- (V2DI "TARGET_SSE2")
+ [(DI "TARGET_SSE2 && TARGET_64BIT") (SI "TARGET_SSE2")
+ (SF "TARGET_SSE4A") (DF "TARGET_SSE4A")
+ (V4DI "TARGET_AVX") (V2DI "TARGET_SSE2")
(V8SF "TARGET_AVX") V4SF
(V4DF "TARGET_AVX") (V2DF "TARGET_SSE2")])
@@ -9962,17 +9963,32 @@
{
rtx tmp0, tmp1;
- tmp0 = gen_reg_rtx (<MODE>mode);
- tmp1 = gen_reg_rtx (<MODE>mode);
+ if (<MODE>mode == V2DFmode
+ && TARGET_AVX && !TARGET_PREFER_AVX128)
+ {
+ rtx tmp2 = gen_reg_rtx (V4DFmode);
- emit_insn
- (gen_<sse4_1>_round<ssemodesuffix><avxsizesuffix> (tmp0, operands[1],
- operands[3]));
- emit_insn
- (gen_<sse4_1>_round<ssemodesuffix><avxsizesuffix> (tmp1, operands[2],
- operands[3]));
- emit_insn
- (gen_vec_pack_sfix_trunc_<mode> (operands[0], tmp0, tmp1));
+ tmp0 = gen_reg_rtx (V4DFmode);
+ tmp1 = force_reg (V2DFmode, operands[1]);
+
+ emit_insn (gen_avx_vec_concatv4df (tmp0, tmp1, operands[2]));
+ emit_insn (gen_avx_roundpd256 (tmp2, tmp0, operands[3]));
+ emit_insn (gen_fix_truncv4dfv4si2 (operands[0], tmp2));
+ }
+ else
+ {
+ tmp0 = gen_reg_rtx (<MODE>mode);
+ tmp1 = gen_reg_rtx (<MODE>mode);
+
+ emit_insn
+ (gen_<sse4_1>_round<ssemodesuffix><avxsizesuffix> (tmp0, operands[1],
+ operands[3]));
+ emit_insn
+ (gen_<sse4_1>_round<ssemodesuffix><avxsizesuffix> (tmp1, operands[2],
+ operands[3]));
+ emit_insn
+ (gen_vec_pack_sfix_trunc_<mode> (operands[0], tmp0, tmp1));
+ }
DONE;
})
@@ -10053,14 +10069,29 @@
{
rtx tmp0, tmp1;
- tmp0 = gen_reg_rtx (<MODE>mode);
- tmp1 = gen_reg_rtx (<MODE>mode);
+ if (<MODE>mode == V2DFmode
+ && TARGET_AVX && !TARGET_PREFER_AVX128)
+ {
+ rtx tmp2 = gen_reg_rtx (V4DFmode);
- emit_insn (gen_round<mode>2 (tmp0, operands[1]));
- emit_insn (gen_round<mode>2 (tmp1, operands[2]));
+ tmp0 = gen_reg_rtx (V4DFmode);
+ tmp1 = force_reg (V2DFmode, operands[1]);
- emit_insn
- (gen_vec_pack_sfix_trunc_<mode> (operands[0], tmp0, tmp1));
+ emit_insn (gen_avx_vec_concatv4df (tmp0, tmp1, operands[2]));
+ emit_insn (gen_roundv4df2 (tmp2, tmp0));
+ emit_insn (gen_fix_truncv4dfv4si2 (operands[0], tmp2));
+ }
+ else
+ {
+ tmp0 = gen_reg_rtx (<MODE>mode);
+ tmp1 = gen_reg_rtx (<MODE>mode);
+
+ emit_insn (gen_round<mode>2 (tmp0, operands[1]));
+ emit_insn (gen_round<mode>2 (tmp1, operands[2]));
+
+ emit_insn
+ (gen_vec_pack_sfix_trunc_<mode> (operands[0], tmp0, tmp1));
+ }
DONE;
})