diff options
author | bstarynk <bstarynk@138bc75d-0d04-0410-961f-82ee72b054a4> | 2011-11-21 09:36:11 +0000 |
---|---|---|
committer | bstarynk <bstarynk@138bc75d-0d04-0410-961f-82ee72b054a4> | 2011-11-21 09:36:11 +0000 |
commit | 8726b204214570b8506b06e7f1af935e21cb25c9 (patch) | |
tree | 197000f9c24e34c902c5357dd6d522f430ac0722 /gcc/config/i386 | |
parent | 0f3b427f9513aaff0bb89af90bb60fd21aa23ce8 (diff) | |
download | gcc-8726b204214570b8506b06e7f1af935e21cb25c9.tar.gz |
2011-11-21 Basile Starynkevitch <basile@starynkevitch.net>
MELT branch merged with trunk rev 181552 using svnmerge
git-svn-id: svn+ssh://gcc.gnu.org/svn/gcc/branches/melt-branch@181554 138bc75d-0d04-0410-961f-82ee72b054a4
Diffstat (limited to 'gcc/config/i386')
-rw-r--r-- | gcc/config/i386/emmintrin.h | 8 | ||||
-rw-r--r-- | gcc/config/i386/i386-builtin-types.def | 2 | ||||
-rw-r--r-- | gcc/config/i386/i386.c | 247 | ||||
-rw-r--r-- | gcc/config/i386/i386.md | 1 | ||||
-rw-r--r-- | gcc/config/i386/i386elf.h | 4 | ||||
-rw-r--r-- | gcc/config/i386/mmx.md | 4 | ||||
-rw-r--r-- | gcc/config/i386/sse.md | 77 |
7 files changed, 242 insertions, 101 deletions
diff --git a/gcc/config/i386/emmintrin.h b/gcc/config/i386/emmintrin.h index fe4cd6abaea..07ac9f3d8ae 100644 --- a/gcc/config/i386/emmintrin.h +++ b/gcc/config/i386/emmintrin.h @@ -1418,6 +1418,14 @@ _mm_stream_si32 (int *__A, int __B) __builtin_ia32_movnti (__A, __B); } +#ifdef __x86_64__ +extern __inline void __attribute__((__gnu_inline__, __always_inline__, __artificial__)) +_mm_stream_si64 (long long int *__A, long long int __B) +{ + __builtin_ia32_movnti64 (__A, __B); +} +#endif + extern __inline void __attribute__((__gnu_inline__, __always_inline__, __artificial__)) _mm_stream_si128 (__m128i *__A, __m128i __B) { diff --git a/gcc/config/i386/i386-builtin-types.def b/gcc/config/i386/i386-builtin-types.def index 79fb1427787..d00b05341b9 100644 --- a/gcc/config/i386/i386-builtin-types.def +++ b/gcc/config/i386/i386-builtin-types.def @@ -111,6 +111,7 @@ DEF_POINTER_TYPE (PDOUBLE, DOUBLE) DEF_POINTER_TYPE (PFLOAT, FLOAT) DEF_POINTER_TYPE (PUSHORT, USHORT) DEF_POINTER_TYPE (PINT, INT) +DEF_POINTER_TYPE (PLONGLONG, LONGLONG) DEF_POINTER_TYPE (PULONGLONG, ULONGLONG) DEF_POINTER_TYPE (PUNSIGNED, UNSIGNED) @@ -357,6 +358,7 @@ DEF_FUNCTION_TYPE (VOID, PDOUBLE, V4DF) DEF_FUNCTION_TYPE (VOID, PFLOAT, V4SF) DEF_FUNCTION_TYPE (VOID, PFLOAT, V8SF) DEF_FUNCTION_TYPE (VOID, PINT, INT) +DEF_FUNCTION_TYPE (VOID, PLONGLONG, LONGLONG) DEF_FUNCTION_TYPE (VOID, PULONGLONG, ULONGLONG) DEF_FUNCTION_TYPE (VOID, PV2SI, V2SI) DEF_FUNCTION_TYPE (VOID, PV2DI, V2DI) diff --git a/gcc/config/i386/i386.c b/gcc/config/i386/i386.c index 691c89a9d28..34368203cd9 100644 --- a/gcc/config/i386/i386.c +++ b/gcc/config/i386/i386.c @@ -1783,18 +1783,18 @@ struct processor_costs atom_cost = { /* stringop_algs for memcpy. SSE loops works best on Atom, but fall back into non-SSE unrolled loop variant if that fails. */ - {{{libcall, {{4096, sse_loop}, {4096, unrolled_loop}, {-1, libcall}}}, /* Known alignment. */ - {libcall, {{4096, sse_loop}, {4096, unrolled_loop}, {-1, libcall}}}}, - {{libcall, {{-1, libcall}}}, /* Unknown alignment. */ - {libcall, {{2048, sse_loop}, {2048, unrolled_loop}, + {{{libcall, {{4096, unrolled_loop}, {-1, libcall}}}, /* Known alignment. */ + {libcall, {{4096, unrolled_loop}, {-1, libcall}}}}, + {{libcall, {{2048, unrolled_loop}, {-1, libcall}}}, /* Unknown alignment. */ + {libcall, {{2048, unrolled_loop}, {-1, libcall}}}}}, /* stringop_algs for memset. */ - {{{libcall, {{4096, sse_loop}, {4096, unrolled_loop}, {-1, libcall}}}, /* Known alignment. */ - {libcall, {{4096, sse_loop}, {4096, unrolled_loop}, {-1, libcall}}}}, - {{libcall, {{1024, sse_loop}, {1024, unrolled_loop}, /* Unknown alignment. */ + {{{libcall, {{4096, unrolled_loop}, {-1, libcall}}}, /* Known alignment. */ + {libcall, {{4096, unrolled_loop}, {-1, libcall}}}}, + {{libcall, {{1024, unrolled_loop}, /* Unknown alignment. */ {-1, libcall}}}, - {libcall, {{2048, sse_loop}, {2048, unrolled_loop}, + {libcall, {{2048, unrolled_loop}, {-1, libcall}}}}}, 1, /* scalar_stmt_cost. */ 1, /* scalar load_cost. */ @@ -19619,8 +19619,12 @@ ix86_expand_int_vcond (rtx operands[]) cop0 = operands[4]; cop1 = operands[5]; - /* XOP supports all of the comparisons on all vector int types. */ - if (!TARGET_XOP) + /* XOP supports all of the comparisons on all 128-bit vector int types. */ + if (TARGET_XOP + && (mode == V16QImode || mode == V8HImode + || mode == V4SImode || mode == V2DImode)) + ; + else { /* Canonicalize the comparison to EQ, GT, GTU. */ switch (code) @@ -21149,20 +21153,25 @@ expand_set_or_movmem_via_loop_with_iter (rtx destmem, rtx srcmem, top_label = gen_label_rtx (); out_label = gen_label_rtx (); - if (!reuse_iter) - iter = gen_reg_rtx (iter_mode); - size = expand_simple_binop (iter_mode, AND, count, piece_size_mask, - NULL, 1, OPTAB_DIRECT); - /* Those two should combine. */ - if (piece_size == const1_rtx) + NULL, 1, OPTAB_DIRECT); + if (!reuse_iter) + { + iter = gen_reg_rtx (iter_mode); + /* Those two should combine. */ + if (piece_size == const1_rtx) + { + emit_cmp_and_jump_insns (size, const0_rtx, EQ, NULL_RTX, iter_mode, + true, out_label); + predict_jump (REG_BR_PROB_BASE * 10 / 100); + } + emit_move_insn (iter, const0_rtx); + } + else { - emit_cmp_and_jump_insns (size, const0_rtx, EQ, NULL_RTX, iter_mode, + emit_cmp_and_jump_insns (iter, size, GE, NULL_RTX, iter_mode, true, out_label); - predict_jump (REG_BR_PROB_BASE * 10 / 100); } - if (!reuse_iter) - emit_move_insn (iter, const0_rtx); emit_label (top_label); @@ -21460,7 +21469,7 @@ expand_movmem_epilogue (rtx destmem, rtx srcmem, gcc_assert (remainder_size == 0); return; } - if (max_size > 8) + if (max_size > 16) { count = expand_simple_binop (GET_MODE (count), AND, count, GEN_INT (max_size - 1), count, 1, OPTAB_DIRECT); @@ -21475,6 +21484,25 @@ expand_movmem_epilogue (rtx destmem, rtx srcmem, */ if (TARGET_SINGLE_STRINGOP) { + if (max_size > 8) + { + rtx label = ix86_expand_aligntest (count, 8, true); + if (TARGET_64BIT) + { + src = change_address (srcmem, DImode, srcptr); + dest = change_address (destmem, DImode, destptr); + emit_insn (gen_strmov (destptr, dest, srcptr, src)); + } + else + { + src = change_address (srcmem, SImode, srcptr); + dest = change_address (destmem, SImode, destptr); + emit_insn (gen_strmov (destptr, dest, srcptr, src)); + emit_insn (gen_strmov (destptr, dest, srcptr, src)); + } + emit_label (label); + LABEL_NUSES (label) = 1; + } if (max_size > 4) { rtx label = ix86_expand_aligntest (count, 4, true); @@ -21508,6 +21536,35 @@ expand_movmem_epilogue (rtx destmem, rtx srcmem, rtx offset = force_reg (Pmode, const0_rtx); rtx tmp; + if (max_size > 8) + { + rtx label = ix86_expand_aligntest (count, 8, true); + if (TARGET_64BIT) + { + src = change_address (srcmem, DImode, srcptr); + dest = change_address (destmem, DImode, destptr); + emit_move_insn (dest, src); + tmp = expand_simple_binop (Pmode, PLUS, offset, GEN_INT (8), NULL, + true, OPTAB_LIB_WIDEN); + } + else + { + src = change_address (srcmem, SImode, srcptr); + dest = change_address (destmem, SImode, destptr); + emit_move_insn (dest, src); + tmp = expand_simple_binop (Pmode, PLUS, offset, GEN_INT (4), NULL, + true, OPTAB_LIB_WIDEN); + if (tmp != offset) + emit_move_insn (offset, tmp); + tmp = expand_simple_binop (Pmode, PLUS, offset, GEN_INT (4), NULL, + true, OPTAB_LIB_WIDEN); + emit_move_insn (dest, src); + } + if (tmp != offset) + emit_move_insn (offset, tmp); + emit_label (label); + LABEL_NUSES (label) = 1; + } if (max_size > 4) { rtx label = ix86_expand_aligntest (count, 4, true); @@ -21588,17 +21645,28 @@ expand_setmem_epilogue (rtx destmem, rtx destptr, rtx promoted_to_vector_value, Remaining part we'll move using Pmode and narrower modes. */ if (promoted_to_vector_value) - while (remainder_size >= 16) - { - if (GET_MODE (destmem) != move_mode) - destmem = adjust_automodify_address_nv (destmem, move_mode, - destptr, offset); - emit_strset (destmem, promoted_to_vector_value, destptr, - move_mode, offset); - - offset += 16; - remainder_size -= 16; - } + { + if (promoted_to_vector_value) + { + if (max_size >= GET_MODE_SIZE (V4SImode)) + move_mode = V4SImode; + else if (max_size >= GET_MODE_SIZE (DImode)) + move_mode = DImode; + } + while (remainder_size >= GET_MODE_SIZE (move_mode)) + { + if (GET_MODE (destmem) != move_mode) + destmem = adjust_automodify_address_nv (destmem, move_mode, + destptr, offset); + emit_strset (destmem, + promoted_to_vector_value, + destptr, + move_mode, offset); + + offset += GET_MODE_SIZE (move_mode); + remainder_size -= GET_MODE_SIZE (move_mode); + } + } /* Move the remaining part of epilogue - its size might be a size of the widest mode. */ @@ -22022,10 +22090,11 @@ decide_alg (HOST_WIDE_INT count, HOST_WIDE_INT expected_size, bool memset, || (memset ? fixed_regs[AX_REG] : fixed_regs[SI_REG])); -#define ALG_USABLE_P(alg) (rep_prefix_usable \ - || (alg != rep_prefix_1_byte \ - && alg != rep_prefix_4_byte \ - && alg != rep_prefix_8_byte)) +#define ALG_USABLE_P(alg) ((rep_prefix_usable \ + || (alg != rep_prefix_1_byte \ + && alg != rep_prefix_4_byte \ + && alg != rep_prefix_8_byte)) \ + && (TARGET_SSE2 || alg != sse_loop)) const struct processor_costs *cost; /* Even if the string operation call is cold, we still might spend a lot @@ -22037,6 +22106,9 @@ decide_alg (HOST_WIDE_INT count, HOST_WIDE_INT expected_size, bool memset, else optimize_for_speed = true; + if (!optimize) + return (rep_prefix_usable ? rep_prefix_1_byte : libcall); + cost = optimize_for_speed ? ix86_cost : &ix86_size_cost; *dynamic_check = -1; @@ -22049,10 +22121,10 @@ decide_alg (HOST_WIDE_INT count, HOST_WIDE_INT expected_size, bool memset, /* rep; movq or rep; movl is the smallest variant. */ else if (!optimize_for_speed) { - if (!count || (count & 3)) - return rep_prefix_usable ? rep_prefix_1_byte : loop_1_byte; + if (!count || (count & 3) || memset) + return rep_prefix_usable ? rep_prefix_1_byte : libcall; else - return rep_prefix_usable ? rep_prefix_4_byte : loop; + return rep_prefix_usable ? rep_prefix_4_byte : libcall; } /* Very tiny blocks are best handled via the loop, REP is expensive to setup. */ @@ -22106,13 +22178,11 @@ decide_alg (HOST_WIDE_INT count, HOST_WIDE_INT expected_size, bool memset, int max = -1; enum stringop_alg alg; int i; - bool any_alg_usable_p = true; bool only_libcall_fits = true; for (i = 0; i < MAX_STRINGOP_ALGS; i++) { enum stringop_alg candidate = algs->size[i].alg; - any_alg_usable_p = any_alg_usable_p && ALG_USABLE_P (candidate); if (candidate != libcall && candidate && ALG_USABLE_P (candidate)) @@ -22124,7 +22194,7 @@ decide_alg (HOST_WIDE_INT count, HOST_WIDE_INT expected_size, bool memset, /* If there aren't any usable algorithms, then recursing on smaller sizes isn't going to find anything. Just return the simple byte-at-a-time copy loop. */ - if (!any_alg_usable_p || only_libcall_fits) + if (only_libcall_fits) { /* Pick something reasonable. */ if (TARGET_INLINE_STRINGOPS_DYNAMICALLY) @@ -22253,7 +22323,7 @@ ix86_expand_movmem (rtx dst, rtx src, rtx count_exp, rtx align_exp, int dynamic_check; bool need_zero_guard = false; bool align_unknown; - int unroll_factor; + unsigned int unroll_factor; enum machine_mode move_mode; rtx loop_iter = NULL_RTX; int dst_offset, src_offset; @@ -22316,14 +22386,28 @@ ix86_expand_movmem (rtx dst, rtx src, rtx count_exp, rtx align_exp, case unrolled_loop: need_zero_guard = true; move_mode = Pmode; - unroll_factor = TARGET_64BIT ? 4 : 2; + unroll_factor = 1; + /* Select maximal available 1,2 or 4 unroll factor. + In 32bit we can not afford to use 4 registers inside the loop. */ + if (!count) + unroll_factor = TARGET_64BIT ? 4 : 2; + else + while (GET_MODE_SIZE (move_mode) * unroll_factor * 2 < count + && unroll_factor < (TARGET_64BIT ? 4 :2)) + unroll_factor *= 2; size_needed = GET_MODE_SIZE (move_mode) * unroll_factor; break; case sse_loop: need_zero_guard = true; /* Use SSE instructions, if possible. */ - move_mode = align_unknown ? DImode : V4SImode; - unroll_factor = TARGET_64BIT ? 4 : 2; + move_mode = V4SImode; + /* Select maximal available 1,2 or 4 unroll factor. */ + if (!count) + unroll_factor = 4; + else + while (GET_MODE_SIZE (move_mode) * unroll_factor * 2 < count + && unroll_factor < 4) + unroll_factor *= 2; size_needed = GET_MODE_SIZE (move_mode) * unroll_factor; break; case rep_prefix_8_byte: @@ -22568,7 +22652,13 @@ ix86_expand_movmem (rtx dst, rtx src, rtx count_exp, rtx align_exp, if (alg == sse_loop || alg == unrolled_loop) { rtx tmp; - if (align_unknown && unroll_factor > 1) + int remainder_size = epilogue_size_needed; + + /* We may not need the epilgoue loop at all when the count is known + and alignment is not adjusted. */ + if (count && desired_align <= align) + remainder_size = count % epilogue_size_needed; + if (remainder_size > 31) { /* Reduce epilogue's size by creating not-unrolled loop. If we won't do this, we can have very big epilogue - when alignment is statically @@ -22710,7 +22800,7 @@ promote_duplicated_reg_to_size (rtx val, int size_needed, int desired_align, int { rtx promoted_val = NULL_RTX; - if (size_needed > 8 || (desired_align > align && desired_align > 8)) + if (size_needed > 8) { /* We want to promote to vector register, so we expect that at least SSE is available. */ @@ -22724,7 +22814,7 @@ promote_duplicated_reg_to_size (rtx val, int size_needed, int desired_align, int else promoted_val = promote_duplicated_reg (V4SImode, val); } - else if (size_needed > 4 || (desired_align > align && desired_align > 4)) + else if (size_needed > 4) { gcc_assert (TARGET_64BIT); promoted_val = promote_duplicated_reg (DImode, val); @@ -22764,6 +22854,7 @@ ix86_expand_setmem (rtx dst, rtx count_exp, rtx val_exp, rtx align_exp, unsigned int unroll_factor; enum machine_mode move_mode; rtx loop_iter = NULL_RTX; + bool early_jump = false; if (CONST_INT_P (align_exp)) align = INTVAL (align_exp); @@ -22783,7 +22874,7 @@ ix86_expand_setmem (rtx dst, rtx count_exp, rtx val_exp, rtx align_exp, /* Step 0: Decide on preferred algorithm, desired alignment and size of chunks to be copied by main loop. */ - align_unknown = CONST_INT_P (align_exp) && INTVAL (align_exp) > 0; + align_unknown = !(CONST_INT_P (align_exp) && INTVAL (align_exp) > 0); alg = decide_alg (count, expected_size, true, &dynamic_check, align_unknown); desired_align = decide_alignment (align, alg, expected_size); unroll_factor = 1; @@ -22813,9 +22904,12 @@ ix86_expand_setmem (rtx dst, rtx count_exp, rtx val_exp, rtx align_exp, move_mode = Pmode; unroll_factor = 1; /* Select maximal available 1,2 or 4 unroll factor. */ - while (GET_MODE_SIZE (move_mode) * unroll_factor * 2 < count - && unroll_factor < 4) - unroll_factor *= 2; + if (!count) + unroll_factor = 4; + else + while (GET_MODE_SIZE (move_mode) * unroll_factor * 2 < count + && unroll_factor < 4) + unroll_factor *= 2; size_needed = GET_MODE_SIZE (move_mode) * unroll_factor; break; case sse_loop: @@ -22823,9 +22917,12 @@ ix86_expand_setmem (rtx dst, rtx count_exp, rtx val_exp, rtx align_exp, move_mode = TARGET_64BIT ? V2DImode : V4SImode; unroll_factor = 1; /* Select maximal available 1,2 or 4 unroll factor. */ - while (GET_MODE_SIZE (move_mode) * unroll_factor * 2 < count - && unroll_factor < 4) - unroll_factor *= 2; + if (!count) + unroll_factor = 4; + else + while (GET_MODE_SIZE (move_mode) * unroll_factor * 2 < count + && unroll_factor < 4) + unroll_factor *= 2; size_needed = GET_MODE_SIZE (move_mode) * unroll_factor; break; case rep_prefix_8_byte: @@ -22904,6 +23001,7 @@ ix86_expand_setmem (rtx dst, rtx count_exp, rtx val_exp, rtx align_exp, emit_move_insn (loop_iter, const0_rtx); } label = gen_label_rtx (); + early_jump = true; emit_cmp_and_jump_insns (count_exp, GEN_INT (epilogue_size_needed), LTU, 0, counter_mode (count_exp), 1, label); @@ -23016,7 +23114,7 @@ ix86_expand_setmem (rtx dst, rtx count_exp, rtx val_exp, rtx align_exp, vec_promoted_val = promote_duplicated_reg_to_size (gpr_promoted_val, GET_MODE_SIZE (move_mode), - desired_align, align); + GET_MODE_SIZE (move_mode), align); loop_iter = expand_set_or_movmem_via_loop_with_iter (dst, NULL, destreg, NULL, vec_promoted_val, count_exp, loop_iter, move_mode, unroll_factor, @@ -23065,21 +23163,26 @@ ix86_expand_setmem (rtx dst, rtx count_exp, rtx val_exp, rtx align_exp, LABEL_NUSES (label) = 1; /* We can not rely on fact that promoved value is known. */ vec_promoted_val = 0; - gpr_promoted_val = 0; + if (early_jump) + gpr_promoted_val = 0; } epilogue: if (alg == unrolled_loop || alg == sse_loop) { rtx tmp; - if (align_unknown && unroll_factor > 1 - && epilogue_size_needed >= GET_MODE_SIZE (move_mode) - && vec_promoted_val) + int remainder_size = epilogue_size_needed; + if (count && desired_align <= align) + remainder_size = count % epilogue_size_needed; + /* We may not need the epilgoue loop at all when the count is known + and alignment is not adjusted. */ + if (remainder_size > 31 + && (alg == sse_loop ? vec_promoted_val : gpr_promoted_val)) { /* Reduce epilogue's size by creating not-unrolled loop. If we won't do this, we can have very big epilogue - when alignment is statically unknown we'll have the epilogue byte by byte which may be very slow. */ loop_iter = expand_set_or_movmem_via_loop_with_iter (dst, NULL, destreg, - NULL, vec_promoted_val, count_exp, + NULL, (alg == sse_loop ? vec_promoted_val : gpr_promoted_val), count_exp, loop_iter, move_mode, 1, expected_size, false); dst = change_address (dst, BLKmode, destreg); @@ -23090,17 +23193,14 @@ ix86_expand_setmem (rtx dst, rtx count_exp, rtx val_exp, rtx align_exp, if (tmp != destreg) emit_move_insn (destreg, tmp); } - if (count_exp == const0_rtx) + if (count_exp == const0_rtx || epilogue_size_needed <= 1) ; - else if (!gpr_promoted_val && epilogue_size_needed > 1) + else if (!gpr_promoted_val) expand_setmem_epilogue_via_loop (dst, destreg, val_exp, count_exp, epilogue_size_needed); else - { - if (epilogue_size_needed > 1) - expand_setmem_epilogue (dst, destreg, vec_promoted_val, gpr_promoted_val, - val_exp, count_exp, epilogue_size_needed); - } + expand_setmem_epilogue (dst, destreg, vec_promoted_val, gpr_promoted_val, + val_exp, count_exp, epilogue_size_needed); if (jump_around_label) emit_label (jump_around_label); return true; @@ -25245,6 +25345,7 @@ enum ix86_builtins IX86_BUILTIN_CVTTPS2DQ, IX86_BUILTIN_MOVNTI, + IX86_BUILTIN_MOVNTI64, IX86_BUILTIN_MOVNTPD, IX86_BUILTIN_MOVNTDQ, @@ -26318,7 +26419,7 @@ static const struct builtin_description bdesc_special_args[] = /* SSE or 3DNow!A */ { OPTION_MASK_ISA_SSE | OPTION_MASK_ISA_3DNOW_A, CODE_FOR_sse_sfence, "__builtin_ia32_sfence", IX86_BUILTIN_SFENCE, UNKNOWN, (int) VOID_FTYPE_VOID }, - { OPTION_MASK_ISA_SSE | OPTION_MASK_ISA_3DNOW_A, CODE_FOR_sse_movntdi, "__builtin_ia32_movntq", IX86_BUILTIN_MOVNTQ, UNKNOWN, (int) VOID_FTYPE_PULONGLONG_ULONGLONG }, + { OPTION_MASK_ISA_SSE | OPTION_MASK_ISA_3DNOW_A, CODE_FOR_sse_movntq, "__builtin_ia32_movntq", IX86_BUILTIN_MOVNTQ, UNKNOWN, (int) VOID_FTYPE_PULONGLONG_ULONGLONG }, /* SSE2 */ { OPTION_MASK_ISA_SSE2, CODE_FOR_sse2_lfence, "__builtin_ia32_lfence", IX86_BUILTIN_LFENCE, UNKNOWN, (int) VOID_FTYPE_VOID }, @@ -26327,7 +26428,8 @@ static const struct builtin_description bdesc_special_args[] = { OPTION_MASK_ISA_SSE2, CODE_FOR_sse2_movdqu, "__builtin_ia32_storedqu", IX86_BUILTIN_STOREDQU, UNKNOWN, (int) VOID_FTYPE_PCHAR_V16QI }, { OPTION_MASK_ISA_SSE2, CODE_FOR_sse2_movntv2df, "__builtin_ia32_movntpd", IX86_BUILTIN_MOVNTPD, UNKNOWN, (int) VOID_FTYPE_PDOUBLE_V2DF }, { OPTION_MASK_ISA_SSE2, CODE_FOR_sse2_movntv2di, "__builtin_ia32_movntdq", IX86_BUILTIN_MOVNTDQ, UNKNOWN, (int) VOID_FTYPE_PV2DI_V2DI }, - { OPTION_MASK_ISA_SSE2, CODE_FOR_sse2_movntsi, "__builtin_ia32_movnti", IX86_BUILTIN_MOVNTI, UNKNOWN, (int) VOID_FTYPE_PINT_INT }, + { OPTION_MASK_ISA_SSE2, CODE_FOR_sse2_movntisi, "__builtin_ia32_movnti", IX86_BUILTIN_MOVNTI, UNKNOWN, (int) VOID_FTYPE_PINT_INT }, + { OPTION_MASK_ISA_SSE2 | OPTION_MASK_ISA_64BIT, CODE_FOR_sse2_movntidi, "__builtin_ia32_movnti64", IX86_BUILTIN_MOVNTI64, UNKNOWN, (int) VOID_FTYPE_PLONGLONG_LONGLONG }, { OPTION_MASK_ISA_SSE2, CODE_FOR_sse2_movupd, "__builtin_ia32_loadupd", IX86_BUILTIN_LOADUPD, UNKNOWN, (int) V2DF_FTYPE_PCDOUBLE }, { OPTION_MASK_ISA_SSE2, CODE_FOR_sse2_movdqu, "__builtin_ia32_loaddqu", IX86_BUILTIN_LOADDQU, UNKNOWN, (int) V16QI_FTYPE_PCCHAR }, @@ -29313,6 +29415,7 @@ ix86_expand_special_args_builtin (const struct builtin_description *d, case VOID_FTYPE_PFLOAT_V4SF: case VOID_FTYPE_PDOUBLE_V4DF: case VOID_FTYPE_PDOUBLE_V2DF: + case VOID_FTYPE_PLONGLONG_LONGLONG: case VOID_FTYPE_PULONGLONG_ULONGLONG: case VOID_FTYPE_PINT_INT: nargs = 1; @@ -29914,7 +30017,7 @@ rdrand_step: icode = CODE_FOR_avx2_gatherdiv8sf; goto gather_gen; case IX86_BUILTIN_GATHERALTSIV4DI: - icode = CODE_FOR_avx2_gathersiv4df; + icode = CODE_FOR_avx2_gathersiv4di; goto gather_gen; case IX86_BUILTIN_GATHERALTDIV8SI: icode = CODE_FOR_avx2_gatherdiv8si; diff --git a/gcc/config/i386/i386.md b/gcc/config/i386/i386.md index bc602532304..912c17229a2 100644 --- a/gcc/config/i386/i386.md +++ b/gcc/config/i386/i386.md @@ -115,6 +115,7 @@ UNSPEC_FIX_NOTRUNC UNSPEC_MASKMOV UNSPEC_MOVMSK + UNSPEC_MOVNTQ UNSPEC_MOVNT UNSPEC_MOVU UNSPEC_RCP diff --git a/gcc/config/i386/i386elf.h b/gcc/config/i386/i386elf.h index 179c601738d..1bf3feb7479 100644 --- a/gcc/config/i386/i386elf.h +++ b/gcc/config/i386/i386elf.h @@ -20,10 +20,6 @@ You should have received a copy of the GNU General Public License along with GCC; see the file COPYING3. If not see <http://www.gnu.org/licenses/>. */ -/* Use stabs instead of DWARF debug format. */ -#undef PREFERRED_DEBUGGING_TYPE -#define PREFERRED_DEBUGGING_TYPE DBX_DEBUG - /* The ELF ABI for the i386 says that records and unions are returned in memory. */ diff --git a/gcc/config/i386/mmx.md b/gcc/config/i386/mmx.md index f76834e5ab2..7fa072eb6fe 100644 --- a/gcc/config/i386/mmx.md +++ b/gcc/config/i386/mmx.md @@ -329,10 +329,10 @@ DONE; }) -(define_insn "sse_movntdi" +(define_insn "sse_movntq" [(set (match_operand:DI 0 "memory_operand" "=m") (unspec:DI [(match_operand:DI 1 "register_operand" "y")] - UNSPEC_MOVNT))] + UNSPEC_MOVNTQ))] "TARGET_SSE || TARGET_3DNOW_A" "movntq\t{%1, %0|%0, %1}" [(set_attr "type" "mmxmov") diff --git a/gcc/config/i386/sse.md b/gcc/config/i386/sse.md index b8e821de90e..89559966f0e 100644 --- a/gcc/config/i386/sse.md +++ b/gcc/config/i386/sse.md @@ -573,15 +573,15 @@ (set_attr "prefix" "maybe_vex") (set_attr "mode" "<sseinsnmode>")]) -(define_insn "sse2_movntsi" - [(set (match_operand:SI 0 "memory_operand" "=m") - (unspec:SI [(match_operand:SI 1 "register_operand" "r")] - UNSPEC_MOVNT))] +(define_insn "sse2_movnti<mode>" + [(set (match_operand:SWI48 0 "memory_operand" "=m") + (unspec:SWI48 [(match_operand:SWI48 1 "register_operand" "r")] + UNSPEC_MOVNT))] "TARGET_SSE2" "movnti\t{%1, %0|%0, %1}" [(set_attr "type" "ssemov") (set_attr "prefix_data16" "0") - (set_attr "mode" "V2DF")]) + (set_attr "mode" "<MODE>")]) (define_insn "<sse>_movnt<mode>" [(set (match_operand:VF 0 "memory_operand" "=m") @@ -614,8 +614,9 @@ ;; Modes handled by storent patterns. (define_mode_iterator STORENT_MODE - [(SI "TARGET_SSE2") (SF "TARGET_SSE4A") (DF "TARGET_SSE4A") - (V2DI "TARGET_SSE2") + [(DI "TARGET_SSE2 && TARGET_64BIT") (SI "TARGET_SSE2") + (SF "TARGET_SSE4A") (DF "TARGET_SSE4A") + (V4DI "TARGET_AVX") (V2DI "TARGET_SSE2") (V8SF "TARGET_AVX") V4SF (V4DF "TARGET_AVX") (V2DF "TARGET_SSE2")]) @@ -9962,17 +9963,32 @@ { rtx tmp0, tmp1; - tmp0 = gen_reg_rtx (<MODE>mode); - tmp1 = gen_reg_rtx (<MODE>mode); + if (<MODE>mode == V2DFmode + && TARGET_AVX && !TARGET_PREFER_AVX128) + { + rtx tmp2 = gen_reg_rtx (V4DFmode); - emit_insn - (gen_<sse4_1>_round<ssemodesuffix><avxsizesuffix> (tmp0, operands[1], - operands[3])); - emit_insn - (gen_<sse4_1>_round<ssemodesuffix><avxsizesuffix> (tmp1, operands[2], - operands[3])); - emit_insn - (gen_vec_pack_sfix_trunc_<mode> (operands[0], tmp0, tmp1)); + tmp0 = gen_reg_rtx (V4DFmode); + tmp1 = force_reg (V2DFmode, operands[1]); + + emit_insn (gen_avx_vec_concatv4df (tmp0, tmp1, operands[2])); + emit_insn (gen_avx_roundpd256 (tmp2, tmp0, operands[3])); + emit_insn (gen_fix_truncv4dfv4si2 (operands[0], tmp2)); + } + else + { + tmp0 = gen_reg_rtx (<MODE>mode); + tmp1 = gen_reg_rtx (<MODE>mode); + + emit_insn + (gen_<sse4_1>_round<ssemodesuffix><avxsizesuffix> (tmp0, operands[1], + operands[3])); + emit_insn + (gen_<sse4_1>_round<ssemodesuffix><avxsizesuffix> (tmp1, operands[2], + operands[3])); + emit_insn + (gen_vec_pack_sfix_trunc_<mode> (operands[0], tmp0, tmp1)); + } DONE; }) @@ -10053,14 +10069,29 @@ { rtx tmp0, tmp1; - tmp0 = gen_reg_rtx (<MODE>mode); - tmp1 = gen_reg_rtx (<MODE>mode); + if (<MODE>mode == V2DFmode + && TARGET_AVX && !TARGET_PREFER_AVX128) + { + rtx tmp2 = gen_reg_rtx (V4DFmode); - emit_insn (gen_round<mode>2 (tmp0, operands[1])); - emit_insn (gen_round<mode>2 (tmp1, operands[2])); + tmp0 = gen_reg_rtx (V4DFmode); + tmp1 = force_reg (V2DFmode, operands[1]); - emit_insn - (gen_vec_pack_sfix_trunc_<mode> (operands[0], tmp0, tmp1)); + emit_insn (gen_avx_vec_concatv4df (tmp0, tmp1, operands[2])); + emit_insn (gen_roundv4df2 (tmp2, tmp0)); + emit_insn (gen_fix_truncv4dfv4si2 (operands[0], tmp2)); + } + else + { + tmp0 = gen_reg_rtx (<MODE>mode); + tmp1 = gen_reg_rtx (<MODE>mode); + + emit_insn (gen_round<mode>2 (tmp0, operands[1])); + emit_insn (gen_round<mode>2 (tmp1, operands[2])); + + emit_insn + (gen_vec_pack_sfix_trunc_<mode> (operands[0], tmp0, tmp1)); + } DONE; }) |