summaryrefslogtreecommitdiff
path: root/gcc/config/i386/i386.c
diff options
context:
space:
mode:
Diffstat (limited to 'gcc/config/i386/i386.c')
-rw-r--r--gcc/config/i386/i386.c77
1 files changed, 49 insertions, 28 deletions
diff --git a/gcc/config/i386/i386.c b/gcc/config/i386/i386.c
index e61d2c55ea5..5ab999db6a7 100644
--- a/gcc/config/i386/i386.c
+++ b/gcc/config/i386/i386.c
@@ -24621,9 +24621,10 @@ alg_usable_p (enum stringop_alg alg, bool memset)
static enum stringop_alg
decide_alg (HOST_WIDE_INT count, HOST_WIDE_INT expected_size,
unsigned HOST_WIDE_INT min_size, unsigned HOST_WIDE_INT max_size,
- bool memset, bool zero_memset, int *dynamic_check, bool *noalign)
+ bool memset, bool zero_memset, int *dynamic_check, bool *noalign,
+ bool recur)
{
- const struct stringop_algs * algs;
+ const struct stringop_algs *algs;
bool optimize_for_speed;
int max = 0;
const struct processor_costs *cost;
@@ -24657,7 +24658,7 @@ decide_alg (HOST_WIDE_INT count, HOST_WIDE_INT expected_size,
any_alg_usable_p |= usable;
if (candidate != libcall && candidate && usable)
- max = algs->size[i].max;
+ max = algs->size[i].max;
}
/* If expected size is not known but max size is small enough
@@ -24667,7 +24668,7 @@ decide_alg (HOST_WIDE_INT count, HOST_WIDE_INT expected_size,
&& expected_size == -1)
expected_size = min_size / 2 + max_size / 2;
- /* If user specified the algorithm, honnor it if possible. */
+ /* If user specified the algorithm, honor it if possible. */
if (ix86_stringop_alg != no_stringop
&& alg_usable_p (ix86_stringop_alg, memset))
return ix86_stringop_alg;
@@ -24742,21 +24743,20 @@ decide_alg (HOST_WIDE_INT count, HOST_WIDE_INT expected_size,
|| !alg_usable_p (algs->unknown_size, memset)))
{
enum stringop_alg alg;
+ HOST_WIDE_INT new_expected_size = (max > 0 ? max : 4096) / 2;
- /* If there aren't any usable algorithms, then recursing on
- smaller sizes isn't going to find anything. Just return the
- simple byte-at-a-time copy loop. */
- if (!any_alg_usable_p)
- {
- /* Pick something reasonable. */
- if (TARGET_INLINE_STRINGOPS_DYNAMICALLY)
- *dynamic_check = 128;
- return loop_1_byte;
- }
- if (max <= 0)
- max = 4096;
- alg = decide_alg (count, max / 2, min_size, max_size, memset,
- zero_memset, dynamic_check, noalign);
+ /* If there aren't any usable algorithms or if recursing already,
+ then recursing on smaller sizes or same size isn't going to
+ find anything. Just return the simple byte-at-a-time copy loop. */
+ if (!any_alg_usable_p || recur)
+ {
+ /* Pick something reasonable. */
+ if (TARGET_INLINE_STRINGOPS_DYNAMICALLY && !recur)
+ *dynamic_check = 128;
+ return loop_1_byte;
+ }
+ alg = decide_alg (count, new_expected_size, min_size, max_size, memset,
+ zero_memset, dynamic_check, noalign, true);
gcc_assert (*dynamic_check == -1);
if (TARGET_INLINE_STRINGOPS_DYNAMICALLY)
*dynamic_check = max;
@@ -25012,7 +25012,7 @@ ix86_expand_set_or_movmem (rtx dst, rtx src, rtx count_exp, rtx val_exp,
alg = decide_alg (count, expected_size, min_size, probable_max_size,
issetmem,
issetmem && val_exp == const0_rtx,
- &dynamic_check, &noalign);
+ &dynamic_check, &noalign, false);
if (alg == libcall)
return false;
gcc_assert (alg != no_stringop);
@@ -30804,6 +30804,14 @@ def_builtin (HOST_WIDE_INT mask, const char *name,
{
ix86_builtins_isa[(int) code].isa = mask;
+ /* OPTION_MASK_ISA_AVX512VL has special meaning. Despite of generic case,
+ where any bit set means that built-in is enable, this bit must be *and-ed*
+ with another one. E.g.: OPTION_MASK_ISA_AVX512DQ | OPTION_MASK_ISA_AVX512VL
+ means that *both* cpuid bits must be set for the built-in to be available.
+ Handle this here. */
+ if (mask & ix86_isa_flags & OPTION_MASK_ISA_AVX512VL)
+ mask &= ~OPTION_MASK_ISA_AVX512VL;
+
mask &= ~OPTION_MASK_ISA_64BIT;
if (mask == 0
|| (mask & ix86_isa_flags) != 0
@@ -32465,9 +32473,9 @@ static const struct builtin_description bdesc_args[] =
{ OPTION_MASK_ISA_AVX512BW | OPTION_MASK_ISA_AVX512VL, CODE_FOR_avx512vl_permvarv16hi_mask, "__builtin_ia32_permvarhi256_mask", IX86_BUILTIN_VPERMVARHI256_MASK, UNKNOWN, (int) V16HI_FTYPE_V16HI_V16HI_V16HI_HI },
{ OPTION_MASK_ISA_AVX512BW | OPTION_MASK_ISA_AVX512VL, CODE_FOR_avx512vl_permvarv8hi_mask, "__builtin_ia32_permvarhi128_mask", IX86_BUILTIN_VPERMVARHI128_MASK, UNKNOWN, (int) V8HI_FTYPE_V8HI_V8HI_V8HI_QI },
{ OPTION_MASK_ISA_AVX512BW | OPTION_MASK_ISA_AVX512VL, CODE_FOR_avx512vl_vpermt2varv16hi3_mask, "__builtin_ia32_vpermt2varhi256_mask", IX86_BUILTIN_VPERMT2VARHI256, UNKNOWN, (int) V16HI_FTYPE_V16HI_V16HI_V16HI_HI },
- { OPTION_MASK_ISA_AVX512BW | OPTION_MASK_ISA_AVX512BW | OPTION_MASK_ISA_AVX512VL, CODE_FOR_avx512vl_vpermt2varv16hi3_maskz, "__builtin_ia32_vpermt2varhi256_maskz", IX86_BUILTIN_VPERMT2VARHI256_MASKZ, UNKNOWN, (int) V16HI_FTYPE_V16HI_V16HI_V16HI_HI },
+ { OPTION_MASK_ISA_AVX512BW | OPTION_MASK_ISA_AVX512VL, CODE_FOR_avx512vl_vpermt2varv16hi3_maskz, "__builtin_ia32_vpermt2varhi256_maskz", IX86_BUILTIN_VPERMT2VARHI256_MASKZ, UNKNOWN, (int) V16HI_FTYPE_V16HI_V16HI_V16HI_HI },
{ OPTION_MASK_ISA_AVX512BW | OPTION_MASK_ISA_AVX512VL, CODE_FOR_avx512vl_vpermt2varv8hi3_mask, "__builtin_ia32_vpermt2varhi128_mask", IX86_BUILTIN_VPERMT2VARHI128, UNKNOWN, (int) V8HI_FTYPE_V8HI_V8HI_V8HI_QI },
- { OPTION_MASK_ISA_AVX512BW | OPTION_MASK_ISA_AVX512BW | OPTION_MASK_ISA_AVX512VL, CODE_FOR_avx512vl_vpermt2varv8hi3_maskz, "__builtin_ia32_vpermt2varhi128_maskz", IX86_BUILTIN_VPERMT2VARHI128_MASKZ, UNKNOWN, (int) V8HI_FTYPE_V8HI_V8HI_V8HI_QI },
+ { OPTION_MASK_ISA_AVX512BW | OPTION_MASK_ISA_AVX512VL, CODE_FOR_avx512vl_vpermt2varv8hi3_maskz, "__builtin_ia32_vpermt2varhi128_maskz", IX86_BUILTIN_VPERMT2VARHI128_MASKZ, UNKNOWN, (int) V8HI_FTYPE_V8HI_V8HI_V8HI_QI },
{ OPTION_MASK_ISA_AVX512BW | OPTION_MASK_ISA_AVX512VL, CODE_FOR_avx512vl_vpermi2varv16hi3_mask, "__builtin_ia32_vpermi2varhi256_mask", IX86_BUILTIN_VPERMI2VARHI256, UNKNOWN, (int) V16HI_FTYPE_V16HI_V16HI_V16HI_HI },
{ OPTION_MASK_ISA_AVX512BW | OPTION_MASK_ISA_AVX512VL, CODE_FOR_avx512vl_vpermi2varv8hi3_mask, "__builtin_ia32_vpermi2varhi128_mask", IX86_BUILTIN_VPERMI2VARHI128, UNKNOWN, (int) V8HI_FTYPE_V8HI_V8HI_V8HI_QI },
{ OPTION_MASK_ISA_AVX512VL, CODE_FOR_rcp14v4df_mask, "__builtin_ia32_rcp14pd256_mask", IX86_BUILTIN_RCP14PD256, UNKNOWN, (int) V4DF_FTYPE_V4DF_V4DF_QI },
@@ -33182,9 +33190,9 @@ static const struct builtin_description bdesc_args[] =
{ OPTION_MASK_ISA_AVX512VBMI | OPTION_MASK_ISA_AVX512VL, CODE_FOR_avx512vl_permvarv32qi_mask, "__builtin_ia32_permvarqi256_mask", IX86_BUILTIN_VPERMVARQI256_MASK, UNKNOWN, (int) V32QI_FTYPE_V32QI_V32QI_V32QI_SI },
{ OPTION_MASK_ISA_AVX512VBMI | OPTION_MASK_ISA_AVX512VL, CODE_FOR_avx512vl_permvarv16qi_mask, "__builtin_ia32_permvarqi128_mask", IX86_BUILTIN_VPERMVARQI128_MASK, UNKNOWN, (int) V16QI_FTYPE_V16QI_V16QI_V16QI_HI },
{ OPTION_MASK_ISA_AVX512VBMI | OPTION_MASK_ISA_AVX512VL, CODE_FOR_avx512vl_vpermt2varv32qi3_mask, "__builtin_ia32_vpermt2varqi256_mask", IX86_BUILTIN_VPERMT2VARQI256, UNKNOWN, (int) V32QI_FTYPE_V32QI_V32QI_V32QI_SI },
- { OPTION_MASK_ISA_AVX512VBMI | OPTION_MASK_ISA_AVX512VBMI | OPTION_MASK_ISA_AVX512VL, CODE_FOR_avx512vl_vpermt2varv32qi3_maskz, "__builtin_ia32_vpermt2varqi256_maskz", IX86_BUILTIN_VPERMT2VARQI256_MASKZ, UNKNOWN, (int) V32QI_FTYPE_V32QI_V32QI_V32QI_SI },
+ { OPTION_MASK_ISA_AVX512VBMI | OPTION_MASK_ISA_AVX512VL, CODE_FOR_avx512vl_vpermt2varv32qi3_maskz, "__builtin_ia32_vpermt2varqi256_maskz", IX86_BUILTIN_VPERMT2VARQI256_MASKZ, UNKNOWN, (int) V32QI_FTYPE_V32QI_V32QI_V32QI_SI },
{ OPTION_MASK_ISA_AVX512VBMI | OPTION_MASK_ISA_AVX512VL, CODE_FOR_avx512vl_vpermt2varv16qi3_mask, "__builtin_ia32_vpermt2varqi128_mask", IX86_BUILTIN_VPERMT2VARQI128, UNKNOWN, (int) V16QI_FTYPE_V16QI_V16QI_V16QI_HI },
- { OPTION_MASK_ISA_AVX512VBMI | OPTION_MASK_ISA_AVX512VBMI | OPTION_MASK_ISA_AVX512VL, CODE_FOR_avx512vl_vpermt2varv16qi3_maskz, "__builtin_ia32_vpermt2varqi128_maskz", IX86_BUILTIN_VPERMT2VARQI128_MASKZ, UNKNOWN, (int) V16QI_FTYPE_V16QI_V16QI_V16QI_HI },
+ { OPTION_MASK_ISA_AVX512VBMI | OPTION_MASK_ISA_AVX512VL, CODE_FOR_avx512vl_vpermt2varv16qi3_maskz, "__builtin_ia32_vpermt2varqi128_maskz", IX86_BUILTIN_VPERMT2VARQI128_MASKZ, UNKNOWN, (int) V16QI_FTYPE_V16QI_V16QI_V16QI_HI },
{ OPTION_MASK_ISA_AVX512VBMI | OPTION_MASK_ISA_AVX512VL, CODE_FOR_avx512vl_vpermi2varv32qi3_mask, "__builtin_ia32_vpermi2varqi256_mask", IX86_BUILTIN_VPERMI2VARQI256, UNKNOWN, (int) V32QI_FTYPE_V32QI_V32QI_V32QI_SI },
{ OPTION_MASK_ISA_AVX512VBMI | OPTION_MASK_ISA_AVX512VL, CODE_FOR_avx512vl_vpermi2varv16qi3_mask, "__builtin_ia32_vpermi2varqi128_mask", IX86_BUILTIN_VPERMI2VARQI128, UNKNOWN, (int) V16QI_FTYPE_V16QI_V16QI_V16QI_HI },
};
@@ -45211,7 +45219,12 @@ half:
tmp = gen_reg_rtx (mode);
emit_insn (gen_rtx_SET (VOIDmode, tmp,
gen_rtx_VEC_DUPLICATE (mode, val)));
- emit_insn (gen_blendm (target, tmp, target,
+ /* The avx512*_blendm<mode> expanders have different operand order
+ from VEC_MERGE. In VEC_MERGE, the first input operand is used for
+ elements where the mask is set and second input operand otherwise,
+ in {sse,avx}*_*blend* the first input operand is used for elements
+ where the mask is clear and second input operand otherwise. */
+ emit_insn (gen_blendm (target, target, tmp,
force_reg (mmode,
gen_int_mode (1 << elt, mmode))));
}
@@ -50028,16 +50041,24 @@ ix86_expand_vecop_qihi (enum rtx_code code, rtx dest, rtx op1, rtx op2)
{
/* For SSE2, we used an full interleave, so the desired
results are in the even elements. */
- for (i = 0; i < 64; ++i)
+ for (i = 0; i < d.nelt; ++i)
d.perm[i] = i * 2;
}
else
{
/* For AVX, the interleave used above was not cross-lane. So the
extraction is evens but with the second and third quarter swapped.
- Happily, that is even one insn shorter than even extraction. */
- for (i = 0; i < 64; ++i)
- d.perm[i] = i * 2 + ((i & 24) == 8 ? 16 : (i & 24) == 16 ? -16 : 0);
+ Happily, that is even one insn shorter than even extraction.
+ For AVX512BW we have 4 lanes. We extract evens from within a lane,
+ always first from the first and then from the second source operand,
+ the index bits above the low 4 bits remains the same.
+ Thus, for d.nelt == 32 we want permutation
+ 0,2,4,..14, 32,34,36,..46, 16,18,20,..30, 48,50,52,..62
+ and for d.nelt == 64 we want permutation
+ 0,2,4,..14, 64,66,68,..78, 16,18,20,..30, 80,82,84,..94,
+ 32,34,36,..46, 96,98,100,..110, 48,50,52,..62, 112,114,116,..126. */
+ for (i = 0; i < d.nelt; ++i)
+ d.perm[i] = ((i * 2) & 14) + ((i & 8) ? d.nelt : 0) + (i & ~15);
}
ok = ix86_expand_vec_perm_const_1 (&d);