diff options
Diffstat (limited to 'gcc/config/i386/i386.c')
-rw-r--r-- | gcc/config/i386/i386.c | 77 |
1 files changed, 49 insertions, 28 deletions
diff --git a/gcc/config/i386/i386.c b/gcc/config/i386/i386.c index e61d2c55ea5..5ab999db6a7 100644 --- a/gcc/config/i386/i386.c +++ b/gcc/config/i386/i386.c @@ -24621,9 +24621,10 @@ alg_usable_p (enum stringop_alg alg, bool memset) static enum stringop_alg decide_alg (HOST_WIDE_INT count, HOST_WIDE_INT expected_size, unsigned HOST_WIDE_INT min_size, unsigned HOST_WIDE_INT max_size, - bool memset, bool zero_memset, int *dynamic_check, bool *noalign) + bool memset, bool zero_memset, int *dynamic_check, bool *noalign, + bool recur) { - const struct stringop_algs * algs; + const struct stringop_algs *algs; bool optimize_for_speed; int max = 0; const struct processor_costs *cost; @@ -24657,7 +24658,7 @@ decide_alg (HOST_WIDE_INT count, HOST_WIDE_INT expected_size, any_alg_usable_p |= usable; if (candidate != libcall && candidate && usable) - max = algs->size[i].max; + max = algs->size[i].max; } /* If expected size is not known but max size is small enough @@ -24667,7 +24668,7 @@ decide_alg (HOST_WIDE_INT count, HOST_WIDE_INT expected_size, && expected_size == -1) expected_size = min_size / 2 + max_size / 2; - /* If user specified the algorithm, honnor it if possible. */ + /* If user specified the algorithm, honor it if possible. */ if (ix86_stringop_alg != no_stringop && alg_usable_p (ix86_stringop_alg, memset)) return ix86_stringop_alg; @@ -24742,21 +24743,20 @@ decide_alg (HOST_WIDE_INT count, HOST_WIDE_INT expected_size, || !alg_usable_p (algs->unknown_size, memset))) { enum stringop_alg alg; + HOST_WIDE_INT new_expected_size = (max > 0 ? max : 4096) / 2; - /* If there aren't any usable algorithms, then recursing on - smaller sizes isn't going to find anything. Just return the - simple byte-at-a-time copy loop. */ - if (!any_alg_usable_p) - { - /* Pick something reasonable. */ - if (TARGET_INLINE_STRINGOPS_DYNAMICALLY) - *dynamic_check = 128; - return loop_1_byte; - } - if (max <= 0) - max = 4096; - alg = decide_alg (count, max / 2, min_size, max_size, memset, - zero_memset, dynamic_check, noalign); + /* If there aren't any usable algorithms or if recursing already, + then recursing on smaller sizes or same size isn't going to + find anything. Just return the simple byte-at-a-time copy loop. */ + if (!any_alg_usable_p || recur) + { + /* Pick something reasonable. */ + if (TARGET_INLINE_STRINGOPS_DYNAMICALLY && !recur) + *dynamic_check = 128; + return loop_1_byte; + } + alg = decide_alg (count, new_expected_size, min_size, max_size, memset, + zero_memset, dynamic_check, noalign, true); gcc_assert (*dynamic_check == -1); if (TARGET_INLINE_STRINGOPS_DYNAMICALLY) *dynamic_check = max; @@ -25012,7 +25012,7 @@ ix86_expand_set_or_movmem (rtx dst, rtx src, rtx count_exp, rtx val_exp, alg = decide_alg (count, expected_size, min_size, probable_max_size, issetmem, issetmem && val_exp == const0_rtx, - &dynamic_check, &noalign); + &dynamic_check, &noalign, false); if (alg == libcall) return false; gcc_assert (alg != no_stringop); @@ -30804,6 +30804,14 @@ def_builtin (HOST_WIDE_INT mask, const char *name, { ix86_builtins_isa[(int) code].isa = mask; + /* OPTION_MASK_ISA_AVX512VL has special meaning. Despite of generic case, + where any bit set means that built-in is enable, this bit must be *and-ed* + with another one. E.g.: OPTION_MASK_ISA_AVX512DQ | OPTION_MASK_ISA_AVX512VL + means that *both* cpuid bits must be set for the built-in to be available. + Handle this here. */ + if (mask & ix86_isa_flags & OPTION_MASK_ISA_AVX512VL) + mask &= ~OPTION_MASK_ISA_AVX512VL; + mask &= ~OPTION_MASK_ISA_64BIT; if (mask == 0 || (mask & ix86_isa_flags) != 0 @@ -32465,9 +32473,9 @@ static const struct builtin_description bdesc_args[] = { OPTION_MASK_ISA_AVX512BW | OPTION_MASK_ISA_AVX512VL, CODE_FOR_avx512vl_permvarv16hi_mask, "__builtin_ia32_permvarhi256_mask", IX86_BUILTIN_VPERMVARHI256_MASK, UNKNOWN, (int) V16HI_FTYPE_V16HI_V16HI_V16HI_HI }, { OPTION_MASK_ISA_AVX512BW | OPTION_MASK_ISA_AVX512VL, CODE_FOR_avx512vl_permvarv8hi_mask, "__builtin_ia32_permvarhi128_mask", IX86_BUILTIN_VPERMVARHI128_MASK, UNKNOWN, (int) V8HI_FTYPE_V8HI_V8HI_V8HI_QI }, { OPTION_MASK_ISA_AVX512BW | OPTION_MASK_ISA_AVX512VL, CODE_FOR_avx512vl_vpermt2varv16hi3_mask, "__builtin_ia32_vpermt2varhi256_mask", IX86_BUILTIN_VPERMT2VARHI256, UNKNOWN, (int) V16HI_FTYPE_V16HI_V16HI_V16HI_HI }, - { OPTION_MASK_ISA_AVX512BW | OPTION_MASK_ISA_AVX512BW | OPTION_MASK_ISA_AVX512VL, CODE_FOR_avx512vl_vpermt2varv16hi3_maskz, "__builtin_ia32_vpermt2varhi256_maskz", IX86_BUILTIN_VPERMT2VARHI256_MASKZ, UNKNOWN, (int) V16HI_FTYPE_V16HI_V16HI_V16HI_HI }, + { OPTION_MASK_ISA_AVX512BW | OPTION_MASK_ISA_AVX512VL, CODE_FOR_avx512vl_vpermt2varv16hi3_maskz, "__builtin_ia32_vpermt2varhi256_maskz", IX86_BUILTIN_VPERMT2VARHI256_MASKZ, UNKNOWN, (int) V16HI_FTYPE_V16HI_V16HI_V16HI_HI }, { OPTION_MASK_ISA_AVX512BW | OPTION_MASK_ISA_AVX512VL, CODE_FOR_avx512vl_vpermt2varv8hi3_mask, "__builtin_ia32_vpermt2varhi128_mask", IX86_BUILTIN_VPERMT2VARHI128, UNKNOWN, (int) V8HI_FTYPE_V8HI_V8HI_V8HI_QI }, - { OPTION_MASK_ISA_AVX512BW | OPTION_MASK_ISA_AVX512BW | OPTION_MASK_ISA_AVX512VL, CODE_FOR_avx512vl_vpermt2varv8hi3_maskz, "__builtin_ia32_vpermt2varhi128_maskz", IX86_BUILTIN_VPERMT2VARHI128_MASKZ, UNKNOWN, (int) V8HI_FTYPE_V8HI_V8HI_V8HI_QI }, + { OPTION_MASK_ISA_AVX512BW | OPTION_MASK_ISA_AVX512VL, CODE_FOR_avx512vl_vpermt2varv8hi3_maskz, "__builtin_ia32_vpermt2varhi128_maskz", IX86_BUILTIN_VPERMT2VARHI128_MASKZ, UNKNOWN, (int) V8HI_FTYPE_V8HI_V8HI_V8HI_QI }, { OPTION_MASK_ISA_AVX512BW | OPTION_MASK_ISA_AVX512VL, CODE_FOR_avx512vl_vpermi2varv16hi3_mask, "__builtin_ia32_vpermi2varhi256_mask", IX86_BUILTIN_VPERMI2VARHI256, UNKNOWN, (int) V16HI_FTYPE_V16HI_V16HI_V16HI_HI }, { OPTION_MASK_ISA_AVX512BW | OPTION_MASK_ISA_AVX512VL, CODE_FOR_avx512vl_vpermi2varv8hi3_mask, "__builtin_ia32_vpermi2varhi128_mask", IX86_BUILTIN_VPERMI2VARHI128, UNKNOWN, (int) V8HI_FTYPE_V8HI_V8HI_V8HI_QI }, { OPTION_MASK_ISA_AVX512VL, CODE_FOR_rcp14v4df_mask, "__builtin_ia32_rcp14pd256_mask", IX86_BUILTIN_RCP14PD256, UNKNOWN, (int) V4DF_FTYPE_V4DF_V4DF_QI }, @@ -33182,9 +33190,9 @@ static const struct builtin_description bdesc_args[] = { OPTION_MASK_ISA_AVX512VBMI | OPTION_MASK_ISA_AVX512VL, CODE_FOR_avx512vl_permvarv32qi_mask, "__builtin_ia32_permvarqi256_mask", IX86_BUILTIN_VPERMVARQI256_MASK, UNKNOWN, (int) V32QI_FTYPE_V32QI_V32QI_V32QI_SI }, { OPTION_MASK_ISA_AVX512VBMI | OPTION_MASK_ISA_AVX512VL, CODE_FOR_avx512vl_permvarv16qi_mask, "__builtin_ia32_permvarqi128_mask", IX86_BUILTIN_VPERMVARQI128_MASK, UNKNOWN, (int) V16QI_FTYPE_V16QI_V16QI_V16QI_HI }, { OPTION_MASK_ISA_AVX512VBMI | OPTION_MASK_ISA_AVX512VL, CODE_FOR_avx512vl_vpermt2varv32qi3_mask, "__builtin_ia32_vpermt2varqi256_mask", IX86_BUILTIN_VPERMT2VARQI256, UNKNOWN, (int) V32QI_FTYPE_V32QI_V32QI_V32QI_SI }, - { OPTION_MASK_ISA_AVX512VBMI | OPTION_MASK_ISA_AVX512VBMI | OPTION_MASK_ISA_AVX512VL, CODE_FOR_avx512vl_vpermt2varv32qi3_maskz, "__builtin_ia32_vpermt2varqi256_maskz", IX86_BUILTIN_VPERMT2VARQI256_MASKZ, UNKNOWN, (int) V32QI_FTYPE_V32QI_V32QI_V32QI_SI }, + { OPTION_MASK_ISA_AVX512VBMI | OPTION_MASK_ISA_AVX512VL, CODE_FOR_avx512vl_vpermt2varv32qi3_maskz, "__builtin_ia32_vpermt2varqi256_maskz", IX86_BUILTIN_VPERMT2VARQI256_MASKZ, UNKNOWN, (int) V32QI_FTYPE_V32QI_V32QI_V32QI_SI }, { OPTION_MASK_ISA_AVX512VBMI | OPTION_MASK_ISA_AVX512VL, CODE_FOR_avx512vl_vpermt2varv16qi3_mask, "__builtin_ia32_vpermt2varqi128_mask", IX86_BUILTIN_VPERMT2VARQI128, UNKNOWN, (int) V16QI_FTYPE_V16QI_V16QI_V16QI_HI }, - { OPTION_MASK_ISA_AVX512VBMI | OPTION_MASK_ISA_AVX512VBMI | OPTION_MASK_ISA_AVX512VL, CODE_FOR_avx512vl_vpermt2varv16qi3_maskz, "__builtin_ia32_vpermt2varqi128_maskz", IX86_BUILTIN_VPERMT2VARQI128_MASKZ, UNKNOWN, (int) V16QI_FTYPE_V16QI_V16QI_V16QI_HI }, + { OPTION_MASK_ISA_AVX512VBMI | OPTION_MASK_ISA_AVX512VL, CODE_FOR_avx512vl_vpermt2varv16qi3_maskz, "__builtin_ia32_vpermt2varqi128_maskz", IX86_BUILTIN_VPERMT2VARQI128_MASKZ, UNKNOWN, (int) V16QI_FTYPE_V16QI_V16QI_V16QI_HI }, { OPTION_MASK_ISA_AVX512VBMI | OPTION_MASK_ISA_AVX512VL, CODE_FOR_avx512vl_vpermi2varv32qi3_mask, "__builtin_ia32_vpermi2varqi256_mask", IX86_BUILTIN_VPERMI2VARQI256, UNKNOWN, (int) V32QI_FTYPE_V32QI_V32QI_V32QI_SI }, { OPTION_MASK_ISA_AVX512VBMI | OPTION_MASK_ISA_AVX512VL, CODE_FOR_avx512vl_vpermi2varv16qi3_mask, "__builtin_ia32_vpermi2varqi128_mask", IX86_BUILTIN_VPERMI2VARQI128, UNKNOWN, (int) V16QI_FTYPE_V16QI_V16QI_V16QI_HI }, }; @@ -45211,7 +45219,12 @@ half: tmp = gen_reg_rtx (mode); emit_insn (gen_rtx_SET (VOIDmode, tmp, gen_rtx_VEC_DUPLICATE (mode, val))); - emit_insn (gen_blendm (target, tmp, target, + /* The avx512*_blendm<mode> expanders have different operand order + from VEC_MERGE. In VEC_MERGE, the first input operand is used for + elements where the mask is set and second input operand otherwise, + in {sse,avx}*_*blend* the first input operand is used for elements + where the mask is clear and second input operand otherwise. */ + emit_insn (gen_blendm (target, target, tmp, force_reg (mmode, gen_int_mode (1 << elt, mmode)))); } @@ -50028,16 +50041,24 @@ ix86_expand_vecop_qihi (enum rtx_code code, rtx dest, rtx op1, rtx op2) { /* For SSE2, we used an full interleave, so the desired results are in the even elements. */ - for (i = 0; i < 64; ++i) + for (i = 0; i < d.nelt; ++i) d.perm[i] = i * 2; } else { /* For AVX, the interleave used above was not cross-lane. So the extraction is evens but with the second and third quarter swapped. - Happily, that is even one insn shorter than even extraction. */ - for (i = 0; i < 64; ++i) - d.perm[i] = i * 2 + ((i & 24) == 8 ? 16 : (i & 24) == 16 ? -16 : 0); + Happily, that is even one insn shorter than even extraction. + For AVX512BW we have 4 lanes. We extract evens from within a lane, + always first from the first and then from the second source operand, + the index bits above the low 4 bits remains the same. + Thus, for d.nelt == 32 we want permutation + 0,2,4,..14, 32,34,36,..46, 16,18,20,..30, 48,50,52,..62 + and for d.nelt == 64 we want permutation + 0,2,4,..14, 64,66,68,..78, 16,18,20,..30, 80,82,84,..94, + 32,34,36,..46, 96,98,100,..110, 48,50,52,..62, 112,114,116,..126. */ + for (i = 0; i < d.nelt; ++i) + d.perm[i] = ((i * 2) & 14) + ((i & 8) ? d.nelt : 0) + (i & ~15); } ok = ix86_expand_vec_perm_const_1 (&d); |