diff options
author | Richard Sandiford <richard.sandiford@linaro.org> | 2017-03-30 11:38:10 +0000 |
---|---|---|
committer | Richard Sandiford <richard.sandiford@linaro.org> | 2017-11-20 16:01:23 +0000 |
commit | c9ef88a25bd1e416d31af8a7e184dc07a8e67006 (patch) | |
tree | 7f95dbd5b7e18e654266932975e3c781bc358e85 | |
parent | 164804dbc17d5bf70634127b342e221cda938b6b (diff) | |
download | gcc-c9ef88a25bd1e416d31af8a7e184dc07a8e67006.tar.gz |
Support fused multiply-adds in fully-masked reductions
This patch adds support for fusing a conditional add or subtract
with a multiplication, so that we can use fused multiply-add and
multiply-subtract operations for fully-masked reductions.
All current conditional X operations have the form "do X or don't do X
to the first operand" (add/don't add to first operand, etc.). However,
the FMA optabs and functions are ordered so that the accumulator comes
last. There were two obvious ways of resolving this: break the
convention for conditional operators and have "add/don't add to the
final operand" or break the convention for FMA and put the accumulator
first. The patch goes for the latter, but adds _REV to make it obvious
that the operands are in a different order.
-rw-r--r-- | gcc/config/aarch64/aarch64-sve.md | 16 | ||||
-rw-r--r-- | gcc/config/aarch64/iterators.md | 26 | ||||
-rw-r--r-- | gcc/doc/md.texi | 26 | ||||
-rw-r--r-- | gcc/genmatch.c | 4 | ||||
-rw-r--r-- | gcc/internal-fn.c | 5 | ||||
-rw-r--r-- | gcc/internal-fn.def | 6 | ||||
-rw-r--r-- | gcc/optabs.def | 2 | ||||
-rw-r--r-- | gcc/testsuite/gcc.target/aarch64/sve_reduc_4.c | 18 | ||||
-rw-r--r-- | gcc/testsuite/gcc.target/aarch64/sve_reduc_6.c | 17 | ||||
-rw-r--r-- | gcc/testsuite/gcc.target/aarch64/sve_reduc_7.c | 17 | ||||
-rw-r--r-- | gcc/tree-ssa-math-opts.c | 156 |
11 files changed, 223 insertions, 70 deletions
diff --git a/gcc/config/aarch64/aarch64-sve.md b/gcc/config/aarch64/aarch64-sve.md index ebf522726f4..7df6445e4ca 100644 --- a/gcc/config/aarch64/aarch64-sve.md +++ b/gcc/config/aarch64/aarch64-sve.md @@ -1629,7 +1629,7 @@ [(match_operand:<VPRED> 1 "register_operand" "Upl") (match_operand:SVE_I 2 "register_operand" "0") (match_operand:SVE_I 3 "register_operand" "w")] - SVE_COND_INT_OP))] + SVE_COND_INT2_OP))] "TARGET_SVE" "<sve_int_op>\t%0.<Vetype>, %1/m, %0.<Vetype>, %3.<Vetype>" ) @@ -2385,11 +2385,23 @@ [(match_operand:<VPRED> 1 "register_operand" "Upl") (match_operand:SVE_F 2 "register_operand" "0") (match_operand:SVE_F 3 "register_operand" "w")] - SVE_COND_FP_OP))] + SVE_COND_FP2_OP))] "TARGET_SVE" "<sve_fp_op>\t%0.<Vetype>, %1/m, %0.<Vetype>, %3.<Vetype>" ) +(define_insn "cond_<optab><mode>" + [(set (match_operand:SVE_F 0 "register_operand" "=w") + (unspec:SVE_F + [(match_operand:<VPRED> 1 "register_operand" "Upl") + (match_operand:SVE_F 2 "register_operand" "0") + (match_operand:SVE_F 3 "register_operand" "w") + (match_operand:SVE_F 4 "register_operand" "w")] + SVE_COND_FP3_OP))] + "TARGET_SVE" + "<sve_fp_op>\t%0.<Vetype>, %1/m, %3.<Vetype>, %4.<Vetype>" +) + (define_insn "*<optab><mode>3_cond" [(set (match_operand:SVE_F 0 "register_operand" "=w") (sve_predicated_comm_fp_op:SVE_F diff --git a/gcc/config/aarch64/iterators.md b/gcc/config/aarch64/iterators.md index df5965d7fac..e48a6dd99d3 100644 --- a/gcc/config/aarch64/iterators.md +++ b/gcc/config/aarch64/iterators.md @@ -430,6 +430,8 @@ UNSPEC_COND_AND ; Used in aarch64-sve.md. UNSPEC_COND_ORR ; Used in aarch64-sve.md. UNSPEC_COND_EOR ; Used in aarch64-sve.md. + UNSPEC_COND_FMLA ; Used in aarch64-sve.md. + UNSPEC_COND_FMLS ; Used in aarch64-sve.md. UNSPEC_COND_LT ; Used in aarch64-sve.md. UNSPEC_COND_LE ; Used in aarch64-sve.md. UNSPEC_COND_EQ ; Used in aarch64-sve.md. @@ -1433,14 +1435,16 @@ (define_int_iterator UNPACK_UNSIGNED [UNSPEC_UNPACKULO UNSPEC_UNPACKUHI]) -(define_int_iterator SVE_COND_INT_OP [UNSPEC_COND_ADD UNSPEC_COND_SUB - UNSPEC_COND_SMAX UNSPEC_COND_UMAX - UNSPEC_COND_SMIN UNSPEC_COND_UMIN - UNSPEC_COND_AND - UNSPEC_COND_ORR - UNSPEC_COND_EOR]) +(define_int_iterator SVE_COND_INT2_OP [UNSPEC_COND_ADD UNSPEC_COND_SUB + UNSPEC_COND_SMAX UNSPEC_COND_UMAX + UNSPEC_COND_SMIN UNSPEC_COND_UMIN + UNSPEC_COND_AND + UNSPEC_COND_ORR + UNSPEC_COND_EOR]) -(define_int_iterator SVE_COND_FP_OP [UNSPEC_COND_ADD UNSPEC_COND_SUB]) +(define_int_iterator SVE_COND_FP2_OP [UNSPEC_COND_ADD UNSPEC_COND_SUB]) + +(define_int_iterator SVE_COND_FP3_OP [UNSPEC_COND_FMLA UNSPEC_COND_FMLS]) (define_int_iterator SVE_COND_INT_CMP [UNSPEC_COND_LT UNSPEC_COND_LE UNSPEC_COND_EQ UNSPEC_COND_NE @@ -1483,7 +1487,9 @@ (UNSPEC_COND_UMIN "umin") (UNSPEC_COND_AND "and") (UNSPEC_COND_ORR "ior") - (UNSPEC_COND_EOR "xor")]) + (UNSPEC_COND_EOR "xor") + (UNSPEC_COND_FMLA "fma_rev") + (UNSPEC_COND_FMLS "fnma_rev")]) (define_int_attr maxmin_uns [(UNSPEC_UMAXV "umax") (UNSPEC_UMINV "umin") @@ -1702,4 +1708,6 @@ (UNSPEC_COND_EOR "eor")]) (define_int_attr sve_fp_op [(UNSPEC_COND_ADD "fadd") - (UNSPEC_COND_SUB "fsub")]) + (UNSPEC_COND_SUB "fsub") + (UNSPEC_COND_FMLA "fmla") + (UNSPEC_COND_FMLS "fmls")]) diff --git a/gcc/doc/md.texi b/gcc/doc/md.texi index 98cf869a09f..02248364359 100644 --- a/gcc/doc/md.texi +++ b/gcc/doc/md.texi @@ -6416,6 +6416,32 @@ be in a normal C @samp{?:} condition. Operands 0, 2 and 3 all have mode @var{m}, while operand 1 has the mode returned by @code{TARGET_VECTORIZE_GET_MASK_MODE}. +@cindex @code{cond_fma_rev@var{mode}} instruction pattern +@item @samp{cond_fma_rev@var{mode}} +Similar to @samp{cond_add@var{m}}, but compute: +@smallexample +op0 = op1 ? fma (op3, op4, op2) : op2; +@end smallexample +for scalars and: +@smallexample +op0[I] = op1[I] ? fma (op3[I], op4[I], op2[I]) : op2[I]; +@end smallexample +for vectors. The @samp{_rev} indicates that the addend (operand 2) +comes first. + +@cindex @code{cond_fnma_rev@var{mode}} instruction pattern +@item @samp{cond_fnma_rev@var{mode}} +Similar to @samp{cond_fma_rev@var{m}}, but negate operand 3 before +multiplying it. That is, the instruction performs: +@smallexample +op0 = op1 ? fma (-op3, op4, op2) : op2; +@end smallexample +for scalars and: +@smallexample +op0[I] = op1[I] ? fma (-op3[I], op4[I], op2[I]) : op2[I]; +@end smallexample +for vectors. + @cindex @code{neg@var{mode}cc} instruction pattern @item @samp{neg@var{mode}cc} Similar to @samp{mov@var{mode}cc} but for conditional negation. Conditionally diff --git a/gcc/genmatch.c b/gcc/genmatch.c index 06f94ee0dc1..ddbf4291479 100644 --- a/gcc/genmatch.c +++ b/gcc/genmatch.c @@ -485,6 +485,10 @@ commutative_op (id_base *id) case CFN_FNMS: return 0; + case CFN_COND_FMA_REV: + case CFN_COND_FNMA_REV: + return 2; + default: return -1; } diff --git a/gcc/internal-fn.c b/gcc/internal-fn.c index d0e5919a760..a3c252511d0 100644 --- a/gcc/internal-fn.c +++ b/gcc/internal-fn.c @@ -93,6 +93,7 @@ init_internal_fns () #define ternary_direct { 0, 0, true } #define cond_unary_direct { 1, 1, true } #define cond_binary_direct { 1, 1, true } +#define cond_ternary_direct { 1, 1, true } #define while_direct { 0, 2, false } #define fold_extract_direct { 2, 2, false } #define firstfault_load_direct { -1, -1, false } @@ -2964,6 +2965,9 @@ expand_while_optab_fn (internal_fn, gcall *stmt, convert_optab optab) #define expand_cond_binary_optab_fn(FN, STMT, OPTAB) \ expand_direct_optab_fn (FN, STMT, OPTAB, 3) +#define expand_cond_ternary_optab_fn(FN, STMT, OPTAB) \ + expand_direct_optab_fn (FN, STMT, OPTAB, 4) + #define expand_fold_extract_optab_fn(FN, STMT, OPTAB) \ expand_direct_optab_fn (FN, STMT, OPTAB, 3) @@ -3043,6 +3047,7 @@ multi_vector_optab_supported_p (convert_optab optab, tree_pair types, #define direct_ternary_optab_supported_p direct_optab_supported_p #define direct_cond_unary_optab_supported_p direct_optab_supported_p #define direct_cond_binary_optab_supported_p direct_optab_supported_p +#define direct_cond_ternary_optab_supported_p direct_optab_supported_p #define direct_mask_load_optab_supported_p direct_optab_supported_p #define direct_load_lanes_optab_supported_p multi_vector_optab_supported_p #define direct_mask_load_lanes_optab_supported_p multi_vector_optab_supported_p diff --git a/gcc/internal-fn.def b/gcc/internal-fn.def index d1f8818bb00..742038627ba 100644 --- a/gcc/internal-fn.def +++ b/gcc/internal-fn.def @@ -58,7 +58,8 @@ along with GCC; see the file COPYING3. If not see - binary: a normal binary optab, such as vec_interleave_lo_<mode> - ternary: a normal ternary optab, such as fma<mode>4 - - cond_binary: a conditional binary optab, such as add<mode>cc + - cond_binary: a conditional binary optab, such as cond_add<mode> + - cond_ternary: a conditional ternary optab, such as cond_fma_rev<mode> DEF_INTERNAL_COND_OPTAB_FN defines a conditional function COND_<NAME>, with optab cond_<OPTAB> and type cond_<TYPE>. All these functions @@ -152,6 +153,9 @@ DEF_INTERNAL_OPTAB_FN (FMS, ECF_CONST, fms, ternary) DEF_INTERNAL_OPTAB_FN (FNMA, ECF_CONST, fnma, ternary) DEF_INTERNAL_OPTAB_FN (FNMS, ECF_CONST, fnms, ternary) +DEF_INTERNAL_OPTAB_FN (COND_FMA_REV, ECF_CONST, cond_fma_rev, cond_ternary) +DEF_INTERNAL_OPTAB_FN (COND_FNMA_REV, ECF_CONST, cond_fnma_rev, cond_ternary) + DEF_INTERNAL_COND_OPTAB_FN (ADD, ECF_CONST, add, binary) DEF_INTERNAL_COND_OPTAB_FN (SUB, ECF_CONST, sub, binary) DEF_INTERNAL_COND_OPTAB_FN (SMIN, ECF_CONST, smin, binary) diff --git a/gcc/optabs.def b/gcc/optabs.def index ee81ef7a34e..b507b2eb671 100644 --- a/gcc/optabs.def +++ b/gcc/optabs.def @@ -222,6 +222,8 @@ OPTAB_D (notcc_optab, "not$acc") OPTAB_D (movcc_optab, "mov$acc") OPTAB_D (cond_add_optab, "cond_add$a") OPTAB_D (cond_sub_optab, "cond_sub$a") +OPTAB_D (cond_fma_rev_optab, "cond_fma_rev$a") +OPTAB_D (cond_fnma_rev_optab, "cond_fnma_rev$a") OPTAB_D (cond_and_optab, "cond_and$a") OPTAB_D (cond_ior_optab, "cond_ior$a") OPTAB_D (cond_xor_optab, "cond_xor$a") diff --git a/gcc/testsuite/gcc.target/aarch64/sve_reduc_4.c b/gcc/testsuite/gcc.target/aarch64/sve_reduc_4.c new file mode 100644 index 00000000000..9e997adedca --- /dev/null +++ b/gcc/testsuite/gcc.target/aarch64/sve_reduc_4.c @@ -0,0 +1,18 @@ +/* { dg-do compile } */ +/* { dg-options "-O2 -ftree-vectorize -ffast-math -march=armv8-a+sve" } */ + +double +f (double *restrict a, double *restrict b, int *lookup) +{ + double res = 0.0; + for (int i = 0; i < 512; ++i) + res += a[lookup[i]] * b[i]; + return res; +} + +/* { dg-final { scan-assembler-times {\tfmla\tz[0-9]+.d, p[0-7]/m, } 2 } } */ +/* Check that the vector instructions are the only instructions. */ +/* { dg-final { scan-assembler-times {\tfmla\t} 2 } } */ +/* { dg-final { scan-assembler-not {\tfadd\t} } } */ +/* { dg-final { scan-assembler-times {\tfaddv\td0,} 1 } } */ +/* { dg-final { scan-assembler-not {\tsel\t} } } */ diff --git a/gcc/testsuite/gcc.target/aarch64/sve_reduc_6.c b/gcc/testsuite/gcc.target/aarch64/sve_reduc_6.c new file mode 100644 index 00000000000..e1f72941de4 --- /dev/null +++ b/gcc/testsuite/gcc.target/aarch64/sve_reduc_6.c @@ -0,0 +1,17 @@ +/* { dg-do compile } */ +/* { dg-options "-O2 -ftree-vectorize -ffast-math -march=armv8-a+sve" } */ + +#define REDUC(TYPE) \ + TYPE reduc_##TYPE (TYPE *x, TYPE *y, int count) \ + { \ + TYPE sum = 0; \ + for (int i = 0; i < count; ++i) \ + sum += x[i] * y[i]; \ + return sum; \ + } + +REDUC (float) +REDUC (double) + +/* { dg-final { scan-assembler-times {\tfmla\tz[0-9]+\.s, p[0-7]/m} 1 } } */ +/* { dg-final { scan-assembler-times {\tfmla\tz[0-9]+\.d, p[0-7]/m} 1 } } */ diff --git a/gcc/testsuite/gcc.target/aarch64/sve_reduc_7.c b/gcc/testsuite/gcc.target/aarch64/sve_reduc_7.c new file mode 100644 index 00000000000..851f52d2cbd --- /dev/null +++ b/gcc/testsuite/gcc.target/aarch64/sve_reduc_7.c @@ -0,0 +1,17 @@ +/* { dg-do compile } */ +/* { dg-options "-O2 -ftree-vectorize -ffast-math -march=armv8-a+sve" } */ + +#define REDUC(TYPE) \ + TYPE reduc_##TYPE (TYPE *x, TYPE *y, int count) \ + { \ + TYPE sum = 0; \ + for (int i = 0; i < count; ++i) \ + sum -= x[i] * y[i]; \ + return sum; \ + } + +REDUC (float) +REDUC (double) + +/* { dg-final { scan-assembler-times {\tfmls\tz[0-9]+\.s, p[0-7]/m} 1 } } */ +/* { dg-final { scan-assembler-times {\tfmls\tz[0-9]+\.d, p[0-7]/m} 1 } } */ diff --git a/gcc/tree-ssa-math-opts.c b/gcc/tree-ssa-math-opts.c index ca2e6e2e098..a600516ded8 100644 --- a/gcc/tree-ssa-math-opts.c +++ b/gcc/tree-ssa-math-opts.c @@ -3552,6 +3552,27 @@ convert_plusminus_to_widen (gimple_stmt_iterator *gsi, gimple *stmt, return true; } +/* If STMT is a call to IFN_COND_{ADD,SUB}, return the equivalent + fused multiply-add/subtract function, otherwise return IFN_LAST. */ + +static internal_fn +fused_cond_internal_fn (gimple *stmt) +{ + gcall *call = dyn_cast <gcall *> (stmt); + if (!call || !gimple_call_internal_p (call)) + return IFN_LAST; + + switch (gimple_call_internal_fn (call)) + { + case IFN_COND_ADD: + return IFN_COND_FMA_REV; + case IFN_COND_SUB: + return IFN_COND_FNMA_REV; + default: + return IFN_LAST; + } +} + /* gimple_fold callback that "valueizes" everything. */ static tree @@ -3601,7 +3622,6 @@ convert_mult_to_fma (gimple *mul_stmt, tree op1, tree op2) as an addition. */ FOR_EACH_IMM_USE_FAST (use_p, imm_iter, mul_result) { - enum tree_code use_code; tree result = mul_result; bool negate_p = false; @@ -3622,13 +3642,9 @@ convert_mult_to_fma (gimple *mul_stmt, tree op1, tree op2) if (gimple_bb (use_stmt) != gimple_bb (mul_stmt)) return false; - if (!is_gimple_assign (use_stmt)) - return false; - - use_code = gimple_assign_rhs_code (use_stmt); - /* A negate on the multiplication leads to FNMA. */ - if (use_code == NEGATE_EXPR) + if (is_gimple_assign (use_stmt) + && gimple_assign_rhs_code (use_stmt) == NEGATE_EXPR) { ssa_op_iter iter; use_operand_p usep; @@ -3650,51 +3666,60 @@ convert_mult_to_fma (gimple *mul_stmt, tree op1, tree op2) use_stmt = neguse_stmt; if (gimple_bb (use_stmt) != gimple_bb (mul_stmt)) return false; - if (!is_gimple_assign (use_stmt)) - return false; - use_code = gimple_assign_rhs_code (use_stmt); negate_p = true; } - switch (use_code) - { - case MINUS_EXPR: - if (gimple_assign_rhs2 (use_stmt) == result) - negate_p = !negate_p; - break; - case PLUS_EXPR: - break; - default: - /* FMA can only be formed from PLUS and MINUS. */ - return false; - } - - /* If the subtrahend (gimple_assign_rhs2 (use_stmt)) is computed - by a MULT_EXPR that we'll visit later, we might be able to - get a more profitable match with fnma. - OTOH, if we don't, a negate / fma pair has likely lower latency - that a mult / subtract pair. */ - if (use_code == MINUS_EXPR && !negate_p - && gimple_assign_rhs1 (use_stmt) == result - && !direct_internal_fn_supported_p (IFN_FMS, type, opt_type) - && direct_internal_fn_supported_p (IFN_FNMA, type, opt_type)) + if (gassign *assign = dyn_cast <gassign *> (use_stmt)) { - tree rhs2 = gimple_assign_rhs2 (use_stmt); - - if (TREE_CODE (rhs2) == SSA_NAME) + switch (gimple_assign_rhs_code (assign)) { - gimple *stmt2 = SSA_NAME_DEF_STMT (rhs2); - if (has_single_use (rhs2) - && is_gimple_assign (stmt2) - && gimple_assign_rhs_code (stmt2) == MULT_EXPR) + case MINUS_EXPR: + if (gimple_assign_rhs2 (use_stmt) == result) + negate_p = !negate_p; + /* If the subtrahend (gimple_assign_rhs2 (use_stmt)) is + computed by a MULT_EXPR that we'll visit later, we + might be able to get a more profitable match with fnma. + OTOH, if we don't, a negate / fma pair has likely lower + latency that a mult / subtract pair. */ + else if (!negate_p + && !direct_internal_fn_supported_p (IFN_FMS, type, + opt_type) + && direct_internal_fn_supported_p (IFN_FNMA, type, + opt_type)) + { + tree rhs2 = gimple_assign_rhs2 (use_stmt); + if (TREE_CODE (rhs2) == SSA_NAME) + { + gimple *stmt2 = SSA_NAME_DEF_STMT (rhs2); + if (has_single_use (rhs2) + && is_gimple_assign (stmt2) + && gimple_assign_rhs_code (stmt2) == MULT_EXPR) + return false; + } + } + break; + case PLUS_EXPR: + break; + default: + /* FMA can only be formed from PLUS and MINUS. */ return false; } - } - /* We can't handle a * b + a * b. */ - if (gimple_assign_rhs1 (use_stmt) == gimple_assign_rhs2 (use_stmt)) - return false; + /* We can't handle a * b + a * b. */ + if (gimple_assign_rhs1 (use_stmt) == gimple_assign_rhs2 (use_stmt)) + return false; + } + else + { + internal_fn ifn = fused_cond_internal_fn (use_stmt); + if (ifn == IFN_LAST) + return false; + if (result != gimple_call_arg (use_stmt, 2)) + return false; + if (!direct_internal_fn_supported_p (ifn, type, opt_type)) + return false; + } /* While it is possible to validate whether or not the exact form that we've recognized is available in the backend, the assumption @@ -3709,7 +3734,6 @@ convert_mult_to_fma (gimple *mul_stmt, tree op1, tree op2) FOR_EACH_IMM_USE_STMT (use_stmt, imm_iter, mul_result) { gimple_stmt_iterator gsi = gsi_for_stmt (use_stmt); - enum tree_code use_code; tree addop, mulop1 = op1, result = mul_result; bool negate_p = false; gimple_seq seq = NULL; @@ -3717,8 +3741,8 @@ convert_mult_to_fma (gimple *mul_stmt, tree op1, tree op2) if (is_gimple_debug (use_stmt)) continue; - use_code = gimple_assign_rhs_code (use_stmt); - if (use_code == NEGATE_EXPR) + if (is_gimple_assign (use_stmt) + && gimple_assign_rhs_code (use_stmt) == NEGATE_EXPR) { result = gimple_assign_lhs (use_stmt); single_imm_use (gimple_assign_lhs (use_stmt), &use_p, &neguse_stmt); @@ -3727,23 +3751,33 @@ convert_mult_to_fma (gimple *mul_stmt, tree op1, tree op2) use_stmt = neguse_stmt; gsi = gsi_for_stmt (use_stmt); - use_code = gimple_assign_rhs_code (use_stmt); negate_p = true; } - if (gimple_assign_rhs1 (use_stmt) == result) + internal_fn ifn; + if (gassign *assign = dyn_cast <gassign *> (use_stmt)) { - addop = gimple_assign_rhs2 (use_stmt); - /* a * b - c -> a * b + (-c) */ - if (gimple_assign_rhs_code (use_stmt) == MINUS_EXPR) - addop = gimple_build (&seq, NEGATE_EXPR, type, addop); + ifn = IFN_FMA; + if (gimple_assign_rhs1 (assign) == result) + { + addop = gimple_assign_rhs2 (assign); + /* a * b - c -> a * b + (-c) */ + if (gimple_assign_rhs_code (assign) == MINUS_EXPR) + addop = gimple_build (&seq, NEGATE_EXPR, type, addop); + } + else + { + addop = gimple_assign_rhs1 (assign); + /* a - b * c -> (-b) * c + a */ + if (gimple_assign_rhs_code (assign) == MINUS_EXPR) + negate_p = !negate_p; + } } else { - addop = gimple_assign_rhs1 (use_stmt); - /* a - b * c -> (-b) * c + a */ - if (gimple_assign_rhs_code (use_stmt) == MINUS_EXPR) - negate_p = !negate_p; + ifn = fused_cond_internal_fn (use_stmt); + gcc_assert (ifn != IFN_LAST); + addop = gimple_call_arg (use_stmt, 1); } if (negate_p) @@ -3751,8 +3785,14 @@ convert_mult_to_fma (gimple *mul_stmt, tree op1, tree op2) if (seq) gsi_insert_seq_before (&gsi, seq, GSI_SAME_STMT); - fma_stmt = gimple_build_call_internal (IFN_FMA, 3, mulop1, op2, addop); - gimple_call_set_lhs (fma_stmt, gimple_assign_lhs (use_stmt)); + + if (ifn == IFN_FMA) + fma_stmt = gimple_build_call_internal (IFN_FMA, 3, mulop1, op2, addop); + else + fma_stmt = gimple_build_call_internal (ifn, 4, + gimple_call_arg (use_stmt, 0), + addop, mulop1, op2); + gimple_set_lhs (fma_stmt, gimple_get_lhs (use_stmt)); gimple_call_set_nothrow (fma_stmt, !stmt_can_throw_internal (use_stmt)); gsi_replace (&gsi, fma_stmt, true); /* Valueize aggressively so that we generate FMS, FNMA and FNMS |