diff options
Diffstat (limited to 'gcc/expmed.c')
-rw-r--r-- | gcc/expmed.c | 257 |
1 files changed, 184 insertions, 73 deletions
diff --git a/gcc/expmed.c b/gcc/expmed.c index 034e49a293d..10084e5ae8b 100644 --- a/gcc/expmed.c +++ b/gcc/expmed.c @@ -2153,8 +2153,37 @@ expand_shift (enum tree_code code, enum machine_mode mode, rtx shifted, enum alg_code { alg_zero, alg_m, alg_shift, alg_add_t_m2, alg_sub_t_m2, alg_add_factor, alg_sub_factor, - alg_add_t2_m, alg_sub_t2_m, - alg_add, alg_subtract, alg_factor, alg_shiftop }; + alg_add_t2_m, alg_sub_t2_m }; + +/* This structure holds the "cost" of a multiply sequence. The + "cost" field holds the total rtx_cost of every operator in the + synthetic multiplication sequence, hence cost(a op b) is defined + as rtx_cost(op) + cost(a) + cost(b), where cost(leaf) is zero. + The "latency" field holds the minimum possible latency of the + synthetic multiply, on a hypothetical infinitely parallel CPU. + This is the critical path, or the maximum height, of the expression + tree which is the sum of rtx_costs on the most expensive path from + any leaf to the root. Hence latency(a op b) is defined as zero for + leaves and rtx_cost(op) + max(latency(a), latency(b)) otherwise. */ + +struct mult_cost { + short cost; /* Total rtx_cost of the multiplication sequence. */ + short latency; /* The latency of the multiplication sequence. */ +}; + +/* This macro is used to compare a pointer to a mult_cost against an + single integer "rtx_cost" value. This is equivalent to the macro + CHEAPER_MULT_COST(X,Z) where Z = {Y,Y}. */ +#define MULT_COST_LESS(X,Y) ((X)->cost < (Y) \ + || ((X)->cost == (Y) && (X)->latency < (Y))) + +/* This macro is used to compare two pointers to mult_costs against + each other. The macro returns true if X is cheaper than Y. + Currently, the cheaper of two mult_costs is the one with the + lower "cost". If "cost"s are tied, the lower latency is cheaper. */ +#define CHEAPER_MULT_COST(X,Y) ((X)->cost < (Y)->cost \ + || ((X)->cost == (Y)->cost \ + && (X)->latency < (Y)->latency)) /* This structure records a sequence of operations. `ops' is the number of operations recorded. @@ -2177,7 +2206,7 @@ enum alg_code { alg_zero, alg_m, alg_shift, struct algorithm { - short cost; + struct mult_cost cost; short ops; /* The size of the OP and LOG fields are not directly related to the word size, but the worst-case algorithms will be if we have few @@ -2195,7 +2224,7 @@ struct algorithm enum mult_variant {basic_variant, negate_variant, add_variant}; static void synth_mult (struct algorithm *, unsigned HOST_WIDE_INT, - int, enum machine_mode mode); + const struct mult_cost *, enum machine_mode mode); static bool choose_mult_variant (enum machine_mode, HOST_WIDE_INT, struct algorithm *, enum mult_variant *, int); static rtx expand_mult_const (enum machine_mode, rtx, HOST_WIDE_INT, rtx, @@ -2215,19 +2244,22 @@ static rtx expand_mult_highpart_optab (enum machine_mode, rtx, rtx, rtx, static void synth_mult (struct algorithm *alg_out, unsigned HOST_WIDE_INT t, - int cost_limit, enum machine_mode mode) + const struct mult_cost *cost_limit, enum machine_mode mode) { int m; struct algorithm *alg_in, *best_alg; - int cost; + struct mult_cost best_cost; + struct mult_cost new_limit; + int op_cost, op_latency; unsigned HOST_WIDE_INT q; int maxm = MIN (BITS_PER_WORD, GET_MODE_BITSIZE (mode)); /* Indicate that no algorithm is yet found. If no algorithm is found, this value will be returned and indicate failure. */ - alg_out->cost = cost_limit; + alg_out->cost.cost = cost_limit->cost + 1; - if (cost_limit <= 0) + if (cost_limit->cost < 0 + || (cost_limit->cost == 0 && cost_limit->latency <= 0)) return; /* Restrict the bits of "t" to the multiplication's mode. */ @@ -2237,7 +2269,8 @@ synth_mult (struct algorithm *alg_out, unsigned HOST_WIDE_INT t, if (t == 1) { alg_out->ops = 1; - alg_out->cost = 0; + alg_out->cost.cost = 0; + alg_out->cost.latency = 0; alg_out->op[0] = alg_m; return; } @@ -2246,12 +2279,13 @@ synth_mult (struct algorithm *alg_out, unsigned HOST_WIDE_INT t, fail now. */ if (t == 0) { - if (zero_cost >= cost_limit) + if (MULT_COST_LESS (cost_limit, zero_cost)) return; else { alg_out->ops = 1; - alg_out->cost = zero_cost; + alg_out->cost.cost = zero_cost; + alg_out->cost.latency = zero_cost; alg_out->op[0] = alg_zero; return; } @@ -2261,6 +2295,7 @@ synth_mult (struct algorithm *alg_out, unsigned HOST_WIDE_INT t, alg_in = alloca (sizeof (struct algorithm)); best_alg = alloca (sizeof (struct algorithm)); + best_cost = *cost_limit; /* If we have a group of zero bits at the low-order part of T, try multiplying by the remaining bits and then doing a shift. */ @@ -2274,19 +2309,22 @@ synth_mult (struct algorithm *alg_out, unsigned HOST_WIDE_INT t, /* The function expand_shift will choose between a shift and a sequence of additions, so the observed cost is given as MIN (m * add_cost[mode], shift_cost[mode][m]). */ - cost = m * add_cost[mode]; - if (shift_cost[mode][m] < cost) - cost = shift_cost[mode][m]; - synth_mult (alg_in, q, cost_limit - cost, mode); - - cost += alg_in->cost; - if (cost < cost_limit) + op_cost = m * add_cost[mode]; + if (shift_cost[mode][m] < op_cost) + op_cost = shift_cost[mode][m]; + new_limit.cost = best_cost.cost - op_cost; + new_limit.latency = best_cost.latency - op_cost; + synth_mult (alg_in, q, &new_limit, mode); + + alg_in->cost.cost += op_cost; + alg_in->cost.latency += op_cost; + if (CHEAPER_MULT_COST (&alg_in->cost, &best_cost)) { struct algorithm *x; + best_cost = alg_in->cost; x = alg_in, alg_in = best_alg, best_alg = x; best_alg->log[best_alg->ops] = m; best_alg->op[best_alg->ops] = alg_shift; - cost_limit = cost; } } } @@ -2311,34 +2349,40 @@ synth_mult (struct algorithm *alg_out, unsigned HOST_WIDE_INT t, { /* T ends with ...111. Multiply by (T + 1) and subtract 1. */ - cost = add_cost[mode]; - synth_mult (alg_in, t + 1, cost_limit - cost, mode); + op_cost = add_cost[mode]; + new_limit.cost = best_cost.cost - op_cost; + new_limit.latency = best_cost.latency - op_cost; + synth_mult (alg_in, t + 1, &new_limit, mode); - cost += alg_in->cost; - if (cost < cost_limit) + alg_in->cost.cost += op_cost; + alg_in->cost.latency += op_cost; + if (CHEAPER_MULT_COST (&alg_in->cost, &best_cost)) { struct algorithm *x; + best_cost = alg_in->cost; x = alg_in, alg_in = best_alg, best_alg = x; best_alg->log[best_alg->ops] = 0; best_alg->op[best_alg->ops] = alg_sub_t_m2; - cost_limit = cost; } } else { /* T ends with ...01 or ...011. Multiply by (T - 1) and add 1. */ - cost = add_cost[mode]; - synth_mult (alg_in, t - 1, cost_limit - cost, mode); + op_cost = add_cost[mode]; + new_limit.cost = best_cost.cost - op_cost; + new_limit.latency = best_cost.latency - op_cost; + synth_mult (alg_in, t - 1, &new_limit, mode); - cost += alg_in->cost; - if (cost < cost_limit) + alg_in->cost.cost += op_cost; + alg_in->cost.latency += op_cost; + if (CHEAPER_MULT_COST (&alg_in->cost, &best_cost)) { struct algorithm *x; + best_cost = alg_in->cost; x = alg_in, alg_in = best_alg, best_alg = x; best_alg->log[best_alg->ops] = 0; best_alg->op[best_alg->ops] = alg_add_t_m2; - cost_limit = cost; } } } @@ -2360,19 +2404,36 @@ synth_mult (struct algorithm *alg_out, unsigned HOST_WIDE_INT t, d = ((unsigned HOST_WIDE_INT) 1 << m) + 1; if (t % d == 0 && t > d && m < maxm) { - cost = add_cost[mode] + shift_cost[mode][m]; - if (shiftadd_cost[mode][m] < cost) - cost = shiftadd_cost[mode][m]; - synth_mult (alg_in, t / d, cost_limit - cost, mode); + /* If the target has a cheap shift-and-add instruction use + that in preference to a shift insn followed by an add insn. + Assume that the shift-and-add is "atomic" with a latency + equal to it's cost, otherwise assume that on superscalar + hardware the shift may be executed concurrently with the + earlier steps in the algorithm. */ + op_cost = add_cost[mode] + shift_cost[mode][m]; + if (shiftadd_cost[mode][m] < op_cost) + { + op_cost = shiftadd_cost[mode][m]; + op_latency = op_cost; + } + else + op_latency = add_cost[mode]; + + new_limit.cost = best_cost.cost - op_cost; + new_limit.latency = best_cost.latency - op_latency; + synth_mult (alg_in, t / d, &new_limit, mode); - cost += alg_in->cost; - if (cost < cost_limit) + alg_in->cost.cost += op_cost; + alg_in->cost.latency += op_latency; + if (alg_in->cost.latency < op_cost) + alg_in->cost.latency = op_cost; + if (CHEAPER_MULT_COST (&alg_in->cost, &best_cost)) { struct algorithm *x; + best_cost = alg_in->cost; x = alg_in, alg_in = best_alg, best_alg = x; best_alg->log[best_alg->ops] = m; best_alg->op[best_alg->ops] = alg_add_factor; - cost_limit = cost; } /* Other factors will have been taken care of in the recursion. */ break; @@ -2381,19 +2442,36 @@ synth_mult (struct algorithm *alg_out, unsigned HOST_WIDE_INT t, d = ((unsigned HOST_WIDE_INT) 1 << m) - 1; if (t % d == 0 && t > d && m < maxm) { - cost = add_cost[mode] + shift_cost[mode][m]; - if (shiftsub_cost[mode][m] < cost) - cost = shiftsub_cost[mode][m]; - synth_mult (alg_in, t / d, cost_limit - cost, mode); + /* If the target has a cheap shift-and-subtract insn use + that in preference to a shift insn followed by a sub insn. + Assume that the shift-and-sub is "atomic" with a latency + equal to it's cost, otherwise assume that on superscalar + hardware the shift may be executed concurrently with the + earlier steps in the algorithm. */ + op_cost = add_cost[mode] + shift_cost[mode][m]; + if (shiftsub_cost[mode][m] < op_cost) + { + op_cost = shiftsub_cost[mode][m]; + op_latency = op_cost; + } + else + op_latency = add_cost[mode]; + + new_limit.cost = best_cost.cost - op_cost; + new_limit.cost = best_cost.cost - op_latency; + synth_mult (alg_in, t / d, &new_limit, mode); - cost += alg_in->cost; - if (cost < cost_limit) + alg_in->cost.cost += op_cost; + alg_in->cost.latency += op_latency; + if (alg_in->cost.latency < op_cost) + alg_in->cost.latency = op_cost; + if (CHEAPER_MULT_COST (&alg_in->cost, &best_cost)) { struct algorithm *x; + best_cost = alg_in->cost; x = alg_in, alg_in = best_alg, best_alg = x; best_alg->log[best_alg->ops] = m; best_alg->op[best_alg->ops] = alg_sub_factor; - cost_limit = cost; } break; } @@ -2408,17 +2486,20 @@ synth_mult (struct algorithm *alg_out, unsigned HOST_WIDE_INT t, m = exact_log2 (q); if (m >= 0 && m < maxm) { - cost = shiftadd_cost[mode][m]; - synth_mult (alg_in, (t - 1) >> m, cost_limit - cost, mode); - - cost += alg_in->cost; - if (cost < cost_limit) + op_cost = shiftadd_cost[mode][m]; + new_limit.cost = best_cost.cost - op_cost; + new_limit.latency = best_cost.latency - op_cost; + synth_mult (alg_in, (t - 1) >> m, &new_limit, mode); + + alg_in->cost.cost += op_cost; + alg_in->cost.latency += op_cost; + if (CHEAPER_MULT_COST (&alg_in->cost, &best_cost)) { struct algorithm *x; + best_cost = alg_in->cost; x = alg_in, alg_in = best_alg, best_alg = x; best_alg->log[best_alg->ops] = m; best_alg->op[best_alg->ops] = alg_add_t2_m; - cost_limit = cost; } } @@ -2427,36 +2508,38 @@ synth_mult (struct algorithm *alg_out, unsigned HOST_WIDE_INT t, m = exact_log2 (q); if (m >= 0 && m < maxm) { - cost = shiftsub_cost[mode][m]; - synth_mult (alg_in, (t + 1) >> m, cost_limit - cost, mode); - - cost += alg_in->cost; - if (cost < cost_limit) + op_cost = shiftsub_cost[mode][m]; + new_limit.cost = best_cost.cost - op_cost; + new_limit.latency = best_cost.latency - op_cost; + synth_mult (alg_in, (t + 1) >> m, &new_limit, mode); + + alg_in->cost.cost += op_cost; + alg_in->cost.latency += op_cost; + if (CHEAPER_MULT_COST (&alg_in->cost, &best_cost)) { struct algorithm *x; + best_cost = alg_in->cost; x = alg_in, alg_in = best_alg, best_alg = x; best_alg->log[best_alg->ops] = m; best_alg->op[best_alg->ops] = alg_sub_t2_m; - cost_limit = cost; } } } - /* If cost_limit has not decreased since we stored it in alg_out->cost, - we have not found any algorithm. */ - if (cost_limit == alg_out->cost) - return; - /* If we are getting a too long sequence for `struct algorithm' to record, make this search fail. */ if (best_alg->ops == MAX_BITS_PER_WORD) return; + /* If best_cost has not decreased, we have not found any algorithm. */ + if (!CHEAPER_MULT_COST (&best_cost, cost_limit)) + return; + /* Copy the algorithm from temporary space to the space at alg_out. We avoid using structure assignment because the majority of best_alg is normally undefined, and this is a critical function. */ alg_out->ops = best_alg->ops + 1; - alg_out->cost = cost_limit; + alg_out->cost = best_cost; memcpy (alg_out->op, best_alg->op, alg_out->ops * sizeof *alg_out->op); memcpy (alg_out->log, best_alg->log, @@ -2479,29 +2562,57 @@ choose_mult_variant (enum machine_mode mode, HOST_WIDE_INT val, int mult_cost) { struct algorithm alg2; + struct mult_cost limit; + int op_cost; *variant = basic_variant; - synth_mult (alg, val, mult_cost, mode); + limit.cost = mult_cost; + limit.latency = mult_cost; + synth_mult (alg, val, &limit, mode); /* This works only if the inverted value actually fits in an `unsigned int' */ if (HOST_BITS_PER_INT >= GET_MODE_BITSIZE (mode)) { - synth_mult (&alg2, -val, MIN (alg->cost, mult_cost) - neg_cost[mode], - mode); - alg2.cost += neg_cost[mode]; - if (alg2.cost < alg->cost) + op_cost = neg_cost[mode]; + if (MULT_COST_LESS (&alg->cost, mult_cost)) + { + limit.cost = alg->cost.cost - op_cost; + limit.latency = alg->cost.latency - op_cost; + } + else + { + limit.cost = mult_cost - op_cost; + limit.latency = mult_cost - op_cost; + } + + synth_mult (&alg2, -val, &limit, mode); + alg2.cost.cost += op_cost; + alg2.cost.latency += op_cost; + if (CHEAPER_MULT_COST (&alg2.cost, &alg->cost)) *alg = alg2, *variant = negate_variant; } /* This proves very useful for division-by-constant. */ - synth_mult (&alg2, val - 1, MIN (alg->cost, mult_cost) - add_cost[mode], - mode); - alg2.cost += add_cost[mode]; - if (alg2.cost < alg->cost) + op_cost = add_cost[mode]; + if (MULT_COST_LESS (&alg->cost, mult_cost)) + { + limit.cost = alg->cost.cost - op_cost; + limit.latency = alg->cost.latency - op_cost; + } + else + { + limit.cost = mult_cost - op_cost; + limit.latency = mult_cost - op_cost; + } + + synth_mult (&alg2, val - 1, &limit, mode); + alg2.cost.cost += op_cost; + alg2.cost.latency += op_cost; + if (CHEAPER_MULT_COST (&alg2.cost, &alg->cost)) *alg = alg2, *variant = add_variant; - return alg->cost < mult_cost; + return MULT_COST_LESS (&alg->cost, mult_cost); } /* A subroutine of expand_mult, used for constant multiplications. @@ -3074,8 +3185,8 @@ expand_mult_highpart (enum machine_mode mode, rtx op0, { /* See whether the specialized multiplication optabs are cheaper than the shift/add version. */ - tem = expand_mult_highpart_optab (mode, op0, op1, target, - unsignedp, alg.cost + extra_cost); + tem = expand_mult_highpart_optab (mode, op0, op1, target, unsignedp, + alg.cost.cost + extra_cost); if (tem) return tem; |