diff options
Diffstat (limited to 'gcc')
-rw-r--r-- | gcc/ChangeLog | 37 | ||||
-rw-r--r-- | gcc/testsuite/ChangeLog | 11 | ||||
-rw-r--r-- | gcc/testsuite/gcc.dg/vect/pr37027.c | 37 | ||||
-rw-r--r-- | gcc/testsuite/gcc.dg/vect/slp-reduc-1.c | 49 | ||||
-rw-r--r-- | gcc/testsuite/gcc.dg/vect/slp-reduc-2.c | 44 | ||||
-rw-r--r-- | gcc/testsuite/gcc.dg/vect/slp-reduc-3.c | 62 | ||||
-rw-r--r-- | gcc/testsuite/gcc.dg/vect/slp-reduc-4.c | 60 | ||||
-rw-r--r-- | gcc/testsuite/gcc.dg/vect/slp-reduc-5.c | 49 | ||||
-rw-r--r-- | gcc/testsuite/gcc.dg/vect/slp-reduc-6.c | 50 | ||||
-rw-r--r-- | gcc/testsuite/lib/target-supports.exp | 19 | ||||
-rw-r--r-- | gcc/tree-vect-loop.c | 978 | ||||
-rw-r--r-- | gcc/tree-vect-patterns.c | 10 | ||||
-rw-r--r-- | gcc/tree-vect-slp.c | 399 | ||||
-rw-r--r-- | gcc/tree-vect-stmts.c | 13 | ||||
-rw-r--r-- | gcc/tree-vectorizer.h | 9 |
15 files changed, 1363 insertions, 464 deletions
diff --git a/gcc/ChangeLog b/gcc/ChangeLog index 2da24e12ccb..98c80045f99 100644 --- a/gcc/ChangeLog +++ b/gcc/ChangeLog @@ -1,3 +1,40 @@ +2010-04-19 Ira Rosen <irar@il.ibm.com> + + PR tree-optimization/37027 + * tree-vectorizer.h (struct _loop_vec_info): Add new field reductions + and macro to access it. + (vectorizable_reduction): Add argument. + (vect_get_slp_defs): Likewise. + * tree-vect-loop.c (vect_analyze_scalar_cycles_1): Collect reduction + statements for possible use in SLP. + (new_loop_vec_info): Initialize LOOP_VINFO_REDUCTIONS. + (destroy_loop_vec_info): Free LOOP_VINFO_REDUCTIONS. + (vect_create_epilog_for_reduction): Handle SLP. Modify documentation, + add new argument. + (vectorizable_reduction): Likewise. + * tree-vect-stmts.c (vect_get_vec_defs): Update call to + vect_get_slp_defs. + (vectorizable_type_demotion, vectorizable_type_promotion, + vectorizable_store): Likewise. + (vect_analyze_stmt): Update call to vectorizable_reduction. + (vect_transform_stmt): Likewise. + * tree-vect-slp.c (vect_get_and_check_slp_defs): Handle reduction. + (vect_build_slp_tree): Fix indentation. Check that there are no loads + from different interleaving chains in same node. + (vect_slp_rearrange_stmts): New function. + (vect_supported_load_permutation_p): Allow load permutations for + reductions. Call vect_slp_rearrange_stmts() to rearrange statements + inside SLP nodes if necessary. + (vect_analyze_slp_instance): Handle reductions. + (vect_analyze_slp): Try to build SLP instances originating from groups + of reductions. + (vect_detect_hybrid_slp_stmts): Skip reduction statements. + (vect_get_constant_vectors): Create initial vectors for reductions + according to reduction code. Add new argument. + (vect_get_slp_defs): Add new argument, pass it to + vect_get_constant_vectors. + (vect_schedule_slp_instance): Remove SLP tree root statements. + 2010-04-19 Jakub Jelinek <jakub@redhat.com> * tree.h (ENUM_IS_SCOPED): Define. diff --git a/gcc/testsuite/ChangeLog b/gcc/testsuite/ChangeLog index 8ffc442caeb..868ce20d31e 100644 --- a/gcc/testsuite/ChangeLog +++ b/gcc/testsuite/ChangeLog @@ -1,3 +1,14 @@ +2010-04-19 Ira Rosen <irar@il.ibm.com> + + PR tree-optimization/37027 + * lib/target-supports.exp + (check_effective_target_vect_widen_sum_hi_to_si_pattern): New. + * gcc.dg/vect/pr37027.c: New test. + * gcc.dg/vect/slp-reduc-1.c, gcc.dg/vect/slp-reduc-2.c, + gcc.dg/vect/slp-reduc-3.c, gcc.dg/vect/slp-reduc-4.c, + gcc.dg/vect/slp-reduc-5.c, gcc.dg/vect/slp-reduc-6.c, + gcc.dg/vect/vect-complex-6.c: Likewise. + 2010-04-19 Jakub Jelinek <jakub@redhat.com> * g++.dg/debug/dwarf2/enum1.C: New test. diff --git a/gcc/testsuite/gcc.dg/vect/pr37027.c b/gcc/testsuite/gcc.dg/vect/pr37027.c new file mode 100644 index 00000000000..dcfed348d11 --- /dev/null +++ b/gcc/testsuite/gcc.dg/vect/pr37027.c @@ -0,0 +1,37 @@ +/* { dg-do compile } */ +/* { dg-require-effective-target vect_int } */ + +#include <stdarg.h> + +struct mystr +{ + int f1; + int f2; +}; + +struct mystr a[16]; +struct mystr b[16]; +int res1, res2; + + +void +foo (void) +{ + int i; + int sum1; + int sum2; + + for (i = 0; i < 16; i++) + { + sum1 += a[i].f1 + b[i].f1; + sum2 += a[i].f2 + b[i].f2; + } + + res1 = sum1; + res2 = sum2; +} + +/* { dg-final { scan-tree-dump-times "vectorized 1 loops" 1 "vect" { xfail vect_no_int_add } } } */ +/* { dg-final { scan-tree-dump-times "vectorizing stmts using SLP" 1 "vect" { xfail vect_no_int_add } } } */ +/* { dg-final { cleanup-tree-dump "vect" } } */ + diff --git a/gcc/testsuite/gcc.dg/vect/slp-reduc-1.c b/gcc/testsuite/gcc.dg/vect/slp-reduc-1.c new file mode 100644 index 00000000000..95faba8e9d4 --- /dev/null +++ b/gcc/testsuite/gcc.dg/vect/slp-reduc-1.c @@ -0,0 +1,49 @@ +/* { dg-require-effective-target vect_int } */ + +#include <stdarg.h> +#include <stdio.h> +#include "tree-vect.h" + +#define N 16 + +unsigned int ub[N] = {0,3,6,9,12,15,18,21,24,27,30,33,36,39,42,45}; +unsigned int uc[N] = {0,1,2,3,4,5,6,7,8,9,10,11,12,13,14,15}; + +/* Vectorization of reduction using loop-aware SLP. */ + +__attribute__ ((noinline)) +int main1 (int n, int res0, int res1, int res2, int res3) +{ + int i; + unsigned int udiff0 = 5, udiff1 = 10, udiff2 = 20, udiff3 = 30; + + for (i = 0; i < n; i++) { + udiff3 += (ub[4*i + 3] - uc[4*i + 3]); + udiff2 += (ub[4*i + 2] - uc[4*i + 2]); + udiff1 += (ub[4*i + 1] - uc[4*i + 1]); + udiff0 += (ub[4*i] - uc[4*i]); + } + + /* Check results: */ + if (udiff0 != res0 + || udiff1 != res1 + || udiff2 != res2 + || udiff3 != res3) + abort (); + + return 0; +} + +int main (void) +{ + check_vect (); + + main1 (N/4, 53, 66, 84, 102); + main1 (N/4 - 1, 29, 40, 56, 72); + return 0; +} + +/* { dg-final { scan-tree-dump-times "vectorized 1 loops" 1 "vect" { xfail vect_no_int_add } } } */ +/* { dg-final { scan-tree-dump-times "vectorizing stmts using SLP" 1 "vect" { xfail vect_no_int_add } } } */ +/* { dg-final { cleanup-tree-dump "vect" } } */ + diff --git a/gcc/testsuite/gcc.dg/vect/slp-reduc-2.c b/gcc/testsuite/gcc.dg/vect/slp-reduc-2.c new file mode 100644 index 00000000000..cb59c8c07ea --- /dev/null +++ b/gcc/testsuite/gcc.dg/vect/slp-reduc-2.c @@ -0,0 +1,44 @@ +/* { dg-require-effective-target vect_int } */ + +#include <stdarg.h> +#include <stdio.h> +#include "tree-vect.h" + +#define N 16 + +unsigned int ub[N] = {0,3,6,9,12,15,18,21,24,27,30,33,36,39,42,45}; +unsigned int uc[N] = {0,1,2,3,4,5,6,7,8,9,10,11,12,13,14,15}; + +/* Vectorization of reduction using loop-aware SLP (with unrolling). */ + +__attribute__ ((noinline)) +int main1 (int n, int res0, int res1, int res2, int res3) +{ + int i; + unsigned int udiff0 = 5, udiff1 = 10; + + for (i = 0; i < n; i++) { + udiff1 += (ub[2*i + 1] - uc[2*i + 1]); + udiff0 += (ub[2*i] - uc[2*i]); + } + + /* Check results: */ + if (udiff0 != res0 + || udiff1 != res1) + abort (); + + return 0; +} + +int main (void) +{ + check_vect (); + + main1 (N/2, 117, 138, 84, 102); + return 0; +} + +/* { dg-final { scan-tree-dump-times "vectorized 1 loops" 1 "vect" { xfail vect_no_int_add } } } */ +/* { dg-final { scan-tree-dump-times "vectorizing stmts using SLP" 1 "vect" { xfail vect_no_int_add } } } */ +/* { dg-final { cleanup-tree-dump "vect" } } */ + diff --git a/gcc/testsuite/gcc.dg/vect/slp-reduc-3.c b/gcc/testsuite/gcc.dg/vect/slp-reduc-3.c new file mode 100644 index 00000000000..3220d3912ba --- /dev/null +++ b/gcc/testsuite/gcc.dg/vect/slp-reduc-3.c @@ -0,0 +1,62 @@ +/* { dg-require-effective-target vect_int } */ + +#include <stdarg.h> +#include "tree-vect.h" + +#define N 64 + +#define DOT1 21834 +#define DOT2 21876 + +unsigned short X[N] __attribute__ ((__aligned__(__BIGGEST_ALIGNMENT__))); +unsigned short Y[N] __attribute__ ((__aligned__(__BIGGEST_ALIGNMENT__))); + +/* short->short->int dot product. + Not detected as a dot-product pattern. + Requires support for non-widneing multiplication and widening-summation. + Vectorized with loop-aware SLP. */ +__attribute__ ((noinline)) unsigned int +foo1(int len, int *result1, int *result2) +{ + int i; + unsigned int res1 = 10, res2 = 20; + unsigned short prod; + + for (i=0; i<len; i++) { + prod = X[2*i] * Y[2*i]; + res1 += prod; + prod = X[2*i+1] * Y[2*i+1]; + res2 += prod; + } + + *result1 = res1; + *result2 = res2; + + return 0; +} + +int main (void) +{ + unsigned int dot1, dot2; + unsigned short i; + + check_vect (); + + for (i=0; i<N; i++) { + X[i] = i; + Y[i] = 64-i; + } + + foo1 (N/2, &dot1, &dot2); + + if (dot1 != DOT1 || dot2 != DOT2) + abort (); + + return 0; +} + +/* The initialization loop in main also gets vectorized. */ +/* { dg-final { scan-tree-dump-times "vect_recog_dot_prod_pattern: detected" 1 "vect" { xfail *-*-* } } } */ +/* { dg-final { scan-tree-dump-times "vectorized 1 loops" 2 "vect" { target { vect_short_mult && vect_widen_sum_hi_to_si } } } } */ +/* { dg-final { scan-tree-dump-times "vectorizing stmts using SLP" 1 "vect" { xfail { vect_widen_sum_hi_to_si_pattern } } } } */ +/* { dg-final { cleanup-tree-dump "vect" } } */ diff --git a/gcc/testsuite/gcc.dg/vect/slp-reduc-4.c b/gcc/testsuite/gcc.dg/vect/slp-reduc-4.c new file mode 100644 index 00000000000..ad5b3ce0700 --- /dev/null +++ b/gcc/testsuite/gcc.dg/vect/slp-reduc-4.c @@ -0,0 +1,60 @@ +/* { dg-require-effective-target vect_int } */ + +#include <stdarg.h> +#include <stdio.h> +#include "tree-vect.h" + +#define N 128 + +unsigned int uc[N]; + +/* Vectorization of reduction using loop-aware SLP. */ + +__attribute__ ((noinline)) +int main1 (int n, int res0, int res1, int res2, int res3, int res4, int res5, int res6, int res7) +{ + int i; + unsigned int max0 = 5, max1 = 10, max2 = 20, max3 = 30, max4 = 2, max5 = 13, max6 = 7, max7 = 313; + + for (i = 0; i < n; i++) { + max2 = max2 < uc[8*i+2] ? uc[8*i+2] : max2; + max3 = max3 < uc[8*i+3] ? uc[8*i+3] : max3; + max1 = max1 < uc[8*i+1] ? uc[8*i+1] : max1; + max7 = max7 < uc[8*i+7] ? uc[8*i+7] : max7; + max6 = max6 < uc[8*i+6] ? uc[8*i+6] : max6; + max0 = max0 < uc[8*i] ? uc[8*i] : max0; + max4 = max4 < uc[8*i+4] ? uc[8*i+4] : max4; + max5 = max5 < uc[8*i+5] ? uc[8*i+5] : max5; + } + + /* Check results: */ + if (max0 != res0 + || max1 != res1 + || max2 != res2 + || max3 != res3 + || max4 != res4 + || max5 != res5 + || max6 != res6 + || max7 != res7) + abort (); + + return 0; +} + +int main (void) +{ + int i; + + check_vect (); + + for (i = 0; i < N; i++) + uc[i] = i+3; + + main1 (N/8, 123, 124, 125, 126, 127, 128, 129, 313); + return 0; +} + +/* { dg-final { scan-tree-dump-times "vectorized 1 loops" 1 "vect" { xfail vect_no_int_max } } } */ +/* { dg-final { scan-tree-dump-times "vectorizing stmts using SLP" 1 "vect" { xfail vect_no_int_max } } } */ +/* { dg-final { cleanup-tree-dump "vect" } } */ + diff --git a/gcc/testsuite/gcc.dg/vect/slp-reduc-5.c b/gcc/testsuite/gcc.dg/vect/slp-reduc-5.c new file mode 100644 index 00000000000..0974b6642d8 --- /dev/null +++ b/gcc/testsuite/gcc.dg/vect/slp-reduc-5.c @@ -0,0 +1,49 @@ +/* { dg-require-effective-target vect_int } */ + +#include <stdarg.h> +#include <stdio.h> +#include "tree-vect.h" + +#define N 128 + +int c[N]; + +/* Vectorization of reduction using loop-aware SLP. */ + +__attribute__ ((noinline)) +int main1 (int n, int res0, int res1) +{ + int i; + int max0 = -100, max1 = -313; + + for (i = 0; i < n; i++) { + max1 = max1 < c[2*i+1] ? c[2*i+1] : max1; + max0 = max0 < c[2*i] ? c[2*i] : max0; + } + + /* Check results: */ + if (max0 != res0 + || max1 != res1) + abort (); + + return 0; +} + +int main (void) +{ + int i; + + check_vect (); + + for (i = 0; i < N; i++) + c[i] = (i+3) * -1; + + c[0] = c[1] = -100; + main1 (N/2, -5, -6); + return 0; +} + +/* { dg-final { scan-tree-dump-times "vectorized 1 loops" 2 "vect" { xfail vect_no_int_max } } } */ +/* { dg-final { scan-tree-dump-times "vectorizing stmts using SLP" 1 "vect" { xfail vect_no_int_max } } } */ +/* { dg-final { cleanup-tree-dump "vect" } } */ + diff --git a/gcc/testsuite/gcc.dg/vect/slp-reduc-6.c b/gcc/testsuite/gcc.dg/vect/slp-reduc-6.c new file mode 100644 index 00000000000..c69251a76e2 --- /dev/null +++ b/gcc/testsuite/gcc.dg/vect/slp-reduc-6.c @@ -0,0 +1,50 @@ +/* { dg-require-effective-target vect_int } */ + +#include <stdarg.h> +#include <stdio.h> +#include "tree-vect.h" + +#define N 128 + +int a[N], b[N]; + +/* Vectorization of reduction. Loop-aware SLP is not possible, because of + different arrays. */ + +__attribute__ ((noinline)) +int main1 (int n, int res0, int res1) +{ + int i; + int sum0 = 0, sum1 = 0; + + for (i = 0; i < n; i++) { + sum1 += a[2*i]; + sum0 += b[2*i]; + } + + /* Check results: */ + if (sum0 != res0 + || sum1 != res1) + abort (); + + return 0; +} + +int main (void) +{ + int i; + + check_vect (); + + for (i = 0; i < N; i++) + a[i] = b[i] = i; + + main1 (N/2, 4032, 4032); + return 0; +} + +/* { dg-final { scan-tree-dump-times "vectorized 1 loops" 2 "vect" { xfail vect_no_int_add } } } */ +/* { dg-final { scan-tree-dump-times "vectorizing stmts using SLP" 0 "vect" } } */ +/* { dg-final { scan-tree-dump-times "different interleaving chains in one node" 1 "vect" { target { ! vect_no_int_add } } } } */ +/* { dg-final { cleanup-tree-dump "vect" } } */ + diff --git a/gcc/testsuite/lib/target-supports.exp b/gcc/testsuite/lib/target-supports.exp index 126ae380fe7..e91c0331516 100644 --- a/gcc/testsuite/lib/target-supports.exp +++ b/gcc/testsuite/lib/target-supports.exp @@ -2105,6 +2105,25 @@ proc check_effective_target_vect_perm { } { return $et_vect_perm_saved } +# Return 1 if the target plus current options supports a vector +# widening summation of *short* args into *int* result, 0 otherwise. +# +# This won't change for different subtargets so cache the result. + +proc check_effective_target_vect_widen_sum_hi_to_si_pattern { } { + global et_vect_widen_sum_hi_to_si_pattern + + if [info exists et_vect_widen_sum_hi_to_si_pattern_saved] { + verbose "check_effective_target_vect_widen_sum_hi_to_si_pattern: using cached result" 2 + } else { + set et_vect_widen_sum_hi_to_si_pattern_saved 0 + if { [istarget powerpc*-*-*] } { + set et_vect_widen_sum_hi_to_si_pattern_saved 1 + } + } + verbose "check_effective_target_vect_widen_sum_hi_to_si_pattern: returning $et_vect_widen_sum_hi_to_si_pattern_saved" 2 + return $et_vect_widen_sum_hi_to_si_pattern_saved +} # Return 1 if the target plus current options supports a vector # widening summation of *short* args into *int* result, 0 otherwise. diff --git a/gcc/tree-vect-loop.c b/gcc/tree-vect-loop.c index 809f3e15a02..e6e9008ea37 100644 --- a/gcc/tree-vect-loop.c +++ b/gcc/tree-vect-loop.c @@ -545,6 +545,11 @@ vect_analyze_scalar_cycles_1 (loop_vec_info loop_vinfo, struct loop *loop) STMT_VINFO_DEF_TYPE (stmt_vinfo) = vect_reduction_def; STMT_VINFO_DEF_TYPE (vinfo_for_stmt (reduc_stmt)) = vect_reduction_def; + /* Store the reduction cycles for possible vectorization in + loop-aware SLP. */ + VEC_safe_push (gimple, heap, + LOOP_VINFO_REDUCTIONS (loop_vinfo), + reduc_stmt); } } } @@ -745,6 +750,7 @@ new_loop_vec_info (struct loop *loop) VEC_alloc (ddr_p, heap, PARAM_VALUE (PARAM_VECT_MAX_VERSION_FOR_ALIAS_CHECKS)); LOOP_VINFO_STRIDED_STORES (res) = VEC_alloc (gimple, heap, 10); + LOOP_VINFO_REDUCTIONS (res) = VEC_alloc (gimple, heap, 10); LOOP_VINFO_SLP_INSTANCES (res) = VEC_alloc (slp_instance, heap, 10); LOOP_VINFO_SLP_UNROLLING_FACTOR (res) = 1; @@ -835,6 +841,7 @@ destroy_loop_vec_info (loop_vec_info loop_vinfo, bool clean_stmts) VEC_free (slp_instance, heap, LOOP_VINFO_SLP_INSTANCES (loop_vinfo)); VEC_free (gimple, heap, LOOP_VINFO_STRIDED_STORES (loop_vinfo)); + VEC_free (gimple, heap, LOOP_VINFO_REDUCTIONS (loop_vinfo)); free (loop_vinfo); loop->aux = NULL; @@ -1223,7 +1230,6 @@ vect_analyze_loop_operations (loop_vec_info loop_vinfo) if ((STMT_VINFO_RELEVANT_P (stmt_info) || VECTORIZABLE_CYCLE_DEF (STMT_VINFO_DEF_TYPE (stmt_info))) && !PURE_SLP_STMT (stmt_info)) - /* STMT needs both SLP and loop-based vectorization. */ only_slp_in_loop = false; } @@ -2860,28 +2866,33 @@ get_initial_def_for_reduction (gimple stmt, tree init_val, /* Function vect_create_epilog_for_reduction Create code at the loop-epilog to finalize the result of a reduction - computation. - - VECT_DEF is a vector of partial results. - REDUC_CODE is the tree-code for the epilog reduction. + computation. + + VECT_DEFS is list of vector of partial results, i.e., the lhs's of vector + reduction statements. + STMT is the scalar reduction stmt that is being vectorized. NCOPIES is > 1 in case the vectorization factor (VF) is bigger than the number of elements that we can fit in a vectype (nunits). In this case we have to generate more than one vector stmt - i.e - we need to "unroll" the vector stmt by a factor VF/nunits. For more details see documentation in vectorizable_operation. - STMT is the scalar reduction stmt that is being vectorized. - REDUCTION_PHI is the phi-node that carries the reduction computation. - REDUC_INDEX is the index of the operand in the right hand side of the + REDUC_CODE is the tree-code for the epilog reduction. + REDUCTION_PHIS is a list of the phi-nodes that carry the reduction + computation. + REDUC_INDEX is the index of the operand in the right hand side of the statement that is defined by REDUCTION_PHI. DOUBLE_REDUC is TRUE if double reduction phi nodes should be handled. + SLP_NODE is an SLP node containing a group of reduction statements. The + first one in this group is STMT. This function: - 1. Creates the reduction def-use cycle: sets the arguments for - REDUCTION_PHI: + 1. Creates the reduction def-use cycles: sets the arguments for + REDUCTION_PHIS: The loop-entry argument is the vectorized initial-value of the reduction. - The loop-latch argument is VECT_DEF - the vector of partial sums. - 2. "Reduces" the vector of partial results VECT_DEF into a single result, - by applying the operation specified by REDUC_CODE if available, or by + The loop-latch argument is taken from VECT_DEFS - the vector of partial + sums. + 2. "Reduces" each vector of partial results VECT_DEFS into a single result, + by applying the operation specified by REDUC_CODE if available, or by other means (whole-vector shifts or a scalar loop). The function also creates a new phi node at the loop exit to preserve loop-closed form, as illustrated below. @@ -2914,12 +2925,11 @@ get_initial_def_for_reduction (gimple stmt, tree init_val, */ static void -vect_create_epilog_for_reduction (tree vect_def, gimple stmt, - int ncopies, - enum tree_code reduc_code, - gimple reduction_phi, - int reduc_index, - bool double_reduc) +vect_create_epilog_for_reduction (VEC (tree, heap) *vect_defs, gimple stmt, + int ncopies, enum tree_code reduc_code, + VEC (gimple, heap) *reduction_phis, + int reduc_index, bool double_reduc, + slp_tree slp_node) { stmt_vec_info stmt_info = vinfo_for_stmt (stmt); stmt_vec_info prev_phi_info; @@ -2933,32 +2943,37 @@ vect_create_epilog_for_reduction (tree vect_def, gimple stmt, gimple new_phi = NULL, phi; gimple_stmt_iterator exit_gsi; tree vec_dest; - tree new_temp = NULL_TREE; - tree new_name; + tree new_temp = NULL_TREE, new_dest, new_name, new_scalar_dest; gimple epilog_stmt = NULL; - tree new_scalar_dest, new_dest; + enum tree_code code = gimple_assign_rhs_code (stmt); gimple exit_phi; tree bitsize, bitpos; - enum tree_code code = gimple_assign_rhs_code (stmt); - tree adjustment_def; - tree vec_initial_def, def; - tree orig_name; + tree adjustment_def = NULL; + tree vec_initial_def = NULL; + tree reduction_op, expr, def; + tree orig_name, scalar_result; imm_use_iterator imm_iter; use_operand_p use_p; bool extract_scalar_result = false; - tree reduction_op, expr; - gimple orig_stmt; - gimple use_stmt; + gimple use_stmt, orig_stmt, reduction_phi = NULL; bool nested_in_vect_loop = false; - VEC(gimple,heap) *phis = NULL; + VEC (gimple, heap) *new_phis = NULL; enum vect_def_type dt = vect_unknown_def_type; int j, i; + VEC (tree, heap) *scalar_results = NULL; + int group_size = 1, k, ratio; + VEC (tree, heap) *vec_initial_defs = NULL; + VEC (gimple, heap) *phis; + + if (slp_node) + group_size = VEC_length (gimple, SLP_TREE_SCALAR_STMTS (slp_node)); if (nested_in_vect_loop_p (loop, stmt)) { outer_loop = loop; loop = loop->inner; nested_in_vect_loop = true; + gcc_assert (!slp_node); } switch (get_gimple_rhs_class (gimple_assign_rhs_code (stmt))) @@ -2983,47 +2998,80 @@ vect_create_epilog_for_reduction (tree vect_def, gimple stmt, gcc_assert (vectype); mode = TYPE_MODE (vectype); - /*** 1. Create the reduction def-use cycle ***/ + /* 1. Create the reduction def-use cycle: + Set the arguments of REDUCTION_PHIS, i.e., transform + + loop: + vec_def = phi <null, null> # REDUCTION_PHI + VECT_DEF = vector_stmt # vectorized form of STMT + ... - /* For the case of reduction, vect_get_vec_def_for_operand returns - the scalar def before the loop, that defines the initial value - of the reduction variable. */ - vec_initial_def = vect_get_vec_def_for_operand (reduction_op, stmt, - &adjustment_def); + into: - phi = reduction_phi; - def = vect_def; - for (j = 0; j < ncopies; j++) + loop: + vec_def = phi <vec_init, VECT_DEF> # REDUCTION_PHI + VECT_DEF = vector_stmt # vectorized form of STMT + ... + + (in case of SLP, do it for all the phis). */ + + /* Get the loop-entry arguments. */ + if (slp_node) + vect_get_slp_defs (slp_node, &vec_initial_defs, NULL, reduc_index); + else { - /* 1.1 set the loop-entry arg of the reduction-phi: */ - add_phi_arg (phi, vec_initial_def, loop_preheader_edge (loop), - UNKNOWN_LOCATION); + vec_initial_defs = VEC_alloc (tree, heap, 1); + /* For the case of reduction, vect_get_vec_def_for_operand returns + the scalar def before the loop, that defines the initial value + of the reduction variable. */ + vec_initial_def = vect_get_vec_def_for_operand (reduction_op, stmt, + &adjustment_def); + VEC_quick_push (tree, vec_initial_defs, vec_initial_def); + } - /* 1.2 set the loop-latch arg for the reduction-phi: */ - if (j > 0) - def = vect_get_vec_def_for_stmt_copy (dt, def); - add_phi_arg (phi, def, loop_latch_edge (loop), UNKNOWN_LOCATION); + /* Set phi nodes arguments. */ + for (i = 0; VEC_iterate (gimple, reduction_phis, i, phi); i++) + { + tree vec_init_def = VEC_index (tree, vec_initial_defs, i); + tree def = VEC_index (tree, vect_defs, i); + for (j = 0; j < ncopies; j++) + { + /* Set the loop-entry arg of the reduction-phi. */ + add_phi_arg (phi, vec_init_def, loop_preheader_edge (loop), + UNKNOWN_LOCATION); - if (vect_print_dump_info (REPORT_DETAILS)) - { - fprintf (vect_dump, "transform reduction: created def-use cycle: "); - print_gimple_stmt (vect_dump, phi, 0, TDF_SLIM); - fprintf (vect_dump, "\n"); - print_gimple_stmt (vect_dump, SSA_NAME_DEF_STMT (def), 0, TDF_SLIM); - } + /* Set the loop-latch arg for the reduction-phi. */ + if (j > 0) + def = vect_get_vec_def_for_stmt_copy (vect_unknown_def_type, def); - phi = STMT_VINFO_RELATED_STMT (vinfo_for_stmt (phi)); + add_phi_arg (phi, def, loop_latch_edge (loop), UNKNOWN_LOCATION); + + if (vect_print_dump_info (REPORT_DETAILS)) + { + fprintf (vect_dump, "transform reduction: created def-use" + " cycle: "); + print_gimple_stmt (vect_dump, phi, 0, TDF_SLIM); + fprintf (vect_dump, "\n"); + print_gimple_stmt (vect_dump, SSA_NAME_DEF_STMT (def), 0, + TDF_SLIM); + } + + phi = STMT_VINFO_RELATED_STMT (vinfo_for_stmt (phi)); + } } - /*** 2. Create epilog code - The reduction epilog code operates across the elements of the vector - of partial results computed by the vectorized loop. - The reduction epilog code consists of: - step 1: compute the scalar result in a vector (v_out2) - step 2: extract the scalar result (s_out3) from the vector (v_out2) - step 3: adjust the scalar result (s_out3) if needed. + VEC_free (tree, heap, vec_initial_defs); + + /* 2. Create epilog code. + The reduction epilog code operates across the elements of the vector + of partial results computed by the vectorized loop. + The reduction epilog code consists of: + + step 1: compute the scalar result in a vector (v_out2) + step 2: extract the scalar result (s_out3) from the vector (v_out2) + step 3: adjust the scalar result (s_out3) if needed. - Step 1 can be accomplished using one the following three schemes: + Step 1 can be accomplished using one the following three schemes: (scheme 1) using reduc_code, if available. (scheme 2) using whole-vector shifts, if available. (scheme 3) using a scalar loop. In this case steps 1+2 above are @@ -3038,29 +3086,33 @@ vect_create_epilog_for_reduction (tree vect_def, gimple stmt, s_out4 = adjust_result <s_out3> # step 3 (step 3 is optional, and steps 1 and 2 may be combined). - Lastly, the uses of s_out0 are replaced by s_out4. + Lastly, the uses of s_out0 are replaced by s_out4. */ - ***/ - /* 2.1 Create new loop-exit-phi to preserve loop-closed form: - v_out1 = phi <v_loop> */ + /* 2.1 Create new loop-exit-phis to preserve loop-closed form: + v_out1 = phi <VECT_DEF> + Store them in NEW_PHIS. */ exit_bb = single_exit (loop)->dest; - def = vect_def; prev_phi_info = NULL; - for (j = 0; j < ncopies; j++) + new_phis = VEC_alloc (gimple, heap, VEC_length (tree, vect_defs)); + for (i = 0; VEC_iterate (tree, vect_defs, i, def); i++) { - phi = create_phi_node (SSA_NAME_VAR (vect_def), exit_bb); - set_vinfo_for_stmt (phi, new_stmt_vec_info (phi, loop_vinfo, NULL)); - if (j == 0) - new_phi = phi; - else - { - def = vect_get_vec_def_for_stmt_copy (dt, def); - STMT_VINFO_RELATED_STMT (prev_phi_info) = phi; - } - SET_PHI_ARG_DEF (phi, single_exit (loop)->dest_idx, def); - prev_phi_info = vinfo_for_stmt (phi); + for (j = 0; j < ncopies; j++) + { + phi = create_phi_node (SSA_NAME_VAR (def), exit_bb); + set_vinfo_for_stmt (phi, new_stmt_vec_info (phi, loop_vinfo, NULL)); + if (j == 0) + VEC_quick_push (gimple, new_phis, phi); + else + { + def = vect_get_vec_def_for_stmt_copy (dt, def); + STMT_VINFO_RELATED_STMT (prev_phi_info) = phi; + } + + SET_PHI_ARG_DEF (phi, single_exit (loop)->dest_idx, def); + prev_phi_info = vinfo_for_stmt (phi); + } } exit_gsi = gsi_after_labels (exit_bb); @@ -3089,16 +3141,17 @@ vect_create_epilog_for_reduction (tree vect_def, gimple stmt, } code = gimple_assign_rhs_code (orig_stmt); + /* For MINUS_EXPR the initial vector is [init_val,0,...,0], therefore, + partial results are added and not subtracted. */ + if (code == MINUS_EXPR) + code = PLUS_EXPR; + scalar_dest = gimple_assign_lhs (orig_stmt); scalar_type = TREE_TYPE (scalar_dest); + scalar_results = VEC_alloc (tree, heap, group_size); new_scalar_dest = vect_create_destination_var (scalar_dest, NULL); bitsize = TYPE_SIZE (scalar_type); - /* For MINUS_EXPR the initial vector is [init_val,0,...,0], therefore, - partial results are added and not subtracted. */ - if (code == MINUS_EXPR) - code = PLUS_EXPR; - /* In case this is a reduction in an inner-loop while vectorizing an outer loop - we don't need to extract a single scalar result at the end of the inner-loop (unless it is double reduction, i.e., the use of reduction is @@ -3108,28 +3161,21 @@ vect_create_epilog_for_reduction (tree vect_def, gimple stmt, if (nested_in_vect_loop && !double_reduc) goto vect_finalize_reduction; - /* The epilogue is created for the outer-loop, i.e., for the loop being - vectorized. */ - if (double_reduc) - loop = outer_loop; - - /* FORNOW */ - gcc_assert (ncopies == 1); - /* 2.3 Create the reduction code, using one of the three schemes described - above. */ - - if (reduc_code != ERROR_MARK) + above. In SLP we simply need to extract all the elements from the + vector (without reducing them), so we use scalar shifts. */ + if (reduc_code != ERROR_MARK && !slp_node) { tree tmp; /*** Case 1: Create: - v_out2 = reduc_expr <v_out1> */ + v_out2 = reduc_expr <v_out1> */ if (vect_print_dump_info (REPORT_DETAILS)) - fprintf (vect_dump, "Reduce using direct vector reduction."); + fprintf (vect_dump, "Reduce using direct vector reduction."); vec_dest = vect_create_destination_var (scalar_dest, vectype); + new_phi = VEC_index (gimple, new_phis, 0); tmp = build1 (reduc_code, vectype, PHI_RESULT (new_phi)); epilog_stmt = gimple_build_assign (vec_dest, tmp); new_temp = make_ssa_name (vec_dest, epilog_stmt); @@ -3148,142 +3194,182 @@ vect_create_epilog_for_reduction (tree vect_def, gimple stmt, tree vec_temp; if (optab_handler (vec_shr_optab, mode)->insn_code != CODE_FOR_nothing) - shift_code = VEC_RSHIFT_EXPR; + shift_code = VEC_RSHIFT_EXPR; else - have_whole_vector_shift = false; + have_whole_vector_shift = false; /* Regardless of whether we have a whole vector shift, if we're - emulating the operation via tree-vect-generic, we don't want - to use it. Only the first round of the reduction is likely - to still be profitable via emulation. */ + emulating the operation via tree-vect-generic, we don't want + to use it. Only the first round of the reduction is likely + to still be profitable via emulation. */ /* ??? It might be better to emit a reduction tree code here, so that - tree-vect-generic can expand the first round via bit tricks. */ + tree-vect-generic can expand the first round via bit tricks. */ if (!VECTOR_MODE_P (mode)) - have_whole_vector_shift = false; + have_whole_vector_shift = false; else - { - optab optab = optab_for_tree_code (code, vectype, optab_default); - if (optab_handler (optab, mode)->insn_code == CODE_FOR_nothing) - have_whole_vector_shift = false; - } - - if (have_whole_vector_shift) { - /*** Case 2: Create: - for (offset = VS/2; offset >= element_size; offset/=2) - { - Create: va' = vec_shift <va, offset> - Create: va = vop <va, va'> - } */ - - if (vect_print_dump_info (REPORT_DETAILS)) - fprintf (vect_dump, "Reduce using vector shifts"); + optab optab = optab_for_tree_code (code, vectype, optab_default); + if (optab_handler (optab, mode)->insn_code == CODE_FOR_nothing) + have_whole_vector_shift = false; + } - vec_dest = vect_create_destination_var (scalar_dest, vectype); - new_temp = PHI_RESULT (new_phi); + if (have_whole_vector_shift && !slp_node) + { + /*** Case 2: Create: + for (offset = VS/2; offset >= element_size; offset/=2) + { + Create: va' = vec_shift <va, offset> + Create: va = vop <va, va'> + } */ - for (bit_offset = vec_size_in_bits/2; - bit_offset >= element_bitsize; - bit_offset /= 2) - { - tree bitpos = size_int (bit_offset); - - epilog_stmt = gimple_build_assign_with_ops (shift_code, vec_dest, - new_temp, bitpos); - new_name = make_ssa_name (vec_dest, epilog_stmt); - gimple_assign_set_lhs (epilog_stmt, new_name); - gsi_insert_before (&exit_gsi, epilog_stmt, GSI_SAME_STMT); - - epilog_stmt = gimple_build_assign_with_ops (code, vec_dest, - new_name, new_temp); - new_temp = make_ssa_name (vec_dest, epilog_stmt); - gimple_assign_set_lhs (epilog_stmt, new_temp); - gsi_insert_before (&exit_gsi, epilog_stmt, GSI_SAME_STMT); - } + if (vect_print_dump_info (REPORT_DETAILS)) + fprintf (vect_dump, "Reduce using vector shifts"); + + vec_dest = vect_create_destination_var (scalar_dest, vectype); + new_phi = VEC_index (gimple, new_phis, 0); + new_temp = PHI_RESULT (new_phi); + for (bit_offset = vec_size_in_bits/2; + bit_offset >= element_bitsize; + bit_offset /= 2) + { + tree bitpos = size_int (bit_offset); + + epilog_stmt = gimple_build_assign_with_ops (shift_code, + vec_dest, new_temp, bitpos); + new_name = make_ssa_name (vec_dest, epilog_stmt); + gimple_assign_set_lhs (epilog_stmt, new_name); + gsi_insert_before (&exit_gsi, epilog_stmt, GSI_SAME_STMT); + + epilog_stmt = gimple_build_assign_with_ops (code, vec_dest, + new_name, new_temp); + new_temp = make_ssa_name (vec_dest, epilog_stmt); + gimple_assign_set_lhs (epilog_stmt, new_temp); + gsi_insert_before (&exit_gsi, epilog_stmt, GSI_SAME_STMT); + } - extract_scalar_result = true; - } + extract_scalar_result = true; + } else { - tree rhs; - - /*** Case 3: Create: - s = extract_field <v_out2, 0> - for (offset = element_size; - offset < vector_size; - offset += element_size;) - { - Create: s' = extract_field <v_out2, offset> - Create: s = op <s, s'> - } */ + tree rhs; + + /*** Case 3: Create: + s = extract_field <v_out2, 0> + for (offset = element_size; + offset < vector_size; + offset += element_size;) + { + Create: s' = extract_field <v_out2, offset> + Create: s = op <s, s'> // For non SLP cases + } */ - if (vect_print_dump_info (REPORT_DETAILS)) - fprintf (vect_dump, "Reduce using scalar code. "); - - vec_temp = PHI_RESULT (new_phi); - vec_size_in_bits = tree_low_cst (TYPE_SIZE (vectype), 1); - rhs = build3 (BIT_FIELD_REF, scalar_type, vec_temp, bitsize, - bitsize_zero_node); - epilog_stmt = gimple_build_assign (new_scalar_dest, rhs); - new_temp = make_ssa_name (new_scalar_dest, epilog_stmt); - gimple_assign_set_lhs (epilog_stmt, new_temp); - gsi_insert_before (&exit_gsi, epilog_stmt, GSI_SAME_STMT); - - for (bit_offset = element_bitsize; - bit_offset < vec_size_in_bits; - bit_offset += element_bitsize) - { - tree bitpos = bitsize_int (bit_offset); - tree rhs = build3 (BIT_FIELD_REF, scalar_type, vec_temp, bitsize, - bitpos); - - epilog_stmt = gimple_build_assign (new_scalar_dest, rhs); - new_name = make_ssa_name (new_scalar_dest, epilog_stmt); - gimple_assign_set_lhs (epilog_stmt, new_name); - gsi_insert_before (&exit_gsi, epilog_stmt, GSI_SAME_STMT); - - epilog_stmt = gimple_build_assign_with_ops (code, - new_scalar_dest, - new_name, new_temp); - new_temp = make_ssa_name (new_scalar_dest, epilog_stmt); - gimple_assign_set_lhs (epilog_stmt, new_temp); - gsi_insert_before (&exit_gsi, epilog_stmt, GSI_SAME_STMT); - } + if (vect_print_dump_info (REPORT_DETAILS)) + fprintf (vect_dump, "Reduce using scalar code. "); - extract_scalar_result = false; - } + vec_size_in_bits = tree_low_cst (TYPE_SIZE (vectype), 1); + for (i = 0; VEC_iterate (gimple, new_phis, i, new_phi); i++) + { + vec_temp = PHI_RESULT (new_phi); + rhs = build3 (BIT_FIELD_REF, scalar_type, vec_temp, bitsize, + bitsize_zero_node); + epilog_stmt = gimple_build_assign (new_scalar_dest, rhs); + new_temp = make_ssa_name (new_scalar_dest, epilog_stmt); + gimple_assign_set_lhs (epilog_stmt, new_temp); + gsi_insert_before (&exit_gsi, epilog_stmt, GSI_SAME_STMT); + + /* In SLP we don't need to apply reduction operation, so we just + collect s' values in SCALAR_RESULTS. */ + if (slp_node) + VEC_safe_push (tree, heap, scalar_results, new_temp); + + for (bit_offset = element_bitsize; + bit_offset < vec_size_in_bits; + bit_offset += element_bitsize) + { + tree bitpos = bitsize_int (bit_offset); + tree rhs = build3 (BIT_FIELD_REF, scalar_type, vec_temp, + bitsize, bitpos); + + epilog_stmt = gimple_build_assign (new_scalar_dest, rhs); + new_name = make_ssa_name (new_scalar_dest, epilog_stmt); + gimple_assign_set_lhs (epilog_stmt, new_name); + gsi_insert_before (&exit_gsi, epilog_stmt, GSI_SAME_STMT); + + if (slp_node) + { + /* In SLP we don't need to apply reduction operation, so + we just collect s' values in SCALAR_RESULTS. */ + new_temp = new_name; + VEC_safe_push (tree, heap, scalar_results, new_name); + } + else + { + epilog_stmt = gimple_build_assign_with_ops (code, + new_scalar_dest, new_name, new_temp); + new_temp = make_ssa_name (new_scalar_dest, epilog_stmt); + gimple_assign_set_lhs (epilog_stmt, new_temp); + gsi_insert_before (&exit_gsi, epilog_stmt, GSI_SAME_STMT); + } + } + } + + /* The only case where we need to reduce scalar results in SLP, is + unrolling. If the size of SCALAR_RESULTS is greater than + GROUP_SIZE, we reduce them combining elements modulo + GROUP_SIZE. */ + if (slp_node) + { + tree res, first_res, new_res; + gimple new_stmt; + + /* Reduce multiple scalar results in case of SLP unrolling. */ + for (j = group_size; VEC_iterate (tree, scalar_results, j, res); + j++) + { + first_res = VEC_index (tree, scalar_results, j % group_size); + new_stmt = gimple_build_assign_with_ops (code, + new_scalar_dest, first_res, res); + new_res = make_ssa_name (new_scalar_dest, new_stmt); + gimple_assign_set_lhs (new_stmt, new_res); + gsi_insert_before (&exit_gsi, new_stmt, GSI_SAME_STMT); + VEC_replace (tree, scalar_results, j % group_size, new_res); + } + } + else + /* Not SLP - we have one scalar to keep in SCALAR_RESULTS. */ + VEC_safe_push (tree, heap, scalar_results, new_temp); + + extract_scalar_result = false; + } } /* 2.4 Extract the final scalar result. Create: - s_out3 = extract_field <v_out2, bitpos> */ + s_out3 = extract_field <v_out2, bitpos> */ if (extract_scalar_result) { tree rhs; - gcc_assert (!nested_in_vect_loop || double_reduc); if (vect_print_dump_info (REPORT_DETAILS)) - fprintf (vect_dump, "extract scalar result"); + fprintf (vect_dump, "extract scalar result"); if (BYTES_BIG_ENDIAN) - bitpos = size_binop (MULT_EXPR, - bitsize_int (TYPE_VECTOR_SUBPARTS (vectype) - 1), - TYPE_SIZE (scalar_type)); + bitpos = size_binop (MULT_EXPR, + bitsize_int (TYPE_VECTOR_SUBPARTS (vectype) - 1), + TYPE_SIZE (scalar_type)); else - bitpos = bitsize_zero_node; + bitpos = bitsize_zero_node; rhs = build3 (BIT_FIELD_REF, scalar_type, new_temp, bitsize, bitpos); epilog_stmt = gimple_build_assign (new_scalar_dest, rhs); new_temp = make_ssa_name (new_scalar_dest, epilog_stmt); gimple_assign_set_lhs (epilog_stmt, new_temp); gsi_insert_before (&exit_gsi, epilog_stmt, GSI_SAME_STMT); + VEC_safe_push (tree, heap, scalar_results, new_temp); } - + vect_finalize_reduction: - if (double_reduc) - loop = loop->inner; - /* 2.5 Adjust the final result by the initial value of the reduction variable. (When such adjustment is not needed, then 'adjustment_def' is zero). For example, if code is PLUS we create: @@ -3291,14 +3377,17 @@ vect_finalize_reduction: if (adjustment_def) { + gcc_assert (!slp_node); if (nested_in_vect_loop) { + new_phi = VEC_index (gimple, new_phis, 0); gcc_assert (TREE_CODE (TREE_TYPE (adjustment_def)) == VECTOR_TYPE); expr = build2 (code, vectype, PHI_RESULT (new_phi), adjustment_def); new_dest = vect_create_destination_var (scalar_dest, vectype); } else { + new_temp = VEC_index (tree, scalar_results, 0); gcc_assert (TREE_CODE (TREE_TYPE (adjustment_def)) != VECTOR_TYPE); expr = build2 (code, scalar_type, new_temp, adjustment_def); new_dest = vect_create_destination_var (scalar_dest, scalar_type); @@ -3309,142 +3398,206 @@ vect_finalize_reduction: gimple_assign_set_lhs (epilog_stmt, new_temp); SSA_NAME_DEF_STMT (new_temp) = epilog_stmt; gsi_insert_before (&exit_gsi, epilog_stmt, GSI_SAME_STMT); + if (nested_in_vect_loop) + { + set_vinfo_for_stmt (epilog_stmt, + new_stmt_vec_info (epilog_stmt, loop_vinfo, + NULL)); + STMT_VINFO_RELATED_STMT (vinfo_for_stmt (epilog_stmt)) = + STMT_VINFO_RELATED_STMT (vinfo_for_stmt (new_phi)); + + if (!double_reduc) + VEC_quick_push (tree, scalar_results, new_temp); + else + VEC_replace (tree, scalar_results, 0, new_temp); + } + else + VEC_replace (tree, scalar_results, 0, new_temp); + + VEC_replace (gimple, new_phis, 0, epilog_stmt); } + /* 2.6 Handle the loop-exit phis. Replace the uses of scalar loop-exit + phis with new adjusted scalar results, i.e., replace use <s_out0> + with use <s_out4>. - /* 2.6 Handle the loop-exit phi */ + Transform: + loop_exit: + s_out0 = phi <s_loop> # (scalar) EXIT_PHI + v_out1 = phi <VECT_DEF> # NEW_EXIT_PHI + v_out2 = reduce <v_out1> + s_out3 = extract_field <v_out2, 0> + s_out4 = adjust_result <s_out3> + use <s_out0> + use <s_out0> + + into: - /* Replace uses of s_out0 with uses of s_out3: - Find the loop-closed-use at the loop exit of the original scalar result. - (The reduction result is expected to have two immediate uses - one at the - latch block, and one at the loop exit). */ - phis = VEC_alloc (gimple, heap, 10); - FOR_EACH_IMM_USE_FAST (use_p, imm_iter, scalar_dest) + loop_exit: + s_out0 = phi <s_loop> # (scalar) EXIT_PHI + v_out1 = phi <VECT_DEF> # NEW_EXIT_PHI + v_out2 = reduce <v_out1> + s_out3 = extract_field <v_out2, 0> + s_out4 = adjust_result <s_out3> + use <s_out4> */ + + /* In SLP we may have several statements in NEW_PHIS and REDUCTION_PHIS (in + case that GROUP_SIZE is greater than vectorization factor). Therefore, we + need to match SCALAR_RESULTS with corresponding statements. The first + (GROUP_SIZE / number of new vector stmts) scalar results correspond to + the first vector stmt, etc. + (RATIO is equal to (GROUP_SIZE / number of new vector stmts)). */ + ratio = group_size / VEC_length (gimple, new_phis); + gcc_assert (!(group_size % VEC_length (gimple, new_phis))); + + for (k = 0; k < group_size; k++) { - if (!flow_bb_inside_loop_p (loop, gimple_bb (USE_STMT (use_p)))) - { - exit_phi = USE_STMT (use_p); - VEC_quick_push (gimple, phis, exit_phi); - } - } + if (k % ratio == 0) + { + epilog_stmt = VEC_index (gimple, new_phis, k / ratio); + reduction_phi = VEC_index (gimple, reduction_phis, k / ratio); + } - /* We expect to have found an exit_phi because of loop-closed-ssa form. */ - gcc_assert (!VEC_empty (gimple, phis)); + if (slp_node) + { + gimple current_stmt = VEC_index (gimple, + SLP_TREE_SCALAR_STMTS (slp_node), k); - for (i = 0; VEC_iterate (gimple, phis, i, exit_phi); i++) - { - if (nested_in_vect_loop) - { - stmt_vec_info stmt_vinfo = vinfo_for_stmt (exit_phi); - gimple vect_phi; - - /* FORNOW. Currently not supporting the case that an inner-loop - reduction is not used in the outer-loop (but only outside the - outer-loop), unless it is double reduction. */ - gcc_assert ((STMT_VINFO_RELEVANT_P (stmt_vinfo) - && !STMT_VINFO_LIVE_P (stmt_vinfo)) || double_reduc); - - epilog_stmt = adjustment_def ? epilog_stmt : new_phi; - STMT_VINFO_VEC_STMT (stmt_vinfo) = epilog_stmt; - set_vinfo_for_stmt (epilog_stmt, - new_stmt_vec_info (epilog_stmt, loop_vinfo, - NULL)); - if (adjustment_def) - STMT_VINFO_RELATED_STMT (vinfo_for_stmt (epilog_stmt)) = - STMT_VINFO_RELATED_STMT (vinfo_for_stmt (new_phi)); - - if (!double_reduc - || STMT_VINFO_DEF_TYPE (stmt_vinfo) != vect_double_reduction_def) - continue; - - /* Handle double reduction: - - stmt1: s1 = phi <s0, s2> - double reduction phi (outer loop) - stmt2: s3 = phi <s1, s4> - (regular) reduction phi (inner loop) - stmt3: s4 = use (s3) - (regular) reduction stmt (inner loop) - stmt4: s2 = phi <s4> - double reduction stmt (outer loop) - - At that point the regular reduction (stmt2 and stmt3) is already - vectorized, as well as the exit phi node, stmt4. - Here we vectorize the phi node of double reduction, stmt1, and - update all relevant statements. */ - - /* Go through all the uses of s2 to find double reduction phi node, - i.e., stmt1 above. */ - orig_name = PHI_RESULT (exit_phi); - FOR_EACH_IMM_USE_STMT (use_stmt, imm_iter, orig_name) + orig_stmt = STMT_VINFO_RELATED_STMT (vinfo_for_stmt (current_stmt)); + /* SLP statements can't participate in patterns. */ + gcc_assert (!orig_stmt); + scalar_dest = gimple_assign_lhs (current_stmt); + } + + phis = VEC_alloc (gimple, heap, 3); + /* Find the loop-closed-use at the loop exit of the original scalar + result. (The reduction result is expected to have two immediate uses - + one at the latch block, and one at the loop exit). */ + FOR_EACH_IMM_USE_FAST (use_p, imm_iter, scalar_dest) + if (!flow_bb_inside_loop_p (loop, gimple_bb (USE_STMT (use_p)))) + VEC_safe_push (gimple, heap, phis, USE_STMT (use_p)); + + /* We expect to have found an exit_phi because of loop-closed-ssa + form. */ + gcc_assert (!VEC_empty (gimple, phis)); + + for (i = 0; VEC_iterate (gimple, phis, i, exit_phi); i++) + { + if (outer_loop) { - stmt_vec_info use_stmt_vinfo = vinfo_for_stmt (use_stmt); - stmt_vec_info new_phi_vinfo; - tree vect_phi_init, preheader_arg, vect_phi_res, init_def; - basic_block bb = gimple_bb (use_stmt); - gimple use; - - /* Check that USE_STMT is really double reduction phi node. */ - if (gimple_code (use_stmt) != GIMPLE_PHI - || gimple_phi_num_args (use_stmt) != 2 - || !use_stmt_vinfo - || STMT_VINFO_DEF_TYPE (use_stmt_vinfo) - != vect_double_reduction_def - || bb->loop_father != outer_loop) + stmt_vec_info exit_phi_vinfo = vinfo_for_stmt (exit_phi); + gimple vect_phi; + + /* FORNOW. Currently not supporting the case that an inner-loop + reduction is not used in the outer-loop (but only outside the + outer-loop), unless it is double reduction. */ + gcc_assert ((STMT_VINFO_RELEVANT_P (exit_phi_vinfo) + && !STMT_VINFO_LIVE_P (exit_phi_vinfo)) + || double_reduc); + + STMT_VINFO_VEC_STMT (exit_phi_vinfo) = epilog_stmt; + if (!double_reduc + || STMT_VINFO_DEF_TYPE (exit_phi_vinfo) + != vect_double_reduction_def) continue; - /* Create vector phi node for double reduction: - vs1 = phi <vs0, vs2> - vs1 was created previously in this function by a call to - vect_get_vec_def_for_operand and is stored in vec_initial_def; - vs2 is defined by EPILOG_STMT, the vectorized EXIT_PHI; - vs0 is created here. */ + /* Handle double reduction: - /* Create vector phi node. */ - vect_phi = create_phi_node (vec_initial_def, bb); - new_phi_vinfo = new_stmt_vec_info (vect_phi, - loop_vec_info_for_loop (outer_loop), NULL); - set_vinfo_for_stmt (vect_phi, new_phi_vinfo); + stmt1: s1 = phi <s0, s2> - double reduction phi (outer loop) + stmt2: s3 = phi <s1, s4> - (regular) reduc phi (inner loop) + stmt3: s4 = use (s3) - (regular) reduc stmt (inner loop) + stmt4: s2 = phi <s4> - double reduction stmt (outer loop) - /* Create vs0 - initial def of the double reduction phi. */ - preheader_arg = PHI_ARG_DEF_FROM_EDGE (use_stmt, - loop_preheader_edge (outer_loop)); - init_def = get_initial_def_for_reduction (stmt, preheader_arg, - NULL); - vect_phi_init = vect_init_vector (use_stmt, init_def, vectype, - NULL); - - /* Update phi node arguments with vs0 and vs2. */ - add_phi_arg (vect_phi, vect_phi_init, - loop_preheader_edge (outer_loop), UNKNOWN_LOCATION); - add_phi_arg (vect_phi, PHI_RESULT (epilog_stmt), - loop_latch_edge (outer_loop), UNKNOWN_LOCATION); - if (vect_print_dump_info (REPORT_DETAILS)) - { - fprintf (vect_dump, "created double reduction phi node: "); - print_gimple_stmt (vect_dump, vect_phi, 0, TDF_SLIM); - } - - vect_phi_res = PHI_RESULT (vect_phi); + At that point the regular reduction (stmt2 and stmt3) is + already vectorized, as well as the exit phi node, stmt4. + Here we vectorize the phi node of double reduction, stmt1, and + update all relevant statements. */ - /* Replace the use, i.e., set the correct vs1 in the regular - reduction phi node. FORNOW, NCOPIES is always 1, so the loop - is redundant. */ - use = reduction_phi; - for (j = 0; j < ncopies; j++) + /* Go through all the uses of s2 to find double reduction phi + node, i.e., stmt1 above. */ + orig_name = PHI_RESULT (exit_phi); + FOR_EACH_IMM_USE_STMT (use_stmt, imm_iter, orig_name) { - edge pr_edge = loop_preheader_edge (loop); - SET_PHI_ARG_DEF (use, pr_edge->dest_idx, vect_phi_res); - use = STMT_VINFO_RELATED_STMT (vinfo_for_stmt (use)); + stmt_vec_info use_stmt_vinfo = vinfo_for_stmt (use_stmt); + stmt_vec_info new_phi_vinfo; + tree vect_phi_init, preheader_arg, vect_phi_res, init_def; + basic_block bb = gimple_bb (use_stmt); + gimple use; + + /* Check that USE_STMT is really double reduction phi + node. */ + if (gimple_code (use_stmt) != GIMPLE_PHI + || gimple_phi_num_args (use_stmt) != 2 + || !use_stmt_vinfo + || STMT_VINFO_DEF_TYPE (use_stmt_vinfo) + != vect_double_reduction_def + || bb->loop_father != outer_loop) + continue; + + /* Create vector phi node for double reduction: + vs1 = phi <vs0, vs2> + vs1 was created previously in this function by a call to + vect_get_vec_def_for_operand and is stored in + vec_initial_def; + vs2 is defined by EPILOG_STMT, the vectorized EXIT_PHI; + vs0 is created here. */ + + /* Create vector phi node. */ + vect_phi = create_phi_node (vec_initial_def, bb); + new_phi_vinfo = new_stmt_vec_info (vect_phi, + loop_vec_info_for_loop (outer_loop), NULL); + set_vinfo_for_stmt (vect_phi, new_phi_vinfo); + + /* Create vs0 - initial def of the double reduction phi. */ + preheader_arg = PHI_ARG_DEF_FROM_EDGE (use_stmt, + loop_preheader_edge (outer_loop)); + init_def = get_initial_def_for_reduction (stmt, + preheader_arg, NULL); + vect_phi_init = vect_init_vector (use_stmt, init_def, + vectype, NULL); + + /* Update phi node arguments with vs0 and vs2. */ + add_phi_arg (vect_phi, vect_phi_init, + loop_preheader_edge (outer_loop), + UNKNOWN_LOCATION); + add_phi_arg (vect_phi, PHI_RESULT (epilog_stmt), + loop_latch_edge (outer_loop), UNKNOWN_LOCATION); + if (vect_print_dump_info (REPORT_DETAILS)) + { + fprintf (vect_dump, "created double reduction phi " + "node: "); + print_gimple_stmt (vect_dump, vect_phi, 0, TDF_SLIM); + } + + vect_phi_res = PHI_RESULT (vect_phi); + + /* Replace the use, i.e., set the correct vs1 in the regular + reduction phi node. FORNOW, NCOPIES is always 1, so the + loop is redundant. */ + use = reduction_phi; + for (j = 0; j < ncopies; j++) + { + edge pr_edge = loop_preheader_edge (loop); + SET_PHI_ARG_DEF (use, pr_edge->dest_idx, vect_phi_res); + use = STMT_VINFO_RELATED_STMT (vinfo_for_stmt (use)); + } } } - } - /* Replace the uses: */ - orig_name = PHI_RESULT (exit_phi); - FOR_EACH_IMM_USE_STMT (use_stmt, imm_iter, orig_name) - FOR_EACH_IMM_USE_ON_STMT (use_p, imm_iter) - SET_USE (use_p, new_temp); + /* Replace the uses: */ + orig_name = PHI_RESULT (exit_phi); + scalar_result = VEC_index (tree, scalar_results, k); + FOR_EACH_IMM_USE_STMT (use_stmt, imm_iter, orig_name) + FOR_EACH_IMM_USE_ON_STMT (use_p, imm_iter) + SET_USE (use_p, scalar_result); + } + + VEC_free (gimple, heap, phis); } - VEC_free (gimple, heap, phis); -} + VEC_free (tree, heap, scalar_results); + VEC_free (gimple, heap, new_phis); +} /* Function vectorizable_reduction. @@ -3489,7 +3642,7 @@ vect_finalize_reduction: bool vectorizable_reduction (gimple stmt, gimple_stmt_iterator *gsi, - gimple *vec_stmt) + gimple *vec_stmt, slp_tree slp_node) { tree vec_dest; tree scalar_dest; @@ -3517,7 +3670,6 @@ vectorizable_reduction (gimple stmt, gimple_stmt_iterator *gsi, int ncopies; int epilog_copies; stmt_vec_info prev_stmt_info, prev_phi_info; - gimple first_phi = NULL; bool single_defuse_cycle = false; tree reduc_def = NULL_TREE; gimple new_stmt = NULL; @@ -3532,6 +3684,10 @@ vectorizable_reduction (gimple stmt, gimple_stmt_iterator *gsi, struct loop * def_stmt_loop, *outer_loop = NULL; tree def_arg; gimple def_arg_stmt; + VEC (tree, heap) *vec_oprnds0 = NULL, *vec_oprnds1 = NULL, *vect_defs = NULL; + VEC (gimple, heap) *phis = NULL; + int vec_num; + tree def0, def1; if (nested_in_vect_loop_p (loop, stmt)) { @@ -3540,10 +3696,6 @@ vectorizable_reduction (gimple stmt, gimple_stmt_iterator *gsi, nested_cycle = true; } - /* FORNOW: SLP not supported. */ - if (STMT_SLP_TYPE (stmt_info)) - return false; - /* 1. Is vectorizable reduction? */ /* Not supportable if the reduction variable is used in the loop. */ if (STMT_VINFO_RELEVANT (stmt_info) > vect_used_in_outer) @@ -3676,9 +3828,12 @@ vectorizable_reduction (gimple stmt, gimple_stmt_iterator *gsi, if (STMT_VINFO_LIVE_P (vinfo_for_stmt (reduc_def_stmt))) return false; + if (slp_node) + ncopies = 1; + else + ncopies = (LOOP_VINFO_VECT_FACTOR (loop_vinfo) + / TYPE_VECTOR_SUBPARTS (vectype_in)); - ncopies = (LOOP_VINFO_VECT_FACTOR (loop_vinfo) - / TYPE_VECTOR_SUBPARTS (vectype_in)); gcc_assert (ncopies >= 1); vec_mode = TYPE_MODE (vectype_in); @@ -3897,23 +4052,48 @@ vectorizable_reduction (gimple stmt, gimple_stmt_iterator *gsi, prev_stmt_info = NULL; prev_phi_info = NULL; + if (slp_node) + { + vec_num = SLP_TREE_NUMBER_OF_VEC_STMTS (slp_node); + gcc_assert (TYPE_VECTOR_SUBPARTS (vectype_out) + == TYPE_VECTOR_SUBPARTS (vectype_in)); + } + else + { + vec_num = 1; + vec_oprnds0 = VEC_alloc (tree, heap, 1); + if (op_type == ternary_op) + vec_oprnds1 = VEC_alloc (tree, heap, 1); + } + + phis = VEC_alloc (gimple, heap, vec_num); + vect_defs = VEC_alloc (tree, heap, vec_num); + if (!slp_node) + VEC_quick_push (tree, vect_defs, NULL_TREE); + for (j = 0; j < ncopies; j++) { if (j == 0 || !single_defuse_cycle) { - /* Create the reduction-phi that defines the reduction-operand. */ - new_phi = create_phi_node (vec_dest, loop->header); - set_vinfo_for_stmt (new_phi, new_stmt_vec_info (new_phi, loop_vinfo, - NULL)); - /* Get the vector def for the reduction variable from the phi - node. */ - reduc_def = PHI_RESULT (new_phi); - } + for (i = 0; i < vec_num; i++) + { + /* Create the reduction-phi that defines the reduction + operand. */ + new_phi = create_phi_node (vec_dest, loop->header); + set_vinfo_for_stmt (new_phi, + new_stmt_vec_info (new_phi, loop_vinfo, + NULL)); + if (j == 0 || slp_node) + VEC_quick_push (gimple, phis, new_phi); + } + } if (code == COND_EXPR) { - first_phi = new_phi; - vectorizable_condition (stmt, gsi, vec_stmt, reduc_def, reduc_index); + gcc_assert (!slp_node); + vectorizable_condition (stmt, gsi, vec_stmt, + PHI_RESULT (VEC_index (gimple, phis, 0)), + reduc_index); /* Multiple types are not supported for condition. */ break; } @@ -3921,65 +4101,94 @@ vectorizable_reduction (gimple stmt, gimple_stmt_iterator *gsi, /* Handle uses. */ if (j == 0) { - loop_vec_def0 = vect_get_vec_def_for_operand (ops[!reduc_index], - stmt, NULL); - if (op_type == ternary_op) + if (slp_node) + vect_get_slp_defs (slp_node, &vec_oprnds0, &vec_oprnds1, -1); + else { - if (reduc_index == 0) - loop_vec_def1 = vect_get_vec_def_for_operand (ops[2], stmt, - NULL); - else - loop_vec_def1 = vect_get_vec_def_for_operand (ops[1], stmt, - NULL); + loop_vec_def0 = vect_get_vec_def_for_operand (ops[!reduc_index], + stmt, NULL); + VEC_quick_push (tree, vec_oprnds0, loop_vec_def0); + if (op_type == ternary_op) + { + if (reduc_index == 0) + loop_vec_def1 = vect_get_vec_def_for_operand (ops[2], stmt, + NULL); + else + loop_vec_def1 = vect_get_vec_def_for_operand (ops[1], stmt, + NULL); + + VEC_quick_push (tree, vec_oprnds1, loop_vec_def1); + } } - - /* Get the vector def for the reduction variable from the phi - node. */ - first_phi = new_phi; } else { - enum vect_def_type dt = vect_unknown_def_type; /* Dummy */ - loop_vec_def0 = vect_get_vec_def_for_stmt_copy (dt, loop_vec_def0); - if (op_type == ternary_op) - loop_vec_def1 = vect_get_vec_def_for_stmt_copy (dt, loop_vec_def1); + if (!slp_node) + { + enum vect_def_type dt = vect_unknown_def_type; /* Dummy */ + loop_vec_def0 = vect_get_vec_def_for_stmt_copy (dt, loop_vec_def0); + VEC_replace (tree, vec_oprnds0, 0, loop_vec_def0); + if (op_type == ternary_op) + { + loop_vec_def1 = vect_get_vec_def_for_stmt_copy (dt, + loop_vec_def1); + VEC_replace (tree, vec_oprnds1, 0, loop_vec_def1); + } + } - if (single_defuse_cycle) - reduc_def = gimple_assign_lhs (new_stmt); - else - reduc_def = PHI_RESULT (new_phi); + if (single_defuse_cycle) + reduc_def = gimple_assign_lhs (new_stmt); - STMT_VINFO_RELATED_STMT (prev_phi_info) = new_phi; + STMT_VINFO_RELATED_STMT (prev_phi_info) = new_phi; } - /* Arguments are ready. Create the new vector stmt. */ - if (op_type == binary_op) + for (i = 0; VEC_iterate (tree, vec_oprnds0, i, def0); i++) { - if (reduc_index == 0) - expr = build2 (code, vectype_out, reduc_def, loop_vec_def0); + if (slp_node) + reduc_def = PHI_RESULT (VEC_index (gimple, phis, i)); else - expr = build2 (code, vectype_out, loop_vec_def0, reduc_def); - } - else - { - if (reduc_index == 0) - expr = build3 (code, vectype_out, reduc_def, loop_vec_def0, - loop_vec_def1); + { + if (!single_defuse_cycle || j == 0) + reduc_def = PHI_RESULT (new_phi); + } + + def1 = ((op_type == ternary_op) + ? VEC_index (tree, vec_oprnds1, i) : NULL); + if (op_type == binary_op) + { + if (reduc_index == 0) + expr = build2 (code, vectype_out, reduc_def, def0); + else + expr = build2 (code, vectype_out, def0, reduc_def); + } else { - if (reduc_index == 1) - expr = build3 (code, vectype_out, loop_vec_def0, reduc_def, - loop_vec_def1); + if (reduc_index == 0) + expr = build3 (code, vectype_out, reduc_def, def0, def1); else - expr = build3 (code, vectype_out, loop_vec_def0, loop_vec_def1, - reduc_def); + { + if (reduc_index == 1) + expr = build3 (code, vectype_out, def0, reduc_def, def1); + else + expr = build3 (code, vectype_out, def0, def1, reduc_def); + } + } + + new_stmt = gimple_build_assign (vec_dest, expr); + new_temp = make_ssa_name (vec_dest, new_stmt); + gimple_assign_set_lhs (new_stmt, new_temp); + vect_finish_stmt_generation (stmt, new_stmt, gsi); + if (slp_node) + { + VEC_quick_push (gimple, SLP_TREE_VEC_STMTS (slp_node), new_stmt); + VEC_quick_push (tree, vect_defs, new_temp); } + else + VEC_replace (tree, vect_defs, 0, new_temp); } - new_stmt = gimple_build_assign (vec_dest, expr); - new_temp = make_ssa_name (vec_dest, new_stmt); - gimple_assign_set_lhs (new_stmt, new_temp); - vect_finish_stmt_generation (stmt, new_stmt, gsi); + if (slp_node) + continue; if (j == 0) STMT_VINFO_VEC_STMT (stmt_info) = *vec_stmt = new_stmt; @@ -3992,12 +4201,21 @@ vectorizable_reduction (gimple stmt, gimple_stmt_iterator *gsi, /* Finalize the reduction-phi (set its arguments) and create the epilog reduction code. */ - if (!single_defuse_cycle || code == COND_EXPR) - new_temp = gimple_assign_lhs (*vec_stmt); + if ((!single_defuse_cycle || code == COND_EXPR) && !slp_node) + { + new_temp = gimple_assign_lhs (*vec_stmt); + VEC_replace (tree, vect_defs, 0, new_temp); + } + + vect_create_epilog_for_reduction (vect_defs, stmt, epilog_copies, + epilog_reduc_code, phis, reduc_index, + double_reduc, slp_node); + + VEC_free (gimple, heap, phis); + VEC_free (tree, heap, vec_oprnds0); + if (vec_oprnds1) + VEC_free (tree, heap, vec_oprnds1); - vect_create_epilog_for_reduction (new_temp, stmt, epilog_copies, - epilog_reduc_code, first_phi, reduc_index, - double_reduc); return true; } diff --git a/gcc/tree-vect-patterns.c b/gcc/tree-vect-patterns.c index 55b9d50cca0..ea827559195 100644 --- a/gcc/tree-vect-patterns.c +++ b/gcc/tree-vect-patterns.c @@ -670,6 +670,8 @@ vect_pattern_recog_1 ( tree pattern_vectype; tree type_in, type_out; enum tree_code code; + int i; + gimple next; pattern_stmt = (* vect_recog_func) (stmt, &type_in, &type_out); if (!pattern_stmt) @@ -735,7 +737,13 @@ vect_pattern_recog_1 ( STMT_VINFO_IN_PATTERN_P (stmt_info) = true; STMT_VINFO_RELATED_STMT (stmt_info) = pattern_stmt; - return; + /* Patterns cannot be vectorized using SLP, because they change the order of + computation. */ + for (i = 0; VEC_iterate (gimple, LOOP_VINFO_REDUCTIONS (loop_vinfo), i, + next); + i++) + if (next == stmt) + VEC_ordered_remove (gimple, LOOP_VINFO_REDUCTIONS (loop_vinfo), i); } diff --git a/gcc/tree-vect-slp.c b/gcc/tree-vect-slp.c index afc4f311078..99a865fee20 100644 --- a/gcc/tree-vect-slp.c +++ b/gcc/tree-vect-slp.c @@ -273,6 +273,7 @@ vect_get_and_check_slp_defs (loop_vec_info loop_vinfo, bb_vec_info bb_vinfo, break; case vect_internal_def: + case vect_reduction_def: if (i == 0) VEC_safe_push (gimple, heap, *def_stmts0, def_stmt); else @@ -332,7 +333,7 @@ vect_build_slp_tree (loop_vec_info loop_vinfo, bb_vec_info bb_vinfo, HOST_WIDE_INT dummy; bool permutation = false; unsigned int load_place; - gimple first_load; + gimple first_load, prev_first_load = NULL; /* For every stmt in NODE find its def stmt/s. */ for (i = 0; VEC_iterate (gimple, stmts, i, stmt); i++) @@ -485,42 +486,62 @@ vect_build_slp_tree (loop_vec_info loop_vinfo, bb_vec_info bb_vinfo, &pattern0, &pattern1)) return false; } - else - { - /* Load. */ - /* FORNOW: Check that there is no gap between the loads. */ - if ((DR_GROUP_FIRST_DR (vinfo_for_stmt (stmt)) == stmt - && DR_GROUP_GAP (vinfo_for_stmt (stmt)) != 0) - || (DR_GROUP_FIRST_DR (vinfo_for_stmt (stmt)) != stmt - && DR_GROUP_GAP (vinfo_for_stmt (stmt)) != 1)) - { - if (vect_print_dump_info (REPORT_SLP)) - { - fprintf (vect_dump, "Build SLP failed: strided " - "loads have gaps "); - print_gimple_stmt (vect_dump, stmt, 0, TDF_SLIM); - } + else + { + /* Load. */ + /* FORNOW: Check that there is no gap between the loads. */ + if ((DR_GROUP_FIRST_DR (vinfo_for_stmt (stmt)) == stmt + && DR_GROUP_GAP (vinfo_for_stmt (stmt)) != 0) + || (DR_GROUP_FIRST_DR (vinfo_for_stmt (stmt)) != stmt + && DR_GROUP_GAP (vinfo_for_stmt (stmt)) != 1)) + { + if (vect_print_dump_info (REPORT_SLP)) + { + fprintf (vect_dump, "Build SLP failed: strided " + "loads have gaps "); + print_gimple_stmt (vect_dump, stmt, 0, TDF_SLIM); + } - return false; - } - - /* Check that the size of interleaved loads group is not - greater than the SLP group size. */ - if (DR_GROUP_SIZE (vinfo_for_stmt (stmt)) - > ncopies * group_size) - { - if (vect_print_dump_info (REPORT_SLP)) - { - fprintf (vect_dump, "Build SLP failed: the number of " - "interleaved loads is greater than" - " the SLP group size "); - print_gimple_stmt (vect_dump, stmt, 0, TDF_SLIM); - } + return false; + } - return false; - } + /* Check that the size of interleaved loads group is not + greater than the SLP group size. */ + if (DR_GROUP_SIZE (vinfo_for_stmt (stmt)) > ncopies * group_size) + { + if (vect_print_dump_info (REPORT_SLP)) + { + fprintf (vect_dump, "Build SLP failed: the number of " + "interleaved loads is greater than" + " the SLP group size "); + print_gimple_stmt (vect_dump, stmt, 0, TDF_SLIM); + } - first_load = DR_GROUP_FIRST_DR (vinfo_for_stmt (stmt)); + return false; + } + + first_load = DR_GROUP_FIRST_DR (vinfo_for_stmt (stmt)); + if (prev_first_load) + { + /* Check that there are no loads from different interleaving + chains in the same node. The only exception is complex + numbers. */ + if (prev_first_load != first_load + && rhs_code != REALPART_EXPR + && rhs_code != IMAGPART_EXPR) + { + if (vect_print_dump_info (REPORT_SLP)) + { + fprintf (vect_dump, "Build SLP failed: different " + "interleaving chains in one node "); + print_gimple_stmt (vect_dump, stmt, 0, TDF_SLIM); + } + + return false; + } + } + else + prev_first_load = first_load; if (first_load == stmt) { @@ -787,6 +808,39 @@ vect_supported_slp_permutation_p (slp_instance instance) } +/* Rearrange the statements of NODE according to PERMUTATION. */ + +static void +vect_slp_rearrange_stmts (slp_tree node, unsigned int group_size, + VEC (int, heap) *permutation) +{ + gimple stmt; + VEC (gimple, heap) *tmp_stmts; + unsigned int index, i; + + if (!node) + return; + + vect_slp_rearrange_stmts (SLP_TREE_LEFT (node), group_size, permutation); + vect_slp_rearrange_stmts (SLP_TREE_RIGHT (node), group_size, permutation); + + gcc_assert (group_size == VEC_length (gimple, SLP_TREE_SCALAR_STMTS (node))); + tmp_stmts = VEC_alloc (gimple, heap, group_size); + + for (i = 0; i < group_size; i++) + VEC_safe_push (gimple, heap, tmp_stmts, NULL); + + for (i = 0; VEC_iterate (gimple, SLP_TREE_SCALAR_STMTS (node), i, stmt); i++) + { + index = VEC_index (int, permutation, i); + VEC_replace (gimple, tmp_stmts, index, stmt); + } + + VEC_free (gimple, heap, SLP_TREE_SCALAR_STMTS (node)); + SLP_TREE_SCALAR_STMTS (node) = tmp_stmts; +} + + /* Check if the required load permutation is supported. LOAD_PERMUTATION contains a list of indices of the loads. In SLP this permutation is relative to the order of strided stores that are @@ -796,9 +850,11 @@ static bool vect_supported_load_permutation_p (slp_instance slp_instn, int group_size, VEC (int, heap) *load_permutation) { - int i = 0, j, prev = -1, next, k; - bool supported; + int i = 0, j, prev = -1, next, k, number_of_groups; + bool supported, bad_permutation = false; sbitmap load_index; + slp_tree node; + gimple stmt; /* FORNOW: permutations are only supported in SLP. */ if (!slp_instn) @@ -811,9 +867,72 @@ vect_supported_load_permutation_p (slp_instance slp_instn, int group_size, fprintf (vect_dump, "%d ", next); } + /* In case of reduction every load permutation is allowed, since the order + of the reduction statements is not important (as opposed to the case of + strided stores). The only condition we need to check is that all the + load nodes are of the same size and have the same permutation (and then + rearrange all the nodes of the SLP instance according to this + permutation). */ + + /* Check that all the load nodes are of the same size. */ + for (i = 0; + VEC_iterate (slp_tree, SLP_INSTANCE_LOADS (slp_instn), i, node); + i++) + if (VEC_length (gimple, SLP_TREE_SCALAR_STMTS (node)) + != (unsigned) group_size) + return false; + + node = SLP_INSTANCE_TREE (slp_instn); + stmt = VEC_index (gimple, SLP_TREE_SCALAR_STMTS (node), 0); + /* LOAD_PERMUTATION is a list of indices of all the loads of the SLP + instance, not all the loads belong to the same node or interleaving + group. Hence, we need to divide them into groups according to + GROUP_SIZE. */ + number_of_groups = VEC_length (int, load_permutation) / group_size; + + /* Reduction (there are no data-refs in the root). */ + if (!STMT_VINFO_DATA_REF (vinfo_for_stmt (stmt))) + { + int first_group_load_index; + + /* Compare all the permutation sequences to the first one. */ + for (i = 1; i < number_of_groups; i++) + { + k = 0; + for (j = i * group_size; j < i * group_size + group_size; j++) + { + next = VEC_index (int, load_permutation, j); + first_group_load_index = VEC_index (int, load_permutation, k); + + if (next != first_group_load_index) + { + bad_permutation = true; + break; + } + + k++; + } + + if (bad_permutation) + break; + } + + if (!bad_permutation) + { + /* This permutaion is valid for reduction. Since the order of the + statements in the nodes is not important unless they are memory + accesses, we can rearrange the statements in all the nodes + according to the order of the loads. */ + vect_slp_rearrange_stmts (SLP_INSTANCE_TREE (slp_instn), group_size, + load_permutation); + VEC_free (int, heap, SLP_INSTANCE_LOAD_PERMUTATION (slp_instn)); + return true; + } + } + /* FORNOW: the only supported permutation is 0..01..1.. of length equal to GROUP_SIZE and where each sequence of same drs is of GROUP_SIZE length as - well. */ + well (unless it's reduction). */ if (VEC_length (int, load_permutation) != (unsigned int) (group_size * group_size)) return false; @@ -896,17 +1015,28 @@ vect_analyze_slp_instance (loop_vec_info loop_vinfo, bb_vec_info bb_vinfo, slp_tree node = XNEW (struct _slp_tree); unsigned int group_size = DR_GROUP_SIZE (vinfo_for_stmt (stmt)); unsigned int unrolling_factor = 1, nunits; - tree vectype, scalar_type; + tree vectype, scalar_type = NULL_TREE; gimple next; unsigned int vectorization_factor = 0; - int inside_cost = 0, outside_cost = 0, ncopies_for_cost; + int inside_cost = 0, outside_cost = 0, ncopies_for_cost, i; unsigned int max_nunits = 0; VEC (int, heap) *load_permutation; VEC (slp_tree, heap) *loads; + struct data_reference *dr = STMT_VINFO_DATA_REF (vinfo_for_stmt (stmt)); + + if (dr) + { + scalar_type = TREE_TYPE (DR_REF (dr)); + vectype = get_vectype_for_scalar_type (scalar_type); + group_size = DR_GROUP_SIZE (vinfo_for_stmt (stmt)); + } + else + { + gcc_assert (loop_vinfo); + vectype = STMT_VINFO_VECTYPE (vinfo_for_stmt (stmt)); + group_size = VEC_length (gimple, LOOP_VINFO_REDUCTIONS (loop_vinfo)); + } - scalar_type = TREE_TYPE (DR_REF (STMT_VINFO_DATA_REF ( - vinfo_for_stmt (stmt)))); - vectype = get_vectype_for_scalar_type (scalar_type); if (!vectype) { if (vect_print_dump_info (REPORT_SLP)) @@ -914,6 +1044,7 @@ vect_analyze_slp_instance (loop_vec_info loop_vinfo, bb_vec_info bb_vinfo, fprintf (vect_dump, "Build SLP failed: unsupported data-type "); print_generic_expr (vect_dump, scalar_type, TDF_SLIM); } + return false; } @@ -938,11 +1069,29 @@ vect_analyze_slp_instance (loop_vec_info loop_vinfo, bb_vec_info bb_vinfo, /* Create a node (a root of the SLP tree) for the packed strided stores. */ SLP_TREE_SCALAR_STMTS (node) = VEC_alloc (gimple, heap, group_size); next = stmt; - /* Collect the stores and store them in SLP_TREE_SCALAR_STMTS. */ - while (next) + if (dr) { - VEC_safe_push (gimple, heap, SLP_TREE_SCALAR_STMTS (node), next); - next = DR_GROUP_NEXT_DR (vinfo_for_stmt (next)); + /* Collect the stores and store them in SLP_TREE_SCALAR_STMTS. */ + while (next) + { + VEC_safe_push (gimple, heap, SLP_TREE_SCALAR_STMTS (node), next); + next = DR_GROUP_NEXT_DR (vinfo_for_stmt (next)); + } + } + else + { + /* Collect reduction statements. */ + for (i = 0; VEC_iterate (gimple, LOOP_VINFO_REDUCTIONS (loop_vinfo), i, + next); + i++) + { + VEC_safe_push (gimple, heap, SLP_TREE_SCALAR_STMTS (node), next); + if (vect_print_dump_info (REPORT_DETAILS)) + { + fprintf (vect_dump, "pushing reduction into node: "); + print_gimple_stmt (vect_dump, next, 0, TDF_SLIM); + } + } } SLP_TREE_VEC_STMTS (node) = NULL; @@ -1035,7 +1184,7 @@ bool vect_analyze_slp (loop_vec_info loop_vinfo, bb_vec_info bb_vinfo) { unsigned int i; - VEC (gimple, heap) *strided_stores; + VEC (gimple, heap) *strided_stores, *reductions = NULL; gimple store; bool ok = false; @@ -1043,10 +1192,14 @@ vect_analyze_slp (loop_vec_info loop_vinfo, bb_vec_info bb_vinfo) fprintf (vect_dump, "=== vect_analyze_slp ==="); if (loop_vinfo) - strided_stores = LOOP_VINFO_STRIDED_STORES (loop_vinfo); + { + strided_stores = LOOP_VINFO_STRIDED_STORES (loop_vinfo); + reductions = LOOP_VINFO_REDUCTIONS (loop_vinfo); + } else strided_stores = BB_VINFO_STRIDED_STORES (bb_vinfo); + /* Find SLP sequences starting from groups of strided stores. */ for (i = 0; VEC_iterate (gimple, strided_stores, i, store); i++) if (vect_analyze_slp_instance (loop_vinfo, bb_vinfo, store)) ok = true; @@ -1059,6 +1212,12 @@ vect_analyze_slp (loop_vec_info loop_vinfo, bb_vec_info bb_vinfo) return false; } + /* Find SLP sequences starting from groups of reductions. */ + if (loop_vinfo && VEC_length (gimple, LOOP_VINFO_REDUCTIONS (loop_vinfo)) + && vect_analyze_slp_instance (loop_vinfo, bb_vinfo, + VEC_index (gimple, reductions, 0))) + ok = true; + return true; } @@ -1120,7 +1279,10 @@ vect_detect_hybrid_slp_stmts (slp_tree node) if ((stmt_vinfo = vinfo_for_stmt (use_stmt)) && !STMT_SLP_TYPE (stmt_vinfo) && (STMT_VINFO_RELEVANT (stmt_vinfo) - || VECTORIZABLE_CYCLE_DEF (STMT_VINFO_DEF_TYPE (stmt_vinfo)))) + || VECTORIZABLE_CYCLE_DEF (STMT_VINFO_DEF_TYPE (stmt_vinfo))) + && !(gimple_code (use_stmt) == GIMPLE_PHI + && STMT_VINFO_DEF_TYPE (vinfo_for_stmt (use_stmt)) + == vect_reduction_def)) vect_mark_slp_stmts (node, hybrid, i); vect_detect_hybrid_slp_stmts (SLP_TREE_LEFT (node)); @@ -1429,11 +1591,14 @@ vect_update_slp_costs_according_to_vf (loop_vec_info loop_vinfo) /* For constant and loop invariant defs of SLP_NODE this function returns (vector) defs (VEC_OPRNDS) that will be used in the vectorized stmts. OP_NUM determines if we gather defs for operand 0 or operand 1 of the scalar - stmts. NUMBER_OF_VECTORS is the number of vector defs to create. */ + stmts. NUMBER_OF_VECTORS is the number of vector defs to create. + REDUC_INDEX is the index of the reduction operand in the statements, unless + it is -1. */ static void vect_get_constant_vectors (slp_tree slp_node, VEC(tree,heap) **vec_oprnds, - unsigned int op_num, unsigned int number_of_vectors) + unsigned int op_num, unsigned int number_of_vectors, + int reduc_index) { VEC (gimple, heap) *stmts = SLP_TREE_SCALAR_STMTS (slp_node); gimple stmt = VEC_index (gimple, stmts, 0); @@ -1449,6 +1614,50 @@ vect_get_constant_vectors (slp_tree slp_node, VEC(tree,heap) **vec_oprnds, int number_of_copies = 1; VEC (tree, heap) *voprnds = VEC_alloc (tree, heap, number_of_vectors); bool constant_p, is_store; + tree neutral_op = NULL; + + if (STMT_VINFO_DEF_TYPE (stmt_vinfo) == vect_reduction_def) + { + enum tree_code code = gimple_assign_rhs_code (stmt); + if (reduc_index == -1) + { + VEC_free (tree, heap, *vec_oprnds); + return; + } + + op_num = reduc_index - 1; + op = gimple_op (stmt, op_num + 1); + /* For additional copies (see the explanation of NUMBER_OF_COPIES below) + we need either neutral operands or the original operands. See + get_initial_def_for_reduction() for details. */ + switch (code) + { + case WIDEN_SUM_EXPR: + case DOT_PROD_EXPR: + case PLUS_EXPR: + case MINUS_EXPR: + case BIT_IOR_EXPR: + case BIT_XOR_EXPR: + if (SCALAR_FLOAT_TYPE_P (TREE_TYPE (op))) + neutral_op = build_real (TREE_TYPE (op), dconst0); + else + neutral_op = build_int_cst (TREE_TYPE (op), 0); + + break; + + case MULT_EXPR: + case BIT_AND_EXPR: + if (SCALAR_FLOAT_TYPE_P (TREE_TYPE (op))) + neutral_op = build_real (TREE_TYPE (op), dconst1); + else + neutral_op = build_int_cst (TREE_TYPE (op), 1); + + break; + + default: + neutral_op = NULL; + } + } if (STMT_VINFO_DATA_REF (stmt_vinfo)) { @@ -1499,6 +1708,19 @@ vect_get_constant_vectors (slp_tree slp_node, VEC(tree,heap) **vec_oprnds, else op = gimple_op (stmt, op_num + 1); + if (reduc_index != -1) + { + struct loop *loop = (gimple_bb (stmt))->loop_father; + gimple def_stmt = SSA_NAME_DEF_STMT (op); + + gcc_assert (loop); + /* Get the def before the loop. */ + op = PHI_ARG_DEF_FROM_EDGE (def_stmt, + loop_preheader_edge (loop)); + if (j != (number_of_copies - 1) && neutral_op) + op = neutral_op; + } + /* Create 'vect_ = {op0,op1,...,opn}'. */ t = tree_cons (NULL_TREE, op, t); @@ -1536,8 +1758,25 @@ vect_get_constant_vectors (slp_tree slp_node, VEC(tree,heap) **vec_oprnds, to replicate the vectors. */ while (number_of_vectors > VEC_length (tree, *vec_oprnds)) { - for (i = 0; VEC_iterate (tree, *vec_oprnds, i, vop) && i < vec_num; i++) - VEC_quick_push (tree, *vec_oprnds, vop); + tree neutral_vec = NULL; + + if (neutral_op) + { + if (!neutral_vec) + { + t = NULL; + for (i = 0; i < (unsigned) nunits; i++) + t = tree_cons (NULL_TREE, neutral_op, t); + neutral_vec = build_vector (vector_type, t); + } + + VEC_quick_push (tree, *vec_oprnds, neutral_vec); + } + else + { + for (i = 0; VEC_iterate (tree, *vec_oprnds, i, vop) && i < vec_num; i++) + VEC_quick_push (tree, *vec_oprnds, vop); + } } } @@ -1576,7 +1815,7 @@ vect_get_slp_vect_defs (slp_tree slp_node, VEC (tree,heap) **vec_oprnds) void vect_get_slp_defs (slp_tree slp_node, VEC (tree,heap) **vec_oprnds0, - VEC (tree,heap) **vec_oprnds1) + VEC (tree,heap) **vec_oprnds1, int reduc_index) { gimple first_stmt; enum tree_code code; @@ -1607,19 +1846,26 @@ vect_get_slp_defs (slp_tree slp_node, VEC (tree,heap) **vec_oprnds0, *vec_oprnds0 = VEC_alloc (tree, heap, number_of_vects); /* SLP_NODE corresponds either to a group of stores or to a group of - unary/binary operations. We don't call this function for loads. */ - if (SLP_TREE_LEFT (slp_node)) + unary/binary operations. We don't call this function for loads. + For reduction defs we call vect_get_constant_vectors(), since we are + looking for initial loop invariant values. */ + if (SLP_TREE_LEFT (slp_node) && reduc_index == -1) /* The defs are already vectorized. */ vect_get_slp_vect_defs (SLP_TREE_LEFT (slp_node), vec_oprnds0); else /* Build vectors from scalar defs. */ - vect_get_constant_vectors (slp_node, vec_oprnds0, 0, number_of_vects); + vect_get_constant_vectors (slp_node, vec_oprnds0, 0, number_of_vects, + reduc_index); if (STMT_VINFO_DATA_REF (vinfo_for_stmt (first_stmt))) /* Since we don't call this function with loads, this is a group of stores. */ return; + /* For reductions, we only need initial values. */ + if (reduc_index != -1) + return; + code = gimple_assign_rhs_code (first_stmt); if (get_gimple_rhs_class (code) != GIMPLE_BINARY_RHS || !vec_oprnds1) return; @@ -1638,7 +1884,7 @@ vect_get_slp_defs (slp_tree slp_node, VEC (tree,heap) **vec_oprnds0, vect_get_slp_vect_defs (SLP_TREE_RIGHT (slp_node), vec_oprnds1); else /* Build vectors from scalar defs. */ - vect_get_constant_vectors (slp_node, vec_oprnds1, 1, number_of_vects); + vect_get_constant_vectors (slp_node, vec_oprnds1, 1, number_of_vects, -1); } @@ -2027,22 +2273,7 @@ vect_schedule_slp_instance (slp_tree node, slp_instance instance, si = gsi_for_stmt (stmt); is_store = vect_transform_stmt (stmt, &si, &strided_store, node, instance); - if (is_store) - { - if (DR_GROUP_FIRST_DR (stmt_info)) - /* If IS_STORE is TRUE, the vectorization of the - interleaving chain was completed - free all the stores in - the chain. */ - vect_remove_stores (DR_GROUP_FIRST_DR (stmt_info)); - else - /* FORNOW: SLP originates only from strided stores. */ - gcc_unreachable (); - - return true; - } - - /* FORNOW: SLP originates only from strided stores. */ - return false; + return is_store; } @@ -2075,6 +2306,26 @@ vect_schedule_slp (loop_vec_info loop_vinfo, bb_vec_info bb_vinfo) fprintf (vect_dump, "vectorizing stmts using SLP."); } + for (i = 0; VEC_iterate (slp_instance, slp_instances, i, instance); i++) + { + slp_tree root = SLP_INSTANCE_TREE (instance); + gimple store; + unsigned int j; + gimple_stmt_iterator gsi; + + for (j = 0; VEC_iterate (gimple, SLP_TREE_SCALAR_STMTS (root), j, store) + && j < SLP_INSTANCE_GROUP_SIZE (instance); j++) + { + if (!STMT_VINFO_DATA_REF (vinfo_for_stmt (store))) + break; + + /* Free the attached stmt_vec_info and remove the stmt. */ + gsi = gsi_for_stmt (store); + gsi_remove (&gsi, true); + free_stmt_vec_info (store); + } + } + return is_store; } diff --git a/gcc/tree-vect-stmts.c b/gcc/tree-vect-stmts.c index 4868f73e684..988749b792f 100644 --- a/gcc/tree-vect-stmts.c +++ b/gcc/tree-vect-stmts.c @@ -1134,7 +1134,7 @@ vect_get_vec_defs (tree op0, tree op1, gimple stmt, slp_tree slp_node) { if (slp_node) - vect_get_slp_defs (slp_node, vec_oprnds0, vec_oprnds1); + vect_get_slp_defs (slp_node, vec_oprnds0, vec_oprnds1, -1); else { tree vec_oprnd; @@ -2519,7 +2519,7 @@ vectorizable_type_demotion (gimple stmt, gimple_stmt_iterator *gsi, { /* Handle uses. */ if (slp_node) - vect_get_slp_defs (slp_node, &vec_oprnds0, NULL); + vect_get_slp_defs (slp_node, &vec_oprnds0, NULL, -1); else { VEC_free (tree, heap, vec_oprnds0); @@ -2819,7 +2819,7 @@ vectorizable_type_promotion (gimple stmt, gimple_stmt_iterator *gsi, if (j == 0) { if (slp_node) - vect_get_slp_defs (slp_node, &vec_oprnds0, &vec_oprnds1); + vect_get_slp_defs (slp_node, &vec_oprnds0, &vec_oprnds1, -1); else { vec_oprnd0 = vect_get_vec_def_for_operand (op0, stmt, NULL); @@ -3105,7 +3105,7 @@ vectorizable_store (gimple stmt, gimple_stmt_iterator *gsi, gimple *vec_stmt, if (slp) { /* Get vectorized arguments for SLP_NODE. */ - vect_get_slp_defs (slp_node, &vec_oprnds, NULL); + vect_get_slp_defs (slp_node, &vec_oprnds, NULL, -1); vec_oprnd = VEC_index (tree, vec_oprnds, 0); } @@ -4049,7 +4049,7 @@ vect_analyze_stmt (gimple stmt, bool *need_to_vectorize, slp_tree node) || vectorizable_load (stmt, NULL, NULL, NULL, NULL) || vectorizable_call (stmt, NULL, NULL) || vectorizable_store (stmt, NULL, NULL, NULL) - || vectorizable_reduction (stmt, NULL, NULL) + || vectorizable_reduction (stmt, NULL, NULL, NULL) || vectorizable_condition (stmt, NULL, NULL, NULL, 0)); else { @@ -4201,8 +4201,7 @@ vect_transform_stmt (gimple stmt, gimple_stmt_iterator *gsi, break; case reduc_vec_info_type: - gcc_assert (!slp_node); - done = vectorizable_reduction (stmt, gsi, &vec_stmt); + done = vectorizable_reduction (stmt, gsi, &vec_stmt, slp_node); gcc_assert (done); break; diff --git a/gcc/tree-vectorizer.h b/gcc/tree-vectorizer.h index 52b2a7ec59f..bd43a4bc173 100644 --- a/gcc/tree-vectorizer.h +++ b/gcc/tree-vectorizer.h @@ -242,6 +242,9 @@ typedef struct _loop_vec_info { /* The unrolling factor needed to SLP the loop. In case of that pure SLP is applied to the loop, i.e., no unrolling is needed, this is 1. */ unsigned slp_unrolling_factor; + + /* Reduction cycles detected in the loop. Used in loop-aware SLP. */ + VEC (gimple, heap) *reductions; } *loop_vec_info; /* Access Functions. */ @@ -266,6 +269,7 @@ typedef struct _loop_vec_info { #define LOOP_VINFO_STRIDED_STORES(L) (L)->strided_stores #define LOOP_VINFO_SLP_INSTANCES(L) (L)->slp_instances #define LOOP_VINFO_SLP_UNROLLING_FACTOR(L) (L)->slp_unrolling_factor +#define LOOP_VINFO_REDUCTIONS(L) (L)->reductions #define LOOP_REQUIRES_VERSIONING_FOR_ALIGNMENT(L) \ VEC_length (gimple, (L)->may_misalign_stmts) > 0 @@ -844,7 +848,8 @@ extern void vect_transform_loop (loop_vec_info); extern loop_vec_info vect_analyze_loop_form (struct loop *); extern bool vectorizable_live_operation (gimple, gimple_stmt_iterator *, gimple *); -extern bool vectorizable_reduction (gimple, gimple_stmt_iterator *, gimple *); +extern bool vectorizable_reduction (gimple, gimple_stmt_iterator *, gimple *, + slp_tree); extern bool vectorizable_induction (gimple, gimple_stmt_iterator *, gimple *); extern int vect_estimate_min_profitable_iters (loop_vec_info); extern tree get_initial_def_for_reduction (gimple, tree, tree *); @@ -862,7 +867,7 @@ extern bool vect_analyze_slp (loop_vec_info, bb_vec_info); extern void vect_make_slp_decision (loop_vec_info); extern void vect_detect_hybrid_slp (loop_vec_info); extern void vect_get_slp_defs (slp_tree, VEC (tree,heap) **, - VEC (tree,heap) **); + VEC (tree,heap) **, int); extern LOC find_bb_location (basic_block); extern bb_vec_info vect_slp_analyze_bb (basic_block); extern void vect_slp_transform_bb (basic_block); |