summaryrefslogtreecommitdiff
path: root/gcc
diff options
context:
space:
mode:
Diffstat (limited to 'gcc')
-rw-r--r--gcc/ChangeLog37
-rw-r--r--gcc/testsuite/ChangeLog11
-rw-r--r--gcc/testsuite/gcc.dg/vect/pr37027.c37
-rw-r--r--gcc/testsuite/gcc.dg/vect/slp-reduc-1.c49
-rw-r--r--gcc/testsuite/gcc.dg/vect/slp-reduc-2.c44
-rw-r--r--gcc/testsuite/gcc.dg/vect/slp-reduc-3.c62
-rw-r--r--gcc/testsuite/gcc.dg/vect/slp-reduc-4.c60
-rw-r--r--gcc/testsuite/gcc.dg/vect/slp-reduc-5.c49
-rw-r--r--gcc/testsuite/gcc.dg/vect/slp-reduc-6.c50
-rw-r--r--gcc/testsuite/lib/target-supports.exp19
-rw-r--r--gcc/tree-vect-loop.c978
-rw-r--r--gcc/tree-vect-patterns.c10
-rw-r--r--gcc/tree-vect-slp.c399
-rw-r--r--gcc/tree-vect-stmts.c13
-rw-r--r--gcc/tree-vectorizer.h9
15 files changed, 1363 insertions, 464 deletions
diff --git a/gcc/ChangeLog b/gcc/ChangeLog
index 2da24e12ccb..98c80045f99 100644
--- a/gcc/ChangeLog
+++ b/gcc/ChangeLog
@@ -1,3 +1,40 @@
+2010-04-19 Ira Rosen <irar@il.ibm.com>
+
+ PR tree-optimization/37027
+ * tree-vectorizer.h (struct _loop_vec_info): Add new field reductions
+ and macro to access it.
+ (vectorizable_reduction): Add argument.
+ (vect_get_slp_defs): Likewise.
+ * tree-vect-loop.c (vect_analyze_scalar_cycles_1): Collect reduction
+ statements for possible use in SLP.
+ (new_loop_vec_info): Initialize LOOP_VINFO_REDUCTIONS.
+ (destroy_loop_vec_info): Free LOOP_VINFO_REDUCTIONS.
+ (vect_create_epilog_for_reduction): Handle SLP. Modify documentation,
+ add new argument.
+ (vectorizable_reduction): Likewise.
+ * tree-vect-stmts.c (vect_get_vec_defs): Update call to
+ vect_get_slp_defs.
+ (vectorizable_type_demotion, vectorizable_type_promotion,
+ vectorizable_store): Likewise.
+ (vect_analyze_stmt): Update call to vectorizable_reduction.
+ (vect_transform_stmt): Likewise.
+ * tree-vect-slp.c (vect_get_and_check_slp_defs): Handle reduction.
+ (vect_build_slp_tree): Fix indentation. Check that there are no loads
+ from different interleaving chains in same node.
+ (vect_slp_rearrange_stmts): New function.
+ (vect_supported_load_permutation_p): Allow load permutations for
+ reductions. Call vect_slp_rearrange_stmts() to rearrange statements
+ inside SLP nodes if necessary.
+ (vect_analyze_slp_instance): Handle reductions.
+ (vect_analyze_slp): Try to build SLP instances originating from groups
+ of reductions.
+ (vect_detect_hybrid_slp_stmts): Skip reduction statements.
+ (vect_get_constant_vectors): Create initial vectors for reductions
+ according to reduction code. Add new argument.
+ (vect_get_slp_defs): Add new argument, pass it to
+ vect_get_constant_vectors.
+ (vect_schedule_slp_instance): Remove SLP tree root statements.
+
2010-04-19 Jakub Jelinek <jakub@redhat.com>
* tree.h (ENUM_IS_SCOPED): Define.
diff --git a/gcc/testsuite/ChangeLog b/gcc/testsuite/ChangeLog
index 8ffc442caeb..868ce20d31e 100644
--- a/gcc/testsuite/ChangeLog
+++ b/gcc/testsuite/ChangeLog
@@ -1,3 +1,14 @@
+2010-04-19 Ira Rosen <irar@il.ibm.com>
+
+ PR tree-optimization/37027
+ * lib/target-supports.exp
+ (check_effective_target_vect_widen_sum_hi_to_si_pattern): New.
+ * gcc.dg/vect/pr37027.c: New test.
+ * gcc.dg/vect/slp-reduc-1.c, gcc.dg/vect/slp-reduc-2.c,
+ gcc.dg/vect/slp-reduc-3.c, gcc.dg/vect/slp-reduc-4.c,
+ gcc.dg/vect/slp-reduc-5.c, gcc.dg/vect/slp-reduc-6.c,
+ gcc.dg/vect/vect-complex-6.c: Likewise.
+
2010-04-19 Jakub Jelinek <jakub@redhat.com>
* g++.dg/debug/dwarf2/enum1.C: New test.
diff --git a/gcc/testsuite/gcc.dg/vect/pr37027.c b/gcc/testsuite/gcc.dg/vect/pr37027.c
new file mode 100644
index 00000000000..dcfed348d11
--- /dev/null
+++ b/gcc/testsuite/gcc.dg/vect/pr37027.c
@@ -0,0 +1,37 @@
+/* { dg-do compile } */
+/* { dg-require-effective-target vect_int } */
+
+#include <stdarg.h>
+
+struct mystr
+{
+ int f1;
+ int f2;
+};
+
+struct mystr a[16];
+struct mystr b[16];
+int res1, res2;
+
+
+void
+foo (void)
+{
+ int i;
+ int sum1;
+ int sum2;
+
+ for (i = 0; i < 16; i++)
+ {
+ sum1 += a[i].f1 + b[i].f1;
+ sum2 += a[i].f2 + b[i].f2;
+ }
+
+ res1 = sum1;
+ res2 = sum2;
+}
+
+/* { dg-final { scan-tree-dump-times "vectorized 1 loops" 1 "vect" { xfail vect_no_int_add } } } */
+/* { dg-final { scan-tree-dump-times "vectorizing stmts using SLP" 1 "vect" { xfail vect_no_int_add } } } */
+/* { dg-final { cleanup-tree-dump "vect" } } */
+
diff --git a/gcc/testsuite/gcc.dg/vect/slp-reduc-1.c b/gcc/testsuite/gcc.dg/vect/slp-reduc-1.c
new file mode 100644
index 00000000000..95faba8e9d4
--- /dev/null
+++ b/gcc/testsuite/gcc.dg/vect/slp-reduc-1.c
@@ -0,0 +1,49 @@
+/* { dg-require-effective-target vect_int } */
+
+#include <stdarg.h>
+#include <stdio.h>
+#include "tree-vect.h"
+
+#define N 16
+
+unsigned int ub[N] = {0,3,6,9,12,15,18,21,24,27,30,33,36,39,42,45};
+unsigned int uc[N] = {0,1,2,3,4,5,6,7,8,9,10,11,12,13,14,15};
+
+/* Vectorization of reduction using loop-aware SLP. */
+
+__attribute__ ((noinline))
+int main1 (int n, int res0, int res1, int res2, int res3)
+{
+ int i;
+ unsigned int udiff0 = 5, udiff1 = 10, udiff2 = 20, udiff3 = 30;
+
+ for (i = 0; i < n; i++) {
+ udiff3 += (ub[4*i + 3] - uc[4*i + 3]);
+ udiff2 += (ub[4*i + 2] - uc[4*i + 2]);
+ udiff1 += (ub[4*i + 1] - uc[4*i + 1]);
+ udiff0 += (ub[4*i] - uc[4*i]);
+ }
+
+ /* Check results: */
+ if (udiff0 != res0
+ || udiff1 != res1
+ || udiff2 != res2
+ || udiff3 != res3)
+ abort ();
+
+ return 0;
+}
+
+int main (void)
+{
+ check_vect ();
+
+ main1 (N/4, 53, 66, 84, 102);
+ main1 (N/4 - 1, 29, 40, 56, 72);
+ return 0;
+}
+
+/* { dg-final { scan-tree-dump-times "vectorized 1 loops" 1 "vect" { xfail vect_no_int_add } } } */
+/* { dg-final { scan-tree-dump-times "vectorizing stmts using SLP" 1 "vect" { xfail vect_no_int_add } } } */
+/* { dg-final { cleanup-tree-dump "vect" } } */
+
diff --git a/gcc/testsuite/gcc.dg/vect/slp-reduc-2.c b/gcc/testsuite/gcc.dg/vect/slp-reduc-2.c
new file mode 100644
index 00000000000..cb59c8c07ea
--- /dev/null
+++ b/gcc/testsuite/gcc.dg/vect/slp-reduc-2.c
@@ -0,0 +1,44 @@
+/* { dg-require-effective-target vect_int } */
+
+#include <stdarg.h>
+#include <stdio.h>
+#include "tree-vect.h"
+
+#define N 16
+
+unsigned int ub[N] = {0,3,6,9,12,15,18,21,24,27,30,33,36,39,42,45};
+unsigned int uc[N] = {0,1,2,3,4,5,6,7,8,9,10,11,12,13,14,15};
+
+/* Vectorization of reduction using loop-aware SLP (with unrolling). */
+
+__attribute__ ((noinline))
+int main1 (int n, int res0, int res1, int res2, int res3)
+{
+ int i;
+ unsigned int udiff0 = 5, udiff1 = 10;
+
+ for (i = 0; i < n; i++) {
+ udiff1 += (ub[2*i + 1] - uc[2*i + 1]);
+ udiff0 += (ub[2*i] - uc[2*i]);
+ }
+
+ /* Check results: */
+ if (udiff0 != res0
+ || udiff1 != res1)
+ abort ();
+
+ return 0;
+}
+
+int main (void)
+{
+ check_vect ();
+
+ main1 (N/2, 117, 138, 84, 102);
+ return 0;
+}
+
+/* { dg-final { scan-tree-dump-times "vectorized 1 loops" 1 "vect" { xfail vect_no_int_add } } } */
+/* { dg-final { scan-tree-dump-times "vectorizing stmts using SLP" 1 "vect" { xfail vect_no_int_add } } } */
+/* { dg-final { cleanup-tree-dump "vect" } } */
+
diff --git a/gcc/testsuite/gcc.dg/vect/slp-reduc-3.c b/gcc/testsuite/gcc.dg/vect/slp-reduc-3.c
new file mode 100644
index 00000000000..3220d3912ba
--- /dev/null
+++ b/gcc/testsuite/gcc.dg/vect/slp-reduc-3.c
@@ -0,0 +1,62 @@
+/* { dg-require-effective-target vect_int } */
+
+#include <stdarg.h>
+#include "tree-vect.h"
+
+#define N 64
+
+#define DOT1 21834
+#define DOT2 21876
+
+unsigned short X[N] __attribute__ ((__aligned__(__BIGGEST_ALIGNMENT__)));
+unsigned short Y[N] __attribute__ ((__aligned__(__BIGGEST_ALIGNMENT__)));
+
+/* short->short->int dot product.
+ Not detected as a dot-product pattern.
+ Requires support for non-widneing multiplication and widening-summation.
+ Vectorized with loop-aware SLP. */
+__attribute__ ((noinline)) unsigned int
+foo1(int len, int *result1, int *result2)
+{
+ int i;
+ unsigned int res1 = 10, res2 = 20;
+ unsigned short prod;
+
+ for (i=0; i<len; i++) {
+ prod = X[2*i] * Y[2*i];
+ res1 += prod;
+ prod = X[2*i+1] * Y[2*i+1];
+ res2 += prod;
+ }
+
+ *result1 = res1;
+ *result2 = res2;
+
+ return 0;
+}
+
+int main (void)
+{
+ unsigned int dot1, dot2;
+ unsigned short i;
+
+ check_vect ();
+
+ for (i=0; i<N; i++) {
+ X[i] = i;
+ Y[i] = 64-i;
+ }
+
+ foo1 (N/2, &dot1, &dot2);
+
+ if (dot1 != DOT1 || dot2 != DOT2)
+ abort ();
+
+ return 0;
+}
+
+/* The initialization loop in main also gets vectorized. */
+/* { dg-final { scan-tree-dump-times "vect_recog_dot_prod_pattern: detected" 1 "vect" { xfail *-*-* } } } */
+/* { dg-final { scan-tree-dump-times "vectorized 1 loops" 2 "vect" { target { vect_short_mult && vect_widen_sum_hi_to_si } } } } */
+/* { dg-final { scan-tree-dump-times "vectorizing stmts using SLP" 1 "vect" { xfail { vect_widen_sum_hi_to_si_pattern } } } } */
+/* { dg-final { cleanup-tree-dump "vect" } } */
diff --git a/gcc/testsuite/gcc.dg/vect/slp-reduc-4.c b/gcc/testsuite/gcc.dg/vect/slp-reduc-4.c
new file mode 100644
index 00000000000..ad5b3ce0700
--- /dev/null
+++ b/gcc/testsuite/gcc.dg/vect/slp-reduc-4.c
@@ -0,0 +1,60 @@
+/* { dg-require-effective-target vect_int } */
+
+#include <stdarg.h>
+#include <stdio.h>
+#include "tree-vect.h"
+
+#define N 128
+
+unsigned int uc[N];
+
+/* Vectorization of reduction using loop-aware SLP. */
+
+__attribute__ ((noinline))
+int main1 (int n, int res0, int res1, int res2, int res3, int res4, int res5, int res6, int res7)
+{
+ int i;
+ unsigned int max0 = 5, max1 = 10, max2 = 20, max3 = 30, max4 = 2, max5 = 13, max6 = 7, max7 = 313;
+
+ for (i = 0; i < n; i++) {
+ max2 = max2 < uc[8*i+2] ? uc[8*i+2] : max2;
+ max3 = max3 < uc[8*i+3] ? uc[8*i+3] : max3;
+ max1 = max1 < uc[8*i+1] ? uc[8*i+1] : max1;
+ max7 = max7 < uc[8*i+7] ? uc[8*i+7] : max7;
+ max6 = max6 < uc[8*i+6] ? uc[8*i+6] : max6;
+ max0 = max0 < uc[8*i] ? uc[8*i] : max0;
+ max4 = max4 < uc[8*i+4] ? uc[8*i+4] : max4;
+ max5 = max5 < uc[8*i+5] ? uc[8*i+5] : max5;
+ }
+
+ /* Check results: */
+ if (max0 != res0
+ || max1 != res1
+ || max2 != res2
+ || max3 != res3
+ || max4 != res4
+ || max5 != res5
+ || max6 != res6
+ || max7 != res7)
+ abort ();
+
+ return 0;
+}
+
+int main (void)
+{
+ int i;
+
+ check_vect ();
+
+ for (i = 0; i < N; i++)
+ uc[i] = i+3;
+
+ main1 (N/8, 123, 124, 125, 126, 127, 128, 129, 313);
+ return 0;
+}
+
+/* { dg-final { scan-tree-dump-times "vectorized 1 loops" 1 "vect" { xfail vect_no_int_max } } } */
+/* { dg-final { scan-tree-dump-times "vectorizing stmts using SLP" 1 "vect" { xfail vect_no_int_max } } } */
+/* { dg-final { cleanup-tree-dump "vect" } } */
+
diff --git a/gcc/testsuite/gcc.dg/vect/slp-reduc-5.c b/gcc/testsuite/gcc.dg/vect/slp-reduc-5.c
new file mode 100644
index 00000000000..0974b6642d8
--- /dev/null
+++ b/gcc/testsuite/gcc.dg/vect/slp-reduc-5.c
@@ -0,0 +1,49 @@
+/* { dg-require-effective-target vect_int } */
+
+#include <stdarg.h>
+#include <stdio.h>
+#include "tree-vect.h"
+
+#define N 128
+
+int c[N];
+
+/* Vectorization of reduction using loop-aware SLP. */
+
+__attribute__ ((noinline))
+int main1 (int n, int res0, int res1)
+{
+ int i;
+ int max0 = -100, max1 = -313;
+
+ for (i = 0; i < n; i++) {
+ max1 = max1 < c[2*i+1] ? c[2*i+1] : max1;
+ max0 = max0 < c[2*i] ? c[2*i] : max0;
+ }
+
+ /* Check results: */
+ if (max0 != res0
+ || max1 != res1)
+ abort ();
+
+ return 0;
+}
+
+int main (void)
+{
+ int i;
+
+ check_vect ();
+
+ for (i = 0; i < N; i++)
+ c[i] = (i+3) * -1;
+
+ c[0] = c[1] = -100;
+ main1 (N/2, -5, -6);
+ return 0;
+}
+
+/* { dg-final { scan-tree-dump-times "vectorized 1 loops" 2 "vect" { xfail vect_no_int_max } } } */
+/* { dg-final { scan-tree-dump-times "vectorizing stmts using SLP" 1 "vect" { xfail vect_no_int_max } } } */
+/* { dg-final { cleanup-tree-dump "vect" } } */
+
diff --git a/gcc/testsuite/gcc.dg/vect/slp-reduc-6.c b/gcc/testsuite/gcc.dg/vect/slp-reduc-6.c
new file mode 100644
index 00000000000..c69251a76e2
--- /dev/null
+++ b/gcc/testsuite/gcc.dg/vect/slp-reduc-6.c
@@ -0,0 +1,50 @@
+/* { dg-require-effective-target vect_int } */
+
+#include <stdarg.h>
+#include <stdio.h>
+#include "tree-vect.h"
+
+#define N 128
+
+int a[N], b[N];
+
+/* Vectorization of reduction. Loop-aware SLP is not possible, because of
+ different arrays. */
+
+__attribute__ ((noinline))
+int main1 (int n, int res0, int res1)
+{
+ int i;
+ int sum0 = 0, sum1 = 0;
+
+ for (i = 0; i < n; i++) {
+ sum1 += a[2*i];
+ sum0 += b[2*i];
+ }
+
+ /* Check results: */
+ if (sum0 != res0
+ || sum1 != res1)
+ abort ();
+
+ return 0;
+}
+
+int main (void)
+{
+ int i;
+
+ check_vect ();
+
+ for (i = 0; i < N; i++)
+ a[i] = b[i] = i;
+
+ main1 (N/2, 4032, 4032);
+ return 0;
+}
+
+/* { dg-final { scan-tree-dump-times "vectorized 1 loops" 2 "vect" { xfail vect_no_int_add } } } */
+/* { dg-final { scan-tree-dump-times "vectorizing stmts using SLP" 0 "vect" } } */
+/* { dg-final { scan-tree-dump-times "different interleaving chains in one node" 1 "vect" { target { ! vect_no_int_add } } } } */
+/* { dg-final { cleanup-tree-dump "vect" } } */
+
diff --git a/gcc/testsuite/lib/target-supports.exp b/gcc/testsuite/lib/target-supports.exp
index 126ae380fe7..e91c0331516 100644
--- a/gcc/testsuite/lib/target-supports.exp
+++ b/gcc/testsuite/lib/target-supports.exp
@@ -2105,6 +2105,25 @@ proc check_effective_target_vect_perm { } {
return $et_vect_perm_saved
}
+# Return 1 if the target plus current options supports a vector
+# widening summation of *short* args into *int* result, 0 otherwise.
+#
+# This won't change for different subtargets so cache the result.
+
+proc check_effective_target_vect_widen_sum_hi_to_si_pattern { } {
+ global et_vect_widen_sum_hi_to_si_pattern
+
+ if [info exists et_vect_widen_sum_hi_to_si_pattern_saved] {
+ verbose "check_effective_target_vect_widen_sum_hi_to_si_pattern: using cached result" 2
+ } else {
+ set et_vect_widen_sum_hi_to_si_pattern_saved 0
+ if { [istarget powerpc*-*-*] } {
+ set et_vect_widen_sum_hi_to_si_pattern_saved 1
+ }
+ }
+ verbose "check_effective_target_vect_widen_sum_hi_to_si_pattern: returning $et_vect_widen_sum_hi_to_si_pattern_saved" 2
+ return $et_vect_widen_sum_hi_to_si_pattern_saved
+}
# Return 1 if the target plus current options supports a vector
# widening summation of *short* args into *int* result, 0 otherwise.
diff --git a/gcc/tree-vect-loop.c b/gcc/tree-vect-loop.c
index 809f3e15a02..e6e9008ea37 100644
--- a/gcc/tree-vect-loop.c
+++ b/gcc/tree-vect-loop.c
@@ -545,6 +545,11 @@ vect_analyze_scalar_cycles_1 (loop_vec_info loop_vinfo, struct loop *loop)
STMT_VINFO_DEF_TYPE (stmt_vinfo) = vect_reduction_def;
STMT_VINFO_DEF_TYPE (vinfo_for_stmt (reduc_stmt)) =
vect_reduction_def;
+ /* Store the reduction cycles for possible vectorization in
+ loop-aware SLP. */
+ VEC_safe_push (gimple, heap,
+ LOOP_VINFO_REDUCTIONS (loop_vinfo),
+ reduc_stmt);
}
}
}
@@ -745,6 +750,7 @@ new_loop_vec_info (struct loop *loop)
VEC_alloc (ddr_p, heap,
PARAM_VALUE (PARAM_VECT_MAX_VERSION_FOR_ALIAS_CHECKS));
LOOP_VINFO_STRIDED_STORES (res) = VEC_alloc (gimple, heap, 10);
+ LOOP_VINFO_REDUCTIONS (res) = VEC_alloc (gimple, heap, 10);
LOOP_VINFO_SLP_INSTANCES (res) = VEC_alloc (slp_instance, heap, 10);
LOOP_VINFO_SLP_UNROLLING_FACTOR (res) = 1;
@@ -835,6 +841,7 @@ destroy_loop_vec_info (loop_vec_info loop_vinfo, bool clean_stmts)
VEC_free (slp_instance, heap, LOOP_VINFO_SLP_INSTANCES (loop_vinfo));
VEC_free (gimple, heap, LOOP_VINFO_STRIDED_STORES (loop_vinfo));
+ VEC_free (gimple, heap, LOOP_VINFO_REDUCTIONS (loop_vinfo));
free (loop_vinfo);
loop->aux = NULL;
@@ -1223,7 +1230,6 @@ vect_analyze_loop_operations (loop_vec_info loop_vinfo)
if ((STMT_VINFO_RELEVANT_P (stmt_info)
|| VECTORIZABLE_CYCLE_DEF (STMT_VINFO_DEF_TYPE (stmt_info)))
&& !PURE_SLP_STMT (stmt_info))
-
/* STMT needs both SLP and loop-based vectorization. */
only_slp_in_loop = false;
}
@@ -2860,28 +2866,33 @@ get_initial_def_for_reduction (gimple stmt, tree init_val,
/* Function vect_create_epilog_for_reduction
Create code at the loop-epilog to finalize the result of a reduction
- computation.
-
- VECT_DEF is a vector of partial results.
- REDUC_CODE is the tree-code for the epilog reduction.
+ computation.
+
+ VECT_DEFS is list of vector of partial results, i.e., the lhs's of vector
+ reduction statements.
+ STMT is the scalar reduction stmt that is being vectorized.
NCOPIES is > 1 in case the vectorization factor (VF) is bigger than the
number of elements that we can fit in a vectype (nunits). In this case
we have to generate more than one vector stmt - i.e - we need to "unroll"
the vector stmt by a factor VF/nunits. For more details see documentation
in vectorizable_operation.
- STMT is the scalar reduction stmt that is being vectorized.
- REDUCTION_PHI is the phi-node that carries the reduction computation.
- REDUC_INDEX is the index of the operand in the right hand side of the
+ REDUC_CODE is the tree-code for the epilog reduction.
+ REDUCTION_PHIS is a list of the phi-nodes that carry the reduction
+ computation.
+ REDUC_INDEX is the index of the operand in the right hand side of the
statement that is defined by REDUCTION_PHI.
DOUBLE_REDUC is TRUE if double reduction phi nodes should be handled.
+ SLP_NODE is an SLP node containing a group of reduction statements. The
+ first one in this group is STMT.
This function:
- 1. Creates the reduction def-use cycle: sets the arguments for
- REDUCTION_PHI:
+ 1. Creates the reduction def-use cycles: sets the arguments for
+ REDUCTION_PHIS:
The loop-entry argument is the vectorized initial-value of the reduction.
- The loop-latch argument is VECT_DEF - the vector of partial sums.
- 2. "Reduces" the vector of partial results VECT_DEF into a single result,
- by applying the operation specified by REDUC_CODE if available, or by
+ The loop-latch argument is taken from VECT_DEFS - the vector of partial
+ sums.
+ 2. "Reduces" each vector of partial results VECT_DEFS into a single result,
+ by applying the operation specified by REDUC_CODE if available, or by
other means (whole-vector shifts or a scalar loop).
The function also creates a new phi node at the loop exit to preserve
loop-closed form, as illustrated below.
@@ -2914,12 +2925,11 @@ get_initial_def_for_reduction (gimple stmt, tree init_val,
*/
static void
-vect_create_epilog_for_reduction (tree vect_def, gimple stmt,
- int ncopies,
- enum tree_code reduc_code,
- gimple reduction_phi,
- int reduc_index,
- bool double_reduc)
+vect_create_epilog_for_reduction (VEC (tree, heap) *vect_defs, gimple stmt,
+ int ncopies, enum tree_code reduc_code,
+ VEC (gimple, heap) *reduction_phis,
+ int reduc_index, bool double_reduc,
+ slp_tree slp_node)
{
stmt_vec_info stmt_info = vinfo_for_stmt (stmt);
stmt_vec_info prev_phi_info;
@@ -2933,32 +2943,37 @@ vect_create_epilog_for_reduction (tree vect_def, gimple stmt,
gimple new_phi = NULL, phi;
gimple_stmt_iterator exit_gsi;
tree vec_dest;
- tree new_temp = NULL_TREE;
- tree new_name;
+ tree new_temp = NULL_TREE, new_dest, new_name, new_scalar_dest;
gimple epilog_stmt = NULL;
- tree new_scalar_dest, new_dest;
+ enum tree_code code = gimple_assign_rhs_code (stmt);
gimple exit_phi;
tree bitsize, bitpos;
- enum tree_code code = gimple_assign_rhs_code (stmt);
- tree adjustment_def;
- tree vec_initial_def, def;
- tree orig_name;
+ tree adjustment_def = NULL;
+ tree vec_initial_def = NULL;
+ tree reduction_op, expr, def;
+ tree orig_name, scalar_result;
imm_use_iterator imm_iter;
use_operand_p use_p;
bool extract_scalar_result = false;
- tree reduction_op, expr;
- gimple orig_stmt;
- gimple use_stmt;
+ gimple use_stmt, orig_stmt, reduction_phi = NULL;
bool nested_in_vect_loop = false;
- VEC(gimple,heap) *phis = NULL;
+ VEC (gimple, heap) *new_phis = NULL;
enum vect_def_type dt = vect_unknown_def_type;
int j, i;
+ VEC (tree, heap) *scalar_results = NULL;
+ int group_size = 1, k, ratio;
+ VEC (tree, heap) *vec_initial_defs = NULL;
+ VEC (gimple, heap) *phis;
+
+ if (slp_node)
+ group_size = VEC_length (gimple, SLP_TREE_SCALAR_STMTS (slp_node));
if (nested_in_vect_loop_p (loop, stmt))
{
outer_loop = loop;
loop = loop->inner;
nested_in_vect_loop = true;
+ gcc_assert (!slp_node);
}
switch (get_gimple_rhs_class (gimple_assign_rhs_code (stmt)))
@@ -2983,47 +2998,80 @@ vect_create_epilog_for_reduction (tree vect_def, gimple stmt,
gcc_assert (vectype);
mode = TYPE_MODE (vectype);
- /*** 1. Create the reduction def-use cycle ***/
+ /* 1. Create the reduction def-use cycle:
+ Set the arguments of REDUCTION_PHIS, i.e., transform
+
+ loop:
+ vec_def = phi <null, null> # REDUCTION_PHI
+ VECT_DEF = vector_stmt # vectorized form of STMT
+ ...
- /* For the case of reduction, vect_get_vec_def_for_operand returns
- the scalar def before the loop, that defines the initial value
- of the reduction variable. */
- vec_initial_def = vect_get_vec_def_for_operand (reduction_op, stmt,
- &adjustment_def);
+ into:
- phi = reduction_phi;
- def = vect_def;
- for (j = 0; j < ncopies; j++)
+ loop:
+ vec_def = phi <vec_init, VECT_DEF> # REDUCTION_PHI
+ VECT_DEF = vector_stmt # vectorized form of STMT
+ ...
+
+ (in case of SLP, do it for all the phis). */
+
+ /* Get the loop-entry arguments. */
+ if (slp_node)
+ vect_get_slp_defs (slp_node, &vec_initial_defs, NULL, reduc_index);
+ else
{
- /* 1.1 set the loop-entry arg of the reduction-phi: */
- add_phi_arg (phi, vec_initial_def, loop_preheader_edge (loop),
- UNKNOWN_LOCATION);
+ vec_initial_defs = VEC_alloc (tree, heap, 1);
+ /* For the case of reduction, vect_get_vec_def_for_operand returns
+ the scalar def before the loop, that defines the initial value
+ of the reduction variable. */
+ vec_initial_def = vect_get_vec_def_for_operand (reduction_op, stmt,
+ &adjustment_def);
+ VEC_quick_push (tree, vec_initial_defs, vec_initial_def);
+ }
- /* 1.2 set the loop-latch arg for the reduction-phi: */
- if (j > 0)
- def = vect_get_vec_def_for_stmt_copy (dt, def);
- add_phi_arg (phi, def, loop_latch_edge (loop), UNKNOWN_LOCATION);
+ /* Set phi nodes arguments. */
+ for (i = 0; VEC_iterate (gimple, reduction_phis, i, phi); i++)
+ {
+ tree vec_init_def = VEC_index (tree, vec_initial_defs, i);
+ tree def = VEC_index (tree, vect_defs, i);
+ for (j = 0; j < ncopies; j++)
+ {
+ /* Set the loop-entry arg of the reduction-phi. */
+ add_phi_arg (phi, vec_init_def, loop_preheader_edge (loop),
+ UNKNOWN_LOCATION);
- if (vect_print_dump_info (REPORT_DETAILS))
- {
- fprintf (vect_dump, "transform reduction: created def-use cycle: ");
- print_gimple_stmt (vect_dump, phi, 0, TDF_SLIM);
- fprintf (vect_dump, "\n");
- print_gimple_stmt (vect_dump, SSA_NAME_DEF_STMT (def), 0, TDF_SLIM);
- }
+ /* Set the loop-latch arg for the reduction-phi. */
+ if (j > 0)
+ def = vect_get_vec_def_for_stmt_copy (vect_unknown_def_type, def);
- phi = STMT_VINFO_RELATED_STMT (vinfo_for_stmt (phi));
+ add_phi_arg (phi, def, loop_latch_edge (loop), UNKNOWN_LOCATION);
+
+ if (vect_print_dump_info (REPORT_DETAILS))
+ {
+ fprintf (vect_dump, "transform reduction: created def-use"
+ " cycle: ");
+ print_gimple_stmt (vect_dump, phi, 0, TDF_SLIM);
+ fprintf (vect_dump, "\n");
+ print_gimple_stmt (vect_dump, SSA_NAME_DEF_STMT (def), 0,
+ TDF_SLIM);
+ }
+
+ phi = STMT_VINFO_RELATED_STMT (vinfo_for_stmt (phi));
+ }
}
- /*** 2. Create epilog code
- The reduction epilog code operates across the elements of the vector
- of partial results computed by the vectorized loop.
- The reduction epilog code consists of:
- step 1: compute the scalar result in a vector (v_out2)
- step 2: extract the scalar result (s_out3) from the vector (v_out2)
- step 3: adjust the scalar result (s_out3) if needed.
+ VEC_free (tree, heap, vec_initial_defs);
+
+ /* 2. Create epilog code.
+ The reduction epilog code operates across the elements of the vector
+ of partial results computed by the vectorized loop.
+ The reduction epilog code consists of:
+
+ step 1: compute the scalar result in a vector (v_out2)
+ step 2: extract the scalar result (s_out3) from the vector (v_out2)
+ step 3: adjust the scalar result (s_out3) if needed.
- Step 1 can be accomplished using one the following three schemes:
+ Step 1 can be accomplished using one the following three schemes:
(scheme 1) using reduc_code, if available.
(scheme 2) using whole-vector shifts, if available.
(scheme 3) using a scalar loop. In this case steps 1+2 above are
@@ -3038,29 +3086,33 @@ vect_create_epilog_for_reduction (tree vect_def, gimple stmt,
s_out4 = adjust_result <s_out3> # step 3
(step 3 is optional, and steps 1 and 2 may be combined).
- Lastly, the uses of s_out0 are replaced by s_out4.
+ Lastly, the uses of s_out0 are replaced by s_out4. */
- ***/
- /* 2.1 Create new loop-exit-phi to preserve loop-closed form:
- v_out1 = phi <v_loop> */
+ /* 2.1 Create new loop-exit-phis to preserve loop-closed form:
+ v_out1 = phi <VECT_DEF>
+ Store them in NEW_PHIS. */
exit_bb = single_exit (loop)->dest;
- def = vect_def;
prev_phi_info = NULL;
- for (j = 0; j < ncopies; j++)
+ new_phis = VEC_alloc (gimple, heap, VEC_length (tree, vect_defs));
+ for (i = 0; VEC_iterate (tree, vect_defs, i, def); i++)
{
- phi = create_phi_node (SSA_NAME_VAR (vect_def), exit_bb);
- set_vinfo_for_stmt (phi, new_stmt_vec_info (phi, loop_vinfo, NULL));
- if (j == 0)
- new_phi = phi;
- else
- {
- def = vect_get_vec_def_for_stmt_copy (dt, def);
- STMT_VINFO_RELATED_STMT (prev_phi_info) = phi;
- }
- SET_PHI_ARG_DEF (phi, single_exit (loop)->dest_idx, def);
- prev_phi_info = vinfo_for_stmt (phi);
+ for (j = 0; j < ncopies; j++)
+ {
+ phi = create_phi_node (SSA_NAME_VAR (def), exit_bb);
+ set_vinfo_for_stmt (phi, new_stmt_vec_info (phi, loop_vinfo, NULL));
+ if (j == 0)
+ VEC_quick_push (gimple, new_phis, phi);
+ else
+ {
+ def = vect_get_vec_def_for_stmt_copy (dt, def);
+ STMT_VINFO_RELATED_STMT (prev_phi_info) = phi;
+ }
+
+ SET_PHI_ARG_DEF (phi, single_exit (loop)->dest_idx, def);
+ prev_phi_info = vinfo_for_stmt (phi);
+ }
}
exit_gsi = gsi_after_labels (exit_bb);
@@ -3089,16 +3141,17 @@ vect_create_epilog_for_reduction (tree vect_def, gimple stmt,
}
code = gimple_assign_rhs_code (orig_stmt);
+ /* For MINUS_EXPR the initial vector is [init_val,0,...,0], therefore,
+ partial results are added and not subtracted. */
+ if (code == MINUS_EXPR)
+ code = PLUS_EXPR;
+
scalar_dest = gimple_assign_lhs (orig_stmt);
scalar_type = TREE_TYPE (scalar_dest);
+ scalar_results = VEC_alloc (tree, heap, group_size);
new_scalar_dest = vect_create_destination_var (scalar_dest, NULL);
bitsize = TYPE_SIZE (scalar_type);
- /* For MINUS_EXPR the initial vector is [init_val,0,...,0], therefore,
- partial results are added and not subtracted. */
- if (code == MINUS_EXPR)
- code = PLUS_EXPR;
-
/* In case this is a reduction in an inner-loop while vectorizing an outer
loop - we don't need to extract a single scalar result at the end of the
inner-loop (unless it is double reduction, i.e., the use of reduction is
@@ -3108,28 +3161,21 @@ vect_create_epilog_for_reduction (tree vect_def, gimple stmt,
if (nested_in_vect_loop && !double_reduc)
goto vect_finalize_reduction;
- /* The epilogue is created for the outer-loop, i.e., for the loop being
- vectorized. */
- if (double_reduc)
- loop = outer_loop;
-
- /* FORNOW */
- gcc_assert (ncopies == 1);
-
/* 2.3 Create the reduction code, using one of the three schemes described
- above. */
-
- if (reduc_code != ERROR_MARK)
+ above. In SLP we simply need to extract all the elements from the
+ vector (without reducing them), so we use scalar shifts. */
+ if (reduc_code != ERROR_MARK && !slp_node)
{
tree tmp;
/*** Case 1: Create:
- v_out2 = reduc_expr <v_out1> */
+ v_out2 = reduc_expr <v_out1> */
if (vect_print_dump_info (REPORT_DETAILS))
- fprintf (vect_dump, "Reduce using direct vector reduction.");
+ fprintf (vect_dump, "Reduce using direct vector reduction.");
vec_dest = vect_create_destination_var (scalar_dest, vectype);
+ new_phi = VEC_index (gimple, new_phis, 0);
tmp = build1 (reduc_code, vectype, PHI_RESULT (new_phi));
epilog_stmt = gimple_build_assign (vec_dest, tmp);
new_temp = make_ssa_name (vec_dest, epilog_stmt);
@@ -3148,142 +3194,182 @@ vect_create_epilog_for_reduction (tree vect_def, gimple stmt,
tree vec_temp;
if (optab_handler (vec_shr_optab, mode)->insn_code != CODE_FOR_nothing)
- shift_code = VEC_RSHIFT_EXPR;
+ shift_code = VEC_RSHIFT_EXPR;
else
- have_whole_vector_shift = false;
+ have_whole_vector_shift = false;
/* Regardless of whether we have a whole vector shift, if we're
- emulating the operation via tree-vect-generic, we don't want
- to use it. Only the first round of the reduction is likely
- to still be profitable via emulation. */
+ emulating the operation via tree-vect-generic, we don't want
+ to use it. Only the first round of the reduction is likely
+ to still be profitable via emulation. */
/* ??? It might be better to emit a reduction tree code here, so that
- tree-vect-generic can expand the first round via bit tricks. */
+ tree-vect-generic can expand the first round via bit tricks. */
if (!VECTOR_MODE_P (mode))
- have_whole_vector_shift = false;
+ have_whole_vector_shift = false;
else
- {
- optab optab = optab_for_tree_code (code, vectype, optab_default);
- if (optab_handler (optab, mode)->insn_code == CODE_FOR_nothing)
- have_whole_vector_shift = false;
- }
-
- if (have_whole_vector_shift)
{
- /*** Case 2: Create:
- for (offset = VS/2; offset >= element_size; offset/=2)
- {
- Create: va' = vec_shift <va, offset>
- Create: va = vop <va, va'>
- } */
-
- if (vect_print_dump_info (REPORT_DETAILS))
- fprintf (vect_dump, "Reduce using vector shifts");
+ optab optab = optab_for_tree_code (code, vectype, optab_default);
+ if (optab_handler (optab, mode)->insn_code == CODE_FOR_nothing)
+ have_whole_vector_shift = false;
+ }
- vec_dest = vect_create_destination_var (scalar_dest, vectype);
- new_temp = PHI_RESULT (new_phi);
+ if (have_whole_vector_shift && !slp_node)
+ {
+ /*** Case 2: Create:
+ for (offset = VS/2; offset >= element_size; offset/=2)
+ {
+ Create: va' = vec_shift <va, offset>
+ Create: va = vop <va, va'>
+ } */
- for (bit_offset = vec_size_in_bits/2;
- bit_offset >= element_bitsize;
- bit_offset /= 2)
- {
- tree bitpos = size_int (bit_offset);
-
- epilog_stmt = gimple_build_assign_with_ops (shift_code, vec_dest,
- new_temp, bitpos);
- new_name = make_ssa_name (vec_dest, epilog_stmt);
- gimple_assign_set_lhs (epilog_stmt, new_name);
- gsi_insert_before (&exit_gsi, epilog_stmt, GSI_SAME_STMT);
-
- epilog_stmt = gimple_build_assign_with_ops (code, vec_dest,
- new_name, new_temp);
- new_temp = make_ssa_name (vec_dest, epilog_stmt);
- gimple_assign_set_lhs (epilog_stmt, new_temp);
- gsi_insert_before (&exit_gsi, epilog_stmt, GSI_SAME_STMT);
- }
+ if (vect_print_dump_info (REPORT_DETAILS))
+ fprintf (vect_dump, "Reduce using vector shifts");
+
+ vec_dest = vect_create_destination_var (scalar_dest, vectype);
+ new_phi = VEC_index (gimple, new_phis, 0);
+ new_temp = PHI_RESULT (new_phi);
+ for (bit_offset = vec_size_in_bits/2;
+ bit_offset >= element_bitsize;
+ bit_offset /= 2)
+ {
+ tree bitpos = size_int (bit_offset);
+
+ epilog_stmt = gimple_build_assign_with_ops (shift_code,
+ vec_dest, new_temp, bitpos);
+ new_name = make_ssa_name (vec_dest, epilog_stmt);
+ gimple_assign_set_lhs (epilog_stmt, new_name);
+ gsi_insert_before (&exit_gsi, epilog_stmt, GSI_SAME_STMT);
+
+ epilog_stmt = gimple_build_assign_with_ops (code, vec_dest,
+ new_name, new_temp);
+ new_temp = make_ssa_name (vec_dest, epilog_stmt);
+ gimple_assign_set_lhs (epilog_stmt, new_temp);
+ gsi_insert_before (&exit_gsi, epilog_stmt, GSI_SAME_STMT);
+ }
- extract_scalar_result = true;
- }
+ extract_scalar_result = true;
+ }
else
{
- tree rhs;
-
- /*** Case 3: Create:
- s = extract_field <v_out2, 0>
- for (offset = element_size;
- offset < vector_size;
- offset += element_size;)
- {
- Create: s' = extract_field <v_out2, offset>
- Create: s = op <s, s'>
- } */
+ tree rhs;
+
+ /*** Case 3: Create:
+ s = extract_field <v_out2, 0>
+ for (offset = element_size;
+ offset < vector_size;
+ offset += element_size;)
+ {
+ Create: s' = extract_field <v_out2, offset>
+ Create: s = op <s, s'> // For non SLP cases
+ } */
- if (vect_print_dump_info (REPORT_DETAILS))
- fprintf (vect_dump, "Reduce using scalar code. ");
-
- vec_temp = PHI_RESULT (new_phi);
- vec_size_in_bits = tree_low_cst (TYPE_SIZE (vectype), 1);
- rhs = build3 (BIT_FIELD_REF, scalar_type, vec_temp, bitsize,
- bitsize_zero_node);
- epilog_stmt = gimple_build_assign (new_scalar_dest, rhs);
- new_temp = make_ssa_name (new_scalar_dest, epilog_stmt);
- gimple_assign_set_lhs (epilog_stmt, new_temp);
- gsi_insert_before (&exit_gsi, epilog_stmt, GSI_SAME_STMT);
-
- for (bit_offset = element_bitsize;
- bit_offset < vec_size_in_bits;
- bit_offset += element_bitsize)
- {
- tree bitpos = bitsize_int (bit_offset);
- tree rhs = build3 (BIT_FIELD_REF, scalar_type, vec_temp, bitsize,
- bitpos);
-
- epilog_stmt = gimple_build_assign (new_scalar_dest, rhs);
- new_name = make_ssa_name (new_scalar_dest, epilog_stmt);
- gimple_assign_set_lhs (epilog_stmt, new_name);
- gsi_insert_before (&exit_gsi, epilog_stmt, GSI_SAME_STMT);
-
- epilog_stmt = gimple_build_assign_with_ops (code,
- new_scalar_dest,
- new_name, new_temp);
- new_temp = make_ssa_name (new_scalar_dest, epilog_stmt);
- gimple_assign_set_lhs (epilog_stmt, new_temp);
- gsi_insert_before (&exit_gsi, epilog_stmt, GSI_SAME_STMT);
- }
+ if (vect_print_dump_info (REPORT_DETAILS))
+ fprintf (vect_dump, "Reduce using scalar code. ");
- extract_scalar_result = false;
- }
+ vec_size_in_bits = tree_low_cst (TYPE_SIZE (vectype), 1);
+ for (i = 0; VEC_iterate (gimple, new_phis, i, new_phi); i++)
+ {
+ vec_temp = PHI_RESULT (new_phi);
+ rhs = build3 (BIT_FIELD_REF, scalar_type, vec_temp, bitsize,
+ bitsize_zero_node);
+ epilog_stmt = gimple_build_assign (new_scalar_dest, rhs);
+ new_temp = make_ssa_name (new_scalar_dest, epilog_stmt);
+ gimple_assign_set_lhs (epilog_stmt, new_temp);
+ gsi_insert_before (&exit_gsi, epilog_stmt, GSI_SAME_STMT);
+
+ /* In SLP we don't need to apply reduction operation, so we just
+ collect s' values in SCALAR_RESULTS. */
+ if (slp_node)
+ VEC_safe_push (tree, heap, scalar_results, new_temp);
+
+ for (bit_offset = element_bitsize;
+ bit_offset < vec_size_in_bits;
+ bit_offset += element_bitsize)
+ {
+ tree bitpos = bitsize_int (bit_offset);
+ tree rhs = build3 (BIT_FIELD_REF, scalar_type, vec_temp,
+ bitsize, bitpos);
+
+ epilog_stmt = gimple_build_assign (new_scalar_dest, rhs);
+ new_name = make_ssa_name (new_scalar_dest, epilog_stmt);
+ gimple_assign_set_lhs (epilog_stmt, new_name);
+ gsi_insert_before (&exit_gsi, epilog_stmt, GSI_SAME_STMT);
+
+ if (slp_node)
+ {
+ /* In SLP we don't need to apply reduction operation, so
+ we just collect s' values in SCALAR_RESULTS. */
+ new_temp = new_name;
+ VEC_safe_push (tree, heap, scalar_results, new_name);
+ }
+ else
+ {
+ epilog_stmt = gimple_build_assign_with_ops (code,
+ new_scalar_dest, new_name, new_temp);
+ new_temp = make_ssa_name (new_scalar_dest, epilog_stmt);
+ gimple_assign_set_lhs (epilog_stmt, new_temp);
+ gsi_insert_before (&exit_gsi, epilog_stmt, GSI_SAME_STMT);
+ }
+ }
+ }
+
+ /* The only case where we need to reduce scalar results in SLP, is
+ unrolling. If the size of SCALAR_RESULTS is greater than
+ GROUP_SIZE, we reduce them combining elements modulo
+ GROUP_SIZE. */
+ if (slp_node)
+ {
+ tree res, first_res, new_res;
+ gimple new_stmt;
+
+ /* Reduce multiple scalar results in case of SLP unrolling. */
+ for (j = group_size; VEC_iterate (tree, scalar_results, j, res);
+ j++)
+ {
+ first_res = VEC_index (tree, scalar_results, j % group_size);
+ new_stmt = gimple_build_assign_with_ops (code,
+ new_scalar_dest, first_res, res);
+ new_res = make_ssa_name (new_scalar_dest, new_stmt);
+ gimple_assign_set_lhs (new_stmt, new_res);
+ gsi_insert_before (&exit_gsi, new_stmt, GSI_SAME_STMT);
+ VEC_replace (tree, scalar_results, j % group_size, new_res);
+ }
+ }
+ else
+ /* Not SLP - we have one scalar to keep in SCALAR_RESULTS. */
+ VEC_safe_push (tree, heap, scalar_results, new_temp);
+
+ extract_scalar_result = false;
+ }
}
/* 2.4 Extract the final scalar result. Create:
- s_out3 = extract_field <v_out2, bitpos> */
+ s_out3 = extract_field <v_out2, bitpos> */
if (extract_scalar_result)
{
tree rhs;
- gcc_assert (!nested_in_vect_loop || double_reduc);
if (vect_print_dump_info (REPORT_DETAILS))
- fprintf (vect_dump, "extract scalar result");
+ fprintf (vect_dump, "extract scalar result");
if (BYTES_BIG_ENDIAN)
- bitpos = size_binop (MULT_EXPR,
- bitsize_int (TYPE_VECTOR_SUBPARTS (vectype) - 1),
- TYPE_SIZE (scalar_type));
+ bitpos = size_binop (MULT_EXPR,
+ bitsize_int (TYPE_VECTOR_SUBPARTS (vectype) - 1),
+ TYPE_SIZE (scalar_type));
else
- bitpos = bitsize_zero_node;
+ bitpos = bitsize_zero_node;
rhs = build3 (BIT_FIELD_REF, scalar_type, new_temp, bitsize, bitpos);
epilog_stmt = gimple_build_assign (new_scalar_dest, rhs);
new_temp = make_ssa_name (new_scalar_dest, epilog_stmt);
gimple_assign_set_lhs (epilog_stmt, new_temp);
gsi_insert_before (&exit_gsi, epilog_stmt, GSI_SAME_STMT);
+ VEC_safe_push (tree, heap, scalar_results, new_temp);
}
-
+
vect_finalize_reduction:
- if (double_reduc)
- loop = loop->inner;
-
/* 2.5 Adjust the final result by the initial value of the reduction
variable. (When such adjustment is not needed, then
'adjustment_def' is zero). For example, if code is PLUS we create:
@@ -3291,14 +3377,17 @@ vect_finalize_reduction:
if (adjustment_def)
{
+ gcc_assert (!slp_node);
if (nested_in_vect_loop)
{
+ new_phi = VEC_index (gimple, new_phis, 0);
gcc_assert (TREE_CODE (TREE_TYPE (adjustment_def)) == VECTOR_TYPE);
expr = build2 (code, vectype, PHI_RESULT (new_phi), adjustment_def);
new_dest = vect_create_destination_var (scalar_dest, vectype);
}
else
{
+ new_temp = VEC_index (tree, scalar_results, 0);
gcc_assert (TREE_CODE (TREE_TYPE (adjustment_def)) != VECTOR_TYPE);
expr = build2 (code, scalar_type, new_temp, adjustment_def);
new_dest = vect_create_destination_var (scalar_dest, scalar_type);
@@ -3309,142 +3398,206 @@ vect_finalize_reduction:
gimple_assign_set_lhs (epilog_stmt, new_temp);
SSA_NAME_DEF_STMT (new_temp) = epilog_stmt;
gsi_insert_before (&exit_gsi, epilog_stmt, GSI_SAME_STMT);
+ if (nested_in_vect_loop)
+ {
+ set_vinfo_for_stmt (epilog_stmt,
+ new_stmt_vec_info (epilog_stmt, loop_vinfo,
+ NULL));
+ STMT_VINFO_RELATED_STMT (vinfo_for_stmt (epilog_stmt)) =
+ STMT_VINFO_RELATED_STMT (vinfo_for_stmt (new_phi));
+
+ if (!double_reduc)
+ VEC_quick_push (tree, scalar_results, new_temp);
+ else
+ VEC_replace (tree, scalar_results, 0, new_temp);
+ }
+ else
+ VEC_replace (tree, scalar_results, 0, new_temp);
+
+ VEC_replace (gimple, new_phis, 0, epilog_stmt);
}
+ /* 2.6 Handle the loop-exit phis. Replace the uses of scalar loop-exit
+ phis with new adjusted scalar results, i.e., replace use <s_out0>
+ with use <s_out4>.
- /* 2.6 Handle the loop-exit phi */
+ Transform:
+ loop_exit:
+ s_out0 = phi <s_loop> # (scalar) EXIT_PHI
+ v_out1 = phi <VECT_DEF> # NEW_EXIT_PHI
+ v_out2 = reduce <v_out1>
+ s_out3 = extract_field <v_out2, 0>
+ s_out4 = adjust_result <s_out3>
+ use <s_out0>
+ use <s_out0>
+
+ into:
- /* Replace uses of s_out0 with uses of s_out3:
- Find the loop-closed-use at the loop exit of the original scalar result.
- (The reduction result is expected to have two immediate uses - one at the
- latch block, and one at the loop exit). */
- phis = VEC_alloc (gimple, heap, 10);
- FOR_EACH_IMM_USE_FAST (use_p, imm_iter, scalar_dest)
+ loop_exit:
+ s_out0 = phi <s_loop> # (scalar) EXIT_PHI
+ v_out1 = phi <VECT_DEF> # NEW_EXIT_PHI
+ v_out2 = reduce <v_out1>
+ s_out3 = extract_field <v_out2, 0>
+ s_out4 = adjust_result <s_out3>
+ use <s_out4> */
+
+ /* In SLP we may have several statements in NEW_PHIS and REDUCTION_PHIS (in
+ case that GROUP_SIZE is greater than vectorization factor). Therefore, we
+ need to match SCALAR_RESULTS with corresponding statements. The first
+ (GROUP_SIZE / number of new vector stmts) scalar results correspond to
+ the first vector stmt, etc.
+ (RATIO is equal to (GROUP_SIZE / number of new vector stmts)). */
+ ratio = group_size / VEC_length (gimple, new_phis);
+ gcc_assert (!(group_size % VEC_length (gimple, new_phis)));
+
+ for (k = 0; k < group_size; k++)
{
- if (!flow_bb_inside_loop_p (loop, gimple_bb (USE_STMT (use_p))))
- {
- exit_phi = USE_STMT (use_p);
- VEC_quick_push (gimple, phis, exit_phi);
- }
- }
+ if (k % ratio == 0)
+ {
+ epilog_stmt = VEC_index (gimple, new_phis, k / ratio);
+ reduction_phi = VEC_index (gimple, reduction_phis, k / ratio);
+ }
- /* We expect to have found an exit_phi because of loop-closed-ssa form. */
- gcc_assert (!VEC_empty (gimple, phis));
+ if (slp_node)
+ {
+ gimple current_stmt = VEC_index (gimple,
+ SLP_TREE_SCALAR_STMTS (slp_node), k);
- for (i = 0; VEC_iterate (gimple, phis, i, exit_phi); i++)
- {
- if (nested_in_vect_loop)
- {
- stmt_vec_info stmt_vinfo = vinfo_for_stmt (exit_phi);
- gimple vect_phi;
-
- /* FORNOW. Currently not supporting the case that an inner-loop
- reduction is not used in the outer-loop (but only outside the
- outer-loop), unless it is double reduction. */
- gcc_assert ((STMT_VINFO_RELEVANT_P (stmt_vinfo)
- && !STMT_VINFO_LIVE_P (stmt_vinfo)) || double_reduc);
-
- epilog_stmt = adjustment_def ? epilog_stmt : new_phi;
- STMT_VINFO_VEC_STMT (stmt_vinfo) = epilog_stmt;
- set_vinfo_for_stmt (epilog_stmt,
- new_stmt_vec_info (epilog_stmt, loop_vinfo,
- NULL));
- if (adjustment_def)
- STMT_VINFO_RELATED_STMT (vinfo_for_stmt (epilog_stmt)) =
- STMT_VINFO_RELATED_STMT (vinfo_for_stmt (new_phi));
-
- if (!double_reduc
- || STMT_VINFO_DEF_TYPE (stmt_vinfo) != vect_double_reduction_def)
- continue;
-
- /* Handle double reduction:
-
- stmt1: s1 = phi <s0, s2> - double reduction phi (outer loop)
- stmt2: s3 = phi <s1, s4> - (regular) reduction phi (inner loop)
- stmt3: s4 = use (s3) - (regular) reduction stmt (inner loop)
- stmt4: s2 = phi <s4> - double reduction stmt (outer loop)
-
- At that point the regular reduction (stmt2 and stmt3) is already
- vectorized, as well as the exit phi node, stmt4.
- Here we vectorize the phi node of double reduction, stmt1, and
- update all relevant statements. */
-
- /* Go through all the uses of s2 to find double reduction phi node,
- i.e., stmt1 above. */
- orig_name = PHI_RESULT (exit_phi);
- FOR_EACH_IMM_USE_STMT (use_stmt, imm_iter, orig_name)
+ orig_stmt = STMT_VINFO_RELATED_STMT (vinfo_for_stmt (current_stmt));
+ /* SLP statements can't participate in patterns. */
+ gcc_assert (!orig_stmt);
+ scalar_dest = gimple_assign_lhs (current_stmt);
+ }
+
+ phis = VEC_alloc (gimple, heap, 3);
+ /* Find the loop-closed-use at the loop exit of the original scalar
+ result. (The reduction result is expected to have two immediate uses -
+ one at the latch block, and one at the loop exit). */
+ FOR_EACH_IMM_USE_FAST (use_p, imm_iter, scalar_dest)
+ if (!flow_bb_inside_loop_p (loop, gimple_bb (USE_STMT (use_p))))
+ VEC_safe_push (gimple, heap, phis, USE_STMT (use_p));
+
+ /* We expect to have found an exit_phi because of loop-closed-ssa
+ form. */
+ gcc_assert (!VEC_empty (gimple, phis));
+
+ for (i = 0; VEC_iterate (gimple, phis, i, exit_phi); i++)
+ {
+ if (outer_loop)
{
- stmt_vec_info use_stmt_vinfo = vinfo_for_stmt (use_stmt);
- stmt_vec_info new_phi_vinfo;
- tree vect_phi_init, preheader_arg, vect_phi_res, init_def;
- basic_block bb = gimple_bb (use_stmt);
- gimple use;
-
- /* Check that USE_STMT is really double reduction phi node. */
- if (gimple_code (use_stmt) != GIMPLE_PHI
- || gimple_phi_num_args (use_stmt) != 2
- || !use_stmt_vinfo
- || STMT_VINFO_DEF_TYPE (use_stmt_vinfo)
- != vect_double_reduction_def
- || bb->loop_father != outer_loop)
+ stmt_vec_info exit_phi_vinfo = vinfo_for_stmt (exit_phi);
+ gimple vect_phi;
+
+ /* FORNOW. Currently not supporting the case that an inner-loop
+ reduction is not used in the outer-loop (but only outside the
+ outer-loop), unless it is double reduction. */
+ gcc_assert ((STMT_VINFO_RELEVANT_P (exit_phi_vinfo)
+ && !STMT_VINFO_LIVE_P (exit_phi_vinfo))
+ || double_reduc);
+
+ STMT_VINFO_VEC_STMT (exit_phi_vinfo) = epilog_stmt;
+ if (!double_reduc
+ || STMT_VINFO_DEF_TYPE (exit_phi_vinfo)
+ != vect_double_reduction_def)
continue;
- /* Create vector phi node for double reduction:
- vs1 = phi <vs0, vs2>
- vs1 was created previously in this function by a call to
- vect_get_vec_def_for_operand and is stored in vec_initial_def;
- vs2 is defined by EPILOG_STMT, the vectorized EXIT_PHI;
- vs0 is created here. */
+ /* Handle double reduction:
- /* Create vector phi node. */
- vect_phi = create_phi_node (vec_initial_def, bb);
- new_phi_vinfo = new_stmt_vec_info (vect_phi,
- loop_vec_info_for_loop (outer_loop), NULL);
- set_vinfo_for_stmt (vect_phi, new_phi_vinfo);
+ stmt1: s1 = phi <s0, s2> - double reduction phi (outer loop)
+ stmt2: s3 = phi <s1, s4> - (regular) reduc phi (inner loop)
+ stmt3: s4 = use (s3) - (regular) reduc stmt (inner loop)
+ stmt4: s2 = phi <s4> - double reduction stmt (outer loop)
- /* Create vs0 - initial def of the double reduction phi. */
- preheader_arg = PHI_ARG_DEF_FROM_EDGE (use_stmt,
- loop_preheader_edge (outer_loop));
- init_def = get_initial_def_for_reduction (stmt, preheader_arg,
- NULL);
- vect_phi_init = vect_init_vector (use_stmt, init_def, vectype,
- NULL);
-
- /* Update phi node arguments with vs0 and vs2. */
- add_phi_arg (vect_phi, vect_phi_init,
- loop_preheader_edge (outer_loop), UNKNOWN_LOCATION);
- add_phi_arg (vect_phi, PHI_RESULT (epilog_stmt),
- loop_latch_edge (outer_loop), UNKNOWN_LOCATION);
- if (vect_print_dump_info (REPORT_DETAILS))
- {
- fprintf (vect_dump, "created double reduction phi node: ");
- print_gimple_stmt (vect_dump, vect_phi, 0, TDF_SLIM);
- }
-
- vect_phi_res = PHI_RESULT (vect_phi);
+ At that point the regular reduction (stmt2 and stmt3) is
+ already vectorized, as well as the exit phi node, stmt4.
+ Here we vectorize the phi node of double reduction, stmt1, and
+ update all relevant statements. */
- /* Replace the use, i.e., set the correct vs1 in the regular
- reduction phi node. FORNOW, NCOPIES is always 1, so the loop
- is redundant. */
- use = reduction_phi;
- for (j = 0; j < ncopies; j++)
+ /* Go through all the uses of s2 to find double reduction phi
+ node, i.e., stmt1 above. */
+ orig_name = PHI_RESULT (exit_phi);
+ FOR_EACH_IMM_USE_STMT (use_stmt, imm_iter, orig_name)
{
- edge pr_edge = loop_preheader_edge (loop);
- SET_PHI_ARG_DEF (use, pr_edge->dest_idx, vect_phi_res);
- use = STMT_VINFO_RELATED_STMT (vinfo_for_stmt (use));
+ stmt_vec_info use_stmt_vinfo = vinfo_for_stmt (use_stmt);
+ stmt_vec_info new_phi_vinfo;
+ tree vect_phi_init, preheader_arg, vect_phi_res, init_def;
+ basic_block bb = gimple_bb (use_stmt);
+ gimple use;
+
+ /* Check that USE_STMT is really double reduction phi
+ node. */
+ if (gimple_code (use_stmt) != GIMPLE_PHI
+ || gimple_phi_num_args (use_stmt) != 2
+ || !use_stmt_vinfo
+ || STMT_VINFO_DEF_TYPE (use_stmt_vinfo)
+ != vect_double_reduction_def
+ || bb->loop_father != outer_loop)
+ continue;
+
+ /* Create vector phi node for double reduction:
+ vs1 = phi <vs0, vs2>
+ vs1 was created previously in this function by a call to
+ vect_get_vec_def_for_operand and is stored in
+ vec_initial_def;
+ vs2 is defined by EPILOG_STMT, the vectorized EXIT_PHI;
+ vs0 is created here. */
+
+ /* Create vector phi node. */
+ vect_phi = create_phi_node (vec_initial_def, bb);
+ new_phi_vinfo = new_stmt_vec_info (vect_phi,
+ loop_vec_info_for_loop (outer_loop), NULL);
+ set_vinfo_for_stmt (vect_phi, new_phi_vinfo);
+
+ /* Create vs0 - initial def of the double reduction phi. */
+ preheader_arg = PHI_ARG_DEF_FROM_EDGE (use_stmt,
+ loop_preheader_edge (outer_loop));
+ init_def = get_initial_def_for_reduction (stmt,
+ preheader_arg, NULL);
+ vect_phi_init = vect_init_vector (use_stmt, init_def,
+ vectype, NULL);
+
+ /* Update phi node arguments with vs0 and vs2. */
+ add_phi_arg (vect_phi, vect_phi_init,
+ loop_preheader_edge (outer_loop),
+ UNKNOWN_LOCATION);
+ add_phi_arg (vect_phi, PHI_RESULT (epilog_stmt),
+ loop_latch_edge (outer_loop), UNKNOWN_LOCATION);
+ if (vect_print_dump_info (REPORT_DETAILS))
+ {
+ fprintf (vect_dump, "created double reduction phi "
+ "node: ");
+ print_gimple_stmt (vect_dump, vect_phi, 0, TDF_SLIM);
+ }
+
+ vect_phi_res = PHI_RESULT (vect_phi);
+
+ /* Replace the use, i.e., set the correct vs1 in the regular
+ reduction phi node. FORNOW, NCOPIES is always 1, so the
+ loop is redundant. */
+ use = reduction_phi;
+ for (j = 0; j < ncopies; j++)
+ {
+ edge pr_edge = loop_preheader_edge (loop);
+ SET_PHI_ARG_DEF (use, pr_edge->dest_idx, vect_phi_res);
+ use = STMT_VINFO_RELATED_STMT (vinfo_for_stmt (use));
+ }
}
}
- }
- /* Replace the uses: */
- orig_name = PHI_RESULT (exit_phi);
- FOR_EACH_IMM_USE_STMT (use_stmt, imm_iter, orig_name)
- FOR_EACH_IMM_USE_ON_STMT (use_p, imm_iter)
- SET_USE (use_p, new_temp);
+ /* Replace the uses: */
+ orig_name = PHI_RESULT (exit_phi);
+ scalar_result = VEC_index (tree, scalar_results, k);
+ FOR_EACH_IMM_USE_STMT (use_stmt, imm_iter, orig_name)
+ FOR_EACH_IMM_USE_ON_STMT (use_p, imm_iter)
+ SET_USE (use_p, scalar_result);
+ }
+
+ VEC_free (gimple, heap, phis);
}
- VEC_free (gimple, heap, phis);
-}
+ VEC_free (tree, heap, scalar_results);
+ VEC_free (gimple, heap, new_phis);
+}
/* Function vectorizable_reduction.
@@ -3489,7 +3642,7 @@ vect_finalize_reduction:
bool
vectorizable_reduction (gimple stmt, gimple_stmt_iterator *gsi,
- gimple *vec_stmt)
+ gimple *vec_stmt, slp_tree slp_node)
{
tree vec_dest;
tree scalar_dest;
@@ -3517,7 +3670,6 @@ vectorizable_reduction (gimple stmt, gimple_stmt_iterator *gsi,
int ncopies;
int epilog_copies;
stmt_vec_info prev_stmt_info, prev_phi_info;
- gimple first_phi = NULL;
bool single_defuse_cycle = false;
tree reduc_def = NULL_TREE;
gimple new_stmt = NULL;
@@ -3532,6 +3684,10 @@ vectorizable_reduction (gimple stmt, gimple_stmt_iterator *gsi,
struct loop * def_stmt_loop, *outer_loop = NULL;
tree def_arg;
gimple def_arg_stmt;
+ VEC (tree, heap) *vec_oprnds0 = NULL, *vec_oprnds1 = NULL, *vect_defs = NULL;
+ VEC (gimple, heap) *phis = NULL;
+ int vec_num;
+ tree def0, def1;
if (nested_in_vect_loop_p (loop, stmt))
{
@@ -3540,10 +3696,6 @@ vectorizable_reduction (gimple stmt, gimple_stmt_iterator *gsi,
nested_cycle = true;
}
- /* FORNOW: SLP not supported. */
- if (STMT_SLP_TYPE (stmt_info))
- return false;
-
/* 1. Is vectorizable reduction? */
/* Not supportable if the reduction variable is used in the loop. */
if (STMT_VINFO_RELEVANT (stmt_info) > vect_used_in_outer)
@@ -3676,9 +3828,12 @@ vectorizable_reduction (gimple stmt, gimple_stmt_iterator *gsi,
if (STMT_VINFO_LIVE_P (vinfo_for_stmt (reduc_def_stmt)))
return false;
+ if (slp_node)
+ ncopies = 1;
+ else
+ ncopies = (LOOP_VINFO_VECT_FACTOR (loop_vinfo)
+ / TYPE_VECTOR_SUBPARTS (vectype_in));
- ncopies = (LOOP_VINFO_VECT_FACTOR (loop_vinfo)
- / TYPE_VECTOR_SUBPARTS (vectype_in));
gcc_assert (ncopies >= 1);
vec_mode = TYPE_MODE (vectype_in);
@@ -3897,23 +4052,48 @@ vectorizable_reduction (gimple stmt, gimple_stmt_iterator *gsi,
prev_stmt_info = NULL;
prev_phi_info = NULL;
+ if (slp_node)
+ {
+ vec_num = SLP_TREE_NUMBER_OF_VEC_STMTS (slp_node);
+ gcc_assert (TYPE_VECTOR_SUBPARTS (vectype_out)
+ == TYPE_VECTOR_SUBPARTS (vectype_in));
+ }
+ else
+ {
+ vec_num = 1;
+ vec_oprnds0 = VEC_alloc (tree, heap, 1);
+ if (op_type == ternary_op)
+ vec_oprnds1 = VEC_alloc (tree, heap, 1);
+ }
+
+ phis = VEC_alloc (gimple, heap, vec_num);
+ vect_defs = VEC_alloc (tree, heap, vec_num);
+ if (!slp_node)
+ VEC_quick_push (tree, vect_defs, NULL_TREE);
+
for (j = 0; j < ncopies; j++)
{
if (j == 0 || !single_defuse_cycle)
{
- /* Create the reduction-phi that defines the reduction-operand. */
- new_phi = create_phi_node (vec_dest, loop->header);
- set_vinfo_for_stmt (new_phi, new_stmt_vec_info (new_phi, loop_vinfo,
- NULL));
- /* Get the vector def for the reduction variable from the phi
- node. */
- reduc_def = PHI_RESULT (new_phi);
- }
+ for (i = 0; i < vec_num; i++)
+ {
+ /* Create the reduction-phi that defines the reduction
+ operand. */
+ new_phi = create_phi_node (vec_dest, loop->header);
+ set_vinfo_for_stmt (new_phi,
+ new_stmt_vec_info (new_phi, loop_vinfo,
+ NULL));
+ if (j == 0 || slp_node)
+ VEC_quick_push (gimple, phis, new_phi);
+ }
+ }
if (code == COND_EXPR)
{
- first_phi = new_phi;
- vectorizable_condition (stmt, gsi, vec_stmt, reduc_def, reduc_index);
+ gcc_assert (!slp_node);
+ vectorizable_condition (stmt, gsi, vec_stmt,
+ PHI_RESULT (VEC_index (gimple, phis, 0)),
+ reduc_index);
/* Multiple types are not supported for condition. */
break;
}
@@ -3921,65 +4101,94 @@ vectorizable_reduction (gimple stmt, gimple_stmt_iterator *gsi,
/* Handle uses. */
if (j == 0)
{
- loop_vec_def0 = vect_get_vec_def_for_operand (ops[!reduc_index],
- stmt, NULL);
- if (op_type == ternary_op)
+ if (slp_node)
+ vect_get_slp_defs (slp_node, &vec_oprnds0, &vec_oprnds1, -1);
+ else
{
- if (reduc_index == 0)
- loop_vec_def1 = vect_get_vec_def_for_operand (ops[2], stmt,
- NULL);
- else
- loop_vec_def1 = vect_get_vec_def_for_operand (ops[1], stmt,
- NULL);
+ loop_vec_def0 = vect_get_vec_def_for_operand (ops[!reduc_index],
+ stmt, NULL);
+ VEC_quick_push (tree, vec_oprnds0, loop_vec_def0);
+ if (op_type == ternary_op)
+ {
+ if (reduc_index == 0)
+ loop_vec_def1 = vect_get_vec_def_for_operand (ops[2], stmt,
+ NULL);
+ else
+ loop_vec_def1 = vect_get_vec_def_for_operand (ops[1], stmt,
+ NULL);
+
+ VEC_quick_push (tree, vec_oprnds1, loop_vec_def1);
+ }
}
-
- /* Get the vector def for the reduction variable from the phi
- node. */
- first_phi = new_phi;
}
else
{
- enum vect_def_type dt = vect_unknown_def_type; /* Dummy */
- loop_vec_def0 = vect_get_vec_def_for_stmt_copy (dt, loop_vec_def0);
- if (op_type == ternary_op)
- loop_vec_def1 = vect_get_vec_def_for_stmt_copy (dt, loop_vec_def1);
+ if (!slp_node)
+ {
+ enum vect_def_type dt = vect_unknown_def_type; /* Dummy */
+ loop_vec_def0 = vect_get_vec_def_for_stmt_copy (dt, loop_vec_def0);
+ VEC_replace (tree, vec_oprnds0, 0, loop_vec_def0);
+ if (op_type == ternary_op)
+ {
+ loop_vec_def1 = vect_get_vec_def_for_stmt_copy (dt,
+ loop_vec_def1);
+ VEC_replace (tree, vec_oprnds1, 0, loop_vec_def1);
+ }
+ }
- if (single_defuse_cycle)
- reduc_def = gimple_assign_lhs (new_stmt);
- else
- reduc_def = PHI_RESULT (new_phi);
+ if (single_defuse_cycle)
+ reduc_def = gimple_assign_lhs (new_stmt);
- STMT_VINFO_RELATED_STMT (prev_phi_info) = new_phi;
+ STMT_VINFO_RELATED_STMT (prev_phi_info) = new_phi;
}
- /* Arguments are ready. Create the new vector stmt. */
- if (op_type == binary_op)
+ for (i = 0; VEC_iterate (tree, vec_oprnds0, i, def0); i++)
{
- if (reduc_index == 0)
- expr = build2 (code, vectype_out, reduc_def, loop_vec_def0);
+ if (slp_node)
+ reduc_def = PHI_RESULT (VEC_index (gimple, phis, i));
else
- expr = build2 (code, vectype_out, loop_vec_def0, reduc_def);
- }
- else
- {
- if (reduc_index == 0)
- expr = build3 (code, vectype_out, reduc_def, loop_vec_def0,
- loop_vec_def1);
+ {
+ if (!single_defuse_cycle || j == 0)
+ reduc_def = PHI_RESULT (new_phi);
+ }
+
+ def1 = ((op_type == ternary_op)
+ ? VEC_index (tree, vec_oprnds1, i) : NULL);
+ if (op_type == binary_op)
+ {
+ if (reduc_index == 0)
+ expr = build2 (code, vectype_out, reduc_def, def0);
+ else
+ expr = build2 (code, vectype_out, def0, reduc_def);
+ }
else
{
- if (reduc_index == 1)
- expr = build3 (code, vectype_out, loop_vec_def0, reduc_def,
- loop_vec_def1);
+ if (reduc_index == 0)
+ expr = build3 (code, vectype_out, reduc_def, def0, def1);
else
- expr = build3 (code, vectype_out, loop_vec_def0, loop_vec_def1,
- reduc_def);
+ {
+ if (reduc_index == 1)
+ expr = build3 (code, vectype_out, def0, reduc_def, def1);
+ else
+ expr = build3 (code, vectype_out, def0, def1, reduc_def);
+ }
+ }
+
+ new_stmt = gimple_build_assign (vec_dest, expr);
+ new_temp = make_ssa_name (vec_dest, new_stmt);
+ gimple_assign_set_lhs (new_stmt, new_temp);
+ vect_finish_stmt_generation (stmt, new_stmt, gsi);
+ if (slp_node)
+ {
+ VEC_quick_push (gimple, SLP_TREE_VEC_STMTS (slp_node), new_stmt);
+ VEC_quick_push (tree, vect_defs, new_temp);
}
+ else
+ VEC_replace (tree, vect_defs, 0, new_temp);
}
- new_stmt = gimple_build_assign (vec_dest, expr);
- new_temp = make_ssa_name (vec_dest, new_stmt);
- gimple_assign_set_lhs (new_stmt, new_temp);
- vect_finish_stmt_generation (stmt, new_stmt, gsi);
+ if (slp_node)
+ continue;
if (j == 0)
STMT_VINFO_VEC_STMT (stmt_info) = *vec_stmt = new_stmt;
@@ -3992,12 +4201,21 @@ vectorizable_reduction (gimple stmt, gimple_stmt_iterator *gsi,
/* Finalize the reduction-phi (set its arguments) and create the
epilog reduction code. */
- if (!single_defuse_cycle || code == COND_EXPR)
- new_temp = gimple_assign_lhs (*vec_stmt);
+ if ((!single_defuse_cycle || code == COND_EXPR) && !slp_node)
+ {
+ new_temp = gimple_assign_lhs (*vec_stmt);
+ VEC_replace (tree, vect_defs, 0, new_temp);
+ }
+
+ vect_create_epilog_for_reduction (vect_defs, stmt, epilog_copies,
+ epilog_reduc_code, phis, reduc_index,
+ double_reduc, slp_node);
+
+ VEC_free (gimple, heap, phis);
+ VEC_free (tree, heap, vec_oprnds0);
+ if (vec_oprnds1)
+ VEC_free (tree, heap, vec_oprnds1);
- vect_create_epilog_for_reduction (new_temp, stmt, epilog_copies,
- epilog_reduc_code, first_phi, reduc_index,
- double_reduc);
return true;
}
diff --git a/gcc/tree-vect-patterns.c b/gcc/tree-vect-patterns.c
index 55b9d50cca0..ea827559195 100644
--- a/gcc/tree-vect-patterns.c
+++ b/gcc/tree-vect-patterns.c
@@ -670,6 +670,8 @@ vect_pattern_recog_1 (
tree pattern_vectype;
tree type_in, type_out;
enum tree_code code;
+ int i;
+ gimple next;
pattern_stmt = (* vect_recog_func) (stmt, &type_in, &type_out);
if (!pattern_stmt)
@@ -735,7 +737,13 @@ vect_pattern_recog_1 (
STMT_VINFO_IN_PATTERN_P (stmt_info) = true;
STMT_VINFO_RELATED_STMT (stmt_info) = pattern_stmt;
- return;
+ /* Patterns cannot be vectorized using SLP, because they change the order of
+ computation. */
+ for (i = 0; VEC_iterate (gimple, LOOP_VINFO_REDUCTIONS (loop_vinfo), i,
+ next);
+ i++)
+ if (next == stmt)
+ VEC_ordered_remove (gimple, LOOP_VINFO_REDUCTIONS (loop_vinfo), i);
}
diff --git a/gcc/tree-vect-slp.c b/gcc/tree-vect-slp.c
index afc4f311078..99a865fee20 100644
--- a/gcc/tree-vect-slp.c
+++ b/gcc/tree-vect-slp.c
@@ -273,6 +273,7 @@ vect_get_and_check_slp_defs (loop_vec_info loop_vinfo, bb_vec_info bb_vinfo,
break;
case vect_internal_def:
+ case vect_reduction_def:
if (i == 0)
VEC_safe_push (gimple, heap, *def_stmts0, def_stmt);
else
@@ -332,7 +333,7 @@ vect_build_slp_tree (loop_vec_info loop_vinfo, bb_vec_info bb_vinfo,
HOST_WIDE_INT dummy;
bool permutation = false;
unsigned int load_place;
- gimple first_load;
+ gimple first_load, prev_first_load = NULL;
/* For every stmt in NODE find its def stmt/s. */
for (i = 0; VEC_iterate (gimple, stmts, i, stmt); i++)
@@ -485,42 +486,62 @@ vect_build_slp_tree (loop_vec_info loop_vinfo, bb_vec_info bb_vinfo,
&pattern0, &pattern1))
return false;
}
- else
- {
- /* Load. */
- /* FORNOW: Check that there is no gap between the loads. */
- if ((DR_GROUP_FIRST_DR (vinfo_for_stmt (stmt)) == stmt
- && DR_GROUP_GAP (vinfo_for_stmt (stmt)) != 0)
- || (DR_GROUP_FIRST_DR (vinfo_for_stmt (stmt)) != stmt
- && DR_GROUP_GAP (vinfo_for_stmt (stmt)) != 1))
- {
- if (vect_print_dump_info (REPORT_SLP))
- {
- fprintf (vect_dump, "Build SLP failed: strided "
- "loads have gaps ");
- print_gimple_stmt (vect_dump, stmt, 0, TDF_SLIM);
- }
+ else
+ {
+ /* Load. */
+ /* FORNOW: Check that there is no gap between the loads. */
+ if ((DR_GROUP_FIRST_DR (vinfo_for_stmt (stmt)) == stmt
+ && DR_GROUP_GAP (vinfo_for_stmt (stmt)) != 0)
+ || (DR_GROUP_FIRST_DR (vinfo_for_stmt (stmt)) != stmt
+ && DR_GROUP_GAP (vinfo_for_stmt (stmt)) != 1))
+ {
+ if (vect_print_dump_info (REPORT_SLP))
+ {
+ fprintf (vect_dump, "Build SLP failed: strided "
+ "loads have gaps ");
+ print_gimple_stmt (vect_dump, stmt, 0, TDF_SLIM);
+ }
- return false;
- }
-
- /* Check that the size of interleaved loads group is not
- greater than the SLP group size. */
- if (DR_GROUP_SIZE (vinfo_for_stmt (stmt))
- > ncopies * group_size)
- {
- if (vect_print_dump_info (REPORT_SLP))
- {
- fprintf (vect_dump, "Build SLP failed: the number of "
- "interleaved loads is greater than"
- " the SLP group size ");
- print_gimple_stmt (vect_dump, stmt, 0, TDF_SLIM);
- }
+ return false;
+ }
- return false;
- }
+ /* Check that the size of interleaved loads group is not
+ greater than the SLP group size. */
+ if (DR_GROUP_SIZE (vinfo_for_stmt (stmt)) > ncopies * group_size)
+ {
+ if (vect_print_dump_info (REPORT_SLP))
+ {
+ fprintf (vect_dump, "Build SLP failed: the number of "
+ "interleaved loads is greater than"
+ " the SLP group size ");
+ print_gimple_stmt (vect_dump, stmt, 0, TDF_SLIM);
+ }
- first_load = DR_GROUP_FIRST_DR (vinfo_for_stmt (stmt));
+ return false;
+ }
+
+ first_load = DR_GROUP_FIRST_DR (vinfo_for_stmt (stmt));
+ if (prev_first_load)
+ {
+ /* Check that there are no loads from different interleaving
+ chains in the same node. The only exception is complex
+ numbers. */
+ if (prev_first_load != first_load
+ && rhs_code != REALPART_EXPR
+ && rhs_code != IMAGPART_EXPR)
+ {
+ if (vect_print_dump_info (REPORT_SLP))
+ {
+ fprintf (vect_dump, "Build SLP failed: different "
+ "interleaving chains in one node ");
+ print_gimple_stmt (vect_dump, stmt, 0, TDF_SLIM);
+ }
+
+ return false;
+ }
+ }
+ else
+ prev_first_load = first_load;
if (first_load == stmt)
{
@@ -787,6 +808,39 @@ vect_supported_slp_permutation_p (slp_instance instance)
}
+/* Rearrange the statements of NODE according to PERMUTATION. */
+
+static void
+vect_slp_rearrange_stmts (slp_tree node, unsigned int group_size,
+ VEC (int, heap) *permutation)
+{
+ gimple stmt;
+ VEC (gimple, heap) *tmp_stmts;
+ unsigned int index, i;
+
+ if (!node)
+ return;
+
+ vect_slp_rearrange_stmts (SLP_TREE_LEFT (node), group_size, permutation);
+ vect_slp_rearrange_stmts (SLP_TREE_RIGHT (node), group_size, permutation);
+
+ gcc_assert (group_size == VEC_length (gimple, SLP_TREE_SCALAR_STMTS (node)));
+ tmp_stmts = VEC_alloc (gimple, heap, group_size);
+
+ for (i = 0; i < group_size; i++)
+ VEC_safe_push (gimple, heap, tmp_stmts, NULL);
+
+ for (i = 0; VEC_iterate (gimple, SLP_TREE_SCALAR_STMTS (node), i, stmt); i++)
+ {
+ index = VEC_index (int, permutation, i);
+ VEC_replace (gimple, tmp_stmts, index, stmt);
+ }
+
+ VEC_free (gimple, heap, SLP_TREE_SCALAR_STMTS (node));
+ SLP_TREE_SCALAR_STMTS (node) = tmp_stmts;
+}
+
+
/* Check if the required load permutation is supported.
LOAD_PERMUTATION contains a list of indices of the loads.
In SLP this permutation is relative to the order of strided stores that are
@@ -796,9 +850,11 @@ static bool
vect_supported_load_permutation_p (slp_instance slp_instn, int group_size,
VEC (int, heap) *load_permutation)
{
- int i = 0, j, prev = -1, next, k;
- bool supported;
+ int i = 0, j, prev = -1, next, k, number_of_groups;
+ bool supported, bad_permutation = false;
sbitmap load_index;
+ slp_tree node;
+ gimple stmt;
/* FORNOW: permutations are only supported in SLP. */
if (!slp_instn)
@@ -811,9 +867,72 @@ vect_supported_load_permutation_p (slp_instance slp_instn, int group_size,
fprintf (vect_dump, "%d ", next);
}
+ /* In case of reduction every load permutation is allowed, since the order
+ of the reduction statements is not important (as opposed to the case of
+ strided stores). The only condition we need to check is that all the
+ load nodes are of the same size and have the same permutation (and then
+ rearrange all the nodes of the SLP instance according to this
+ permutation). */
+
+ /* Check that all the load nodes are of the same size. */
+ for (i = 0;
+ VEC_iterate (slp_tree, SLP_INSTANCE_LOADS (slp_instn), i, node);
+ i++)
+ if (VEC_length (gimple, SLP_TREE_SCALAR_STMTS (node))
+ != (unsigned) group_size)
+ return false;
+
+ node = SLP_INSTANCE_TREE (slp_instn);
+ stmt = VEC_index (gimple, SLP_TREE_SCALAR_STMTS (node), 0);
+ /* LOAD_PERMUTATION is a list of indices of all the loads of the SLP
+ instance, not all the loads belong to the same node or interleaving
+ group. Hence, we need to divide them into groups according to
+ GROUP_SIZE. */
+ number_of_groups = VEC_length (int, load_permutation) / group_size;
+
+ /* Reduction (there are no data-refs in the root). */
+ if (!STMT_VINFO_DATA_REF (vinfo_for_stmt (stmt)))
+ {
+ int first_group_load_index;
+
+ /* Compare all the permutation sequences to the first one. */
+ for (i = 1; i < number_of_groups; i++)
+ {
+ k = 0;
+ for (j = i * group_size; j < i * group_size + group_size; j++)
+ {
+ next = VEC_index (int, load_permutation, j);
+ first_group_load_index = VEC_index (int, load_permutation, k);
+
+ if (next != first_group_load_index)
+ {
+ bad_permutation = true;
+ break;
+ }
+
+ k++;
+ }
+
+ if (bad_permutation)
+ break;
+ }
+
+ if (!bad_permutation)
+ {
+ /* This permutaion is valid for reduction. Since the order of the
+ statements in the nodes is not important unless they are memory
+ accesses, we can rearrange the statements in all the nodes
+ according to the order of the loads. */
+ vect_slp_rearrange_stmts (SLP_INSTANCE_TREE (slp_instn), group_size,
+ load_permutation);
+ VEC_free (int, heap, SLP_INSTANCE_LOAD_PERMUTATION (slp_instn));
+ return true;
+ }
+ }
+
/* FORNOW: the only supported permutation is 0..01..1.. of length equal to
GROUP_SIZE and where each sequence of same drs is of GROUP_SIZE length as
- well. */
+ well (unless it's reduction). */
if (VEC_length (int, load_permutation)
!= (unsigned int) (group_size * group_size))
return false;
@@ -896,17 +1015,28 @@ vect_analyze_slp_instance (loop_vec_info loop_vinfo, bb_vec_info bb_vinfo,
slp_tree node = XNEW (struct _slp_tree);
unsigned int group_size = DR_GROUP_SIZE (vinfo_for_stmt (stmt));
unsigned int unrolling_factor = 1, nunits;
- tree vectype, scalar_type;
+ tree vectype, scalar_type = NULL_TREE;
gimple next;
unsigned int vectorization_factor = 0;
- int inside_cost = 0, outside_cost = 0, ncopies_for_cost;
+ int inside_cost = 0, outside_cost = 0, ncopies_for_cost, i;
unsigned int max_nunits = 0;
VEC (int, heap) *load_permutation;
VEC (slp_tree, heap) *loads;
+ struct data_reference *dr = STMT_VINFO_DATA_REF (vinfo_for_stmt (stmt));
+
+ if (dr)
+ {
+ scalar_type = TREE_TYPE (DR_REF (dr));
+ vectype = get_vectype_for_scalar_type (scalar_type);
+ group_size = DR_GROUP_SIZE (vinfo_for_stmt (stmt));
+ }
+ else
+ {
+ gcc_assert (loop_vinfo);
+ vectype = STMT_VINFO_VECTYPE (vinfo_for_stmt (stmt));
+ group_size = VEC_length (gimple, LOOP_VINFO_REDUCTIONS (loop_vinfo));
+ }
- scalar_type = TREE_TYPE (DR_REF (STMT_VINFO_DATA_REF (
- vinfo_for_stmt (stmt))));
- vectype = get_vectype_for_scalar_type (scalar_type);
if (!vectype)
{
if (vect_print_dump_info (REPORT_SLP))
@@ -914,6 +1044,7 @@ vect_analyze_slp_instance (loop_vec_info loop_vinfo, bb_vec_info bb_vinfo,
fprintf (vect_dump, "Build SLP failed: unsupported data-type ");
print_generic_expr (vect_dump, scalar_type, TDF_SLIM);
}
+
return false;
}
@@ -938,11 +1069,29 @@ vect_analyze_slp_instance (loop_vec_info loop_vinfo, bb_vec_info bb_vinfo,
/* Create a node (a root of the SLP tree) for the packed strided stores. */
SLP_TREE_SCALAR_STMTS (node) = VEC_alloc (gimple, heap, group_size);
next = stmt;
- /* Collect the stores and store them in SLP_TREE_SCALAR_STMTS. */
- while (next)
+ if (dr)
{
- VEC_safe_push (gimple, heap, SLP_TREE_SCALAR_STMTS (node), next);
- next = DR_GROUP_NEXT_DR (vinfo_for_stmt (next));
+ /* Collect the stores and store them in SLP_TREE_SCALAR_STMTS. */
+ while (next)
+ {
+ VEC_safe_push (gimple, heap, SLP_TREE_SCALAR_STMTS (node), next);
+ next = DR_GROUP_NEXT_DR (vinfo_for_stmt (next));
+ }
+ }
+ else
+ {
+ /* Collect reduction statements. */
+ for (i = 0; VEC_iterate (gimple, LOOP_VINFO_REDUCTIONS (loop_vinfo), i,
+ next);
+ i++)
+ {
+ VEC_safe_push (gimple, heap, SLP_TREE_SCALAR_STMTS (node), next);
+ if (vect_print_dump_info (REPORT_DETAILS))
+ {
+ fprintf (vect_dump, "pushing reduction into node: ");
+ print_gimple_stmt (vect_dump, next, 0, TDF_SLIM);
+ }
+ }
}
SLP_TREE_VEC_STMTS (node) = NULL;
@@ -1035,7 +1184,7 @@ bool
vect_analyze_slp (loop_vec_info loop_vinfo, bb_vec_info bb_vinfo)
{
unsigned int i;
- VEC (gimple, heap) *strided_stores;
+ VEC (gimple, heap) *strided_stores, *reductions = NULL;
gimple store;
bool ok = false;
@@ -1043,10 +1192,14 @@ vect_analyze_slp (loop_vec_info loop_vinfo, bb_vec_info bb_vinfo)
fprintf (vect_dump, "=== vect_analyze_slp ===");
if (loop_vinfo)
- strided_stores = LOOP_VINFO_STRIDED_STORES (loop_vinfo);
+ {
+ strided_stores = LOOP_VINFO_STRIDED_STORES (loop_vinfo);
+ reductions = LOOP_VINFO_REDUCTIONS (loop_vinfo);
+ }
else
strided_stores = BB_VINFO_STRIDED_STORES (bb_vinfo);
+ /* Find SLP sequences starting from groups of strided stores. */
for (i = 0; VEC_iterate (gimple, strided_stores, i, store); i++)
if (vect_analyze_slp_instance (loop_vinfo, bb_vinfo, store))
ok = true;
@@ -1059,6 +1212,12 @@ vect_analyze_slp (loop_vec_info loop_vinfo, bb_vec_info bb_vinfo)
return false;
}
+ /* Find SLP sequences starting from groups of reductions. */
+ if (loop_vinfo && VEC_length (gimple, LOOP_VINFO_REDUCTIONS (loop_vinfo))
+ && vect_analyze_slp_instance (loop_vinfo, bb_vinfo,
+ VEC_index (gimple, reductions, 0)))
+ ok = true;
+
return true;
}
@@ -1120,7 +1279,10 @@ vect_detect_hybrid_slp_stmts (slp_tree node)
if ((stmt_vinfo = vinfo_for_stmt (use_stmt))
&& !STMT_SLP_TYPE (stmt_vinfo)
&& (STMT_VINFO_RELEVANT (stmt_vinfo)
- || VECTORIZABLE_CYCLE_DEF (STMT_VINFO_DEF_TYPE (stmt_vinfo))))
+ || VECTORIZABLE_CYCLE_DEF (STMT_VINFO_DEF_TYPE (stmt_vinfo)))
+ && !(gimple_code (use_stmt) == GIMPLE_PHI
+ && STMT_VINFO_DEF_TYPE (vinfo_for_stmt (use_stmt))
+ == vect_reduction_def))
vect_mark_slp_stmts (node, hybrid, i);
vect_detect_hybrid_slp_stmts (SLP_TREE_LEFT (node));
@@ -1429,11 +1591,14 @@ vect_update_slp_costs_according_to_vf (loop_vec_info loop_vinfo)
/* For constant and loop invariant defs of SLP_NODE this function returns
(vector) defs (VEC_OPRNDS) that will be used in the vectorized stmts.
OP_NUM determines if we gather defs for operand 0 or operand 1 of the scalar
- stmts. NUMBER_OF_VECTORS is the number of vector defs to create. */
+ stmts. NUMBER_OF_VECTORS is the number of vector defs to create.
+ REDUC_INDEX is the index of the reduction operand in the statements, unless
+ it is -1. */
static void
vect_get_constant_vectors (slp_tree slp_node, VEC(tree,heap) **vec_oprnds,
- unsigned int op_num, unsigned int number_of_vectors)
+ unsigned int op_num, unsigned int number_of_vectors,
+ int reduc_index)
{
VEC (gimple, heap) *stmts = SLP_TREE_SCALAR_STMTS (slp_node);
gimple stmt = VEC_index (gimple, stmts, 0);
@@ -1449,6 +1614,50 @@ vect_get_constant_vectors (slp_tree slp_node, VEC(tree,heap) **vec_oprnds,
int number_of_copies = 1;
VEC (tree, heap) *voprnds = VEC_alloc (tree, heap, number_of_vectors);
bool constant_p, is_store;
+ tree neutral_op = NULL;
+
+ if (STMT_VINFO_DEF_TYPE (stmt_vinfo) == vect_reduction_def)
+ {
+ enum tree_code code = gimple_assign_rhs_code (stmt);
+ if (reduc_index == -1)
+ {
+ VEC_free (tree, heap, *vec_oprnds);
+ return;
+ }
+
+ op_num = reduc_index - 1;
+ op = gimple_op (stmt, op_num + 1);
+ /* For additional copies (see the explanation of NUMBER_OF_COPIES below)
+ we need either neutral operands or the original operands. See
+ get_initial_def_for_reduction() for details. */
+ switch (code)
+ {
+ case WIDEN_SUM_EXPR:
+ case DOT_PROD_EXPR:
+ case PLUS_EXPR:
+ case MINUS_EXPR:
+ case BIT_IOR_EXPR:
+ case BIT_XOR_EXPR:
+ if (SCALAR_FLOAT_TYPE_P (TREE_TYPE (op)))
+ neutral_op = build_real (TREE_TYPE (op), dconst0);
+ else
+ neutral_op = build_int_cst (TREE_TYPE (op), 0);
+
+ break;
+
+ case MULT_EXPR:
+ case BIT_AND_EXPR:
+ if (SCALAR_FLOAT_TYPE_P (TREE_TYPE (op)))
+ neutral_op = build_real (TREE_TYPE (op), dconst1);
+ else
+ neutral_op = build_int_cst (TREE_TYPE (op), 1);
+
+ break;
+
+ default:
+ neutral_op = NULL;
+ }
+ }
if (STMT_VINFO_DATA_REF (stmt_vinfo))
{
@@ -1499,6 +1708,19 @@ vect_get_constant_vectors (slp_tree slp_node, VEC(tree,heap) **vec_oprnds,
else
op = gimple_op (stmt, op_num + 1);
+ if (reduc_index != -1)
+ {
+ struct loop *loop = (gimple_bb (stmt))->loop_father;
+ gimple def_stmt = SSA_NAME_DEF_STMT (op);
+
+ gcc_assert (loop);
+ /* Get the def before the loop. */
+ op = PHI_ARG_DEF_FROM_EDGE (def_stmt,
+ loop_preheader_edge (loop));
+ if (j != (number_of_copies - 1) && neutral_op)
+ op = neutral_op;
+ }
+
/* Create 'vect_ = {op0,op1,...,opn}'. */
t = tree_cons (NULL_TREE, op, t);
@@ -1536,8 +1758,25 @@ vect_get_constant_vectors (slp_tree slp_node, VEC(tree,heap) **vec_oprnds,
to replicate the vectors. */
while (number_of_vectors > VEC_length (tree, *vec_oprnds))
{
- for (i = 0; VEC_iterate (tree, *vec_oprnds, i, vop) && i < vec_num; i++)
- VEC_quick_push (tree, *vec_oprnds, vop);
+ tree neutral_vec = NULL;
+
+ if (neutral_op)
+ {
+ if (!neutral_vec)
+ {
+ t = NULL;
+ for (i = 0; i < (unsigned) nunits; i++)
+ t = tree_cons (NULL_TREE, neutral_op, t);
+ neutral_vec = build_vector (vector_type, t);
+ }
+
+ VEC_quick_push (tree, *vec_oprnds, neutral_vec);
+ }
+ else
+ {
+ for (i = 0; VEC_iterate (tree, *vec_oprnds, i, vop) && i < vec_num; i++)
+ VEC_quick_push (tree, *vec_oprnds, vop);
+ }
}
}
@@ -1576,7 +1815,7 @@ vect_get_slp_vect_defs (slp_tree slp_node, VEC (tree,heap) **vec_oprnds)
void
vect_get_slp_defs (slp_tree slp_node, VEC (tree,heap) **vec_oprnds0,
- VEC (tree,heap) **vec_oprnds1)
+ VEC (tree,heap) **vec_oprnds1, int reduc_index)
{
gimple first_stmt;
enum tree_code code;
@@ -1607,19 +1846,26 @@ vect_get_slp_defs (slp_tree slp_node, VEC (tree,heap) **vec_oprnds0,
*vec_oprnds0 = VEC_alloc (tree, heap, number_of_vects);
/* SLP_NODE corresponds either to a group of stores or to a group of
- unary/binary operations. We don't call this function for loads. */
- if (SLP_TREE_LEFT (slp_node))
+ unary/binary operations. We don't call this function for loads.
+ For reduction defs we call vect_get_constant_vectors(), since we are
+ looking for initial loop invariant values. */
+ if (SLP_TREE_LEFT (slp_node) && reduc_index == -1)
/* The defs are already vectorized. */
vect_get_slp_vect_defs (SLP_TREE_LEFT (slp_node), vec_oprnds0);
else
/* Build vectors from scalar defs. */
- vect_get_constant_vectors (slp_node, vec_oprnds0, 0, number_of_vects);
+ vect_get_constant_vectors (slp_node, vec_oprnds0, 0, number_of_vects,
+ reduc_index);
if (STMT_VINFO_DATA_REF (vinfo_for_stmt (first_stmt)))
/* Since we don't call this function with loads, this is a group of
stores. */
return;
+ /* For reductions, we only need initial values. */
+ if (reduc_index != -1)
+ return;
+
code = gimple_assign_rhs_code (first_stmt);
if (get_gimple_rhs_class (code) != GIMPLE_BINARY_RHS || !vec_oprnds1)
return;
@@ -1638,7 +1884,7 @@ vect_get_slp_defs (slp_tree slp_node, VEC (tree,heap) **vec_oprnds0,
vect_get_slp_vect_defs (SLP_TREE_RIGHT (slp_node), vec_oprnds1);
else
/* Build vectors from scalar defs. */
- vect_get_constant_vectors (slp_node, vec_oprnds1, 1, number_of_vects);
+ vect_get_constant_vectors (slp_node, vec_oprnds1, 1, number_of_vects, -1);
}
@@ -2027,22 +2273,7 @@ vect_schedule_slp_instance (slp_tree node, slp_instance instance,
si = gsi_for_stmt (stmt);
is_store = vect_transform_stmt (stmt, &si, &strided_store, node, instance);
- if (is_store)
- {
- if (DR_GROUP_FIRST_DR (stmt_info))
- /* If IS_STORE is TRUE, the vectorization of the
- interleaving chain was completed - free all the stores in
- the chain. */
- vect_remove_stores (DR_GROUP_FIRST_DR (stmt_info));
- else
- /* FORNOW: SLP originates only from strided stores. */
- gcc_unreachable ();
-
- return true;
- }
-
- /* FORNOW: SLP originates only from strided stores. */
- return false;
+ return is_store;
}
@@ -2075,6 +2306,26 @@ vect_schedule_slp (loop_vec_info loop_vinfo, bb_vec_info bb_vinfo)
fprintf (vect_dump, "vectorizing stmts using SLP.");
}
+ for (i = 0; VEC_iterate (slp_instance, slp_instances, i, instance); i++)
+ {
+ slp_tree root = SLP_INSTANCE_TREE (instance);
+ gimple store;
+ unsigned int j;
+ gimple_stmt_iterator gsi;
+
+ for (j = 0; VEC_iterate (gimple, SLP_TREE_SCALAR_STMTS (root), j, store)
+ && j < SLP_INSTANCE_GROUP_SIZE (instance); j++)
+ {
+ if (!STMT_VINFO_DATA_REF (vinfo_for_stmt (store)))
+ break;
+
+ /* Free the attached stmt_vec_info and remove the stmt. */
+ gsi = gsi_for_stmt (store);
+ gsi_remove (&gsi, true);
+ free_stmt_vec_info (store);
+ }
+ }
+
return is_store;
}
diff --git a/gcc/tree-vect-stmts.c b/gcc/tree-vect-stmts.c
index 4868f73e684..988749b792f 100644
--- a/gcc/tree-vect-stmts.c
+++ b/gcc/tree-vect-stmts.c
@@ -1134,7 +1134,7 @@ vect_get_vec_defs (tree op0, tree op1, gimple stmt,
slp_tree slp_node)
{
if (slp_node)
- vect_get_slp_defs (slp_node, vec_oprnds0, vec_oprnds1);
+ vect_get_slp_defs (slp_node, vec_oprnds0, vec_oprnds1, -1);
else
{
tree vec_oprnd;
@@ -2519,7 +2519,7 @@ vectorizable_type_demotion (gimple stmt, gimple_stmt_iterator *gsi,
{
/* Handle uses. */
if (slp_node)
- vect_get_slp_defs (slp_node, &vec_oprnds0, NULL);
+ vect_get_slp_defs (slp_node, &vec_oprnds0, NULL, -1);
else
{
VEC_free (tree, heap, vec_oprnds0);
@@ -2819,7 +2819,7 @@ vectorizable_type_promotion (gimple stmt, gimple_stmt_iterator *gsi,
if (j == 0)
{
if (slp_node)
- vect_get_slp_defs (slp_node, &vec_oprnds0, &vec_oprnds1);
+ vect_get_slp_defs (slp_node, &vec_oprnds0, &vec_oprnds1, -1);
else
{
vec_oprnd0 = vect_get_vec_def_for_operand (op0, stmt, NULL);
@@ -3105,7 +3105,7 @@ vectorizable_store (gimple stmt, gimple_stmt_iterator *gsi, gimple *vec_stmt,
if (slp)
{
/* Get vectorized arguments for SLP_NODE. */
- vect_get_slp_defs (slp_node, &vec_oprnds, NULL);
+ vect_get_slp_defs (slp_node, &vec_oprnds, NULL, -1);
vec_oprnd = VEC_index (tree, vec_oprnds, 0);
}
@@ -4049,7 +4049,7 @@ vect_analyze_stmt (gimple stmt, bool *need_to_vectorize, slp_tree node)
|| vectorizable_load (stmt, NULL, NULL, NULL, NULL)
|| vectorizable_call (stmt, NULL, NULL)
|| vectorizable_store (stmt, NULL, NULL, NULL)
- || vectorizable_reduction (stmt, NULL, NULL)
+ || vectorizable_reduction (stmt, NULL, NULL, NULL)
|| vectorizable_condition (stmt, NULL, NULL, NULL, 0));
else
{
@@ -4201,8 +4201,7 @@ vect_transform_stmt (gimple stmt, gimple_stmt_iterator *gsi,
break;
case reduc_vec_info_type:
- gcc_assert (!slp_node);
- done = vectorizable_reduction (stmt, gsi, &vec_stmt);
+ done = vectorizable_reduction (stmt, gsi, &vec_stmt, slp_node);
gcc_assert (done);
break;
diff --git a/gcc/tree-vectorizer.h b/gcc/tree-vectorizer.h
index 52b2a7ec59f..bd43a4bc173 100644
--- a/gcc/tree-vectorizer.h
+++ b/gcc/tree-vectorizer.h
@@ -242,6 +242,9 @@ typedef struct _loop_vec_info {
/* The unrolling factor needed to SLP the loop. In case of that pure SLP is
applied to the loop, i.e., no unrolling is needed, this is 1. */
unsigned slp_unrolling_factor;
+
+ /* Reduction cycles detected in the loop. Used in loop-aware SLP. */
+ VEC (gimple, heap) *reductions;
} *loop_vec_info;
/* Access Functions. */
@@ -266,6 +269,7 @@ typedef struct _loop_vec_info {
#define LOOP_VINFO_STRIDED_STORES(L) (L)->strided_stores
#define LOOP_VINFO_SLP_INSTANCES(L) (L)->slp_instances
#define LOOP_VINFO_SLP_UNROLLING_FACTOR(L) (L)->slp_unrolling_factor
+#define LOOP_VINFO_REDUCTIONS(L) (L)->reductions
#define LOOP_REQUIRES_VERSIONING_FOR_ALIGNMENT(L) \
VEC_length (gimple, (L)->may_misalign_stmts) > 0
@@ -844,7 +848,8 @@ extern void vect_transform_loop (loop_vec_info);
extern loop_vec_info vect_analyze_loop_form (struct loop *);
extern bool vectorizable_live_operation (gimple, gimple_stmt_iterator *,
gimple *);
-extern bool vectorizable_reduction (gimple, gimple_stmt_iterator *, gimple *);
+extern bool vectorizable_reduction (gimple, gimple_stmt_iterator *, gimple *,
+ slp_tree);
extern bool vectorizable_induction (gimple, gimple_stmt_iterator *, gimple *);
extern int vect_estimate_min_profitable_iters (loop_vec_info);
extern tree get_initial_def_for_reduction (gimple, tree, tree *);
@@ -862,7 +867,7 @@ extern bool vect_analyze_slp (loop_vec_info, bb_vec_info);
extern void vect_make_slp_decision (loop_vec_info);
extern void vect_detect_hybrid_slp (loop_vec_info);
extern void vect_get_slp_defs (slp_tree, VEC (tree,heap) **,
- VEC (tree,heap) **);
+ VEC (tree,heap) **, int);
extern LOC find_bb_location (basic_block);
extern bb_vec_info vect_slp_analyze_bb (basic_block);
extern void vect_slp_transform_bb (basic_block);