summaryrefslogtreecommitdiff
diff options
context:
space:
mode:
-rw-r--r--gcc/ChangeLog66
-rw-r--r--gcc/Makefile.in2
-rw-r--r--gcc/testsuite/ChangeLog26
-rw-r--r--gcc/testsuite/gcc.dg/vect/costmodel/ppc/costmodel-slp-12.c120
-rw-r--r--gcc/testsuite/gcc.dg/vect/costmodel/ppc/costmodel-slp-33.c45
-rw-r--r--gcc/testsuite/gcc.dg/vect/costmodel/ppc/ppc-costmodel-vect.exp2
-rw-r--r--gcc/testsuite/gcc.dg/vect/fast-math-slp-27.c17
-rw-r--r--gcc/testsuite/gcc.dg/vect/no-math-errno-slp-32.c17
-rw-r--r--gcc/testsuite/gcc.dg/vect/no-scevccp-slp-30.c58
-rw-r--r--gcc/testsuite/gcc.dg/vect/no-scevccp-slp-31.c58
-rw-r--r--gcc/testsuite/gcc.dg/vect/no-tree-pre-slp-29.c79
-rw-r--r--gcc/testsuite/gcc.dg/vect/slp-1.c124
-rw-r--r--gcc/testsuite/gcc.dg/vect/slp-10.c114
-rw-r--r--gcc/testsuite/gcc.dg/vect/slp-11.c113
-rw-r--r--gcc/testsuite/gcc.dg/vect/slp-12a.c105
-rw-r--r--gcc/testsuite/gcc.dg/vect/slp-12b.c51
-rw-r--r--gcc/testsuite/gcc.dg/vect/slp-13.c134
-rw-r--r--gcc/testsuite/gcc.dg/vect/slp-14.c118
-rw-r--r--gcc/testsuite/gcc.dg/vect/slp-15.c117
-rw-r--r--gcc/testsuite/gcc.dg/vect/slp-16.c70
-rw-r--r--gcc/testsuite/gcc.dg/vect/slp-17.c56
-rw-r--r--gcc/testsuite/gcc.dg/vect/slp-18.c97
-rw-r--r--gcc/testsuite/gcc.dg/vect/slp-19.c155
-rw-r--r--gcc/testsuite/gcc.dg/vect/slp-2.c146
-rw-r--r--gcc/testsuite/gcc.dg/vect/slp-20.c116
-rw-r--r--gcc/testsuite/gcc.dg/vect/slp-21.c208
-rw-r--r--gcc/testsuite/gcc.dg/vect/slp-22.c135
-rw-r--r--gcc/testsuite/gcc.dg/vect/slp-23.c113
-rw-r--r--gcc/testsuite/gcc.dg/vect/slp-24.c82
-rw-r--r--gcc/testsuite/gcc.dg/vect/slp-25.c59
-rw-r--r--gcc/testsuite/gcc.dg/vect/slp-26.c53
-rw-r--r--gcc/testsuite/gcc.dg/vect/slp-28.c86
-rw-r--r--gcc/testsuite/gcc.dg/vect/slp-3.c147
-rw-r--r--gcc/testsuite/gcc.dg/vect/slp-33.c112
-rw-r--r--gcc/testsuite/gcc.dg/vect/slp-34.c61
-rw-r--r--gcc/testsuite/gcc.dg/vect/slp-35.c73
-rw-r--r--gcc/testsuite/gcc.dg/vect/slp-36.c75
-rw-r--r--gcc/testsuite/gcc.dg/vect/slp-37.c67
-rw-r--r--gcc/testsuite/gcc.dg/vect/slp-4.c128
-rw-r--r--gcc/testsuite/gcc.dg/vect/slp-5.c128
-rw-r--r--gcc/testsuite/gcc.dg/vect/slp-6.c122
-rw-r--r--gcc/testsuite/gcc.dg/vect/slp-7.c127
-rw-r--r--gcc/testsuite/gcc.dg/vect/slp-8.c45
-rw-r--r--gcc/testsuite/gcc.dg/vect/slp-9.c47
-rw-r--r--gcc/testsuite/gcc.dg/vect/vect-vfa-03.c12
-rw-r--r--gcc/testsuite/gcc.dg/vect/vect-vfa-slp.c56
-rw-r--r--gcc/testsuite/gcc.dg/vect/vect.exp22
-rw-r--r--gcc/testsuite/lib/target-supports.exp20
-rw-r--r--gcc/tree-vect-analyze.c1060
-rw-r--r--gcc/tree-vect-transform.c900
-rw-r--r--gcc/tree-vectorizer.c11
-rw-r--r--gcc/tree-vectorizer.h135
52 files changed, 5780 insertions, 310 deletions
diff --git a/gcc/ChangeLog b/gcc/ChangeLog
index 5b93ed1e9f1..de8ecc2b79d 100644
--- a/gcc/ChangeLog
+++ b/gcc/ChangeLog
@@ -1,3 +1,69 @@
+2007-09-09 Ira Rosen <irar@il.ibm.com>
+
+ * tree-vectorizer.h (enum vect_def_type): Start enumeration from 1.
+ (struct _slp_tree, struct _slp_instance): Define new data structures
+ along macros for their access.
+ (struct _loop_vec_info): Define new fields: strided_stores,
+ slp_instances, and slp_unrolling_factor along macros for their access.
+ (enum slp_vect_type): New.
+ (struct _stmt_vec_info): Define new field, slp_type, and macros for its
+ access.
+ (STMT_VINFO_STRIDED_ACCESS): New macro.
+ (vect_free_slp_tree): Declare.
+ (vectorizable_load): Add an argument of type slp_tree.
+ (vectorizable_store, vectorizable_operation, vectorizable_conversion,
+ vectorizable_assignment): Likewise.
+ (vect_model_simple_cost, vect_model_store_cost, vect_model_load_cost):
+ Declare (make extern).
+ * tree-vectorizer.c (new_stmt_vec_info): Initiliaze the new field.
+ (new_loop_vec_info): Likewise.
+ (destroy_loop_vec_info): Free memory allocated for SLP structures.
+ * tree-vect-analyze.c: Include recog.h.
+ (vect_update_slp_costs_according_to_vf): New.
+ (vect_analyze_operations): Add argument for calls to vectorizable_ ()
+ functions. For not pure SLP stmts with strided access check that the
+ group size is power of 2. Update the vectorization factor according to
+ SLP. Call vect_update_slp_costs_according_to_vf.
+ (vect_analyze_group_access): New.
+ (vect_analyze_data_ref_access): Call vect_analyze_group_access.
+ (vect_free_slp_tree): New functions.
+ (vect_get_and_check_slp_defs, vect_build_slp_tree, vect_print_slp_tree,
+ vect_mark_slp_stmts, vect_analyze_slp_instance, vect_analyze_slp,
+ vect_make_slp_decision, vect_detect_hybrid_slp_stmts,
+ vect_detect_hybrid_slp): Likewise.
+ (vect_analyze_loop): Call vect_analyze_slp, vect_make_slp_decision
+ and vect_detect_hybrid_slp.
+ * tree-vect-transform.c (vect_estimate_min_profitable_iters): Take
+ SLP costs into account.
+ (vect_get_cost_fields): New function.
+ (vect_model_simple_cost): Make extern, add SLP parameter and handle
+ SLP.
+ (vect_model_store_cost, vect_model_load_cost): Likewise.
+ (vect_get_constant_vectors): New function.
+ (vect_get_slp_vect_defs, vect_get_slp_defs,
+ vect_get_vec_defs_for_stmt_copy, vect_get_vec_defs_for_stmt_copy,
+ vect_get_vec_defs): Likewise.
+ (vectorizable_reduction): Don't handle SLP for now.
+ (vectorizable_call): Don't handle SLP for now. Add argument to
+ vect_model_simple_cost.
+ (vectorizable_conversion): Handle SLP (call vect_get_vec_defs to
+ get SLPed and vectorized defs). Fix indentation and spacing.
+ (vectorizable_assignment): Handle SLP.
+ (vectorizable_induction): Don't handle SLP for now.
+ (vectorizable_operation): Likewise.
+ (vectorizable_type_demotion): Add argument to
+ vect_model_simple_cost.
+ (vectorizable_type_promotion): Likewise.
+ (vectorizable_store, vectorizable_load): Handle SLP.
+ (vectorizable_condition): Don't handle SLP for now.
+ (vect_transform_stmt): Add a new argument for SLP. Check that there is
+ no SLP transformation required for unsupported cases. Add SLP
+ argument for supported cases.
+ (vect_remove_stores): New function.
+ (vect_schedule_slp_instance, vect_schedule_slp): Likewise.
+ (vect_transform_loop): Schedule SLP instances.
+ * Makefile.in: (tree-vect-analyze.o): Depend on recog.h.
+
2007-09-09 Andrew Haley <aph@redhat.com>
* optabs.c (sign_expand_binop): Set libcall_gen = NULL in the
diff --git a/gcc/Makefile.in b/gcc/Makefile.in
index 724c452cab8..348dbc4b774 100644
--- a/gcc/Makefile.in
+++ b/gcc/Makefile.in
@@ -2246,7 +2246,7 @@ tree-data-ref.o: tree-data-ref.c $(CONFIG_H) $(SYSTEM_H) coretypes.h $(TM_H) \
$(TREE_FLOW_H) $(TREE_DUMP_H) $(TIMEVAR_H) $(CFGLOOP_H) \
$(TREE_DATA_REF_H) $(SCEV_H) tree-pass.h tree-chrec.h langhooks.h
tree-vect-analyze.o: tree-vect-analyze.c $(CONFIG_H) $(SYSTEM_H) coretypes.h \
- $(TM_H) $(GGC_H) $(OPTABS_H) $(TREE_H) $(BASIC_BLOCK_H) \
+ $(TM_H) $(GGC_H) $(OPTABS_H) $(TREE_H) $(RECOG_H) $(BASIC_BLOCK_H) \
$(DIAGNOSTIC_H) $(TREE_FLOW_H) $(TREE_DUMP_H) $(TIMEVAR_H) $(CFGLOOP_H) \
tree-vectorizer.h $(TREE_DATA_REF_H) $(SCEV_H) $(EXPR_H) tree-chrec.h
tree-vect-patterns.o: tree-vect-patterns.c $(CONFIG_H) $(SYSTEM_H) coretypes.h \
diff --git a/gcc/testsuite/ChangeLog b/gcc/testsuite/ChangeLog
index 06d1e728f1f..4d294e10fac 100644
--- a/gcc/testsuite/ChangeLog
+++ b/gcc/testsuite/ChangeLog
@@ -1,3 +1,29 @@
+2007-09-09 Ira Rosen <irar@il.ibm.com>
+
+ * gcc.dg/vect/vect.exp: Compile tests starting with slp-.
+ Remove "vect" part from test names for -ffast-math, -ffast-math-errno,
+ -fwrapv, -ftrapv tests. Add -fno-tree-scev-cprop for slp- tests.
+ Compile tests with -fno-tree-pre.
+ * gcc.dg/vect/costmodel/ppc/ppc-costmodel-vect.exp: Run SLP tests.
+ * lib/target-supports.exp (check_effective_target_vect_strided): New.
+ * gcc.dg/vect/slp-1.c, gcc.dg/vect/slp-2.c, gcc.dg/vect/slp-3.c,
+ gcc.dg/vect/slp-4.c, gcc.dg/vect/slp-5.c, gcc.dg/vect/slp-6.c,
+ gcc.dg/vect/slp-7.c, gcc.dg/vect/slp-8.c, gcc.dg/vect/slp-9.c,
+ gcc.dg/vect/slp-10.c, gcc.dg/vect/slp-11.c, gcc.dg/vect/slp-12.c,
+ gcc.dg/vect/slp-13.c, gcc.dg/vect/slp-14.c, gcc.dg/vect/slp-15.c,
+ gcc.dg/vect/slp-16.c, gcc.dg/vect/slp-17.c, gcc.dg/vect/slp-18.c,
+ gcc.dg/vect/slp-19.c, gcc.dg/vect/slp-20.c, gcc.dg/vect/slp-21.c,
+ gcc.dg/vect/slp-22.c, gcc.dg/vect/slp-23.c, gcc.dg/vect/slp-24.c,
+ gcc.dg/vect/slp-25.c, gcc.dg/vect/slp-26.c, gcc.dg/vect/slp-28.c,
+ gcc.dg/vect/fast-math-slp-27.c, gcc.dg/vect/no-tree-pre-slp-29.c,
+ gcc.dg/vect/no-scevccp-slp-30.c, gcc.dg/vect/no-scevccp-slp-31.c,
+ gcc.dg/vect/no-math-errno-slp-32.c, gcc.dg/vect/slp-33.c,
+ gcc.dg/vect/slp-34.c, gcc.dg/vect/slp-35.c, gcc.dg/vect/slp-36.c,
+ gcc.dg/vect/slp-37.c, gcc.dg/vect/vect-vfa-slp.c,
+ gcc.dg/vect/costmodel/ppc/costmodel-slp-12.c,
+ gcc.dg/vect/costmodel/ppc/costmodel-slp-33.c: New testcases.
+ * gcc.dg/vect/vect-vfa-03.c: Change the test to prevent SLP.
+
2007-09-09 Joseph Myers <joseph@codesourcery.com>
* lib/file-format.exp (gcc_target_object_format): Use remote_exec
diff --git a/gcc/testsuite/gcc.dg/vect/costmodel/ppc/costmodel-slp-12.c b/gcc/testsuite/gcc.dg/vect/costmodel/ppc/costmodel-slp-12.c
new file mode 100644
index 00000000000..752c4f61390
--- /dev/null
+++ b/gcc/testsuite/gcc.dg/vect/costmodel/ppc/costmodel-slp-12.c
@@ -0,0 +1,120 @@
+/* { dg-require-effective-target vect_int } */
+
+#include <stdarg.h>
+#include <stdio.h>
+#include "../../tree-vect.h"
+
+#define N 8
+
+int
+main1 ()
+{
+ int i;
+ unsigned int out[N*8], a0, a1, a2, a3, a4, a5, a6, a7, b1, b0, b2, b3, b4, b5, b6, b7;
+ unsigned int in[N*8] = {0,1,2,3,4,5,6,7,8,9,10,11,12,13,14,15,16,17,18,19,20,21,22,23,24,25,26,27,28,29,30,31,32,33,34,35,36,37,38,39,40,41,42,43,44,45,46,47,48,49,50,51,52,53,54,55,56,57,58,59,60,61,62,63};
+ float out2[N*8], fa[N*4];
+ unsigned int ia[N], ib[N*2];
+
+ for (i = 0; i < N; i++)
+ {
+
+ a0 = in[i*8] + 5;
+ a1 = in[i*8 + 1] + 6;
+ a2 = in[i*8 + 2] + 7;
+ a3 = in[i*8 + 3] + 8;
+ a4 = in[i*8 + 4] + 9;
+ a5 = in[i*8 + 5] + 10;
+ a6 = in[i*8 + 6] + 11;
+ a7 = in[i*8 + 7] + 12;
+
+ b0 = a0 * 3;
+ b1 = a1 * 2;
+ b2 = a2 * 12;
+ b3 = a3 * 5;
+ b4 = a4 * 8;
+ b5 = a5 * 4;
+ b6 = a6 * 3;
+ b7 = a7 * 2;
+
+ out[i*8] = b0 - 2;
+ out[i*8 + 1] = b1 - 3;
+ out[i*8 + 2] = b2 - 2;
+ out[i*8 + 3] = b3 - 1;
+ out[i*8 + 4] = b4 - 8;
+ out[i*8 + 5] = b5 - 7;
+ out[i*8 + 6] = b6 - 3;
+ out[i*8 + 7] = b7 - 7;
+
+ ia[i] = b6;
+ }
+
+ /* check results: */
+ for (i = 0; i < N; i++)
+ {
+ if (out[i*8] != (in[i*8] + 5) * 3 - 2
+ || out[i*8 + 1] != (in[i*8 + 1] + 6) * 2 - 3
+ || out[i*8 + 2] != (in[i*8 + 2] + 7) * 12 - 2
+ || out[i*8 + 3] != (in[i*8 + 3] + 8) * 5 - 1
+ || out[i*8 + 4] != (in[i*8 + 4] + 9) * 8 - 8
+ || out[i*8 + 5] != (in[i*8 + 5] + 10) * 4 - 7
+ || out[i*8 + 6] != (in[i*8 + 6] + 11) * 3 - 3
+ || out[i*8 + 7] != (in[i*8 + 7] + 12) * 2 - 7
+ || ia[i] != (in[i*8 + 6] + 11) * 3)
+ abort ();
+ }
+
+ for (i = 0; i < N*2; i++)
+ {
+ out[i*4] = (in[i*4] + 2) * 3;
+ out[i*4 + 1] = (in[i*4 + 1] + 2) * 7;
+ out[i*4 + 2] = (in[i*4 + 2] + 7) * 3;
+ out[i*4 + 3] = (in[i*4 + 3] + 7) * 7;
+
+ ib[i] = 7;
+ }
+
+ /* check results: */
+ for (i = 0; i < N*2; i++)
+ {
+ if (out[i*4] != (in[i*4] + 2) * 3
+ || out[i*4 + 1] != (in[i*4 + 1] + 2) * 7
+ || out[i*4 + 2] != (in[i*4 + 2] + 7) * 3
+ || out[i*4 + 3] != (in[i*4 + 3] + 7) * 7
+ || ib[i] != 7)
+ abort ();
+ }
+
+ for (i = 0; i < N*4; i++)
+ {
+ out2[i*2] = (float) (in[i*2] * 2 + 11) ;
+ out2[i*2 + 1] = (float) (in[i*2 + 1] * 3 + 7);
+
+ fa[i] = (float) in[i*2+1];
+ }
+
+ /* check results: */
+ for (i = 0; i < N*4; i++)
+ {
+ if (out2[i*2] != (float) (in[i*2] * 2 + 11)
+ || out2[i*2 + 1] != (float) (in[i*2 + 1] * 3 + 7)
+ || fa[i] != (float) in[i*2+1])
+ abort ();
+ }
+
+
+ return 0;
+}
+
+int main (void)
+{
+ check_vect ();
+
+ main1 ();
+
+ return 0;
+}
+
+/* { dg-final { scan-tree-dump-times "vectorized 1 loops" 1 "vect" {target { vect_strided && vect_int_mult } } } } */
+/* { dg-final { scan-tree-dump-times "vectorizing stmts using SLP" 1 "vect" {target { vect_strided && vect_int_mult } } } } */
+/* { dg-final { cleanup-tree-dump "vect" } } */
+
diff --git a/gcc/testsuite/gcc.dg/vect/costmodel/ppc/costmodel-slp-33.c b/gcc/testsuite/gcc.dg/vect/costmodel/ppc/costmodel-slp-33.c
new file mode 100644
index 00000000000..9cae12fdbb3
--- /dev/null
+++ b/gcc/testsuite/gcc.dg/vect/costmodel/ppc/costmodel-slp-33.c
@@ -0,0 +1,45 @@
+/* { dg-require-effective-target vect_int } */
+
+#include <stdarg.h>
+#include "../../tree-vect.h"
+
+#define N 32
+
+struct s{
+ short a; /* aligned */
+ char b[N-1]; /* unaligned (offset 2B) */
+};
+
+int main1 ()
+{
+ int i;
+ struct s tmp;
+
+ /* unaligned */
+ for (i = 0; i < N/4; i++)
+ {
+ tmp.b[2*i] = 5;
+ tmp.b[2*i+1] = 15;
+ }
+
+ /* check results: */
+ for (i = 0; i <N/4; i++)
+ {
+ if (tmp.b[2*i] != 5
+ || tmp.b[2*i+1] != 15)
+ abort ();
+ }
+
+ return 0;
+}
+
+int main (void)
+{
+ check_vect ();
+
+ return main1 ();
+}
+
+/* { dg-final { scan-tree-dump-times "vectorization not profitable" 1 "vect" } } */
+/* { dg-final { scan-tree-dump-times "vectorized 1 loops" 0 "vect" } } */
+/* { dg-final { cleanup-tree-dump "vect" } } */
diff --git a/gcc/testsuite/gcc.dg/vect/costmodel/ppc/ppc-costmodel-vect.exp b/gcc/testsuite/gcc.dg/vect/costmodel/ppc/ppc-costmodel-vect.exp
index 63f5349ee84..4f710634e8a 100644
--- a/gcc/testsuite/gcc.dg/vect/costmodel/ppc/ppc-costmodel-vect.exp
+++ b/gcc/testsuite/gcc.dg/vect/costmodel/ppc/ppc-costmodel-vect.exp
@@ -64,6 +64,8 @@ dg-runtest [lsort [glob -nocomplain $srcdir/$subdir/costmodel-pr*.\[cS\]]] \
"" $DEFAULT_VECTCFLAGS
dg-runtest [lsort [glob -nocomplain $srcdir/$subdir/costmodel-vect-*.\[cS\]]] \
"" $DEFAULT_VECTCFLAGS
+dg-runtest [lsort [glob -nocomplain $srcdir/$subdir/costmodel-slp-*.\[cS\]]] \
+ "" $DEFAULT_VECTCFLAGS
#### Tests with special options
global SAVED_DEFAULT_VECTCFLAGS
diff --git a/gcc/testsuite/gcc.dg/vect/fast-math-slp-27.c b/gcc/testsuite/gcc.dg/vect/fast-math-slp-27.c
new file mode 100644
index 00000000000..d4c7d19925b
--- /dev/null
+++ b/gcc/testsuite/gcc.dg/vect/fast-math-slp-27.c
@@ -0,0 +1,17 @@
+/* { dg-do compile } */
+/* { dg-require-effective-target vect_float } */
+
+float x[256];
+
+void foo(void)
+{
+ int i;
+ for (i=0; i<256; ++i)
+ {
+ x[2*i] = x[2*i] * x[2*i];
+ x[2*i+1] = x[2*i+1] * x[2*i+1];
+ }
+}
+
+/* { dg-final { scan-tree-dump "vectorized 1 loops" "vect" { target vect_strided } } } */
+/* { dg-final { cleanup-tree-dump "vect" } } */
diff --git a/gcc/testsuite/gcc.dg/vect/no-math-errno-slp-32.c b/gcc/testsuite/gcc.dg/vect/no-math-errno-slp-32.c
new file mode 100644
index 00000000000..c952e7f1b11
--- /dev/null
+++ b/gcc/testsuite/gcc.dg/vect/no-math-errno-slp-32.c
@@ -0,0 +1,17 @@
+/* { dg-do compile } */
+/* { dg-require-effective-target vect_double } */
+
+double x[256];
+
+void foo(void)
+{
+ int i;
+ for (i=0; i<128; ++i)
+ {
+ x[2*i] = __builtin_pow (x[2*i], 0.5);
+ x[2*i+1] = __builtin_pow (x[2*i+1], 0.5);
+ }
+}
+
+/* { dg-final { scan-tree-dump "pattern recognized" "vect" { xfail spu*-*-* } } } */
+/* { dg-final { cleanup-tree-dump "vect" } } */
diff --git a/gcc/testsuite/gcc.dg/vect/no-scevccp-slp-30.c b/gcc/testsuite/gcc.dg/vect/no-scevccp-slp-30.c
new file mode 100644
index 00000000000..30cb947bed0
--- /dev/null
+++ b/gcc/testsuite/gcc.dg/vect/no-scevccp-slp-30.c
@@ -0,0 +1,58 @@
+/* { dg-require-effective-target vect_int } */
+
+#include <stdarg.h>
+#include <stdio.h>
+#include "tree-vect.h"
+
+#define N 128
+
+int
+main1 ()
+{
+ int i, j;
+ unsigned short out[N*8], a[N];
+
+ for (j = 0; j < N; j++)
+ {
+ for (i = 0; i < N; i++)
+ {
+ out[i*4] = 8;
+ out[i*4 + 1] = 18;
+ out[i*4 + 2] = 28;
+ out[i*4 + 3] = 38;
+ }
+ a[j] = 8;
+ }
+
+ /* check results: */
+ for (j = 0; j < N; j++)
+ {
+ for (i = 0; i < N; i++)
+ {
+ if (out[i*4] != 8
+ || out[i*4 + 1] != 18
+ || out[i*4 + 2] != 28
+ || out[i*4 + 3] != 38)
+ abort();
+ }
+
+ if (a[j] != 8)
+ abort ();
+ }
+
+ return 0;
+}
+
+int main (void)
+{
+ check_vect ();
+
+ main1 ();
+
+ return 0;
+}
+
+/* { dg-final { scan-tree-dump-times "vectorized 1 loops" 1 "vect" } } */
+/* { dg-final { scan-tree-dump-times "vectorizing stmts using SLP" 1 "vect" } } */
+/* { dg-final { cleanup-tree-dump "vect" } } */
+
diff --git a/gcc/testsuite/gcc.dg/vect/no-scevccp-slp-31.c b/gcc/testsuite/gcc.dg/vect/no-scevccp-slp-31.c
new file mode 100644
index 00000000000..2e43db13601
--- /dev/null
+++ b/gcc/testsuite/gcc.dg/vect/no-scevccp-slp-31.c
@@ -0,0 +1,58 @@
+/* { dg-require-effective-target vect_int } */
+
+#include <stdarg.h>
+#include <stdio.h>
+#include "tree-vect.h"
+
+#define N 128
+
+int
+main1 ()
+{
+ int i, j;
+ unsigned short out[N*8], a[N][N];
+
+ for (i = 0; i < N; i++)
+ {
+ for (j = 0; j < N; j++)
+ {
+ a[i][j] = 8;
+ }
+ out[i*4] = 8;
+ out[i*4 + 1] = 18;
+ out[i*4 + 2] = 28;
+ out[i*4 + 3] = 38;
+ }
+
+ /* check results: */
+ for (i = 0; i < N; i++)
+ {
+ for (j = 0; j < N; j++)
+ {
+ if (a[i][j] != 8)
+ abort ();
+ }
+ if (out[i*4] != 8
+ || out[i*4 + 1] != 18
+ || out[i*4 + 2] != 28
+ || out[i*4 + 3] != 38)
+ abort();
+ }
+
+
+ return 0;
+}
+
+int main (void)
+{
+ check_vect ();
+
+ main1 ();
+
+ return 0;
+}
+
+/* { dg-final { scan-tree-dump-times "vectorized 1 loops" 1 "vect" } } */
+/* { dg-final { scan-tree-dump-times "vectorizing stmts using SLP" 0 "vect" } } */
+/* { dg-final { cleanup-tree-dump "vect" } } */
+
diff --git a/gcc/testsuite/gcc.dg/vect/no-tree-pre-slp-29.c b/gcc/testsuite/gcc.dg/vect/no-tree-pre-slp-29.c
new file mode 100644
index 00000000000..9c75e9dc5b0
--- /dev/null
+++ b/gcc/testsuite/gcc.dg/vect/no-tree-pre-slp-29.c
@@ -0,0 +1,79 @@
+/* { dg-require-effective-target vect_int } */
+
+#include <stdarg.h>
+#include <stdio.h>
+#include "tree-vect.h"
+
+#define N 8
+
+unsigned short in2[N*8] = {0,1,2,3,4,5,6,7,8,9,10,11,12,13,14,15,16,17,18,19,20,21,22,23,24,25,26,27,28,29,30,31,32,33,34,35,36,37,38,39,40,41,42,43,44,45,46,47,48,49,50,51,52,53,54,55,56,57,58,59,60,61,62,63};
+
+int
+main1 (unsigned short *in)
+{
+ int i;
+ unsigned short out[N*8];
+
+ for (i = 0; i < N; i++)
+ {
+ out[i*4] = in[i*4];
+ out[i*4 + 1] = in[i*4 + 1];
+ out[i*4 + 2] = in[i*4 + 2];
+ out[i*4 + 3] = in[i*4 + 3];
+ }
+
+ /* check results: */
+ for (i = 0; i < N; i++)
+ {
+ if (out[i*4] != in[i*4]
+ || out[i*4 + 1] != in[i*4 + 1]
+ || out[i*4 + 2] != in[i*4 + 2]
+ || out[i*4 + 3] != in[i*4 + 3])
+ abort ();
+ }
+
+ return 0;
+}
+
+int
+main2 (unsigned short * __restrict__ in, unsigned short * __restrict__ out)
+{
+ int i;
+
+ for (i = 0; i < N; i++)
+ {
+ out[i*4] = in[i*4];
+ out[i*4 + 1] = in[i*4 + 1];
+ out[i*4 + 2] = in[i*4 + 2];
+ out[i*4 + 3] = in[i*4 + 3];
+ }
+
+ /* check results: */
+ for (i = 0; i < N; i++)
+ {
+ if (out[i*4] != in[i*4]
+ || out[i*4 + 1] != in[i*4 + 1]
+ || out[i*4 + 2] != in[i*4 + 2]
+ || out[i*4 + 3] != in[i*4 + 3])
+ abort ();
+ }
+
+ return 0;
+}
+
+int main (void)
+{
+ unsigned short out[N*8];
+
+ check_vect ();
+
+ main1 (&in2[5]);
+ main2 (&in2[3], &out[3]);
+
+ return 0;
+}
+
+/* { dg-final { scan-tree-dump-times "vectorized 1 loops" 2 "vect" { xfail vect_no_align } } } */
+/* { dg-final { scan-tree-dump-times "vectorizing stmts using SLP" 2 "vect" { xfail vect_no_align } } } */
+/* { dg-final { cleanup-tree-dump "vect" } } */
+
diff --git a/gcc/testsuite/gcc.dg/vect/slp-1.c b/gcc/testsuite/gcc.dg/vect/slp-1.c
new file mode 100644
index 00000000000..f7e20973818
--- /dev/null
+++ b/gcc/testsuite/gcc.dg/vect/slp-1.c
@@ -0,0 +1,124 @@
+/* { dg-require-effective-target vect_int } */
+
+#include <stdarg.h>
+#include <stdio.h>
+#include "tree-vect.h"
+
+#define N 128
+
+int
+main1 ()
+{
+ int i;
+ unsigned short out[N*8];
+
+ for (i = 0; i < N; i++)
+ {
+ out[i*4] = 8;
+ out[i*4 + 1] = 18;
+ out[i*4 + 2] = 28;
+ out[i*4 + 3] = 38;
+ }
+
+ /* check results: */
+ for (i = 0; i < N; i++)
+ {
+ if (out[i*4] != 8
+ || out[i*4 + 1] != 18
+ || out[i*4 + 2] != 28
+ || out[i*4 + 3] != 38)
+ abort ();
+ }
+
+ for (i = 0; i < N; i++)
+ {
+ out[i*8] = 8;
+ out[i*8 + 1] = 7;
+ out[i*8 + 2] = 81;
+ out[i*8 + 3] = 28;
+ out[i*8 + 4] = 18;
+ out[i*8 + 5] = 85;
+ out[i*8 + 6] = 5;
+ out[i*8 + 7] = 4;
+ }
+
+ /* check results: */
+ for (i = 0; i < N; i++)
+ {
+ if (out[i*8] != 8
+ || out[i*8 + 1] != 7
+ || out[i*8 + 2] != 81
+ || out[i*8 + 3] != 28
+ || out[i*8 + 4] != 18
+ || out[i*8 + 5] != 85
+ || out[i*8 + 6] != 5
+ || out[i*8 + 7] != 4)
+ abort ();
+ }
+
+ /* SLP with unrolling by 8. */
+ for (i = 0; i < N; i++)
+ {
+ out[i*5] = 8;
+ out[i*5 + 1] = 7;
+ out[i*5 + 2] = 81;
+ out[i*5 + 3] = 28;
+ out[i*5 + 4] = 18;
+ }
+
+ /* check results: */
+ for (i = 0; i < N; i++)
+ {
+ if (out[i*5] != 8
+ || out[i*5 + 1] != 7
+ || out[i*5 + 2] != 81
+ || out[i*5 + 3] != 28
+ || out[i*5 + 4] != 18)
+ abort ();
+ }
+
+ /* SLP with unrolling by 8. */
+ for (i = 0; i < N/2; i++)
+ {
+ out[i*9] = 8;
+ out[i*9 + 1] = 7;
+ out[i*9 + 2] = 81;
+ out[i*9 + 3] = 28;
+ out[i*9 + 4] = 18;
+ out[i*9 + 5] = 85;
+ out[i*9 + 6] = 5;
+ out[i*9 + 7] = 4;
+ out[i*9 + 8] = 14;
+ }
+
+ /* check results: */
+ for (i = 0; i < N/2; i++)
+ {
+ if (out[i*9] != 8
+ || out[i*9 + 1] != 7
+ || out[i*9 + 2] != 81
+ || out[i*9 + 3] != 28
+ || out[i*9 + 4] != 18
+ || out[i*9 + 5] != 85
+ || out[i*9 + 6] != 5
+ || out[i*9 + 7] != 4
+ || out[i*9 + 8] != 14)
+ abort ();
+ }
+
+ return 0;
+}
+
+int main (void)
+{
+ check_vect ();
+
+ main1 ();
+
+ return 0;
+}
+
+/* { dg-final { scan-tree-dump-times "vectorized 4 loops" 1 "vect" } } */
+/* { dg-final { scan-tree-dump-times "vectorizing stmts using SLP" 4 "vect" } } */
+/* { dg-final { cleanup-tree-dump "vect" } } */
+
diff --git a/gcc/testsuite/gcc.dg/vect/slp-10.c b/gcc/testsuite/gcc.dg/vect/slp-10.c
new file mode 100644
index 00000000000..737e1e7860d
--- /dev/null
+++ b/gcc/testsuite/gcc.dg/vect/slp-10.c
@@ -0,0 +1,114 @@
+/* { dg-require-effective-target vect_int } */
+
+#include <stdarg.h>
+#include <stdio.h>
+#include "tree-vect.h"
+
+#define N 8
+
+int
+main1 ()
+{
+ int i;
+ unsigned int out[N*8], a0, a1, a2, a3, a4, a5, a6, a7, b1, b0, b2, b3, b4, b5, b6, b7;
+ unsigned int in[N*8] = {0,1,2,3,4,5,6,7,8,9,10,11,12,13,14,15,16,17,18,19,20,21,22,23,24,25,26,27,28,29,30,31,32,33,34,35,36,37,38,39,40,41,42,43,44,45,46,47,48,49,50,51,52,53,54,55,56,57,58,59,60,61,62,63};
+ float out2[N*8];
+
+ for (i = 0; i < N; i++)
+ {
+
+ a0 = in[i*8] + 5;
+ a1 = in[i*8 + 1] + 6;
+ a2 = in[i*8 + 2] + 7;
+ a3 = in[i*8 + 3] + 8;
+ a4 = in[i*8 + 4] + 9;
+ a5 = in[i*8 + 5] + 10;
+ a6 = in[i*8 + 6] + 11;
+ a7 = in[i*8 + 7] + 12;
+
+ b0 = a0 * 3;
+ b1 = a1 * 2;
+ b2 = a2 * 12;
+ b3 = a3 * 5;
+ b4 = a4 * 8;
+ b5 = a5 * 4;
+ b6 = a6 * 3;
+ b7 = a7 * 2;
+
+ out[i*8] = b0 - 2;
+ out[i*8 + 1] = b1 - 3;
+ out[i*8 + 2] = b2 - 2;
+ out[i*8 + 3] = b3 - 1;
+ out[i*8 + 4] = b4 - 8;
+ out[i*8 + 5] = b5 - 7;
+ out[i*8 + 6] = b6 - 3;
+ out[i*8 + 7] = b7 - 7;
+ }
+
+ /* check results: */
+ for (i = 0; i < N; i++)
+ {
+ if (out[i*8] != (in[i*8] + 5) * 3 - 2
+ || out[i*8 + 1] != (in[i*8 + 1] + 6) * 2 - 3
+ || out[i*8 + 2] != (in[i*8 + 2] + 7) * 12 - 2
+ || out[i*8 + 3] != (in[i*8 + 3] + 8) * 5 - 1
+ || out[i*8 + 4] != (in[i*8 + 4] + 9) * 8 - 8
+ || out[i*8 + 5] != (in[i*8 + 5] + 10) * 4 - 7
+ || out[i*8 + 6] != (in[i*8 + 6] + 11) * 3 - 3
+ || out[i*8 + 7] != (in[i*8 + 7] + 12) * 2 - 7)
+ abort ();
+ }
+
+ for (i = 0; i < N*2; i++)
+ {
+ out[i*4] = (in[i*4] + 2) * 3;
+ out[i*4 + 1] = (in[i*4 + 1] + 2) * 7;
+ out[i*4 + 2] = (in[i*4 + 2] + 7) * 3;
+ out[i*4 + 3] = (in[i*4 + 3] + 7) * 7;
+ }
+
+ /* check results: */
+ for (i = 0; i < N*2; i++)
+ {
+ if (out[i*4] != (in[i*4] + 2) * 3
+ || out[i*4 + 1] != (in[i*4 + 1] + 2) * 7
+ || out[i*4 + 2] != (in[i*4 + 2] + 7) * 3
+ || out[i*4 + 3] != (in[i*4 + 3] + 7) * 7)
+ abort ();
+ }
+
+ for (i = 0; i < N*4; i++)
+ {
+ out2[i*2] = (float) (in[i*2] * 2 + 5) ;
+ out2[i*2 + 1] = (float) (in[i*2 + 1] * 3 + 7);
+ }
+
+ /* check results: */
+ for (i = 0; i < N*4; i++)
+ {
+ if (out2[i*2] != (float) (in[i*2] * 2 + 5)
+ || out2[i*2 + 1] != (float) (in[i*2 + 1] * 3 + 7))
+ abort ();
+ }
+
+
+ return 0;
+}
+
+int main (void)
+{
+ check_vect ();
+
+ main1 ();
+
+ return 0;
+}
+
+/* { dg-final { scan-tree-dump-times "vectorized 3 loops" 1 "vect" {target {vect_intfloat_cvt && vect_int_mult} } } } */
+/* { dg-final { scan-tree-dump-times "vectorized 2 loops" 1 "vect" {target {{! { vect_intfloat_cvt}} && vect_int_mult} } } } */
+/* { dg-final { scan-tree-dump-times "vectorized 0 loops" 1 "vect" {target {{! { vect_intfloat_cvt}} && {!{vect_int_mult}}} } } } */
+/* { dg-final { scan-tree-dump-times "vectorizing stmts using SLP" 3 "vect" {target {vect_intfloat_cvt && vect_int_mult} } } } */
+/* { dg-final { scan-tree-dump-times "vectorizing stmts using SLP" 2 "vect" {target {{! { vect_intfloat_cvt}} && vect_int_mult} } } } */
+/* { dg-final { scan-tree-dump-times "vectorizing stmts using SLP" 0 "vect" {target {{! { vect_intfloat_cvt}} && {!{vect_int_mult}}} } } } */
+/* { dg-final { cleanup-tree-dump "vect" } } */
+
diff --git a/gcc/testsuite/gcc.dg/vect/slp-11.c b/gcc/testsuite/gcc.dg/vect/slp-11.c
new file mode 100644
index 00000000000..118818c97bd
--- /dev/null
+++ b/gcc/testsuite/gcc.dg/vect/slp-11.c
@@ -0,0 +1,113 @@
+/* { dg-require-effective-target vect_int } */
+
+#include <stdarg.h>
+#include <stdio.h>
+#include "tree-vect.h"
+
+#define N 8
+
+int
+main1 ()
+{
+ int i;
+ unsigned int out[N*8], a0, a1, a2, a3, a4, a5, a6, a7, b1, b0, b2, b3, b4, b5, b6, b7;
+ unsigned int in[N*8] = {0,1,2,3,4,5,6,7,8,9,10,11,12,13,14,15,16,17,18,19,20,21,22,23,24,25,26,27,28,29,30,31,32,33,34,35,36,37,38,39,40,41,42,43,44,45,46,47,48,49,50,51,52,53,54,55,56,57,58,59,60,61,62,63};
+ float out2[N*8];
+
+ /* Different operations - not SLPable. */
+ for (i = 0; i < N; i++)
+ {
+ a0 = in[i*8] + 5;
+ a1 = in[i*8 + 1] * 6;
+ a2 = in[i*8 + 2] + 7;
+ a3 = in[i*8 + 3] + 8;
+ a4 = in[i*8 + 4] + 9;
+ a5 = in[i*8 + 5] + 10;
+ a6 = in[i*8 + 6] + 11;
+ a7 = in[i*8 + 7] + 12;
+
+ b0 = a0 * 3;
+ b1 = a1 * 2;
+ b2 = a2 * 12;
+ b3 = a3 * 5;
+ b4 = a4 * 8;
+ b5 = a5 * 4;
+ b6 = a6 * 3;
+ b7 = a7 * 2;
+
+ out[i*8] = b0 - 2;
+ out[i*8 + 1] = b1 - 3;
+ out[i*8 + 2] = b2 - 2;
+ out[i*8 + 3] = b3 - 1;
+ out[i*8 + 4] = b4 - 8;
+ out[i*8 + 5] = b5 - 7;
+ out[i*8 + 6] = b6 - 3;
+ out[i*8 + 7] = b7 - 7;
+ }
+
+ /* check results: */
+ for (i = 0; i < N; i++)
+ {
+ if (out[i*8] != (in[i*8] + 5) * 3 - 2
+ || out[i*8 + 1] != (in[i*8 + 1] * 6) * 2 - 3
+ || out[i*8 + 2] != (in[i*8 + 2] + 7) * 12 - 2
+ || out[i*8 + 3] != (in[i*8 + 3] + 8) * 5 - 1
+ || out[i*8 + 4] != (in[i*8 + 4] + 9) * 8 - 8
+ || out[i*8 + 5] != (in[i*8 + 5] + 10) * 4 - 7
+ || out[i*8 + 6] != (in[i*8 + 6] + 11) * 3 - 3
+ || out[i*8 + 7] != (in[i*8 + 7] + 12) * 2 - 7)
+ abort ();
+ }
+
+ /* Requires permutation - not SLPable. */
+ for (i = 0; i < N*2; i++)
+ {
+ out[i*4] = (in[i*4] + 2) * 3;
+ out[i*4 + 1] = (in[i*4 + 2] + 2) * 7;
+ out[i*4 + 2] = (in[i*4 + 1] + 7) * 3;
+ out[i*4 + 3] = (in[i*4 + 3] + 3) * 4;
+ }
+
+ /* check results: */
+ for (i = 0; i < N*2; i++)
+ {
+ if (out[i*4] != (in[i*4] + 2) * 3
+ || out[i*4 + 1] != (in[i*4 + 2] + 2) * 7
+ || out[i*4 + 2] != (in[i*4 + 1] + 7) * 3
+ || out[i*4 + 3] != (in[i*4 + 3] + 3) * 4)
+ abort ();
+ }
+
+ /* Different operations - not SLPable. */
+ for (i = 0; i < N*4; i++)
+ {
+ out2[i*2] = ((float) in[i*2] * 2 + 6) ;
+ out2[i*2 + 1] = (float) (in[i*2 + 1] * 3 + 7);
+ }
+
+ /* check results: */
+ for (i = 0; i < N*4; i++)
+ {
+ if (out2[i*2] != ((float) in[i*2] * 2 + 6)
+ || out2[i*2 + 1] != (float) (in[i*2 + 1] * 3 + 7))
+ abort ();
+ }
+
+
+ return 0;
+}
+
+int main (void)
+{
+ check_vect ();
+
+ main1 ();
+
+ return 0;
+}
+
+/* { dg-final { scan-tree-dump-times "vectorized 3 loops" 1 "vect" { target { vect_strided && vect_int_mult } } } } */
+/* { dg-final { scan-tree-dump-times "vectorized 0 loops" 1 "vect" {target { ! { vect_int_mult && vect_strided } } } } } */
+/* { dg-final { scan-tree-dump-times "vectorizing stmts using SLP" 0 "vect" } } */
+/* { dg-final { cleanup-tree-dump "vect" } } */
+
diff --git a/gcc/testsuite/gcc.dg/vect/slp-12a.c b/gcc/testsuite/gcc.dg/vect/slp-12a.c
new file mode 100644
index 00000000000..066bf7ff9a3
--- /dev/null
+++ b/gcc/testsuite/gcc.dg/vect/slp-12a.c
@@ -0,0 +1,105 @@
+/* { dg-require-effective-target vect_int } */
+
+#include <stdarg.h>
+#include <stdio.h>
+#include "tree-vect.h"
+
+#define N 8
+
+int
+main1 ()
+{
+ int i;
+ unsigned int out[N*8], a0, a1, a2, a3, a4, a5, a6, a7, b1, b0, b2, b3, b4, b5, b6, b7;
+ unsigned int in[N*8] = {0,1,2,3,4,5,6,7,8,9,10,11,12,13,14,15,16,17,18,19,20,21,22,23,24,25,26,27,28,29,30,31,32,33,34,35,36,37,38,39,40,41,42,43,44,45,46,47,48,49,50,51,52,53,54,55,56,57,58,59,60,61,62,63};
+ unsigned int ia[N], ib[N*2];
+
+ for (i = 0; i < N; i++)
+ {
+
+ a0 = in[i*8] + 5;
+ a1 = in[i*8 + 1] + 6;
+ a2 = in[i*8 + 2] + 7;
+ a3 = in[i*8 + 3] + 8;
+ a4 = in[i*8 + 4] + 9;
+ a5 = in[i*8 + 5] + 10;
+ a6 = in[i*8 + 6] + 11;
+ a7 = in[i*8 + 7] + 12;
+
+ b0 = a0 * 3;
+ b1 = a1 * 2;
+ b2 = a2 * 12;
+ b3 = a3 * 5;
+ b4 = a4 * 8;
+ b5 = a5 * 4;
+ b6 = a6 * 3;
+ b7 = a7 * 2;
+
+ out[i*8] = b0 - 2;
+ out[i*8 + 1] = b1 - 3;
+ out[i*8 + 2] = b2 - 2;
+ out[i*8 + 3] = b3 - 1;
+ out[i*8 + 4] = b4 - 8;
+ out[i*8 + 5] = b5 - 7;
+ out[i*8 + 6] = b6 - 3;
+ out[i*8 + 7] = b7 - 7;
+
+ ia[i] = b6;
+ }
+
+ /* check results: */
+ for (i = 0; i < N; i++)
+ {
+ if (out[i*8] != (in[i*8] + 5) * 3 - 2
+ || out[i*8 + 1] != (in[i*8 + 1] + 6) * 2 - 3
+ || out[i*8 + 2] != (in[i*8 + 2] + 7) * 12 - 2
+ || out[i*8 + 3] != (in[i*8 + 3] + 8) * 5 - 1
+ || out[i*8 + 4] != (in[i*8 + 4] + 9) * 8 - 8
+ || out[i*8 + 5] != (in[i*8 + 5] + 10) * 4 - 7
+ || out[i*8 + 6] != (in[i*8 + 6] + 11) * 3 - 3
+ || out[i*8 + 7] != (in[i*8 + 7] + 12) * 2 - 7
+ || ia[i] != (in[i*8 + 6] + 11) * 3)
+ abort ();
+ }
+
+ for (i = 0; i < N*2; i++)
+ {
+ out[i*4] = (in[i*4] + 2) * 3;
+ out[i*4 + 1] = (in[i*4 + 1] + 2) * 7;
+ out[i*4 + 2] = (in[i*4 + 2] + 7) * 3;
+ out[i*4 + 3] = (in[i*4 + 3] + 7) * 7;
+
+ ib[i] = 7;
+ }
+
+ /* check results: */
+ for (i = 0; i < N*2; i++)
+ {
+ if (out[i*4] != (in[i*4] + 2) * 3
+ || out[i*4 + 1] != (in[i*4 + 1] + 2) * 7
+ || out[i*4 + 2] != (in[i*4 + 2] + 7) * 3
+ || out[i*4 + 3] != (in[i*4 + 3] + 7) * 7
+ || ib[i] != 7)
+ abort ();
+ }
+
+ return 0;
+}
+
+int main (void)
+{
+ check_vect ();
+
+ main1 ();
+
+ return 0;
+}
+
+/* { dg-final { scan-tree-dump-times "vectorized 2 loops" 1 "vect" {target { vect_strided && vect_int_mult} } } } */
+/* { dg-final { scan-tree-dump-times "vectorized 1 loops" 1 "vect" {target { {! {vect_strided}} && vect_int_mult } } } } */
+/* { dg-final { scan-tree-dump-times "vectorized 0 loops" 1 "vect" {target { ! vect_int_mult } } } } */
+/* { dg-final { scan-tree-dump-times "vectorizing stmts using SLP" 2 "vect" {target { vect_strided && vect_int_mult } } } } */
+/* { dg-final { scan-tree-dump-times "vectorizing stmts using SLP" 1 "vect" {target { {! {vect_strided}} && vect_int_mult } } } } */
+/* { dg-final { scan-tree-dump-times "vectorizing stmts using SLP" 0 "vect" {target { ! vect_int_mult } } } } */
+/* { dg-final { cleanup-tree-dump "vect" } } */
+
diff --git a/gcc/testsuite/gcc.dg/vect/slp-12b.c b/gcc/testsuite/gcc.dg/vect/slp-12b.c
new file mode 100644
index 00000000000..39570016f38
--- /dev/null
+++ b/gcc/testsuite/gcc.dg/vect/slp-12b.c
@@ -0,0 +1,51 @@
+/* { dg-require-effective-target vect_intfloat_cvt } */
+
+#include <stdarg.h>
+#include <stdio.h>
+#include "tree-vect.h"
+
+#define N 64
+
+int
+main1 ()
+{
+ int i;
+ unsigned int in[N*8] = {0,1,2,3,4,5,6,7,8,9,10,11,12,13,14,15,16,17,18,19,20,21,22,23,24,25,26,27,28,29,30,31,32,33,34,35,36,37,38,39,40,41,42,43,44,45,46,47,48,49,50,51,52,53,54,55,56,57,58,59,60,61,62,63};
+ float out2[N*8], fa[N*4];
+
+ for (i = 0; i < N; i++)
+ {
+ out2[i*2] = (float) (in[i*2] * 2 + 11) ;
+ out2[i*2 + 1] = (float) (in[i*2 + 1] * 3 + 7);
+
+ fa[i] = (float) in[i*2+1];
+ }
+
+ /* check results: */
+ for (i = 0; i < N; i++)
+ {
+ if (out2[i*2] != (float) (in[i*2] * 2 + 11)
+ || out2[i*2 + 1] != (float) (in[i*2 + 1] * 3 + 7)
+ || fa[i] != (float) in[i*2+1])
+ abort ();
+ }
+
+
+ return 0;
+}
+
+int main (void)
+{
+ check_vect ();
+
+ main1 ();
+
+ return 0;
+}
+
+/* { dg-final { scan-tree-dump-times "vectorized 1 loops" 1 "vect" {target { vect_strided && vect_int_mult } } } } */
+/* { dg-final { scan-tree-dump-times "vectorized 0 loops" 1 "vect" {target { { ! { vect_int_mult }} || { ! {vect_strided}}} } } } */
+/* { dg-final { scan-tree-dump-times "vectorizing stmts using SLP" 1 "vect" {target { vect_strided && vect_int_mult } } } } */
+/* { dg-final { scan-tree-dump-times "vectorizing stmts using SLP" 0 "vect" {target { { ! { vect_int_mult }} || { ! {vect_strided}}} } } } */
+/* { dg-final { cleanup-tree-dump "vect" } } */
+
diff --git a/gcc/testsuite/gcc.dg/vect/slp-13.c b/gcc/testsuite/gcc.dg/vect/slp-13.c
new file mode 100644
index 00000000000..0041526b972
--- /dev/null
+++ b/gcc/testsuite/gcc.dg/vect/slp-13.c
@@ -0,0 +1,134 @@
+/* { dg-require-effective-target vect_int } */
+
+#include <stdarg.h>
+#include <stdio.h>
+#include "tree-vect.h"
+
+#define N 8
+
+int
+main1 ()
+{
+ int i;
+ unsigned short out[N*8];
+ unsigned short in[N*8] = {0,1,2,3,4,5,6,7,8,9,10,11,12,13,14,15,16,17,18,19,20,21,22,23,24,25,26,27,28,29,30,31,32,33,34,35,36,37,38,39,40,41,42,43,44,45,46,47,48,49,50,51,52,53,54,55,56,57,58,59,60,61,62,63};
+ unsigned int in2[N*8] = {0,1,2,3,4,5,6,7,8,9,10,11,12,13,14,15,16,17,18,19,20,21,22,23,24,25,26,27,28,29,30,31,32,33,34,35,36,37,38,39,40,41,42,43,44,45,46,47,48,49,50,51,52,53,54,55,56,57,58,59,60,61,62,63};
+ unsigned int out2[N*8];
+
+ /* Induction is not SLPable yet. */
+ for (i = 0; i < N; i++)
+ {
+ out[i*8] = in[i*8] + i;
+ out[i*8 + 1] = in[i*8 + 1] + i;
+ out[i*8 + 2] = in[i*8 + 2] + i;
+ out[i*8 + 3] = in[i*8 + 3] + i;
+ out[i*8 + 4] = in[i*8 + 4] + i;
+ out[i*8 + 5] = in[i*8 + 5] + i;
+ out[i*8 + 6] = in[i*8 + 6] + i;
+ out[i*8 + 7] = in[i*8 + 7] + i;
+ }
+
+ /* check results: */
+ for (i = 0; i < N; i++)
+ {
+ if (out[i*8] != in[i*8] + i
+ || out[i*8 + 1] != in[i*8 + 1] + i
+ || out[i*8 + 2] != in[i*8 + 2] + i
+ || out[i*8 + 3] != in[i*8 + 3] + i
+ || out[i*8 + 4] != in[i*8 + 4] + i
+ || out[i*8 + 5] != in[i*8 + 5] + i
+ || out[i*8 + 6] != in[i*8 + 6] + i
+ || out[i*8 + 7] != in[i*8 + 7] + i)
+ abort ();
+ }
+
+ /* Induction is not SLPable yet and strided group size must be a power of 2
+ to get vectorized. */
+ for (i = 0; i < N/2; i++)
+ {
+ out2[i*12] = in2[i*12] + i;
+ out2[i*12 + 1] = in2[i*12 + 1] + i;
+ out2[i*12 + 2] = in2[i*12 + 2] + i;
+ out2[i*12 + 3] = in2[i*12 + 3] + i;
+ out2[i*12 + 4] = in2[i*12 + 4] + i;
+ out2[i*12 + 5] = in2[i*12 + 5] + i;
+ out2[i*12 + 6] = in2[i*12 + 6] + i;
+ out2[i*12 + 7] = in2[i*12 + 7] + i;
+ out2[i*12 + 8] = in2[i*12 + 8] + i;
+ out2[i*12 + 9] = in2[i*12 + 9] + i;
+ out2[i*12 + 10] = in2[i*12 + 10] + i;
+ out2[i*12 + 11] = in2[i*12 + 11] + i;
+ }
+
+ /* check results: */
+ for (i = 0; i < N/2; i++)
+ {
+ if (out2[i*12] != in2[i*12] + i
+ || out2[i*12 + 1] != in2[i*12 + 1] + i
+ || out2[i*12 + 2] != in2[i*12 + 2] + i
+ || out2[i*12 + 3] != in2[i*12 + 3] + i
+ || out2[i*12 + 4] != in2[i*12 + 4] + i
+ || out2[i*12 + 5] != in2[i*12 + 5] + i
+ || out2[i*12 + 6] != in2[i*12 + 6] + i
+ || out2[i*12 + 7] != in2[i*12 + 7] + i
+ || out2[i*12 + 8] != in2[i*12 + 8] + i
+ || out2[i*12 + 9] != in2[i*12 + 9] + i
+ || out2[i*12 + 10] != in2[i*12 + 10] + i
+ || out2[i*12 + 11] != in2[i*12 + 11] + i)
+ abort ();
+ }
+
+ /* Not power of 2 but SLPable. */
+ for (i = 0; i < N/2; i++)
+ {
+ out2[i*12] = in2[i*12] + 1;
+ out2[i*12 + 1] = in2[i*12 + 1] + 2;
+ out2[i*12 + 2] = in2[i*12 + 2] + 3;
+ out2[i*12 + 3] = in2[i*12 + 3] + 4;
+ out2[i*12 + 4] = in2[i*12 + 4] + 5;
+ out2[i*12 + 5] = in2[i*12 + 5] + 6;
+ out2[i*12 + 6] = in2[i*12 + 6] + 7;
+ out2[i*12 + 7] = in2[i*12 + 7] + 8;
+ out2[i*12 + 8] = in2[i*12 + 8] + 9;
+ out2[i*12 + 9] = in2[i*12 + 9] + 10;
+ out2[i*12 + 10] = in2[i*12 + 10] + 11;
+ out2[i*12 + 11] = in2[i*12 + 11] + 12;
+ }
+
+ /* check results: */
+ for (i = 0; i < N/2; i++)
+ {
+ if (out2[i*12] != in2[i*12] + 1
+ || out2[i*12 + 1] != in2[i*12 + 1] + 2
+ || out2[i*12 + 2] != in2[i*12 + 2] + 3
+ || out2[i*12 + 3] != in2[i*12 + 3] + 4
+ || out2[i*12 + 4] != in2[i*12 + 4] + 5
+ || out2[i*12 + 5] != in2[i*12 + 5] + 6
+ || out2[i*12 + 6] != in2[i*12 + 6] + 7
+ || out2[i*12 + 7] != in2[i*12 + 7] + 8
+ || out2[i*12 + 8] != in2[i*12 + 8] + 9
+ || out2[i*12 + 9] != in2[i*12 + 9] + 10
+ || out2[i*12 + 10] != in2[i*12 + 10] + 11
+ || out2[i*12 + 11] != in2[i*12 + 11] + 12)
+ abort ();
+ }
+
+
+
+ return 0;
+}
+
+int main (void)
+{
+ check_vect ();
+
+ main1 ();
+
+ return 0;
+}
+
+/* { dg-final { scan-tree-dump-times "vectorized 2 loops" 1 "vect" { target { vect_interleave && vect_extract_even_odd } } } } */
+/* { dg-final { scan-tree-dump-times "vectorizing stmts using SLP" 3 "vect" { xfail *-*-* } } } */
+/* { dg-final { scan-tree-dump-times "vectorizing stmts using SLP" 1 "vect" } } */
+/* { dg-final { cleanup-tree-dump "vect" } } */
+
diff --git a/gcc/testsuite/gcc.dg/vect/slp-14.c b/gcc/testsuite/gcc.dg/vect/slp-14.c
new file mode 100644
index 00000000000..62610dc0233
--- /dev/null
+++ b/gcc/testsuite/gcc.dg/vect/slp-14.c
@@ -0,0 +1,118 @@
+/* { dg-require-effective-target vect_int } */
+
+#include <stdarg.h>
+#include <stdio.h>
+#include "tree-vect.h"
+
+#define N 8
+
+int
+main1 (int n)
+{
+ int i;
+ unsigned int out[N*8], a0, a1, a2, a3, a4, a5, a6, a7, b1, b0, b2, b3, b4, b5, b6, b7;
+ unsigned int in[N*8] = {0,1,2,3,4,5,6,7,8,9,10,11,12,13,14,15,16,17,18,19,20,21,22,23,24,25,26,27,28,29,30,31,32,33,34,35,36,37,38,39,40,41,42,43,44,45,46,47,48,49,50,51,52,53,54,55,56,57,58,59,60,61,62,63};
+ unsigned short in2[N*16] = {0,1,2,3,4,5,6,7,8,9,10,11,12,13,14,15,16,17,18,19,20,21,22,23,24,25,26,27,28,29,30,31,32,33,34,35,36,37,38,39,40,41,42,43,44,45,46,47,48,49,50,51,52,53,54,55,56,57,58,59,60,61,62,63};
+ unsigned short out2[N*16];
+
+ /* Multiple types are not SLPable yet. */
+ for (i = 0; i < n; i++)
+ {
+ a0 = in[i*8] + 5;
+ a1 = in[i*8 + 1] + 6;
+ a2 = in[i*8 + 2] + 7;
+ a3 = in[i*8 + 3] + 8;
+ a4 = in[i*8 + 4] + 9;
+ a5 = in[i*8 + 5] + 10;
+ a6 = in[i*8 + 6] + 11;
+ a7 = in[i*8 + 7] + 12;
+
+ b0 = a0 * 3;
+ b1 = a1 * 2;
+ b2 = a2 * 12;
+ b3 = a3 * 5;
+ b4 = a4 * 8;
+ b5 = a5 * 4;
+ b6 = a6 * 3;
+ b7 = a7 * 2;
+
+ out[i*8] = b0 - 2;
+ out[i*8 + 1] = b1 - 3;
+ out[i*8 + 2] = b2 - 2;
+ out[i*8 + 3] = b3 - 1;
+ out[i*8 + 4] = b4 - 8;
+ out[i*8 + 5] = b5 - 7;
+ out[i*8 + 6] = b6 - 3;
+ out[i*8 + 7] = b7 - 7;
+
+ out2[i*16] = in2[i*16] + 2;
+ out2[i*16 + 1] = in2[i*16 + 1] + 3;
+ out2[i*16 + 2] = in2[i*16 + 2] + 4;
+ out2[i*16 + 3] = in2[i*16 + 3] + 3;
+ out2[i*16 + 4] = in2[i*16 + 4] + 2;
+ out2[i*16 + 5] = in2[i*16 + 5] + 3;
+ out2[i*16 + 6] = in2[i*16 + 6] + 2;
+ out2[i*16 + 7] = in2[i*16 + 7] + 4;
+ out2[i*16 + 8] = in2[i*16 + 8] + 2;
+ out2[i*16 + 9] = in2[i*16 + 9] + 5;
+ out2[i*16 + 10] = in2[i*16 + 10] + 2;
+ out2[i*16 + 11] = in2[i*16 + 11] + 3;
+ out2[i*16 + 12] = in2[i*16 + 12] + 4;
+ out2[i*16 + 13] = in2[i*16 + 13] + 4;
+ out2[i*16 + 14] = in2[i*16 + 14] + 3;
+ out2[i*16 + 15] = in2[i*16 + 15] + 2;
+}
+
+ /* check results: */
+ for (i = 0; i < n; i++)
+ {
+ if (out[i*8] != (in[i*8] + 5) * 3 - 2
+ || out[i*8 + 1] != (in[i*8 + 1] + 6) * 2 - 3
+ || out[i*8 + 2] != (in[i*8 + 2] + 7) * 12 - 2
+ || out[i*8 + 3] != (in[i*8 + 3] + 8) * 5 - 1
+ || out[i*8 + 4] != (in[i*8 + 4] + 9) * 8 - 8
+ || out[i*8 + 5] != (in[i*8 + 5] + 10) * 4 - 7
+ || out[i*8 + 6] != (in[i*8 + 6] + 11) * 3 - 3
+ || out[i*8 + 7] != (in[i*8 + 7] + 12) * 2 - 7)
+ abort ();
+
+ if (out2[i*16] != in2[i*16] + 2
+ || out2[i*16 + 1] != in2[i*16 + 1] + 3
+ || out2[i*16 + 2] != in2[i*16 + 2] + 4
+ || out2[i*16 + 3] != in2[i*16 + 3] + 3
+ || out2[i*16 + 4] != in2[i*16 + 4] + 2
+ || out2[i*16 + 5] != in2[i*16 + 5] + 3
+ || out2[i*16 + 6] != in2[i*16 + 6] + 2
+ || out2[i*16 + 7] != in2[i*16 + 7] + 4
+ || out2[i*16 + 8] != in2[i*16 + 8] + 2
+ || out2[i*16 + 9] != in2[i*16 + 9] + 5
+ || out2[i*16 + 10] != in2[i*16 + 10] + 2
+ || out2[i*16 + 11] != in2[i*16 + 11] + 3
+ || out2[i*16 + 12] != in2[i*16 + 12] + 4
+ || out2[i*16 + 13] != in2[i*16 + 13] + 4
+ || out2[i*16 + 14] != in2[i*16 + 14] + 3
+ || out2[i*16 + 15] != in2[i*16 + 15] + 2)
+ abort ();
+
+ }
+
+
+
+ return 0;
+}
+
+int main (void)
+{
+ check_vect ();
+
+ main1 (N);
+
+ return 0;
+}
+
+/* { dg-final { scan-tree-dump-times "vectorized 1 loops" 1 "vect" { target { vect_strided && vect_int_mult } } } } */
+/* { dg-final { scan-tree-dump-times "vectorized 0 loops" 1 "vect" {target { ! { vect_strided && vect_int_mult } } } } } */
+/* { dg-final { scan-tree-dump-times "vectorizing stmts using SLP" 2 "vect" { xfail *-*-* } } } */
+/* { dg-final { scan-tree-dump-times "vectorizing stmts using SLP" 0 "vect" } } */
+/* { dg-final { cleanup-tree-dump "vect" } } */
+
diff --git a/gcc/testsuite/gcc.dg/vect/slp-15.c b/gcc/testsuite/gcc.dg/vect/slp-15.c
new file mode 100644
index 00000000000..6f04e6a3784
--- /dev/null
+++ b/gcc/testsuite/gcc.dg/vect/slp-15.c
@@ -0,0 +1,117 @@
+/* { dg-require-effective-target vect_int } */
+
+#include <stdarg.h>
+#include <stdio.h>
+#include "tree-vect.h"
+
+#define N 8
+
+int
+main1 (int n)
+{
+ int i;
+ unsigned int out[N*8], a0, a1, a2, a3, a4, a5, a6, a7, b1, b0, b2, b3, b4, b5, b6, b7;
+ unsigned int in[N*8] = {0,1,2,3,4,5,6,7,8,9,10,11,12,13,14,15,16,17,18,19,20,21,22,23,24,25,26,27,28,29,30,31,32,33,34,35,36,37,38,39,40,41,42,43,44,45,46,47,48,49,50,51,52,53,54,55,56,57,58,59,60,61,62,63};
+ unsigned int in2[N*16] = {0,1,2,3,4,5,6,7,8,9,10,11,12,13,14,15,16,17,18,19,20,21,22,23,24,25,26,27,28,29,30,31,32,33,34,35,36,37,38,39,40,41,42,43,44,45,46,47,48,49,50,51,52,53,54,55,56,57,58,59,60,61,62,63};
+ unsigned int out2[N*16];
+
+ for (i = 0; i < n; i++)
+ {
+ a0 = in[i*8] + 5;
+ a1 = in[i*8 + 1] + 6;
+ a2 = in[i*8 + 2] + 7;
+ a3 = in[i*8 + 3] + 8;
+ a4 = in[i*8 + 4] + 9;
+ a5 = in[i*8 + 5] + 10;
+ a6 = in[i*8 + 6] + 11;
+ a7 = in[i*8 + 7] + 12;
+
+ b0 = a0 * 3;
+ b1 = a1 * 2;
+ b2 = a2 * 12;
+ b3 = a3 * 5;
+ b4 = a4 * 8;
+ b5 = a5 * 4;
+ b6 = a6 * 3;
+ b7 = a7 * 2;
+
+ out[i*8] = b0 - 2;
+ out[i*8 + 1] = b1 - 3;
+ out[i*8 + 2] = b2 - 2;
+ out[i*8 + 3] = b3 - 1;
+ out[i*8 + 4] = b4 - 8;
+ out[i*8 + 5] = b5 - 7;
+ out[i*8 + 6] = b6 - 3;
+ out[i*8 + 7] = b7 - 7;
+
+ out2[i*16] = in2[i*16] * 2;
+ out2[i*16 + 1] = in2[i*16 + 1] * 3;
+ out2[i*16 + 2] = in2[i*16 + 2] * 4;
+ out2[i*16 + 3] = in2[i*16 + 3] * 3;
+ out2[i*16 + 4] = in2[i*16 + 4] * 2;
+ out2[i*16 + 5] = in2[i*16 + 5] * 3;
+ out2[i*16 + 6] = in2[i*16 + 6] * 2;
+ out2[i*16 + 7] = in2[i*16 + 7] * 4;
+ out2[i*16 + 8] = in2[i*16 + 8] * 2;
+ out2[i*16 + 9] = in2[i*16 + 9] * 5;
+ out2[i*16 + 10] = in2[i*16 + 10] * 2;
+ out2[i*16 + 11] = in2[i*16 + 11] * 3;
+ out2[i*16 + 12] = in2[i*16 + 12] * 4;
+ out2[i*16 + 13] = in2[i*16 + 13] * 4;
+ out2[i*16 + 14] = in2[i*16 + 14] * 3;
+ out2[i*16 + 15] = in2[i*16 + 15] * 2;
+}
+
+ /* check results: */
+ for (i = 0; i < n; i++)
+ {
+ if (out[i*8] != (in[i*8] + 5) * 3 - 2
+ || out[i*8 + 1] != (in[i*8 + 1] + 6) * 2 - 3
+ || out[i*8 + 2] != (in[i*8 + 2] + 7) * 12 - 2
+ || out[i*8 + 3] != (in[i*8 + 3] + 8) * 5 - 1
+ || out[i*8 + 4] != (in[i*8 + 4] + 9) * 8 - 8
+ || out[i*8 + 5] != (in[i*8 + 5] + 10) * 4 - 7
+ || out[i*8 + 6] != (in[i*8 + 6] + 11) * 3 - 3
+ || out[i*8 + 7] != (in[i*8 + 7] + 12) * 2 - 7)
+ abort ();
+
+ if (out2[i*16] != in2[i*16] * 2
+ || out2[i*16 + 1] != in2[i*16 + 1] * 3
+ || out2[i*16 + 2] != in2[i*16 + 2] * 4
+ || out2[i*16 + 3] != in2[i*16 + 3] * 3
+ || out2[i*16 + 4] != in2[i*16 + 4] * 2
+ || out2[i*16 + 5] != in2[i*16 + 5] * 3
+ || out2[i*16 + 6] != in2[i*16 + 6] * 2
+ || out2[i*16 + 7] != in2[i*16 + 7] * 4
+ || out2[i*16 + 8] != in2[i*16 + 8] * 2
+ || out2[i*16 + 9] != in2[i*16 + 9] * 5
+ || out2[i*16 + 10] != in2[i*16 + 10] * 2
+ || out2[i*16 + 11] != in2[i*16 + 11] * 3
+ || out2[i*16 + 12] != in2[i*16 + 12] * 4
+ || out2[i*16 + 13] != in2[i*16 + 13] * 4
+ || out2[i*16 + 14] != in2[i*16 + 14] * 3
+ || out2[i*16 + 15] != in2[i*16 + 15] * 2)
+ abort ();
+
+ }
+
+
+
+ return 0;
+}
+
+int main (void)
+{
+ check_vect ();
+
+ main1 (N);
+
+ return 0;
+}
+
+/* { dg-final { scan-tree-dump-times "vectorized 1 loops" 1 "vect" {target vect_int_mult } } } */
+/* { dg-final { scan-tree-dump-times "vectorized 0 loops" 1 "vect" {target { ! { vect_int_mult } } } } } */
+/* { dg-final { scan-tree-dump-times "vectorizing stmts using SLP" 2 "vect" {target vect_int_mult } } } */
+/* { dg-final { scan-tree-dump-times "vectorizing stmts using SLP" 2 "vect" {target { ! { vect_int_mult } } } } } */
+/* { dg-final { cleanup-tree-dump "vect" } } */
+
diff --git a/gcc/testsuite/gcc.dg/vect/slp-16.c b/gcc/testsuite/gcc.dg/vect/slp-16.c
new file mode 100644
index 00000000000..cbc47cd1f2b
--- /dev/null
+++ b/gcc/testsuite/gcc.dg/vect/slp-16.c
@@ -0,0 +1,70 @@
+/* { dg-require-effective-target vect_int } */
+
+#include <stdarg.h>
+#include <stdio.h>
+#include "tree-vect.h"
+
+#define N 8
+
+int
+main1 ()
+{
+ int i;
+ unsigned int out[N*8], a0, a1, a2, a3, a4, a5, a6, a7, b1, b0, b2, b3, b4, b5, b6, b7;
+ unsigned int in[N*8] = {0,1,2,3,4,5,6,7,8,9,10,11,12,13,14,15,16,17,18,19,20,21,22,23,24,25,26,27,28,29,30,31,32,33,34,35,36,37,38,39,40,41,42,43,44,45,46,47,48,49,50,51,52,53,54,55,56,57,58,59,60,61,62,63};
+ unsigned int in2[N*16] = {0,1,2,3,4,5,6,7,8,9,10,11,12,13,14,15,16,17,18,19,20,21,22,23,24,25,26,27,28,29,30,31,32,33,34,35,36,37,38,39,40,41,42,43,44,45,46,47,48,49,50,51,52,53,54,55,56,57,58,59,60,61,62,63};
+ unsigned int out2[N*16];
+
+ /* SLP group of size that is not a multiple of vector size.
+ Unrolling by 2. */
+ for (i = 0; i < N; i++)
+ {
+ a0 = in[i*2] + 5;
+ a1 = in[i*2 + 1] + 6;
+
+ b0 = a0 * 3;
+ b1 = a1 * 2;
+
+ out[i*2] = b0 - 2;
+ out[i*2 + 1] = b1 - 3;
+
+ out2[i*6] = in2[i*6] * 2;
+ out2[i*6 + 1] = in2[i*6 + 1] * 3;
+ out2[i*6 + 2] = in2[i*6 + 2] * 4;
+ out2[i*6 + 3] = in2[i*6 + 3] * 2;
+ out2[i*6 + 4] = in2[i*6 + 4] * 4;
+ out2[i*6 + 5] = in2[i*6 + 5] * 3;
+ }
+
+ /* check results: */
+ for (i = 0; i < N; i++)
+ {
+ if (out[i*2] != (in[i*2] + 5) * 3 - 2
+ || out[i*2 + 1] != (in[i*2 + 1] + 6) * 2 - 3
+ || out2[i*6] != in2[i*6] * 2
+ || out2[i*6 + 1] != in2[i*6 + 1] * 3
+ || out2[i*6 + 2] != in2[i*6 + 2] * 4
+ || out2[i*6 + 3] != in2[i*6 + 3] * 2
+ || out2[i*6 + 4] != in2[i*6 + 4] * 4
+ || out2[i*6 + 5] != in2[i*6 + 5] * 3)
+ abort ();
+ }
+
+
+
+ return 0;
+}
+
+int main (void)
+{
+ check_vect ();
+
+ main1 ();
+
+ return 0;
+}
+
+/* { dg-final { scan-tree-dump-times "vectorized 1 loops" 1 "vect" { target vect_int_mult } } } */
+/* { dg-final { scan-tree-dump-times "vectorizing stmts using SLP" 2 "vect" { target vect_int_mult } } } */
+/* { dg-final { cleanup-tree-dump "vect" } } */
+
diff --git a/gcc/testsuite/gcc.dg/vect/slp-17.c b/gcc/testsuite/gcc.dg/vect/slp-17.c
new file mode 100644
index 00000000000..0a760bf2c68
--- /dev/null
+++ b/gcc/testsuite/gcc.dg/vect/slp-17.c
@@ -0,0 +1,56 @@
+/* { dg-require-effective-target vect_int } */
+
+#include <stdarg.h>
+#include <stdio.h>
+#include "tree-vect.h"
+
+#define N 8
+
+int
+main1 ()
+{
+ int i;
+ unsigned short out[N*8];
+ unsigned short in[N*8] = {0,1,2,3,4,5,6,7,8,9,10,11,12,13,14,15,16,17,18,19,20,21,22,23,24,25,26,27,28,29,30,31,32,33,34,35,36,37,38,39,40,41,42,43,44,45,46,47,48,49,50,51,52,53,54,55,56,57,58,59,60,61,62,63};
+ unsigned short in2[N*8] = {0,1,2,3,4,5,6,7,8,9,10,11,12,13,14,15,16,17,18,19,20,21,22,23,24,25,26,27,28,29,30,31,32,33,34,35,36,37,38,39,40,41,42,43,44,45,46,47,48,49,50,51,52,53,54,55,56,57,58,59,60,61,62,63};
+ unsigned short out2[N*8];
+
+ for (i = 0; i < N*2; i++)
+ {
+ out[i*2] = in[i*2] + 5;
+ out[i*2 + 1] = in[i*2 + 1] + 6;
+
+ out2[i*4] = in2[i*4] + 2;
+ out2[i*4 + 1] = in2[i*4 + 1] + 2;
+ out2[i*4 + 2] = in2[i*4 + 2] + 1;
+ out2[i*4 + 3] = in2[i*4 + 3] + 3;
+ }
+
+ /* check results: */
+ for (i = 0; i < N*2; i++)
+ {
+ if (out[i*2] != in[i*2] + 5
+ || out[i*2 + 1] != in[i*2 + 1] + 6
+ || out2[i*4] != in2[i*4] + 2
+ || out2[i*4 + 1] != in2[i*4 + 1] + 2
+ || out2[i*4 + 2] != in2[i*4 + 2] + 1
+ || out2[i*4 + 3] != in2[i*4 + 3] + 3)
+ abort ();
+ }
+
+ return 0;
+}
+
+int main (void)
+{
+ check_vect ();
+
+ main1 ();
+
+ return 0;
+}
+
+/* { dg-final { scan-tree-dump-times "vectorized 1 loops" 1 "vect" } } */
+/* { dg-final { scan-tree-dump-times "vectorizing stmts using SLP" 2 "vect" } } */
+/* { dg-final { cleanup-tree-dump "vect" } } */
+
diff --git a/gcc/testsuite/gcc.dg/vect/slp-18.c b/gcc/testsuite/gcc.dg/vect/slp-18.c
new file mode 100644
index 00000000000..b8e122c6cfa
--- /dev/null
+++ b/gcc/testsuite/gcc.dg/vect/slp-18.c
@@ -0,0 +1,97 @@
+/* { dg-require-effective-target vect_int } */
+
+#include <stdarg.h>
+#include <stdio.h>
+#include "tree-vect.h"
+
+#define N 8
+
+int
+main1 ()
+{
+ int i;
+ unsigned int out[N*8], a0, a1, a2, a3, a4, a5, a6, a7, b1, b0, b2, b3, b4, b5, b6, b7;
+ unsigned int in[N*8] = {0,1,2,3,4,5,6,7,8,9,10,11,12,13,14,15,16,17,18,19,20,21,22,23,24,25,26,27,28,29,30,31,32,33,34,35,36,37,38,39,40,41,42,43,44,45,46,47,48,49,50,51,52,53,54,55,56,57,58,59,60,61,62,63};
+ float out2[N*8];
+
+ for (i = 0; i < N; i++)
+ {
+
+ a0 = in[i*8] + 5;
+ a1 = in[i*8 + 1] + 6;
+ a2 = in[i*8 + 2] + 7;
+ a3 = in[i*8 + 3] + 8;
+ a4 = in[i*8 + 4] + 9;
+ a5 = in[i*8 + 5] + 10;
+ a6 = in[i*8 + 6] + 11;
+ a7 = in[i*8 + 7] + 12;
+
+ b0 = a0 * 3;
+ b1 = a1 * 2;
+ b2 = a2 * 12;
+ b3 = a3 * 5;
+ b4 = a4 * 8;
+ b5 = a5 * 4;
+ b6 = a6 * 3;
+ b7 = a7 * 2;
+
+ out[i*8] = b0 - 2;
+ out[i*8 + 1] = b1 - 3;
+ out[i*8 + 2] = b2 - 2;
+ out[i*8 + 3] = b3 - 1;
+ out[i*8 + 4] = b4 - 8;
+ out[i*8 + 5] = b5 - 7;
+ out[i*8 + 6] = b6 - 3;
+ out[i*8 + 7] = b7 - 7;
+
+
+ out2[i*8] = (float) b0;
+ out2[i*8 + 1] = (float) b1;
+ out2[i*8 + 2] = (float) b2;
+ out2[i*8 + 3] = (float) b3;
+ out2[i*8 + 4] = (float) b4;
+ out2[i*8 + 5] = (float) b5;
+ out2[i*8 + 6] = (float) b6;
+ out2[i*8 + 7] = (float) b7;
+ }
+
+ /* check results: */
+ for (i = 0; i < N; i++)
+ {
+ if (out[i*8] != (in[i*8] + 5) * 3 - 2
+ || out[i*8 + 1] != (in[i*8 + 1] + 6) * 2 - 3
+ || out[i*8 + 2] != (in[i*8 + 2] + 7) * 12 - 2
+ || out[i*8 + 3] != (in[i*8 + 3] + 8) * 5 - 1
+ || out[i*8 + 4] != (in[i*8 + 4] + 9) * 8 - 8
+ || out[i*8 + 5] != (in[i*8 + 5] + 10) * 4 - 7
+ || out[i*8 + 6] != (in[i*8 + 6] + 11) * 3 - 3
+ || out[i*8 + 7] != (in[i*8 + 7] + 12) * 2 - 7)
+ abort ();
+
+ if (out2[i*8] != (float) ((in[i*8] + 5) * 3)
+ || out2[i*8 + 1] != (float) ((in[i*8 + 1] + 6) * 2)
+ || out2[i*8 + 2] != (float) ((in[i*8 + 2] + 7) * 12)
+ || out2[i*8 + 3] != (float) ((in[i*8 + 3] + 8) * 5)
+ || out2[i*8 + 4] != (float) ((in[i*8 + 4] + 9) * 8)
+ || out2[i*8 + 5] != (float) ((in[i*8 + 5] + 10) * 4)
+ || out2[i*8 + 6] != (float) ((in[i*8 + 6] + 11) * 3)
+ || out2[i*8 + 7] != (float) ((in[i*8 + 7] + 12) * 2))
+ abort ();
+ }
+
+ return 0;
+}
+
+int main (void)
+{
+ check_vect ();
+
+ main1 ();
+
+ return 0;
+}
+
+/* { dg-final { scan-tree-dump-times "vectorized 1 loops" 1 "vect" { target { vect_strided } } } } */
+/* { dg-final { scan-tree-dump-times "vectorizing stmts using SLP" 2 "vect" { target { vect_strided } } } } */
+/* { dg-final { cleanup-tree-dump "vect" } } */
+
diff --git a/gcc/testsuite/gcc.dg/vect/slp-19.c b/gcc/testsuite/gcc.dg/vect/slp-19.c
new file mode 100644
index 00000000000..d9a68cd69d4
--- /dev/null
+++ b/gcc/testsuite/gcc.dg/vect/slp-19.c
@@ -0,0 +1,155 @@
+/* { dg-require-effective-target vect_int } */
+
+#include <stdarg.h>
+#include <stdio.h>
+#include "tree-vect.h"
+
+#define N 16
+
+int
+main1 ()
+{
+ unsigned int i;
+ unsigned int out[N*8];
+ unsigned int in[N*8] = {0,1,2,3,4,5,6,7,8,9,10,11,12,13,14,15,16,17,18,19,20,21,22,23,24,25,26,27,28,29,30,31,32,33,34,35,36,37,38,39,40,41,42,43,44,45,46,47,48,49,50,51,52,53,54,55,56,57,58,59,60,61,62,63,0,1,2,3,4,5,6,7,8,9,10,11,12,13,14,15,16,17,18,19,20,21,22,23,24,25,26,27,28,29,30,31,32,33,34,35,36,37,38,39,40,41,42,43,44,45,46,47,48,49,50,51,52,53,54,55,56,57,58,59,60,61,62,63};
+ unsigned int ia[N*2], a0, a1, a2, a3;
+
+ for (i = 0; i < N; i++)
+ {
+ out[i*8] = in[i*8];
+ out[i*8 + 1] = in[i*8 + 1];
+ out[i*8 + 2] = in[i*8 + 2];
+ out[i*8 + 3] = in[i*8 + 3];
+ out[i*8 + 4] = in[i*8 + 4];
+ out[i*8 + 5] = in[i*8 + 5];
+ out[i*8 + 6] = in[i*8 + 6];
+ out[i*8 + 7] = in[i*8 + 7];
+
+ ia[i] = in[i*8 + 2];
+ }
+
+ /* check results: */
+ for (i = 0; i < N; i++)
+ {
+ if (out[i*8] != in[i*8]
+ || out[i*8 + 1] != in[i*8 + 1]
+ || out[i*8 + 2] != in[i*8 + 2]
+ || out[i*8 + 3] != in[i*8 + 3]
+ || out[i*8 + 4] != in[i*8 + 4]
+ || out[i*8 + 5] != in[i*8 + 5]
+ || out[i*8 + 6] != in[i*8 + 6]
+ || out[i*8 + 7] != in[i*8 + 7]
+ || ia[i] != in[i*8 + 2])
+ abort ();
+ }
+
+ for (i = 0; i < N*2; i++)
+ {
+ a0 = in[i*4] + 1;
+ a1 = in[i*4 + 1] + 2;
+ a2 = in[i*4 + 2] + 3;
+ a3 = in[i*4 + 3] + 4;
+
+ out[i*4] = a0;
+ out[i*4 + 1] = a1;
+ out[i*4 + 2] = a2;
+ out[i*4 + 3] = a3;
+
+ ia[i] = a2;
+ }
+
+ /* check results: */
+ for (i = 0; i < N*2; i++)
+ {
+ if (out[i*4] != in[i*4] + 1
+ || out[i*4 + 1] != in[i*4 + 1] + 2
+ || out[i*4 + 2] != in[i*4 + 2] + 3
+ || out[i*4 + 3] != in[i*4 + 3] + 4
+ || ia[i] != in[i*4 + 2] + 3)
+ abort ();
+ }
+
+ /* The last stmt requires interleaving of not power of 2 size - not
+ vectorizable. */
+ for (i = 0; i < N/2; i++)
+ {
+ out[i*12] = in[i*12];
+ out[i*12 + 1] = in[i*12 + 1];
+ out[i*12 + 2] = in[i*12 + 2];
+ out[i*12 + 3] = in[i*12 + 3];
+ out[i*12 + 4] = in[i*12 + 4];
+ out[i*12 + 5] = in[i*12 + 5];
+ out[i*12 + 6] = in[i*12 + 6];
+ out[i*12 + 7] = in[i*12 + 7];
+ out[i*12 + 8] = in[i*12 + 8];
+ out[i*12 + 9] = in[i*12 + 9];
+ out[i*12 + 10] = in[i*12 + 10];
+ out[i*12 + 11] = in[i*12 + 11];
+
+ ia[i] = in[i*12 + 7];
+ }
+
+ /* check results: */
+ for (i = 0; i < N/2; i++)
+ {
+ if (out[i*12] != in[i*12]
+ || out[i*12 + 1] != in[i*12 + 1]
+ || out[i*12 + 2] != in[i*12 + 2]
+ || out[i*12 + 3] != in[i*12 + 3]
+ || out[i*12 + 4] != in[i*12 + 4]
+ || out[i*12 + 5] != in[i*12 + 5]
+ || out[i*12 + 6] != in[i*12 + 6]
+ || out[i*12 + 7] != in[i*12 + 7]
+ || out[i*12 + 8] != in[i*12 + 8]
+ || out[i*12 + 9] != in[i*12 + 9]
+ || out[i*12 + 10] != in[i*12 + 10]
+ || out[i*12 + 11] != in[i*12 + 11]
+ || ia[i] != in[i*12 + 7])
+ abort ();
+ }
+
+ /* Hybrid SLP with unrolling by 2. */
+ for (i = 0; i < N; i++)
+ {
+ out[i*6] = in[i*6];
+ out[i*6 + 1] = in[i*6 + 1];
+ out[i*6 + 2] = in[i*6 + 2];
+ out[i*6 + 3] = in[i*6 + 3];
+ out[i*6 + 4] = in[i*6 + 4];
+ out[i*6 + 5] = in[i*6 + 5];
+
+ ia[i] = i;
+ }
+
+ /* check results: */
+ for (i = 0; i < N/2; i++)
+ {
+ if (out[i*6] != in[i*6]
+ || out[i*6 + 1] != in[i*6 + 1]
+ || out[i*6 + 2] != in[i*6 + 2]
+ || out[i*6 + 3] != in[i*6 + 3]
+ || out[i*6 + 4] != in[i*6 + 4]
+ || out[i*6 + 5] != in[i*6 + 5]
+ || ia[i] != i)
+ abort ();
+ }
+
+
+ return 0;
+}
+
+int main (void)
+{
+ check_vect ();
+
+ main1 ();
+
+ return 0;
+}
+
+/* { dg-final { scan-tree-dump-times "vectorized 3 loops" 1 "vect" { target vect_strided } } } */
+/* { dg-final { scan-tree-dump-times "vectorized 1 loops" 1 "vect" { target { ! { vect_strided } } } } } */
+/* { dg-final { scan-tree-dump-times "vectorizing stmts using SLP" 3 "vect" { target vect_strided } } } */
+/* { dg-final { scan-tree-dump-times "vectorizing stmts using SLP" 1 "vect" { target { ! { vect_strided } } } } } */
+/* { dg-final { cleanup-tree-dump "vect" } } */
+
diff --git a/gcc/testsuite/gcc.dg/vect/slp-2.c b/gcc/testsuite/gcc.dg/vect/slp-2.c
new file mode 100644
index 00000000000..2731747dbcf
--- /dev/null
+++ b/gcc/testsuite/gcc.dg/vect/slp-2.c
@@ -0,0 +1,146 @@
+/* { dg-require-effective-target vect_int } */
+
+#include <stdarg.h>
+#include <stdio.h>
+#include "tree-vect.h"
+
+#define N 128
+
+int
+main1 (unsigned short a0, unsigned short a1, unsigned short a2,
+ unsigned short a3, unsigned short a4, unsigned short a5,
+ unsigned short a6, unsigned short a7, unsigned short a8,
+ unsigned short a9, unsigned short a10, unsigned short a11,
+ unsigned short a12, unsigned short a13, unsigned short a14,
+ unsigned short a15)
+{
+ int i;
+ unsigned short out[N*16];
+
+ for (i = 0; i < N; i++)
+ {
+ out[i*4] = a8;
+ out[i*4 + 1] = a1;
+ out[i*4 + 2] = a2;
+ out[i*4 + 3] = a3;
+ }
+
+ /* check results: */
+ for (i = 0; i < N; i++)
+ {
+ if (out[i*4] != a8
+ || out[i*4 + 1] != a1
+ || out[i*4 + 2] != a2
+ || out[i*4 + 3] != a3)
+ abort ();
+ }
+
+ for (i = 0; i < N; i++)
+ {
+ out[i*16] = a8;
+ out[i*16 + 1] = a7;
+ out[i*16 + 2] = a1;
+ out[i*16 + 3] = a2;
+ out[i*16 + 4] = a8;
+ out[i*16 + 5] = a5;
+ out[i*16 + 6] = a5;
+ out[i*16 + 7] = a4;
+ out[i*16 + 8] = a12;
+ out[i*16 + 9] = a13;
+ out[i*16 + 10] = a14;
+ out[i*16 + 11] = a15;
+ out[i*16 + 12] = a6;
+ out[i*16 + 13] = a9;
+ out[i*16 + 14] = a0;
+ out[i*16 + 15] = a7;
+ }
+
+ /* check results: */
+ for (i = 0; i < N; i++)
+ {
+ if (out[i*16] != a8
+ || out[i*16 + 1] != a7
+ || out[i*16 + 2] != a1
+ || out[i*16 + 3] != a2
+ || out[i*16 + 4] != a8
+ || out[i*16 + 5] != a5
+ || out[i*16 + 6] != a5
+ || out[i*16 + 7] != a4
+ || out[i*16 + 8] != a12
+ || out[i*16 + 9] != a13
+ || out[i*16 + 10] != a14
+ || out[i*16 + 11] != a15
+ || out[i*16 + 12] != a6
+ || out[i*16 + 13] != a9
+ || out[i*16 + 14] != a0
+ || out[i*16 + 15] != a7)
+ abort ();
+ }
+
+ /* SLP with unrolling by 8. */
+ for (i = 0; i < N; i++)
+ {
+ out[i*3] = a8;
+ out[i*3 + 1] = a1;
+ out[i*3 + 2] = a2;
+ }
+
+ /* check results: */
+ for (i = 0; i < N; i++)
+ {
+ if (out[i*3] != a8
+ || out[i*3 + 1] != a1
+ || out[i*3 + 2] != a2)
+ abort ();
+ }
+
+ /* SLP with unrolling by 8. */
+ for (i = 0; i < N; i++)
+ {
+ out[i*11] = a8;
+ out[i*11 + 1] = a7;
+ out[i*11 + 2] = a1;
+ out[i*11 + 3] = a2;
+ out[i*11 + 4] = a8;
+ out[i*11 + 5] = a5;
+ out[i*11 + 6] = a5;
+ out[i*11 + 7] = a4;
+ out[i*11 + 8] = a12;
+ out[i*11 + 9] = a13;
+ out[i*11 + 10] = a14;
+ }
+
+ /* check results: */
+ for (i = 0; i < N; i++)
+ {
+ if (out[i*11] != a8
+ || out[i*11 + 1] != a7
+ || out[i*11 + 2] != a1
+ || out[i*11 + 3] != a2
+ || out[i*11 + 4] != a8
+ || out[i*11 + 5] != a5
+ || out[i*11 + 6] != a5
+ || out[i*11 + 7] != a4
+ || out[i*11 + 8] != a12
+ || out[i*11 + 9] != a13
+ || out[i*11 + 10] != a14)
+ abort ();
+ }
+
+
+ return 0;
+}
+
+int main (void)
+{
+ check_vect ();
+
+ main1 (15,14,13,12,11,10,9,8,7,6,5,4,3,2,1,0);
+
+ return 0;
+}
+
+/* { dg-final { scan-tree-dump-times "vectorized 4 loops" 1 "vect" } } */
+/* { dg-final { scan-tree-dump-times "vectorizing stmts using SLP" 4 "vect" } } */
+/* { dg-final { cleanup-tree-dump "vect" } } */
+
diff --git a/gcc/testsuite/gcc.dg/vect/slp-20.c b/gcc/testsuite/gcc.dg/vect/slp-20.c
new file mode 100644
index 00000000000..86d3927a42c
--- /dev/null
+++ b/gcc/testsuite/gcc.dg/vect/slp-20.c
@@ -0,0 +1,116 @@
+/* { dg-require-effective-target vect_int } */
+
+#include <stdarg.h>
+#include <stdio.h>
+#include "tree-vect.h"
+
+#define N 128
+
+int
+main1 (unsigned short a0, unsigned short a1, unsigned short a2,
+ unsigned short a3, unsigned short a4, unsigned short a5,
+ unsigned short a6, unsigned short a7, unsigned short a8)
+{
+ int i;
+ unsigned short out[N*8], out2[N*8], b0, b1, b2, b3, b4, b5, b6, b7, b8;
+
+ for (i = 0; i < N; i++)
+ {
+ b0 = a0 + 8;
+ b1 = a1 + 7;
+ b2 = a2 + 6;
+ b3 = a3 + 5;
+ b4 = a4 + 4;
+ b5 = a5 + 3;
+
+ out[i*4] = b0;
+ out[i*4 + 1] = b1;
+ out[i*4 + 2] = b2;
+ out[i*4 + 3] = b3;
+
+ out2[i*4] = b0;
+ out2[i*4 + 1] = b1;
+ out2[i*4 + 2] = b4;
+ out2[i*4 + 3] = b5;
+ }
+
+ /* check results: */
+ for (i = 0; i < N; i++)
+ {
+ if (out[i*4] != b0
+ || out[i*4 + 1] != b1
+ || out[i*4 + 2] != b2
+ || out[i*4 + 3] != b3)
+ abort ();
+
+ if (out2[i*4] != b0
+ || out2[i*4 + 1] != b1
+ || out2[i*4 + 2] != b4
+ || out2[i*4 + 3] != b5)
+ abort ();
+ }
+
+ for (i = 0; i < N; i++)
+ {
+ b0 = a0 + 8;
+ b1 = a1 + 7;
+ b2 = a2 + 6;
+ b3 = a3 + 5;
+ b4 = a4 + 4;
+ b5 = a5 + 3;
+ b6 = a6 + 2;
+ b7 = a7 + 1;
+ b8 = a8 + 9;
+
+ out[i*4] = b0;
+ out[i*4 + 1] = b1;
+ out[i*4 + 2] = b2;
+ out[i*4 + 3] = b3;
+
+ out2[i*8] = b0;
+ out2[i*8 + 1] = b1;
+ out2[i*8 + 2] = b4;
+ out2[i*8 + 3] = b5;
+ out2[i*8 + 4] = b6;
+ out2[i*8 + 5] = b2;
+ out2[i*8 + 6] = b7;
+ out2[i*8 + 7] = b8;
+ }
+
+ /* check results: */
+ for (i = 0; i < N; i++)
+ {
+ if (out[i*4] != b0
+ || out[i*4 + 1] != b1
+ || out[i*4 + 2] != b2
+ || out[i*4 + 3] != b3)
+ abort ();
+
+ if (out2[i*8] != b0
+ || out2[i*8 + 1] != b1
+ || out2[i*8 + 2] != b4
+ || out2[i*8 + 3] != b5
+ || out2[i*8 + 4] != b6
+ || out2[i*8 + 5] != b2
+ || out2[i*8 + 6] != b7
+ || out2[i*8 + 7] != b8)
+ abort ();
+ }
+
+
+ return 0;
+}
+
+int main (void)
+{
+ check_vect ();
+
+ main1 (8,7,6,5,4,3,2,1,0);
+
+ return 0;
+}
+
+/* { dg-final { scan-tree-dump-times "vectorized 2 loops" 1 "vect" } } */
+/* { dg-final { scan-tree-dump-times "vectorizing stmts using SLP" 4 "vect" } } */
+/* { dg-final { cleanup-tree-dump "vect" } } */
+
diff --git a/gcc/testsuite/gcc.dg/vect/slp-21.c b/gcc/testsuite/gcc.dg/vect/slp-21.c
new file mode 100644
index 00000000000..327045e4789
--- /dev/null
+++ b/gcc/testsuite/gcc.dg/vect/slp-21.c
@@ -0,0 +1,208 @@
+/* { dg-require-effective-target vect_int } */
+
+#include <stdarg.h>
+#include <stdio.h>
+#include "tree-vect.h"
+
+#define N 128
+
+int
+main1 ()
+{
+ unsigned short i;
+ unsigned short out[N*8], out2[N*8], b0, b1, b2, b3, b4, a0, a1, a2, a3, b5;
+ unsigned short in[N*8];
+
+ for (i = 0; i < N*8; i++)
+ {
+ in[i] = i;
+ }
+
+ /* Different operations in both cases - vectorization with interleaving. */
+ for (i = 0; i < N; i++)
+ {
+ a0 = in[i*4];
+ a1 = in[i*4 + 1];
+ a2 = in[i*4 + 2];
+ a3 = in[i*4 + 3];
+
+ b0 = a0 * 8;
+ b1 = a1 + 7;
+ b2 = a2 + 6;
+ b3 = a3 * 5;
+
+ b4 = a2 + 4;
+ b5 = a3 + 3;
+
+ out[i*4] = b0;
+ out[i*4 + 1] = b1;
+ out[i*4 + 2] = b2;
+ out[i*4 + 3] = b3;
+
+ out2[i*4] = b0;
+ out2[i*4 + 1] = b1;
+ out2[i*4 + 2] = b4;
+ out2[i*4 + 3] = b5;
+ }
+
+ /* check results: */
+ for (i = 0; i < N; i++)
+ {
+ a0 = in[i*4];
+ a1 = in[i*4 + 1];
+ a2 = in[i*4 + 2];
+ a3 = in[i*4 + 3];
+
+ b0 = a0 * 8;
+ b1 = a1 + 7;
+ b2 = a2 + 6;
+ b3 = a3 * 5;
+
+ b4 = a2 + 4;
+ b5 = a3 + 3;
+
+ if (out[i*4] != b0
+ || out[i*4 + 1] != b1
+ || out[i*4 + 2] != b2
+ || out[i*4 + 3] != b3)
+ abort ();
+
+ if (out2[i*4] != b0
+ || out2[i*4 + 1] != b1
+ || out2[i*4 + 2] != b4
+ || out2[i*4 + 3] != b5)
+ abort ();
+ }
+
+ /* Different operations in the first case - vectorization with interleaving. */
+ for (i = 0; i < N; i++)
+ {
+ a0 = in[i*4];
+ a1 = in[i*4 + 1];
+ a2 = in[i*4 + 2];
+ a3 = in[i*4 + 3];
+
+ b0 = a0 + 8;
+ b1 = a1 + 7;
+ b2 = a2 + 6;
+ b3 = a3 * 5;
+
+ b4 = a2 + 4;
+ b5 = a3 + 3;
+
+ out[i*4] = b0;
+ out[i*4 + 1] = b1;
+ out[i*4 + 2] = b2;
+ out[i*4 + 3] = b3;
+
+ out2[i*4] = b0;
+ out2[i*4 + 1] = b1;
+ out2[i*4 + 2] = b4;
+ out2[i*4 + 3] = b5;
+ }
+
+ /* check results: */
+ for (i = 0; i < N; i++)
+ {
+ a0 = in[i*4];
+ a1 = in[i*4 + 1];
+ a2 = in[i*4 + 2];
+ a3 = in[i*4 + 3];
+
+ b0 = a0 + 8;
+ b1 = a1 + 7;
+ b2 = a2 + 6;
+ b3 = a3 * 5;
+
+ b4 = a2 + 4;
+ b5 = a3 + 3;
+
+ if (out[i*4] != b0
+ || out[i*4 + 1] != b1
+ || out[i*4 + 2] != b2
+ || out[i*4 + 3] != b3)
+ abort ();
+
+ if (out2[i*4] != b0
+ || out2[i*4 + 1] != b1
+ || out2[i*4 + 2] != b4
+ || out2[i*4 + 3] != b5)
+ abort ();
+ }
+
+
+ /* Different operations in the second case - vectorization with interleaving. */
+ for (i = 0; i < N; i++)
+ {
+ a0 = in[i*4];
+ a1 = in[i*4 + 1];
+ a2 = in[i*4 + 2];
+ a3 = in[i*4 + 3];
+
+ b0 = a0 + 8;
+ b1 = a1 + 7;
+ b2 = a2 + 6;
+ b3 = a3 + 5;
+
+ b4 = a2 * 4;
+ b5 = a3 + 3;
+
+ out[i*4] = b0;
+ out[i*4 + 1] = b1;
+ out[i*4 + 2] = b2;
+ out[i*4 + 3] = b3;
+
+ out2[i*4] = b0;
+ out2[i*4 + 1] = b1;
+ out2[i*4 + 2] = b4;
+ out2[i*4 + 3] = b5;
+ }
+
+ /* check results: */
+ for (i = 0; i < N; i++)
+ {
+ a0 = in[i*4];
+ a1 = in[i*4 + 1];
+ a2 = in[i*4 + 2];
+ a3 = in[i*4 + 3];
+
+ b0 = a0 + 8;
+ b1 = a1 + 7;
+ b2 = a2 + 6;
+ b3 = a3 + 5;
+
+ b4 = a2 * 4;
+ b5 = a3 + 3;
+
+ if (out[i*4] != b0
+ || out[i*4 + 1] != b1
+ || out[i*4 + 2] != b2
+ || out[i*4 + 3] != b3)
+ abort ();
+
+ if (out2[i*4] != b0
+ || out2[i*4 + 1] != b1
+ || out2[i*4 + 2] != b4
+ || out2[i*4 + 3] != b5)
+ abort ();
+ }
+
+
+ return 0;
+}
+
+int main (void)
+{
+ check_vect ();
+
+ main1 ();
+
+ return 0;
+}
+
+/* { dg-final { scan-tree-dump-times "vectorized 4 loops" 1 "vect" { target vect_strided } } } */
+/* { dg-final { scan-tree-dump-times "vectorized 1 loops" 1 "vect" { target { ! { vect_strided } } } } } */
+/* { dg-final { scan-tree-dump-times "vectorizing stmts using SLP" 2 "vect" { target vect_strided } } } */
+/* { dg-final { scan-tree-dump-times "vectorizing stmts using SLP" 0 "vect" { target { ! { vect_strided } } } } } */
+/* { dg-final { cleanup-tree-dump "vect" } } */
+
diff --git a/gcc/testsuite/gcc.dg/vect/slp-22.c b/gcc/testsuite/gcc.dg/vect/slp-22.c
new file mode 100644
index 00000000000..18df4269a6c
--- /dev/null
+++ b/gcc/testsuite/gcc.dg/vect/slp-22.c
@@ -0,0 +1,135 @@
+/* { dg-require-effective-target vect_int } */
+
+#include <stdarg.h>
+#include <stdio.h>
+#include "tree-vect.h"
+
+#define N 128
+
+int
+main1 (unsigned short a0, unsigned short a1, unsigned short a2,
+ unsigned short a3, unsigned short a4, unsigned short a5,
+ unsigned short a6, unsigned short a7, unsigned short a8)
+{
+ int i;
+ unsigned short out[N*8], out2[N*8], out3[N*8], b0, b1, b2, b3, b4, b5, b6, b7, b8;
+
+ for (i = 0; i < N; i++)
+ {
+ b0 = a0 + 8;
+ b1 = a1 + 7;
+ b2 = a2 + 6;
+ b3 = a3 + 5;
+ b4 = a4 + 4;
+ b5 = a5 + 3;
+
+ out[i*4] = b0;
+ out[i*4 + 1] = b1;
+ out[i*4 + 2] = b2;
+ out[i*4 + 3] = b3;
+
+ out2[i*4] = b0;
+ out2[i*4 + 1] = b1;
+ out2[i*4 + 2] = b4;
+ out2[i*4 + 3] = b5;
+
+ out3[i*4] = b2;
+ out3[i*4 + 1] = b1;
+ out3[i*4 + 2] = b4;
+ out3[i*4 + 3] = b5;
+ }
+
+ /* check results: */
+ for (i = 0; i < N; i++)
+ {
+ if (out[i*4] != b0
+ || out[i*4 + 1] != b1
+ || out[i*4 + 2] != b2
+ || out[i*4 + 3] != b3)
+ abort ();
+
+
+ if (out2[i*4] != b0
+ || out2[i*4 + 1] != b1
+ || out2[i*4 + 2] != b4
+ || out2[i*4 + 3] != b5)
+ abort ();
+
+ if (out3[i*4] != b2
+ || out3[i*4 + 1] != b1
+ || out3[i*4 + 2] != b4
+ || out3[i*4 + 3] != b5)
+ abort ();
+ }
+
+ for (i = 0; i < N; i++)
+ {
+ b0 = a0 + 8;
+ b1 = a1 + 7;
+ b2 = a2 + 6;
+ b3 = a3 + 5;
+ b4 = a4 + 4;
+ b5 = a5 + 3;
+ b6 = a6 + 2;
+ b7 = a7 + 1;
+ b8 = a8 + 9;
+
+ out[i*4] = b0;
+ out[i*4 + 1] = b1;
+ out[i*4 + 2] = b2;
+ out[i*4 + 3] = b3;
+
+ out2[i*8] = b0;
+ out2[i*8 + 1] = b1;
+ out2[i*8 + 2] = b4;
+ out2[i*8 + 3] = b5;
+ out2[i*8 + 4] = b6;
+ out2[i*8 + 5] = b2;
+ out2[i*8 + 6] = b7;
+ out2[i*8 + 7] = b8;
+
+ out3[2*i + 1] = a0;
+ out3[2*i] = b8;
+ }
+
+ /* check results: */
+ for (i = 0; i < N; i++)
+ {
+ if (out[i*4] != b0
+ || out[i*4 + 1] != b1
+ || out[i*4 + 2] != b2
+ || out[i*4 + 3] != b3)
+ abort ();
+
+ if (out2[i*8] != b0
+ || out2[i*8 + 1] != b1
+ || out2[i*8 + 2] != b4
+ || out2[i*8 + 3] != b5
+ || out2[i*8 + 4] != b6
+ || out2[i*8 + 5] != b2
+ || out2[i*8 + 6] != b7
+ || out2[i*8 + 7] != b8)
+ abort ();
+
+ if (out3[2*i] != b8
+ || out3[2*i+1] != a0)
+ abort();
+ }
+
+
+ return 0;
+}
+
+int main (void)
+{
+ check_vect ();
+
+ main1 (8,7,6,5,4,3,2,1,0);
+
+ return 0;
+}
+
+/* { dg-final { scan-tree-dump-times "vectorized 2 loops" 1 "vect" } } */
+/* { dg-final { scan-tree-dump-times "vectorizing stmts using SLP" 6 "vect" } } */
+/* { dg-final { cleanup-tree-dump "vect" } } */
+
diff --git a/gcc/testsuite/gcc.dg/vect/slp-23.c b/gcc/testsuite/gcc.dg/vect/slp-23.c
new file mode 100644
index 00000000000..2bba580271d
--- /dev/null
+++ b/gcc/testsuite/gcc.dg/vect/slp-23.c
@@ -0,0 +1,113 @@
+/* { dg-require-effective-target vect_int } */
+
+#include <stdarg.h>
+#include "tree-vect.h"
+
+#define N 128
+
+typedef struct {
+ int a;
+ int b;
+ int c;
+ int d;
+ int e;
+ int f;
+ int g;
+ int h;
+} s;
+
+int
+main1 (s *arr)
+{
+ int i;
+ s *ptr = arr;
+ s res[N];
+
+ for (i = 0; i < N; i++)
+ {
+ res[i].c = ptr->c + ptr->c;
+ res[i].a = ptr->a + ptr->a;
+ res[i].d = ptr->d + ptr->d;
+ res[i].b = ptr->b + ptr->b;
+ res[i].f = ptr->f + ptr->f;
+ res[i].e = ptr->e + ptr->e;
+ res[i].h = ptr->h + ptr->h;
+ res[i].g = ptr->g + ptr->g;
+ ptr++;
+ }
+
+ /* check results: */
+ for (i = 0; i < N; i++)
+ {
+ if (res[i].c != arr[i].c + arr[i].c
+ || res[i].a != arr[i].a + arr[i].a
+ || res[i].d != arr[i].d + arr[i].d
+ || res[i].b != arr[i].b + arr[i].b
+ || res[i].f != arr[i].f + arr[i].f
+ || res[i].e != arr[i].e + arr[i].e
+ || res[i].h != arr[i].h + arr[i].h
+ || res[i].g != arr[i].g + arr[i].g)
+ abort();
+ }
+
+ ptr = arr;
+ for (i = 0; i < N; i++)
+ {
+ res[i].c = ptr->c + ptr->c;
+ res[i].a = ptr->a + ptr->a;
+ res[i].d = ptr->d + ptr->d;
+ res[i].b = ptr->b + ptr->b;
+ res[i].f = ptr->f + ptr->f;
+ res[i].e = ptr->e + ptr->e;
+ res[i].h = ptr->e + ptr->e;
+ res[i].g = ptr->g + ptr->g;
+ ptr++;
+ }
+
+ /* check results: */
+ for (i = 0; i < N; i++)
+ {
+ if (res[i].c != arr[i].c + arr[i].c
+ || res[i].a != arr[i].a + arr[i].a
+ || res[i].d != arr[i].d + arr[i].d
+ || res[i].b != arr[i].b + arr[i].b
+ || res[i].f != arr[i].f + arr[i].f
+ || res[i].e != arr[i].e + arr[i].e
+ || res[i].h != arr[i].e + arr[i].e
+ || res[i].g != arr[i].g + arr[i].g)
+ abort();
+ }
+
+}
+
+int main (void)
+{
+ int i;
+ s arr[N];
+
+ check_vect ();
+
+ for (i = 0; i < N; i++)
+ {
+ arr[i].a = i;
+ arr[i].b = i * 2;
+ arr[i].c = 17;
+ arr[i].d = i+34;
+ arr[i].e = i * 3 + 5;
+ arr[i].f = i * 5;
+ arr[i].g = i - 3;
+ arr[i].h = 56;
+ if (arr[i].a == 178)
+ abort();
+ }
+
+ main1 (arr);
+
+ return 0;
+}
+
+/* { dg-final { scan-tree-dump-times "vectorized 2 loops" 1 "vect" { target { vect_strided } && {! { vect_no_align} } } } } */
+/* { dg-final { scan-tree-dump-times "vectorized 1 loops" 1 "vect" { target { ! { vect_strided || vect_no_align} } } } } */
+/* { dg-final { scan-tree-dump-times "vectorizing stmts using SLP" 1 "vect" { xfail vect_no_align } } } */
+/* { dg-final { cleanup-tree-dump "vect" } } */
+
diff --git a/gcc/testsuite/gcc.dg/vect/slp-24.c b/gcc/testsuite/gcc.dg/vect/slp-24.c
new file mode 100644
index 00000000000..b3bf0735b02
--- /dev/null
+++ b/gcc/testsuite/gcc.dg/vect/slp-24.c
@@ -0,0 +1,82 @@
+/* { dg-require-effective-target vect_int } */
+
+#include <stdarg.h>
+#include <stdio.h>
+#include "tree-vect.h"
+
+#define N 16
+#define DIFF 242
+
+typedef struct {
+ unsigned char a;
+ unsigned char b;
+ unsigned char c;
+ unsigned char d;
+} s;
+
+void
+main1 (unsigned char x, unsigned char max_result, unsigned char min_result, s *arr)
+{
+ int i;
+ unsigned char ub[N*2] = {1,3,6,9,12,15,18,21,24,27,30,33,36,39,42,45,1,3,6,9,12,15,18,21,24,27,30,33,36,39,42,45};
+ unsigned char uc[N] = {1,1,2,3,4,5,6,7,8,9,10,11,12,13,14,15};
+ unsigned char udiff = 2;
+ unsigned char umax = x;
+ unsigned char umin = x;
+ unsigned char ua1[N*2];
+ s *pIn = arr;
+ s out[N];
+
+ for (i = 0; i < N; i++) {
+ udiff += (unsigned char)(ub[i] - uc[i]);
+
+ ua1[2*i+1] = ub[2*i+1];
+ ua1[2*i] = ub[2*i];
+
+ out[i].d = pIn->d - 1;
+ out[i].b = pIn->b - 4;
+ out[i].c = pIn->c - 8;
+ out[i].a = pIn->a - 3;
+
+ pIn++;
+ }
+
+ for (i = 0; i < N; i++) {
+ if (ua1[2*i] != ub[2*i]
+ || ua1[2*i+1] != ub[2*i+1]
+ || out[i].a != arr[i].a - 3
+ || out[i].b != arr[i].b - 4
+ || out[i].c != arr[i].c - 8
+ || out[i].d != arr[i].d - 1)
+ abort();
+ }
+
+ /* check results: */
+ if (udiff != DIFF)
+ abort ();
+}
+
+int main (void)
+{
+ int i;
+ s arr[N];
+
+ for (i = 0; i < N; i++)
+ {
+ arr[i].a = i + 9;
+ arr[i].b = i * 2 + 10;
+ arr[i].c = 17;
+ arr[i].d = i+34;
+ if (arr[i].a == 178)
+ abort();
+ }
+ check_vect ();
+
+ main1 (100, 100, 1, arr);
+ main1 (0, 15, 0, arr);
+ return 0;
+}
+
+/* { dg-final { scan-tree-dump-times "vectorized 1 loops" 1 "vect" { xfail vect_no_align } } } */
+/* { dg-final { scan-tree-dump-times "vectorizing stmts using SLP" 2 "vect" { xfail vect_no_align } } } */
+/* { dg-final { cleanup-tree-dump "vect" } } */
diff --git a/gcc/testsuite/gcc.dg/vect/slp-25.c b/gcc/testsuite/gcc.dg/vect/slp-25.c
new file mode 100644
index 00000000000..21f1900dc20
--- /dev/null
+++ b/gcc/testsuite/gcc.dg/vect/slp-25.c
@@ -0,0 +1,59 @@
+/* { dg-require-effective-target vect_int } */
+
+#include <stdarg.h>
+#include "tree-vect.h"
+
+#define N 128
+
+/* Unaligned stores. */
+
+int main1 (int n)
+{
+ int i;
+ int ia[N+1];
+ short sa[N+1];
+
+ for (i = 1; i <= N/2; i++)
+ {
+ ia[2*i] = 25;
+ ia[2*i + 1] = 5;
+ }
+
+ /* check results: */
+ for (i = 1; i <= N/2; i++)
+ {
+ if (ia[2*i] != 25
+ || ia[2*i + 1] != 5)
+ abort ();
+ }
+
+ for (i = 1; i <= n/2; i++)
+ {
+ sa[2*i] = 25;
+ sa[2*i + 1] = 5;
+ }
+
+ /* check results: */
+ for (i = 1; i <= n/2; i++)
+ {
+ if (sa[2*i] != 25
+ || sa[2*i + 1] != 5)
+ abort ();
+ }
+
+
+ return 0;
+}
+
+int main (void)
+{
+
+ check_vect ();
+
+ return main1 (N);
+}
+
+/* { dg-final { scan-tree-dump-times "vectorized 2 loops" 1 "vect" } } */
+/* { dg-final { scan-tree-dump-times "Vectorizing an unaligned access" 0 "vect" } } */
+/* { dg-final { scan-tree-dump-times "Alignment of access forced using peeling" 2 "vect" } } */
+/* { dg-final { cleanup-tree-dump "vect" } } */
diff --git a/gcc/testsuite/gcc.dg/vect/slp-26.c b/gcc/testsuite/gcc.dg/vect/slp-26.c
new file mode 100644
index 00000000000..14be68bd289
--- /dev/null
+++ b/gcc/testsuite/gcc.dg/vect/slp-26.c
@@ -0,0 +1,53 @@
+/* { dg-require-effective-target vect_int } */
+
+#include <stdarg.h>
+#include <stdio.h>
+#include "tree-vect.h"
+
+#define N 8
+
+int
+main1 ()
+{
+ int i;
+ unsigned short in[N*8] = {0,1,2,3,4,5,6,7,8,9,10,11,12,13,14,15,16,17,18,19,20,21,22,23,24,25,26,27,28,29,30,31,32,33,34,35,36,37,38,39,40,41,42,43,44,45,46,47,48,49,50,51,52,53,54,55,56,57,58,59,60,61,62,63};
+ unsigned short out[N*8], a[N], b[N] = {3,6,9,12,15,18,21,24};
+
+ /* Partial SLP is not supported. */
+ for (i = 0; i < N; i++)
+ {
+ out[i*4] = in[i*4];
+ out[i*4 + 1] = in[i*4 + 1];
+ out[i*4 + 2] = in[i*4 + 2];
+ out[i*4 + 3] = in[i*4 + 3];
+
+ a[i] = b[i] / 3;
+ }
+
+ /* check results: */
+ for (i = 0; i < N; i++)
+ {
+ if (out[i*4] != in[i*4]
+ || out[i*4 + 1] != in[i*4 + 1]
+ || out[i*4 + 2] != in[i*4 + 2]
+ || out[i*4 + 3] != in[i*4 + 3]
+ || a[i] != b[i] / 3)
+ abort ();
+ }
+
+ return 0;
+}
+
+int main (void)
+{
+ check_vect ();
+
+ main1 ();
+
+ return 0;
+}
+
+/* { dg-final { scan-tree-dump-times "vectorized 0 loops" 1 "vect" } } */
+/* { dg-final { scan-tree-dump-times "vectorizing stmts using SLP" 0 "vect" } } */
+/* { dg-final { cleanup-tree-dump "vect" } } */
+
diff --git a/gcc/testsuite/gcc.dg/vect/slp-28.c b/gcc/testsuite/gcc.dg/vect/slp-28.c
new file mode 100644
index 00000000000..069116ac58e
--- /dev/null
+++ b/gcc/testsuite/gcc.dg/vect/slp-28.c
@@ -0,0 +1,86 @@
+/* { dg-require-effective-target vect_int } */
+
+#include <stdarg.h>
+#include <stdio.h>
+#include "tree-vect.h"
+
+#define N 32
+
+int
+main1 ()
+{
+ int i;
+ unsigned short in[N] = {0,1,2,3,4,5,6,7,8,9,10,11,12,13,14,15,16,17,18,19,20,21,22,23,24,25,26,27,28,29,30,31};
+ unsigned short in2[N] = {0,1,2,3,4,5,6,7,8,9,10,11,12,13,14,15,16,17,18,19,20,21,22,23,24,25,26,27,28,29,30,31};
+ unsigned short in3[N] = {0,1,2,3,4,5,6,7,8,9,10,11,12,13,14,15,16,17,18,19,20,21,22,23,24,25,26,27,28,29,30,31};
+ unsigned short check[N] = {0,1,2,3,5,6,7,8,10,11,12,13,15,16,17,18,20,21,22,23,25,26,27,28,30,31,32,33,35,36,37,38};
+ unsigned short check3[N] = {0,1,2,3,4,5,6,7,8,9,10,11,5,6,7,8,9,10,11,12,13,14,15,16,10,11,12,13,14,15,16,17};
+
+ for (i = 0; i < N/4; i++)
+ {
+ in[i*4] = in[i*4] + 5;
+ in[i*4 + 1] = in[i*4 + 1] + 5;
+ in[i*4 + 2] = in[i*4 + 2] + 5;
+ in[i*4 + 3] = in[i*4 + 3] + 5;
+
+ }
+
+ /* check results: */
+ for (i = 0; i < N; i++)
+ {
+ if (in[i] != i+5)
+ abort ();
+ }
+
+ /* Not vectorizable because of data dependencies. */
+ for (i = 1; i < N/4; i++)
+ {
+ in2[i*4] = in2[(i-1)*4] + 5;
+ in2[i*4 + 1] = in2[(i-1)*4 + 1] + 5;
+ in2[i*4 + 2] = in2[(i-1)*4 + 2] + 5;
+ in2[i*4 + 3] = in2[(i-1)*4 + 3] + 5;
+
+ }
+
+ /* check results: */
+ for (i = 4; i < N; i++)
+ {
+ if (in2[i] != check[i])
+ abort ();
+ }
+
+ /* Not vectorizable because of data dependencies: distance 3 is greater than
+ the actual VF with SLP (2), but the analysis fail to detect that for now. */
+ for (i = 3; i < N/4; i++)
+ {
+ in3[i*4] = in3[(i-3)*4] + 5;
+ in3[i*4 + 1] = in3[(i-3)*4 + 1] + 5;
+ in3[i*4 + 2] = in3[(i-3)*4 + 2] + 5;
+ in3[i*4 + 3] = in3[(i-3)*4 + 3] + 5;
+
+ }
+
+ /* check results: */
+ for (i = 12; i < N; i++)
+ {
+ if (in3[i] != check3[i])
+ abort ();
+ }
+
+
+ return 0;
+}
+
+int main (void)
+{
+ check_vect ();
+
+ main1 ();
+
+ return 0;
+}
+
+/* { dg-final { scan-tree-dump-times "vectorized 1 loops" 1 "vect" } } */
+/* { dg-final { scan-tree-dump-times "vectorizing stmts using SLP" 1 "vect" } } */
+/* { dg-final { cleanup-tree-dump "vect" } } */
+
diff --git a/gcc/testsuite/gcc.dg/vect/slp-3.c b/gcc/testsuite/gcc.dg/vect/slp-3.c
new file mode 100644
index 00000000000..474bfe8285f
--- /dev/null
+++ b/gcc/testsuite/gcc.dg/vect/slp-3.c
@@ -0,0 +1,147 @@
+/* { dg-require-effective-target vect_int } */
+
+#include <stdarg.h>
+#include <stdio.h>
+#include "tree-vect.h"
+
+#define N 8
+
+int
+main1 ()
+{
+ int i;
+ unsigned short out[N*8];
+ unsigned short in[N*8] = {0,1,2,3,4,5,6,7,8,9,10,11,12,13,14,15,16,17,18,19,20,21,22,23,24,25,26,27,28,29,30,31,32,33,34,35,36,37,38,39,40,41,42,43,44,45,46,47,48,49,50,51,52,53,54,55,56,57,58,59,60,61,62,63};
+
+ for (i = 0; i < N; i++)
+ {
+ out[i*8] = in[i*8];
+ out[i*8 + 1] = in[i*8 + 1];
+ out[i*8 + 2] = in[i*8 + 2];
+ out[i*8 + 3] = in[i*8 + 3];
+ out[i*8 + 4] = in[i*8 + 4];
+ out[i*8 + 5] = in[i*8 + 5];
+ out[i*8 + 6] = in[i*8 + 6];
+ out[i*8 + 7] = in[i*8 + 7];
+ }
+
+ /* check results: */
+ for (i = 0; i < N; i++)
+ {
+ if (out[i*8] != in[i*8]
+ || out[i*8 + 1] != in[i*8 + 1]
+ || out[i*8 + 2] != in[i*8 + 2]
+ || out[i*8 + 3] != in[i*8 + 3]
+ || out[i*8 + 4] != in[i*8 + 4]
+ || out[i*8 + 5] != in[i*8 + 5]
+ || out[i*8 + 6] != in[i*8 + 6]
+ || out[i*8 + 7] != in[i*8 + 7])
+ abort ();
+ }
+
+ for (i = 0; i < N*2; i++)
+ {
+ out[i*4] = in[i*4];
+ out[i*4 + 1] = in[i*4 + 1];
+ out[i*4 + 2] = in[i*4 + 2];
+ out[i*4 + 3] = in[i*4 + 3];
+ }
+
+ /* check results: */
+ for (i = 0; i < N*2; i++)
+ {
+ if (out[i*4] != in[i*4]
+ || out[i*4 + 1] != in[i*4 + 1]
+ || out[i*4 + 2] != in[i*4 + 2]
+ || out[i*4 + 3] != in[i*4 + 3])
+ abort ();
+ }
+
+ for (i = 0; i < N/2; i++)
+ {
+ out[i*16] = in[i*16];
+ out[i*16 + 1] = in[i*16 + 1];
+ out[i*16 + 2] = in[i*16 + 2];
+ out[i*16 + 3] = in[i*16 + 3];
+ out[i*16 + 4] = in[i*16 + 4];
+ out[i*16 + 5] = in[i*16 + 5];
+ out[i*16 + 6] = in[i*16 + 6];
+ out[i*16 + 7] = in[i*16 + 7];
+ out[i*16 + 8] = in[i*16 + 8];
+ out[i*16 + 9] = in[i*16 + 9];
+ out[i*16 + 10] = in[i*16 + 10];
+ out[i*16 + 11] = in[i*16 + 11];
+ out[i*16 + 12] = in[i*16 + 12];
+ out[i*16 + 13] = in[i*16 + 13];
+ out[i*16 + 14] = in[i*16 + 14];
+ out[i*16 + 15] = in[i*16 + 15];
+ }
+
+ /* check results: */
+ for (i = 0; i < N/2; i++)
+ {
+ if (out[i*16] != in[i*16]
+ || out[i*16 + 1] != in[i*16 + 1]
+ || out[i*16 + 2] != in[i*16 + 2]
+ || out[i*16 + 3] != in[i*16 + 3]
+ || out[i*16 + 4] != in[i*16 + 4]
+ || out[i*16 + 5] != in[i*16 + 5]
+ || out[i*16 + 6] != in[i*16 + 6]
+ || out[i*16 + 7] != in[i*16 + 7]
+ || out[i*16 + 8] != in[i*16 + 8]
+ || out[i*16 + 9] != in[i*16 + 9]
+ || out[i*16 + 10] != in[i*16 + 10]
+ || out[i*16 + 11] != in[i*16 + 11]
+ || out[i*16 + 12] != in[i*16 + 12]
+ || out[i*16 + 13] != in[i*16 + 13]
+ || out[i*16 + 14] != in[i*16 + 14]
+ || out[i*16 + 15] != in[i*16 + 15])
+ abort ();
+ }
+
+ /* SLP with unrolling by 8. */
+ for (i = 0; i < N/2; i++)
+ {
+ out[i*9] = in[i*9];
+ out[i*9 + 1] = in[i*9 + 1];
+ out[i*9 + 2] = in[i*9 + 2];
+ out[i*9 + 3] = in[i*9 + 3];
+ out[i*9 + 4] = in[i*9 + 4];
+ out[i*9 + 5] = in[i*9 + 5];
+ out[i*9 + 6] = in[i*9 + 6];
+ out[i*9 + 7] = in[i*9 + 7];
+ out[i*9 + 8] = in[i*9 + 8];
+ }
+
+ /* check results: */
+ for (i = 0; i < N/2; i++)
+ {
+ if (out[i*9] != in[i*9]
+ || out[i*9 + 1] != in[i*9 + 1]
+ || out[i*9 + 2] != in[i*9 + 2]
+ || out[i*9 + 3] != in[i*9 + 3]
+ || out[i*9 + 4] != in[i*9 + 4]
+ || out[i*9 + 5] != in[i*9 + 5]
+ || out[i*9 + 6] != in[i*9 + 6]
+ || out[i*9 + 7] != in[i*9 + 7]
+ || out[i*9 + 8] != in[i*9 + 8])
+ abort ();
+ }
+
+
+ return 0;
+}
+
+int main (void)
+{
+ check_vect ();
+
+ main1 ();
+
+ return 0;
+}
+
+/* { dg-final { scan-tree-dump-times "vectorized 3 loops" 1 "vect" } } */
+/* { dg-final { scan-tree-dump-times "vectorizing stmts using SLP" 3 "vect" } } */
+/* { dg-final { cleanup-tree-dump "vect" } } */
+
diff --git a/gcc/testsuite/gcc.dg/vect/slp-33.c b/gcc/testsuite/gcc.dg/vect/slp-33.c
new file mode 100644
index 00000000000..86a641cfdf5
--- /dev/null
+++ b/gcc/testsuite/gcc.dg/vect/slp-33.c
@@ -0,0 +1,112 @@
+/* { dg-require-effective-target vect_int } */
+
+#include <stdarg.h>
+#include <stdio.h>
+#include "tree-vect.h"
+
+#define N 8
+
+int
+main1 ()
+{
+ int i;
+ unsigned int out[N*8], a0, a1, a2, a3, a4, a5, a6, a7, b1, b0, b2, b3, b4, b5, b6, b7;
+ unsigned int in[N*8] = {0,1,2,3,4,5,6,7,8,9,10,11,12,13,14,15,16,17,18,19,20,21,22,23,24,25,26,27,28,29,30,31,32,33,34,35,36,37,38,39,40,41,42,43,44,45,46,47,48,49,50,51,52,53,54,55,56,57,58,59,60,61,62,63};
+ float out2[N*8];
+
+ /* SLP with unrolling by 4. */
+ for (i = 0; i < N; i++)
+ {
+ a0 = in[i*7] + 5;
+ a1 = in[i*7 + 1] + 6;
+ a2 = in[i*7 + 2] + 7;
+ a3 = in[i*7 + 3] + 8;
+ a4 = in[i*7 + 4] + 9;
+ a5 = in[i*7 + 5] + 10;
+ a6 = in[i*7 + 6] + 11;
+
+ b0 = a0 * 3;
+ b1 = a1 * 2;
+ b2 = a2 * 12;
+ b3 = a3 * 5;
+ b4 = a4 * 8;
+ b5 = a5 * 4;
+ b6 = a6 * 3;
+
+ out[i*7] = b0 - 2;
+ out[i*7 + 1] = b1 - 3;
+ out[i*7 + 2] = b2 - 2;
+ out[i*7 + 3] = b3 - 1;
+ out[i*7 + 4] = b4 - 8;
+ out[i*7 + 5] = b5 - 7;
+ out[i*7 + 6] = b6 - 3;
+ }
+
+ /* check results: */
+ for (i = 0; i < N; i++)
+ {
+ if (out[i*7] != (in[i*7] + 5) * 3 - 2
+ || out[i*7 + 1] != (in[i*7 + 1] + 6) * 2 - 3
+ || out[i*7 + 2] != (in[i*7 + 2] + 7) * 12 - 2
+ || out[i*7 + 3] != (in[i*7 + 3] + 8) * 5 - 1
+ || out[i*7 + 4] != (in[i*7 + 4] + 9) * 8 - 8
+ || out[i*7 + 5] != (in[i*7 + 5] + 10) * 4 - 7
+ || out[i*7 + 6] != (in[i*7 + 6] + 11) * 3 - 3)
+ abort ();
+ }
+
+ /* SLP with unrolling by 4. */
+ for (i = 0; i < N*2; i++)
+ {
+ out[i*3] = (in[i*3] + 2) * 3;
+ out[i*3 + 1] = (in[i*3 + 1] + 2) * 7;
+ out[i*3 + 2] = (in[i*3 + 2] + 7) * 3;
+ }
+
+ /* check results: */
+ for (i = 0; i < N*2; i++)
+ {
+ if (out[i*3] != (in[i*3] + 2) * 3
+ || out[i*3 + 1] != (in[i*3 + 1] + 2) * 7
+ || out[i*3 + 2] != (in[i*3 + 2] + 7) * 3)
+ abort ();
+ }
+
+ /* SLP with unrolling by 4. */
+ for (i = 0; i < N*2; i++)
+ {
+ out2[i*3] = (float) (in[i*3] * 2 + 5) ;
+ out2[i*3 + 1] = (float) (in[i*3 + 1] * 3 + 7);
+ out2[i*3 + 2] = (float) (in[i*3 + 2] * 5 + 4);
+ }
+
+ /* check results: */
+ for (i = 0; i < N*2; i++)
+ {
+ if (out2[i*3] != (float) (in[i*3] * 2 + 5)
+ || out2[i*3 + 1] != (float) (in[i*3 + 1] * 3 + 7)
+ || out2[i*3 + 2] != (float) (in[i*3 + 2] * 5 + 4))
+ abort ();
+ }
+
+
+ return 0;
+}
+
+int main (void)
+{
+ check_vect ();
+
+ main1 ();
+
+ return 0;
+}
+
+/* { dg-final { scan-tree-dump-times "vectorized 3 loops" 1 "vect" {target {vect_intfloat_cvt && vect_int_mult} } } } */
+/* { dg-final { scan-tree-dump-times "vectorized 2 loops" 1 "vect" {target {{! { vect_intfloat_cvt}} && vect_int_mult} } } } */
+/* { dg-final { scan-tree-dump-times "vectorized 0 loops" 1 "vect" {target {{! { vect_intfloat_cvt}} && {!{vect_int_mult}}} } } } */
+/* { dg-final { scan-tree-dump-times "vectorizing stmts using SLP" 3 "vect" {target {vect_intfloat_cvt && vect_int_mult} } } } */
+/* { dg-final { scan-tree-dump-times "vectorizing stmts using SLP" 2 "vect" {target {{! { vect_intfloat_cvt}} && vect_int_mult} } } } */
+/* { dg-final { scan-tree-dump-times "vectorizing stmts using SLP" 0 "vect" {target {{! { vect_intfloat_cvt}} && {!{vect_int_mult}}} } } } */
+/* { dg-final { cleanup-tree-dump "vect" } } */
+
diff --git a/gcc/testsuite/gcc.dg/vect/slp-34.c b/gcc/testsuite/gcc.dg/vect/slp-34.c
new file mode 100644
index 00000000000..d25eef02101
--- /dev/null
+++ b/gcc/testsuite/gcc.dg/vect/slp-34.c
@@ -0,0 +1,61 @@
+/* { dg-require-effective-target vect_int } */
+
+#include <stdarg.h>
+#include <stdio.h>
+#include "tree-vect.h"
+
+#define N 8
+
+int
+main1 ()
+{
+ int i;
+ unsigned short out[N*8];
+ unsigned short in[N*8] = {0,1,2,3,4,5,6,7,8,9,10,11,12,13,14,15,16,17,18,19,20,21,22,23,24,25,26,27,28,29,30,31,32,33,34,35,36,37,38,39,40,41,42,43,44,45,46,47,48,49,50,51,52,53,54,55,56,57,58,59,60,61,62,63};
+ unsigned short in2[N*8] = {0,1,2,3,4,5,6,7,8,9,10,11,12,13,14,15,16,17,18,19,20,21,22,23,24,25,26,27,28,29,30,31,32,33,34,35,36,37,38,39,40,41,42,43,44,45,46,47,48,49,50,51,52,53,54,55,56,57,58,59,60,61,62,63};
+ unsigned short out2[N*8];
+
+ /* SLP with unrolling by 8. */
+ for (i = 0; i < N; i++)
+ {
+ out[i*3] = in[i*3] + 5;
+ out[i*3 + 1] = in[i*3 + 1] + 6;
+ out[i*3 + 2] = in[i*3 + 2] + 16;
+
+ out2[i*5] = in2[i*5] + 2;
+ out2[i*5 + 1] = in2[i*5 + 1] + 2;
+ out2[i*5 + 2] = in2[i*5 + 2] + 1;
+ out2[i*5 + 3] = in2[i*5 + 3] + 3;
+ out2[i*5 + 4] = in2[i*5 + 4] + 13;
+ }
+
+ /* check results: */
+ for (i = 0; i < N; i++)
+ {
+ if (out[i*3] != in[i*3] + 5
+ || out[i*3 + 1] != in[i*3 + 1] + 6
+ || out[i*3 + 2] != in[i*3 + 2] + 16
+ || out2[i*5] != in2[i*5] + 2
+ || out2[i*5 + 1] != in2[i*5 + 1] + 2
+ || out2[i*5 + 2] != in2[i*5 + 2] + 1
+ || out2[i*5 + 3] != in2[i*5 + 3] + 3
+ || out2[i*5 + 4] != in2[i*5 + 4] + 13)
+ abort ();
+ }
+
+ return 0;
+}
+
+int main (void)
+{
+ check_vect ();
+
+ main1 ();
+
+ return 0;
+}
+
+/* { dg-final { scan-tree-dump-times "vectorized 1 loops" 1 "vect" } } */
+/* { dg-final { scan-tree-dump-times "vectorizing stmts using SLP" 2 "vect" } } */
+/* { dg-final { cleanup-tree-dump "vect" } } */
+
diff --git a/gcc/testsuite/gcc.dg/vect/slp-35.c b/gcc/testsuite/gcc.dg/vect/slp-35.c
new file mode 100644
index 00000000000..39a7089ae7b
--- /dev/null
+++ b/gcc/testsuite/gcc.dg/vect/slp-35.c
@@ -0,0 +1,73 @@
+/* { dg-require-effective-target vect_int } */
+
+#include <stdarg.h>
+#include "tree-vect.h"
+
+#define N 128
+
+typedef struct {
+ int a;
+ int b;
+ int c;
+ int d;
+ int e;
+} s;
+
+int
+main1 (s *arr)
+{
+ int i;
+ s *ptr = arr;
+ s res[N];
+
+ /* SLP with unrolling by 4. */
+ for (i = 0; i < N; i++)
+ {
+ res[i].c = ptr->c + ptr->c;
+ res[i].a = ptr->a + ptr->a;
+ res[i].d = ptr->d + ptr->d;
+ res[i].b = ptr->b + ptr->b;
+ res[i].e = ptr->e + ptr->e;
+ ptr++;
+ }
+
+ /* check results: */
+ for (i = 0; i < N; i++)
+ {
+ if (res[i].c != arr[i].c + arr[i].c
+ || res[i].a != arr[i].a + arr[i].a
+ || res[i].d != arr[i].d + arr[i].d
+ || res[i].b != arr[i].b + arr[i].b
+ || res[i].e != arr[i].e + arr[i].e)
+ abort();
+ }
+
+}
+
+int main (void)
+{
+ int i;
+ s arr[N];
+
+ check_vect ();
+
+ for (i = 0; i < N; i++)
+ {
+ arr[i].a = i;
+ arr[i].b = i * 2;
+ arr[i].c = 17;
+ arr[i].d = i+34;
+ arr[i].e = i * 3 + 5;
+ if (arr[i].a == 178)
+ abort();
+ }
+
+ main1 (arr);
+
+ return 0;
+}
+
+/* { dg-final { scan-tree-dump-times "vectorized 1 loops" 1 "vect" { xfail vect_no_align } } } */
+/* { dg-final { scan-tree-dump-times "vectorizing stmts using SLP" 1 "vect" { xfail vect_no_align } } } */
+/* { dg-final { cleanup-tree-dump "vect" } } */
+
diff --git a/gcc/testsuite/gcc.dg/vect/slp-36.c b/gcc/testsuite/gcc.dg/vect/slp-36.c
new file mode 100644
index 00000000000..98d1473419e
--- /dev/null
+++ b/gcc/testsuite/gcc.dg/vect/slp-36.c
@@ -0,0 +1,75 @@
+/* { dg-do compile } */
+/* { dg-require-effective-target vect_shift } */
+
+#define N 32
+
+/* All the loops are vectorizable on platforms with vector shift argument. */
+
+void
+test_1 (void)
+{
+ static unsigned int bm[N];
+ static unsigned int cm[N];
+ int j;
+
+ /* Vectorizable on platforms with scalar shift argument. */
+ for (j = 0; j < N/2; j++)
+ {
+ bm[2*j] <<= 8;
+ bm[2*j+1] <<= 8;
+ }
+
+ /* Not vectorizable on platforms with scalar shift argument. */
+ for (j = 0; j < N/2; j++)
+ {
+ cm[2*j] <<= 8;
+ cm[2*j+1] <<= 7;
+ }
+}
+
+void
+test_2 (int a, int b)
+{
+ static unsigned int bm[N];
+ static unsigned int cm[N];
+ int j;
+
+ /* Vectorizable on platforms with scalar shift argument. */
+ for (j = 0; j < N/2; j++)
+ {
+ bm[2*j] <<= a;
+ bm[2*j+1] <<= a;
+ }
+
+ /* Not vectorizable on platforms with scalar shift argument. */
+ for (j = 0; j < N/2; j++)
+ {
+ cm[2*j] <<= a;
+ cm[2*j+1] <<= b;
+ }
+}
+
+void
+test_3 (void)
+{
+ static unsigned int bm[N];
+ int am[N];
+ int j;
+
+ /* Not vectorizable on platforms with scalar shift argument. */
+ for (j = 0; j < N/2; j++)
+ {
+ bm[2*j] <<= am[j];
+ bm[2*j+1] <<= am[j];
+ }
+
+ /* Not vectorizable on platforms with scalar shift argument. */
+ for (j = 0; j < N/2; j++)
+ {
+ bm[2*j] <<= am[2*j];
+ bm[2*j+1] <<= am[2*j+1];
+ }
+
+}
+
+/* { dg-final { cleanup-tree-dump "vect" } } */
diff --git a/gcc/testsuite/gcc.dg/vect/slp-37.c b/gcc/testsuite/gcc.dg/vect/slp-37.c
new file mode 100644
index 00000000000..48642db96e7
--- /dev/null
+++ b/gcc/testsuite/gcc.dg/vect/slp-37.c
@@ -0,0 +1,67 @@
+/* { dg-require-effective-target vect_int } */
+
+#include <stdarg.h>
+#include <stdlib.h>
+#include "tree-vect.h"
+
+#define N 128
+
+typedef struct {
+ int a;
+ int b;
+ void *c;
+} s1;
+
+int
+foo1 (s1 *arr)
+{
+ int i;
+ s1 *ptr = arr;
+
+ /* Different constant types - not SLPable. The group size is not power of 2,
+ interleaving is not supported either. */
+ for (i = 0; i < N; i++)
+ {
+ ptr->a = 6;
+ ptr->b = 7;
+ ptr->c = NULL;
+ ptr++;
+ }
+
+ /* check results: */
+ for (i = 0; i < N; i++)
+ {
+ if (arr[i].a != 6
+ || arr[i].b != 7
+ || arr[i].c != NULL)
+ abort();
+ }
+}
+
+int main (void)
+{
+ int i;
+ s1 arr1[N];
+
+ check_vect ();
+
+ for (i = 0; i < N; i++)
+ {
+ arr1[i].a = i;
+ arr1[i].b = i * 2;
+ arr1[i].c = (void *)arr1;
+
+ if (arr1[i].a == 178)
+ abort();
+ }
+
+
+ foo1 (arr1);
+
+ return 0;
+}
+
+/* { dg-final { scan-tree-dump-times "vectorized 1 loops" 0 "vect" } } */
+/* { dg-final { scan-tree-dump-times "vectorizing stmts using SLP" 0 "vect" } } */
+/* { dg-final { cleanup-tree-dump "vect" } } */
+
diff --git a/gcc/testsuite/gcc.dg/vect/slp-4.c b/gcc/testsuite/gcc.dg/vect/slp-4.c
new file mode 100644
index 00000000000..e1353584fd2
--- /dev/null
+++ b/gcc/testsuite/gcc.dg/vect/slp-4.c
@@ -0,0 +1,128 @@
+/* { dg-require-effective-target vect_int } */
+
+#include <stdarg.h>
+#include <stdio.h>
+#include "tree-vect.h"
+
+#define N 16
+
+int
+main1 ()
+{
+ int i;
+ unsigned short out[N*8];
+ unsigned short in[N*8] = {0,1,2,3,4,5,6,7,8,9,10,11,12,13,14,15,16,17,18,19,20,21,22,23,24,25,26,27,28,29,30,31,32,33,34,35,36,37,38,39,40,41,42,43,44,45,46,47,48,49,50,51,52,53,54,55,56,57,58,59,60,61,62,63,0,1,2,3,4,5,6,7,8,9,10,11,12,13,14,15,16,17,18,19,20,21,22,23,24,25,26,27,28,29,30,31,32,33,34,35,36,37,38,39,40,41,42,43,44,45,46,47,48,49,50,51,52,53,54,55,56,57,58,59,60,61,62,63};
+ unsigned int ia[N*2];
+
+ for (i = 0; i < N; i++)
+ {
+ out[i*8] = in[i*8];
+ out[i*8 + 1] = in[i*8 + 1];
+ out[i*8 + 2] = in[i*8 + 2];
+ out[i*8 + 3] = in[i*8 + 3];
+ out[i*8 + 4] = in[i*8 + 4];
+ out[i*8 + 5] = in[i*8 + 5];
+ out[i*8 + 6] = in[i*8 + 6];
+ out[i*8 + 7] = in[i*8 + 7];
+
+ ia[i] = 7;
+ }
+
+ /* check results: */
+ for (i = 0; i < N; i++)
+ {
+ if (out[i*8] != in[i*8]
+ || out[i*8 + 1] != in[i*8 + 1]
+ || out[i*8 + 2] != in[i*8 + 2]
+ || out[i*8 + 3] != in[i*8 + 3]
+ || out[i*8 + 4] != in[i*8 + 4]
+ || out[i*8 + 5] != in[i*8 + 5]
+ || out[i*8 + 6] != in[i*8 + 6]
+ || out[i*8 + 7] != in[i*8 + 7]
+ || ia[i] != 7)
+ abort ();
+ }
+
+ for (i = 0; i < N*2; i++)
+ {
+ out[i*4] = in[i*4];
+ out[i*4 + 1] = in[i*4 + 1];
+ out[i*4 + 2] = in[i*4 + 2];
+ out[i*4 + 3] = in[i*4 + 3];
+
+ ia[i] = 12;
+ }
+
+ /* check results: */
+ for (i = 0; i < N*2; i++)
+ {
+ if (out[i*4] != in[i*4]
+ || out[i*4 + 1] != in[i*4 + 1]
+ || out[i*4 + 2] != in[i*4 + 2]
+ || out[i*4 + 3] != in[i*4 + 3]
+ || ia[i] != 12)
+ abort ();
+ }
+
+ for (i = 0; i < N/2; i++)
+ {
+ out[i*16] = in[i*16];
+ out[i*16 + 1] = in[i*16 + 1];
+ out[i*16 + 2] = in[i*16 + 2];
+ out[i*16 + 3] = in[i*16 + 3];
+ out[i*16 + 4] = in[i*16 + 4];
+ out[i*16 + 5] = in[i*16 + 5];
+ out[i*16 + 6] = in[i*16 + 6];
+ out[i*16 + 7] = in[i*16 + 7];
+ out[i*16 + 8] = in[i*16 + 8];
+ out[i*16 + 9] = in[i*16 + 9];
+ out[i*16 + 10] = in[i*16 + 10];
+ out[i*16 + 11] = in[i*16 + 11];
+ out[i*16 + 12] = in[i*16 + 12];
+ out[i*16 + 13] = in[i*16 + 13];
+ out[i*16 + 14] = in[i*16 + 14];
+ out[i*16 + 15] = in[i*16 + 15];
+
+ ia[i] = 21;
+ }
+
+ /* check results: */
+ for (i = 0; i < N/2; i++)
+ {
+ if (out[i*16] != in[i*16]
+ || out[i*16 + 1] != in[i*16 + 1]
+ || out[i*16 + 2] != in[i*16 + 2]
+ || out[i*16 + 3] != in[i*16 + 3]
+ || out[i*16 + 4] != in[i*16 + 4]
+ || out[i*16 + 5] != in[i*16 + 5]
+ || out[i*16 + 6] != in[i*16 + 6]
+ || out[i*16 + 7] != in[i*16 + 7]
+ || out[i*16 + 8] != in[i*16 + 8]
+ || out[i*16 + 9] != in[i*16 + 9]
+ || out[i*16 + 10] != in[i*16 + 10]
+ || out[i*16 + 11] != in[i*16 + 11]
+ || out[i*16 + 12] != in[i*16 + 12]
+ || out[i*16 + 13] != in[i*16 + 13]
+ || out[i*16 + 14] != in[i*16 + 14]
+ || out[i*16 + 15] != in[i*16 + 15]
+ || ia[i] != 21)
+ abort ();
+ }
+
+
+ return 0;
+}
+
+int main (void)
+{
+ check_vect ();
+
+ main1 ();
+
+ return 0;
+}
+
+/* { dg-final { scan-tree-dump-times "vectorized 3 loops" 1 "vect" } } */
+/* { dg-final { scan-tree-dump-times "vectorizing stmts using SLP" 3 "vect" } } */
+/* { dg-final { cleanup-tree-dump "vect" } } */
+
diff --git a/gcc/testsuite/gcc.dg/vect/slp-5.c b/gcc/testsuite/gcc.dg/vect/slp-5.c
new file mode 100644
index 00000000000..0f9c2eefb21
--- /dev/null
+++ b/gcc/testsuite/gcc.dg/vect/slp-5.c
@@ -0,0 +1,128 @@
+/* { dg-require-effective-target vect_int } */
+
+#include <stdarg.h>
+#include <stdio.h>
+#include "tree-vect.h"
+
+#define N 16
+
+int
+main1 ()
+{
+ int i;
+ unsigned int out[N*8];
+ unsigned int in[N*8] = {0,1,2,3,4,5,6,7,8,9,10,11,12,13,14,15,16,17,18,19,20,21,22,23,24,25,26,27,28,29,30,31,32,33,34,35,36,37,38,39,40,41,42,43,44,45,46,47,48,49,50,51,52,53,54,55,56,57,58,59,60,61,62,63,0,1,2,3,4,5,6,7,8,9,10,11,12,13,14,15,16,17,18,19,20,21,22,23,24,25,26,27,28,29,30,31,32,33,34,35,36,37,38,39,40,41,42,43,44,45,46,47,48,49,50,51,52,53,54,55,56,57,58,59,60,61,62,63};
+ unsigned short ia[N];
+ unsigned int ib[N*2];
+
+ /* Not SLPable for now: multiple types with SLP of the smaller type. */
+ for (i = 0; i < N; i++)
+ {
+ out[i*8] = in[i*8];
+ out[i*8 + 1] = in[i*8 + 1];
+ out[i*8 + 2] = in[i*8 + 2];
+ out[i*8 + 3] = in[i*8 + 3];
+ out[i*8 + 4] = in[i*8 + 4];
+ out[i*8 + 5] = in[i*8 + 5];
+ out[i*8 + 6] = in[i*8 + 6];
+ out[i*8 + 7] = in[i*8 + 7];
+
+ ia[i] = 7;
+ }
+
+ /* check results: */
+ for (i = 0; i < N; i++)
+ {
+ if (out[i*8] != in[i*8]
+ || out[i*8 + 1] != in[i*8 + 1]
+ || out[i*8 + 2] != in[i*8 + 2]
+ || out[i*8 + 3] != in[i*8 + 3]
+ || out[i*8 + 4] != in[i*8 + 4]
+ || out[i*8 + 5] != in[i*8 + 5]
+ || out[i*8 + 6] != in[i*8 + 6]
+ || out[i*8 + 7] != in[i*8 + 7]
+ || ia[i] != 7)
+ abort ();
+ }
+
+ for (i = 0; i < N*2; i++)
+ {
+ out[i*4] = in[i*4];
+ out[i*4 + 1] = in[i*4 + 1];
+ out[i*4 + 2] = in[i*4 + 2];
+ out[i*4 + 3] = in[i*4 + 3];
+
+ ib[i] = 12;
+ }
+
+ /* check results: */
+ for (i = 0; i < N*2; i++)
+ {
+ if (out[i*4] != in[i*4]
+ || out[i*4 + 1] != in[i*4 + 1]
+ || out[i*4 + 2] != in[i*4 + 2]
+ || out[i*4 + 3] != in[i*4 + 3]
+ || ib[i] != 12)
+ abort ();
+ }
+
+ for (i = 0; i < N/2; i++)
+ {
+ out[i*16] = in[i*16];
+ out[i*16 + 1] = in[i*16 + 1];
+ out[i*16 + 2] = in[i*16 + 2];
+ out[i*16 + 3] = in[i*16 + 3];
+ out[i*16 + 4] = in[i*16 + 4];
+ out[i*16 + 5] = in[i*16 + 5];
+ out[i*16 + 6] = in[i*16 + 6];
+ out[i*16 + 7] = in[i*16 + 7];
+ out[i*16 + 8] = in[i*16 + 8];
+ out[i*16 + 9] = in[i*16 + 9];
+ out[i*16 + 10] = in[i*16 + 10];
+ out[i*16 + 11] = in[i*16 + 11];
+ out[i*16 + 12] = in[i*16 + 12];
+ out[i*16 + 13] = in[i*16 + 13];
+ out[i*16 + 14] = in[i*16 + 14];
+ out[i*16 + 15] = in[i*16 + 15];
+ }
+
+ /* check results: */
+ for (i = 0; i < N/2; i++)
+ {
+ if (out[i*16] != in[i*16]
+ || out[i*16 + 1] != in[i*16 + 1]
+ || out[i*16 + 2] != in[i*16 + 2]
+ || out[i*16 + 3] != in[i*16 + 3]
+ || out[i*16 + 4] != in[i*16 + 4]
+ || out[i*16 + 5] != in[i*16 + 5]
+ || out[i*16 + 6] != in[i*16 + 6]
+ || out[i*16 + 7] != in[i*16 + 7]
+ || out[i*16 + 8] != in[i*16 + 8]
+ || out[i*16 + 9] != in[i*16 + 9]
+ || out[i*16 + 10] != in[i*16 + 10]
+ || out[i*16 + 11] != in[i*16 + 11]
+ || out[i*16 + 12] != in[i*16 + 12]
+ || out[i*16 + 13] != in[i*16 + 13]
+ || out[i*16 + 14] != in[i*16 + 14]
+ || out[i*16 + 15] != in[i*16 + 15])
+ abort ();
+ }
+
+
+ return 0;
+}
+
+int main (void)
+{
+ check_vect ();
+
+ main1 ();
+
+ return 0;
+}
+
+/* { dg-final { scan-tree-dump-times "vectorized 3 loops" 1 "vect" { target { vect_strided } } } } */
+/* { dg-final { scan-tree-dump-times "vectorized 2 loops" 1 "vect" { target { ! { vect_strided } } } } } */
+/* { dg-final { scan-tree-dump-times "vectorizing stmts using SLP" 2 "vect" } } */
+/* { dg-final { cleanup-tree-dump "vect" } } */
+
diff --git a/gcc/testsuite/gcc.dg/vect/slp-6.c b/gcc/testsuite/gcc.dg/vect/slp-6.c
new file mode 100644
index 00000000000..5e86410588a
--- /dev/null
+++ b/gcc/testsuite/gcc.dg/vect/slp-6.c
@@ -0,0 +1,122 @@
+/* { dg-require-effective-target vect_int } */
+
+#include <stdarg.h>
+#include <stdio.h>
+#include "tree-vect.h"
+
+#define N 8
+
+int
+main1 ()
+{
+ int i;
+ unsigned short out[N*8];
+ unsigned short in[N*8] = {0,1,2,3,4,5,6,7,8,9,10,11,12,13,14,15,16,17,18,19,20,21,22,23,24,25,26,27,28,29,30,31,32,33,34,35,36,37,38,39,40,41,42,43,44,45,46,47,48,49,50,51,52,53,54,55,56,57,58,59,60,61,62,63};
+ unsigned int in2[N*8] = {0,1,2,3,4,5,6,7,8,9,10,11,12,13,14,15,16,17,18,19,20,21,22,23,24,25,26,27,28,29,30,31,32,33,34,35,36,37,38,39,40,41,42,43,44,45,46,47,48,49,50,51,52,53,54,55,56,57,58,59,60,61,62,63};
+ unsigned int out2[N*8];
+
+ for (i = 0; i < N; i++)
+ {
+ out[i*8] = in[i*8] + 5;
+ out[i*8 + 1] = in[i*8 + 1] + 6;
+ out[i*8 + 2] = in[i*8 + 2] + 7;
+ out[i*8 + 3] = in[i*8 + 3] + 8;
+ out[i*8 + 4] = in[i*8 + 4] + 9;
+ out[i*8 + 5] = in[i*8 + 5] + 10;
+ out[i*8 + 6] = in[i*8 + 6] + 11;
+ out[i*8 + 7] = in[i*8 + 7] + 12;
+ }
+
+ /* check results: */
+ for (i = 0; i < N; i++)
+ {
+ if (out[i*8] != in[i*8] + 5
+ || out[i*8 + 1] != in[i*8 + 1] + 6
+ || out[i*8 + 2] != in[i*8 + 2] + 7
+ || out[i*8 + 3] != in[i*8 + 3] + 8
+ || out[i*8 + 4] != in[i*8 + 4] + 9
+ || out[i*8 + 5] != in[i*8 + 5] + 10
+ || out[i*8 + 6] != in[i*8 + 6] + 11
+ || out[i*8 + 7] != in[i*8 + 7] + 12)
+ abort ();
+ }
+
+ for (i = 0; i < N*2; i++)
+ {
+ out[i*4] = in[i*4] + 2;
+ out[i*4 + 1] = in[i*4 + 1] + 2;
+ out[i*4 + 2] = in[i*4 + 2] + 1;
+ out[i*4 + 3] = in[i*4 + 3] + 3;
+ }
+
+ /* check results: */
+ for (i = 0; i < N*2; i++)
+ {
+ if (out[i*4] != in[i*4] + 2
+ || out[i*4 + 1] != in[i*4 + 1] + 2
+ || out[i*4 + 2] != in[i*4 + 2] + 1
+ || out[i*4 + 3] != in[i*4 + 3] + 3)
+ abort ();
+ }
+
+ for (i = 0; i < N/2; i++)
+ {
+ out2[i*16] = in2[i*16] * 2;
+ out2[i*16 + 1] = in2[i*16 + 1] * 3;
+ out2[i*16 + 2] = in2[i*16 + 2] * 4;
+ out2[i*16 + 3] = in2[i*16 + 3] * 3;
+ out2[i*16 + 4] = in2[i*16 + 4] * 2;
+ out2[i*16 + 5] = in2[i*16 + 5] * 3;
+ out2[i*16 + 6] = in2[i*16 + 6] * 2;
+ out2[i*16 + 7] = in2[i*16 + 7] * 4;
+ out2[i*16 + 8] = in2[i*16 + 8] * 2;
+ out2[i*16 + 9] = in2[i*16 + 9] * 5;
+ out2[i*16 + 10] = in2[i*16 + 10] * 2;
+ out2[i*16 + 11] = in2[i*16 + 11] * 3;
+ out2[i*16 + 12] = in2[i*16 + 12] * 4;
+ out2[i*16 + 13] = in2[i*16 + 13] * 4;
+ out2[i*16 + 14] = in2[i*16 + 14] * 3;
+ out2[i*16 + 15] = in2[i*16 + 15] * 2;
+ }
+
+ /* check results: */
+ for (i = 0; i < N/2; i++)
+ {
+ if (out2[i*16] != in2[i*16] * 2
+ || out2[i*16 + 1] != in2[i*16 + 1] * 3
+ || out2[i*16 + 2] != in2[i*16 + 2] * 4
+ || out2[i*16 + 3] != in2[i*16 + 3] * 3
+ || out2[i*16 + 4] != in2[i*16 + 4] * 2
+ || out2[i*16 + 5] != in2[i*16 + 5] * 3
+ || out2[i*16 + 6] != in2[i*16 + 6] * 2
+ || out2[i*16 + 7] != in2[i*16 + 7] * 4
+ || out2[i*16 + 8] != in2[i*16 + 8] * 2
+ || out2[i*16 + 9] != in2[i*16 + 9] * 5
+ || out2[i*16 + 10] != in2[i*16 + 10] * 2
+ || out2[i*16 + 11] != in2[i*16 + 11] * 3
+ || out2[i*16 + 12] != in2[i*16 + 12] * 4
+ || out2[i*16 + 13] != in2[i*16 + 13] * 4
+ || out2[i*16 + 14] != in2[i*16 + 14] * 3
+ || out2[i*16 + 15] != in2[i*16 + 15] * 2)
+ abort ();
+ }
+
+
+ return 0;
+}
+
+int main (void)
+{
+ check_vect ();
+
+ main1 ();
+
+ return 0;
+}
+
+/* { dg-final { scan-tree-dump-times "vectorized 3 loops" 1 "vect" {target vect_int_mult} } } */
+/* { dg-final { scan-tree-dump-times "vectorized 2 loops" 1 "vect" {target { ! { vect_int_mult } } } } } */
+/* { dg-final { scan-tree-dump-times "vectorizing stmts using SLP" 3 "vect" {target vect_int_mult } } } */
+/* { dg-final { scan-tree-dump-times "vectorizing stmts using SLP" 2 "vect" {target { ! { vect_int_mult } } } } } */
+/* { dg-final { cleanup-tree-dump "vect" } } */
+
diff --git a/gcc/testsuite/gcc.dg/vect/slp-7.c b/gcc/testsuite/gcc.dg/vect/slp-7.c
new file mode 100644
index 00000000000..4ee7029af0e
--- /dev/null
+++ b/gcc/testsuite/gcc.dg/vect/slp-7.c
@@ -0,0 +1,127 @@
+/* { dg-require-effective-target vect_int } */
+
+#include <stdarg.h>
+#include <stdio.h>
+#include "tree-vect.h"
+
+#define N 8
+
+int
+main1 ()
+{
+ int i;
+ unsigned int out[N*8], ia[N*2];
+ unsigned int in[N*8] = {0,1,2,3,4,5,6,7,8,9,10,11,12,13,14,15,16,17,18,19,20,21,22,23,24,25,26,27,28,29,30,31,32,33,34,35,36,37,38,39,40,41,42,43,44,45,46,47,48,49,50,51,52,53,54,55,56,57,58,59,60,61,62,63};
+ unsigned short in2[N*16] = {0,1,2,3,4,5,6,7,8,9,10,11,12,13,14,15,16,17,18,19,20,21,22,23,24,25,26,27,28,29,30,31,32,33,34,35,36,37,38,39,40,41,42,43,44,45,46,47,48,49,50,51,52,53,54,55,56,57,58,59,60,61,62,63,0,1,2,3,4,5,6,7,8,9,10,11,12,13,14,15,16,17,18,19,20,21,22,23,24,25,26,27,28,29,30,31,32,33,34,35,36,37,38,39,40,41,42,43,44,45,46,47,48,49,50,51,52,53,54,55,56,57,58,59,60,61,62,63};
+ unsigned short sa[N], out2[N*16];
+
+ for (i = 0; i < N; i++)
+ {
+ out[i*8] = in[i*8] + 5;
+ out[i*8 + 1] = in[i*8 + 1] + 6;
+ out[i*8 + 2] = in[i*8 + 2] + 7;
+ out[i*8 + 3] = in[i*8 + 3] + 8;
+ out[i*8 + 4] = in[i*8 + 4] + 9;
+ out[i*8 + 5] = in[i*8 + 5] + 10;
+ out[i*8 + 6] = in[i*8 + 6] + 11;
+ out[i*8 + 7] = in[i*8 + 7] + 12;
+
+ ia[i] = in[i];
+ }
+
+ /* check results: */
+ for (i = 0; i < N; i++)
+ {
+ if (out[i*8] != in[i*8] + 5
+ || out[i*8 + 1] != in[i*8 + 1] + 6
+ || out[i*8 + 2] != in[i*8 + 2] + 7
+ || out[i*8 + 3] != in[i*8 + 3] + 8
+ || out[i*8 + 4] != in[i*8 + 4] + 9
+ || out[i*8 + 5] != in[i*8 + 5] + 10
+ || out[i*8 + 6] != in[i*8 + 6] + 11
+ || out[i*8 + 7] != in[i*8 + 7] + 12
+ || ia[i] != in[i])
+ abort ();
+ }
+
+ for (i = 0; i < N*2; i++)
+ {
+ out[i*4] = in[i*4] + 1;
+ out[i*4 + 1] = in[i*4 + 1] + 2;
+ out[i*4 + 2] = in[i*4 + 2] + 3;
+ out[i*4 + 3] = in[i*4 + 3] + 4;
+
+ ia[i] = in[i];
+ }
+
+ /* check results: */
+ for (i = 0; i < N*2; i++)
+ {
+ if (out[i*4] != in[i*4] + 1
+ || out[i*4 + 1] != in[i*4 + 1] + 2
+ || out[i*4 + 2] != in[i*4 + 2] + 3
+ || out[i*4 + 3] != in[i*4 + 3] + 4
+ || ia[i] != in[i])
+ abort ();
+ }
+
+ for (i = 0; i < N; i++)
+ {
+ out2[i*16] = in2[i*16] * 2;
+ out2[i*16 + 1] = in2[i*16 + 1] * 3;
+ out2[i*16 + 2] = in2[i*16 + 2] * 4;
+ out2[i*16 + 3] = in2[i*16 + 3] * 3;
+ out2[i*16 + 4] = in2[i*16 + 4] * 2;
+ out2[i*16 + 5] = in2[i*16 + 5] * 3;
+ out2[i*16 + 6] = in2[i*16 + 6] * 2;
+ out2[i*16 + 7] = in2[i*16 + 7] * 4;
+ out2[i*16 + 8] = in2[i*16 + 8] * 2;
+ out2[i*16 + 9] = in2[i*16 + 9] * 5;
+ out2[i*16 + 10] = in2[i*16 + 10] * 2;
+ out2[i*16 + 11] = in2[i*16 + 11] * 3;
+ out2[i*16 + 12] = in2[i*16 + 12] * 4;
+ out2[i*16 + 13] = in2[i*16 + 13] * 4;
+ out2[i*16 + 14] = in2[i*16 + 14] * 3;
+ out2[i*16 + 15] = in2[i*16 + 15] * 2;
+ }
+
+ /* check results: */
+ for (i = 0; i < N; i++)
+ {
+ if (out2[i*16] != in2[i*16] * 2
+ || out2[i*16 + 1] != in2[i*16 + 1] * 3
+ || out2[i*16 + 2] != in2[i*16 + 2] * 4
+ || out2[i*16 + 3] != in2[i*16 + 3] * 3
+ || out2[i*16 + 4] != in2[i*16 + 4] * 2
+ || out2[i*16 + 5] != in2[i*16 + 5] * 3
+ || out2[i*16 + 6] != in2[i*16 + 6] * 2
+ || out2[i*16 + 7] != in2[i*16 + 7] * 4
+ || out2[i*16 + 8] != in2[i*16 + 8] * 2
+ || out2[i*16 + 9] != in2[i*16 + 9] * 5
+ || out2[i*16 + 10] != in2[i*16 + 10] * 2
+ || out2[i*16 + 11] != in2[i*16 + 11] * 3
+ || out2[i*16 + 12] != in2[i*16 + 12] * 4
+ || out2[i*16 + 13] != in2[i*16 + 13] * 4
+ || out2[i*16 + 14] != in2[i*16 + 14] * 3
+ || out2[i*16 + 15] != in2[i*16 + 15] * 2)
+ abort ();
+ }
+
+
+ return 0;
+}
+
+int main (void)
+{
+ check_vect ();
+
+ main1 ();
+
+ return 0;
+}
+
+/* { dg-final { scan-tree-dump-times "vectorized 3 loops" 1 "vect" {target { vect_strided && vect_int_mult } } } }*/
+/* { dg-final { scan-tree-dump-times "vectorized 2 loops" 1 "vect" {target { ! { vect_strided && vect_int_mult } } } } } */
+/* { dg-final { scan-tree-dump-times "vectorizing stmts using SLP" 2 "vect" } } */
+/* { dg-final { cleanup-tree-dump "vect" } } */
+
diff --git a/gcc/testsuite/gcc.dg/vect/slp-8.c b/gcc/testsuite/gcc.dg/vect/slp-8.c
new file mode 100644
index 00000000000..1260ddce504
--- /dev/null
+++ b/gcc/testsuite/gcc.dg/vect/slp-8.c
@@ -0,0 +1,45 @@
+/* { dg-require-effective-target vect_int } */
+
+#include <stdarg.h>
+#include "tree-vect.h"
+
+#define N 32
+
+int main1 ()
+{
+ int i;
+ int ib[N] = {0,3,6,9,12,15,18,21,24,27,30,33,36,39,42,45,0,3,6,9,12,15,18,21,24,27,30,33,36,39,42,45};
+ float fa[N];
+
+ /* int -> float */
+ for (i = 0; i < N/4; i++)
+ {
+ fa[4*i] = (float) ib[4*i];
+ fa[4*i + 1] = (float) ib[4*i + 1];
+ fa[4*i + 2] = (float) ib[4*i + 2];
+ fa[4*i + 3] = (float) ib[4*i + 3];
+ }
+
+ /* check results: */
+ for (i = 0; i < N/4; i++)
+ {
+ if (fa[4*i] != (float) ib[4*i]
+ || fa[4*i + 1] != (float) ib[4*i + 1]
+ || fa[4*i + 2] != (float) ib[4*i + 2]
+ || fa[4*i + 3] != (float) ib[4*i + 3])
+ abort ();
+ }
+
+ return 0;
+}
+
+int main (void)
+{
+ check_vect ();
+
+ return main1 ();
+}
+
+/* { dg-final { scan-tree-dump-times "vectorized 1 loops" 1 "vect" { target powerpc*-*-* i?86-*-* x86_64-*-* } } } */
+/* { dg-final { scan-tree-dump-times "vectorizing stmts using SLP" 1 "vect" { target powerpc*-*-* i?86-*-* x86_64-*-* } } } */
+/* { dg-final { cleanup-tree-dump "vect" } } */
diff --git a/gcc/testsuite/gcc.dg/vect/slp-9.c b/gcc/testsuite/gcc.dg/vect/slp-9.c
new file mode 100644
index 00000000000..cfb30bd718b
--- /dev/null
+++ b/gcc/testsuite/gcc.dg/vect/slp-9.c
@@ -0,0 +1,47 @@
+/* { dg-require-effective-target vect_int } */
+
+#include <stdarg.h>
+#include "tree-vect.h"
+
+#define N 64
+
+short X[N] __attribute__ ((__aligned__(16)));
+short Y[N] __attribute__ ((__aligned__(16)));
+int result[N];
+
+/* short->int widening-mult */
+int
+foo1(int len) {
+ int i;
+
+ for (i=0; i<len/2; i++) {
+ result[2*i] = X[2*i] * Y[2*i];
+ result[2*i+1] = X[2*i+1] * Y[2*i+1];
+ }
+}
+
+int main (void)
+{
+ int i;
+
+ check_vect ();
+
+ for (i=0; i<N; i++) {
+ X[i] = i;
+ Y[i] = 64-i;
+ }
+
+ foo1 (N);
+
+ for (i=0; i<N; i++) {
+ if (result[i] != X[i] * Y[i])
+ abort ();
+ }
+
+ return 0;
+}
+
+/* { dg-final { scan-tree-dump-times "vectorized 1 loops" 1 "vect" { target { vect_strided && vect_widen_mult_hi_to_si } } } }*/
+/* { dg-final { scan-tree-dump-times "vectorizing stmts using SLP" 0 "vect" } } */
+/* { dg-final { cleanup-tree-dump "vect" } } */
+
diff --git a/gcc/testsuite/gcc.dg/vect/vect-vfa-03.c b/gcc/testsuite/gcc.dg/vect/vect-vfa-03.c
index 53d781677ae..7d684aa1ff0 100644
--- a/gcc/testsuite/gcc.dg/vect/vect-vfa-03.c
+++ b/gcc/testsuite/gcc.dg/vect/vect-vfa-03.c
@@ -10,9 +10,9 @@ struct S
unsigned short b;
};
-struct S result[N] = {12, 13, 13, 14, 14, 15, 15, 16, 16, 17, 17, 18,
- 18, 19, 19, 20, 20, 21, 21, 22, 22, 23, 23, 24,
- 24, 25, 25, 26, 26, 27, 27, 28};
+struct S result[N] = {20, 13, 22, 14, 24, 15, 26, 16, 28, 17, 30, 18,
+ 32, 19, 34, 20, 36, 21, 38, 22, 40, 23, 42, 24,
+ 44, 25, 46, 26, 48, 27, 50, 28};
struct S X[N] = {10, 10, 11, 11, 12, 12, 13, 13, 14, 14, 15, 15, 16,
16, 17, 17, 18, 18, 19, 19, 20, 20, 21, 21, 22, 22,
23, 23, 24, 24, 25, 25};
@@ -25,7 +25,7 @@ foo (struct S * in, struct S * out)
for (i = 0; i < N; i++)
{
- out[i].a = in[i].a + 2;
+ out[i].a = in[i].a * 2;
out[i].b = in[i].b + 3;
}
}
@@ -42,10 +42,10 @@ main (void)
/* check results: */
for (i = 0; i < N; i++)
{
- if (Y[i].a != result[i].a)
+ if (Y[i].a != result[i].a)
abort ();
- if (Y[i].b != result[i].b)
+ if (Y[i].b != result[i].b)
abort ();
}
diff --git a/gcc/testsuite/gcc.dg/vect/vect-vfa-slp.c b/gcc/testsuite/gcc.dg/vect/vect-vfa-slp.c
new file mode 100644
index 00000000000..27560c72d9d
--- /dev/null
+++ b/gcc/testsuite/gcc.dg/vect/vect-vfa-slp.c
@@ -0,0 +1,56 @@
+/* { dg-require-effective-target vect_int } */
+
+#include <stdarg.h>
+#include "tree-vect.h"
+
+#define N 16
+struct S
+{
+ unsigned short a;
+ unsigned short b;
+};
+
+struct S result[N] = {12, 13, 13, 14, 14, 15, 15, 16, 16, 17, 17, 18,
+ 18, 19, 19, 20, 20, 21, 21, 22, 22, 23, 23, 24,
+ 24, 25, 25, 26, 26, 27, 27, 28};
+struct S X[N] = {10, 10, 11, 11, 12, 12, 13, 13, 14, 14, 15, 15, 16,
+ 16, 17, 17, 18, 18, 19, 19, 20, 20, 21, 21, 22, 22,
+ 23, 23, 24, 24, 25, 25};
+struct S Y[N] = {};
+
+__attribute__ ((noinline)) void
+foo (struct S * in, struct S * out)
+{
+ int i;
+
+ for (i = 0; i < N; i++)
+ {
+ out[i].a = in[i].a + 2;
+ out[i].b = in[i].b + 3;
+ }
+}
+
+int
+main (void)
+{
+ int i;
+
+ check_vect ();
+
+ foo (X, Y);
+
+ /* check results: */
+ for (i = 0; i < N; i++)
+ {
+ if (Y[i].a != result[i].a)
+ abort ();
+
+ if (Y[i].b != result[i].b)
+ abort ();
+
+ }
+ return 0;
+}
+
+/* { dg-final { scan-tree-dump-times "vectorized 1 loops" 1 "vect" } } */
+/* { dg-final { cleanup-tree-dump "vect" } } */
diff --git a/gcc/testsuite/gcc.dg/vect/vect.exp b/gcc/testsuite/gcc.dg/vect/vect.exp
index fba5aa8f591..e9be793621e 100644
--- a/gcc/testsuite/gcc.dg/vect/vect.exp
+++ b/gcc/testsuite/gcc.dg/vect/vect.exp
@@ -108,6 +108,8 @@ dg-runtest [lsort [glob -nocomplain $srcdir/$subdir/pr*.\[cS\]]] \
"" $DEFAULT_VECTCFLAGS
dg-runtest [lsort [glob -nocomplain $srcdir/$subdir/vect-*.\[cS\]]] \
"" $DEFAULT_VECTCFLAGS
+dg-runtest [lsort [glob -nocomplain $srcdir/$subdir/slp-*.\[cS\]]] \
+ "" $DEFAULT_VECTCFLAGS
#### Tests with special options
global SAVED_DEFAULT_VECTCFLAGS
@@ -122,25 +124,25 @@ dg-runtest [lsort [glob -nocomplain $srcdir/$subdir/no-vfa-*.\[cS\]]] \
# -ffast-math tests
set DEFAULT_VECTCFLAGS $SAVED_DEFAULT_VECTCFLAGS
lappend DEFAULT_VECTCFLAGS "-ffast-math"
-dg-runtest [lsort [glob -nocomplain $srcdir/$subdir/fast-math-vect*.\[cS\]]] \
+dg-runtest [lsort [glob -nocomplain $srcdir/$subdir/fast-math-*.\[cS\]]] \
"" $DEFAULT_VECTCFLAGS
# -fno-math-errno tests
set DEFAULT_VECTCFLAGS $SAVED_DEFAULT_VECTCFLAGS
lappend DEFAULT_VECTCFLAGS "-fno-math-errno"
-dg-runtest [lsort [glob -nocomplain $srcdir/$subdir/no-math-errno-vect*.\[cS\]]] \
+dg-runtest [lsort [glob -nocomplain $srcdir/$subdir/no-math-errno-*.\[cS\]]] \
"" $DEFAULT_VECTCFLAGS
# -fwrapv tests
set DEFAULT_VECTCFLAGS $SAVED_DEFAULT_VECTCFLAGS
lappend DEFAULT_VECTCFLAGS "-fwrapv"
-dg-runtest [lsort [glob -nocomplain $srcdir/$subdir/wrapv-vect*.\[cS\]]] \
+dg-runtest [lsort [glob -nocomplain $srcdir/$subdir/wrapv-*.\[cS\]]] \
"" $DEFAULT_VECTCFLAGS
# -ftrapv tests
set DEFAULT_VECTCFLAGS $SAVED_DEFAULT_VECTCFLAGS
lappend DEFAULT_VECTCFLAGS "-ftrapv"
-dg-runtest [lsort [glob -nocomplain $srcdir/$subdir/trapv-vect*.\[cS\]]] \
+dg-runtest [lsort [glob -nocomplain $srcdir/$subdir/trapv-*.\[cS\]]] \
"" $DEFAULT_VECTCFLAGS
# -fdump-tree-dceloop-details tests
@@ -197,12 +199,24 @@ lappend DEFAULT_VECTCFLAGS "-fno-tree-scev-cprop" "-fno-tree-reassoc"
dg-runtest [lsort [glob -nocomplain $srcdir/$subdir/no-scevccp-noreassoc-*.\[cS\]]] \
"" $DEFAULT_VECTCFLAGS
+# -fno-tree-scev-cprop
+set DEFAULT_VECTCFLAGS $SAVED_DEFAULT_VECTCFLAGS
+lappend DEFAULT_VECTCFLAGS "-fno-tree-scev-cprop"
+dg-runtest [lsort [glob -nocomplain $srcdir/$subdir/no-scevccp-slp-*.\[cS\]]] \
+ "" $DEFAULT_VECTCFLAGS
+
# -fno-tree-dominator-opts
set DEFAULT_VECTCFLAGS $SAVED_DEFAULT_VECTCFLAGS
lappend DEFAULT_VECTCFLAGS "-fno-tree-dominator-opts"
dg-runtest [lsort [glob -nocomplain $srcdir/$subdir/no-tree-dom-*.\[cS\]]] \
"" $DEFAULT_VECTCFLAGS
+# -fno-tree-pre
+set DEFAULT_VECTCFLAGS $SAVED_DEFAULT_VECTCFLAGS
+lappend DEFAULT_VECTCFLAGS "-fno-tree-pre"
+dg-runtest [lsort [glob -nocomplain $srcdir/$subdir/no-tree-pre-*.\[cS\]]] \
+ "" $DEFAULT_VECTCFLAGS
+
# With -Os
set DEFAULT_VECTCFLAGS $SAVED_DEFAULT_VECTCFLAGS
lappend DEFAULT_VECTCFLAGS "-Os"
diff --git a/gcc/testsuite/lib/target-supports.exp b/gcc/testsuite/lib/target-supports.exp
index fdf25444f9d..55e2a8a4c30 100644
--- a/gcc/testsuite/lib/target-supports.exp
+++ b/gcc/testsuite/lib/target-supports.exp
@@ -2043,7 +2043,7 @@ proc check_effective_target_vect_no_align { } {
set et_vect_no_align_saved 0
if { [istarget mipsisa64*-*-*]
|| [istarget sparc*-*-*]
- || [istarget ia64-*-*] } {
+ || [istarget ia64-*-*] } {
set et_vect_no_align_saved 1
}
}
@@ -2255,6 +2255,24 @@ proc check_effective_target_vect_interleave { } {
return $et_vect_interleave_saved
}
+# Return 1 if the target supports vector interleaving and extract even/odd, 0 otherwise.
+proc check_effective_target_vect_strided { } {
+ global et_vect_strided_saved
+
+ if [info exists et_vect_strided_saved] {
+ verbose "check_effective_target_vect_strided: using cached result" 2
+ } else {
+ set et_vect_strided_saved 0
+ if { [check_effective_target_vect_interleave]
+ && [check_effective_target_vect_extract_even_odd] } {
+ set et_vect_strided_saved 1
+ }
+ }
+
+ verbose "check_effective_target_vect_strided: returning $et_vect_strided_saved" 2
+ return $et_vect_strided_saved
+}
+
# Return 1 if the target supports section-anchors
proc check_effective_target_section_anchors { } {
diff --git a/gcc/tree-vect-analyze.c b/gcc/tree-vect-analyze.c
index a37fcf4395b..684d12dfcb4 100644
--- a/gcc/tree-vect-analyze.c
+++ b/gcc/tree-vect-analyze.c
@@ -39,6 +39,7 @@ along with GCC; see the file COPYING3. If not see
#include "tree-scalar-evolution.h"
#include "tree-vectorizer.h"
#include "toplev.h"
+#include "recog.h"
/* Main analysis functions. */
static loop_vec_info vect_analyze_loop_form (struct loop *);
@@ -300,6 +301,30 @@ vect_determine_vectorization_factor (loop_vec_info loop_vinfo)
}
+/* SLP costs are calculated according to SLP instance unrolling factor (i.e.,
+ the number of created vector stmts depends on the unrolling factor). However,
+ the actual number of vector stmts for every SLP node depends on VF which is
+ set later in vect_analyze_operations(). Hence, SLP costs should be updated.
+ In this function we assume that the inside costs calculated in
+ vect_model_xxx_cost are linear in ncopies. */
+
+static void
+vect_update_slp_costs_according_to_vf (loop_vec_info loop_vinfo)
+{
+ unsigned int i, vf = LOOP_VINFO_VECT_FACTOR (loop_vinfo);
+ VEC (slp_instance, heap) *slp_instances = LOOP_VINFO_SLP_INSTANCES (loop_vinfo);
+ slp_instance instance;
+
+ if (vect_print_dump_info (REPORT_SLP))
+ fprintf (vect_dump, "=== vect_update_slp_costs_according_to_vf ===");
+
+ for (i = 0; VEC_iterate (slp_instance, slp_instances, i, instance); i++)
+ /* We assume that costs are linear in ncopies. */
+ SLP_INSTANCE_INSIDE_OF_LOOP_COST (instance) *= vf
+ / SLP_INSTANCE_UNROLLING_FACTOR (instance);
+}
+
+
/* Function vect_analyze_operations.
Scan the loop stmts and make sure they are all vectorizable. */
@@ -320,6 +345,7 @@ vect_analyze_operations (loop_vec_info loop_vinfo)
int min_profitable_iters;
int min_scalar_loop_bound;
unsigned int th;
+ bool only_slp_in_loop = true;
if (vect_print_dump_info (REPORT_DETAILS))
fprintf (vect_dump, "=== vect_analyze_operations ===");
@@ -456,12 +482,12 @@ vect_analyze_operations (loop_vec_info loop_vinfo)
ok = (vectorizable_type_promotion (stmt, NULL, NULL)
|| vectorizable_type_demotion (stmt, NULL, NULL)
- || vectorizable_conversion (stmt, NULL, NULL)
- || vectorizable_operation (stmt, NULL, NULL)
- || vectorizable_assignment (stmt, NULL, NULL)
- || vectorizable_load (stmt, NULL, NULL)
+ || vectorizable_conversion (stmt, NULL, NULL, NULL)
+ || vectorizable_operation (stmt, NULL, NULL, NULL)
+ || vectorizable_assignment (stmt, NULL, NULL, NULL)
+ || vectorizable_load (stmt, NULL, NULL, NULL)
|| vectorizable_call (stmt, NULL, NULL)
- || vectorizable_store (stmt, NULL, NULL)
+ || vectorizable_store (stmt, NULL, NULL, NULL)
|| vectorizable_condition (stmt, NULL, NULL)
|| vectorizable_reduction (stmt, NULL, NULL));
@@ -480,6 +506,30 @@ vect_analyze_operations (loop_vec_info loop_vinfo)
}
return false;
}
+
+ if (!PURE_SLP_STMT (stmt_info))
+ {
+ /* STMT needs loop-based vectorization. */
+ only_slp_in_loop = false;
+
+ /* Groups of strided accesses whose size is not a power of 2 are
+ not vectorizable yet using loop-vectorization. Therefore, if
+ this stmt feeds non-SLP-able stmts (i.e., this stmt has to be
+ both SLPed and loop-based vectorzed), the loop cannot be
+ vectorized. */
+ if (STMT_VINFO_STRIDED_ACCESS (stmt_info)
+ && exact_log2 (DR_GROUP_SIZE (vinfo_for_stmt (
+ DR_GROUP_FIRST_DR (stmt_info)))) == -1)
+ {
+ if (vect_print_dump_info (REPORT_DETAILS))
+ {
+ fprintf (vect_dump, "not vectorized: the size of group "
+ "of strided accesses is not a power of 2");
+ print_generic_expr (vect_dump, stmt, TDF_SLIM);
+ }
+ return false;
+ }
+ }
} /* stmts in bb */
} /* bbs */
@@ -499,6 +549,18 @@ vect_analyze_operations (loop_vec_info loop_vinfo)
return false;
}
+ /* If all the stmts in the loop can be SLPed, we perform only SLP, and
+ vectorization factor of the loop is the unrolling factor required by the
+ SLP instances. If that unrolling factor is 1, we say, that we perform
+ pure SLP on loop - cross iteration parallelism is not exploited. */
+ if (only_slp_in_loop)
+ vectorization_factor = LOOP_VINFO_SLP_UNROLLING_FACTOR (loop_vinfo);
+ else
+ vectorization_factor = least_common_multiple (vectorization_factor,
+ LOOP_VINFO_SLP_UNROLLING_FACTOR (loop_vinfo));
+
+ LOOP_VINFO_VECT_FACTOR (loop_vinfo) = vectorization_factor;
+
if (LOOP_VINFO_NITERS_KNOWN_P (loop_vinfo)
&& vect_print_dump_info (REPORT_DETAILS))
fprintf (vect_dump,
@@ -518,6 +580,10 @@ vect_analyze_operations (loop_vec_info loop_vinfo)
/* Analyze cost. Decide if worth while to vectorize. */
+ /* Once VF is set, SLP costs should be updated since the number of created
+ vector stmts depends on VF. */
+ vect_update_slp_costs_according_to_vf (loop_vinfo);
+
min_profitable_iters = vect_estimate_min_profitable_iters (loop_vinfo);
LOOP_VINFO_COST_MODEL_MIN_ITERS (loop_vinfo) = min_profitable_iters;
if (min_profitable_iters < 0)
@@ -1462,9 +1528,9 @@ vect_update_misalignment_for_peel (struct data_reference *dr,
/* For interleaved data accesses the step in the loop must be multiplied by
the size of the interleaving group. */
- if (DR_GROUP_FIRST_DR (stmt_info))
+ if (STMT_VINFO_STRIDED_ACCESS (stmt_info))
dr_size *= DR_GROUP_SIZE (vinfo_for_stmt (DR_GROUP_FIRST_DR (stmt_info)));
- if (DR_GROUP_FIRST_DR (peel_stmt_info))
+ if (STMT_VINFO_STRIDED_ACCESS (peel_stmt_info))
dr_peel_size *= DR_GROUP_SIZE (peel_stmt_info);
/* It can be assumed that the data refs with the same alignment as dr_peel
@@ -1516,7 +1582,7 @@ vect_verify_datarefs_alignment (loop_vec_info loop_vinfo)
stmt_vec_info stmt_info = vinfo_for_stmt (stmt);
/* For interleaving, only the alignment of the first access matters. */
- if (DR_GROUP_FIRST_DR (stmt_info)
+ if (STMT_VINFO_STRIDED_ACCESS (stmt_info)
&& DR_GROUP_FIRST_DR (stmt_info) != stmt)
continue;
@@ -1554,7 +1620,7 @@ vector_alignment_reachable_p (struct data_reference *dr)
stmt_vec_info stmt_info = vinfo_for_stmt (stmt);
tree vectype = STMT_VINFO_VECTYPE (stmt_info);
- if (DR_GROUP_FIRST_DR (stmt_info))
+ if (STMT_VINFO_STRIDED_ACCESS (stmt_info))
{
/* For interleaved access we peel only if number of iterations in
the prolog loop ({VF - misalignment}), is a multiple of the
@@ -1768,7 +1834,7 @@ vect_enhance_data_refs_alignment (loop_vec_info loop_vinfo)
/* For interleaving, only the alignment of the first access
matters. */
- if (DR_GROUP_FIRST_DR (stmt_info)
+ if (STMT_VINFO_STRIDED_ACCESS (stmt_info)
&& DR_GROUP_FIRST_DR (stmt_info) != stmt)
continue;
@@ -1818,7 +1884,7 @@ vect_enhance_data_refs_alignment (loop_vec_info loop_vinfo)
members of the group, therefore we divide the number of iterations
by the group size. */
stmt_info = vinfo_for_stmt (DR_STMT (dr0));
- if (DR_GROUP_FIRST_DR (stmt_info))
+ if (STMT_VINFO_STRIDED_ACCESS (stmt_info))
npeel /= DR_GROUP_SIZE (stmt_info);
if (vect_print_dump_info (REPORT_DETAILS))
@@ -1837,7 +1903,7 @@ vect_enhance_data_refs_alignment (loop_vec_info loop_vinfo)
stmt_info = vinfo_for_stmt (stmt);
/* For interleaving, only the alignment of the first access
matters. */
- if (DR_GROUP_FIRST_DR (stmt_info)
+ if (STMT_VINFO_STRIDED_ACCESS (stmt_info)
&& DR_GROUP_FIRST_DR (stmt_info) != stmt)
continue;
@@ -1907,7 +1973,7 @@ vect_enhance_data_refs_alignment (loop_vec_info loop_vinfo)
/* For interleaving, only the alignment of the first access
matters. */
if (aligned_access_p (dr)
- || (DR_GROUP_FIRST_DR (stmt_info)
+ || (STMT_VINFO_STRIDED_ACCESS (stmt_info)
&& DR_GROUP_FIRST_DR (stmt_info) != stmt))
continue;
@@ -2019,13 +2085,13 @@ vect_analyze_data_refs_alignment (loop_vec_info loop_vinfo)
}
-/* Function vect_analyze_data_ref_access.
-
- Analyze the access pattern of the data-reference DR. For now, a data access
- has to be consecutive to be considered vectorizable. */
+/* Analyze groups of strided accesses: check that DR belongs to a group of
+ strided accesses of legal size, step, etc. Detect gaps, single element
+ interleaving, and other special cases. Set strided access info.
+ Collect groups of strided stores for further use in SLP analysis. */
static bool
-vect_analyze_data_ref_access (struct data_reference *dr)
+vect_analyze_group_access (struct data_reference *dr)
{
tree step = DR_STEP (dr);
tree scalar_type = TREE_TYPE (DR_REF (dr));
@@ -2033,50 +2099,14 @@ vect_analyze_data_ref_access (struct data_reference *dr)
tree stmt = DR_STMT (dr);
stmt_vec_info stmt_info = vinfo_for_stmt (stmt);
loop_vec_info loop_vinfo = STMT_VINFO_LOOP_VINFO (stmt_info);
- struct loop *loop = LOOP_VINFO_LOOP (loop_vinfo);
HOST_WIDE_INT dr_step = TREE_INT_CST_LOW (step);
HOST_WIDE_INT stride;
+ bool slp_impossible = false;
- /* Don't allow invariant accesses. */
- if (dr_step == 0)
- return false;
-
- if (nested_in_vect_loop_p (loop, stmt))
- {
- /* For the rest of the analysis we use the outer-loop step. */
- step = STMT_VINFO_DR_STEP (stmt_info);
- dr_step = TREE_INT_CST_LOW (step);
-
- if (dr_step == 0)
- {
- if (vect_print_dump_info (REPORT_ALIGNMENT))
- fprintf (vect_dump, "zero step in outer loop.");
- if (DR_IS_READ (dr))
- return true;
- else
- return false;
- }
- }
-
/* For interleaving, STRIDE is STEP counted in elements, i.e., the size of the
interleaving group (including gaps). */
stride = dr_step / type_size;
- /* Consecutive? */
- if (!tree_int_cst_compare (step, TYPE_SIZE_UNIT (scalar_type)))
- {
- /* Mark that it is not interleaving. */
- DR_GROUP_FIRST_DR (vinfo_for_stmt (stmt)) = NULL_TREE;
- return true;
- }
-
- if (nested_in_vect_loop_p (loop, stmt))
- {
- if (vect_print_dump_info (REPORT_ALIGNMENT))
- fprintf (vect_dump, "strided access in outer loop.");
- return false;
- }
-
/* Not consecutive access is possible only if it is a part of interleaving. */
if (!DR_GROUP_FIRST_DR (vinfo_for_stmt (stmt)))
{
@@ -2119,99 +2149,105 @@ vect_analyze_data_ref_access (struct data_reference *dr)
HOST_WIDE_INT diff, count_in_bytes;
while (next)
- {
- /* Skip same data-refs. In case that two or more stmts share data-ref
- (supported only for loads), we vectorize only the first stmt, and
- the rest get their vectorized loads from the first one. */
- if (!tree_int_cst_compare (DR_INIT (data_ref),
- DR_INIT (STMT_VINFO_DATA_REF (
- vinfo_for_stmt (next)))))
- {
+ {
+ /* Skip same data-refs. In case that two or more stmts share data-ref
+ (supported only for loads), we vectorize only the first stmt, and
+ the rest get their vectorized loads from the first one. */
+ if (!tree_int_cst_compare (DR_INIT (data_ref),
+ DR_INIT (STMT_VINFO_DATA_REF (
+ vinfo_for_stmt (next)))))
+ {
if (!DR_IS_READ (data_ref))
- {
+ {
if (vect_print_dump_info (REPORT_DETAILS))
fprintf (vect_dump, "Two store stmts share the same dr.");
- return false;
+ return false;
}
- /* Check that there is no load-store dependencies for this loads
+ /* Check that there is no load-store dependencies for this loads
to prevent a case of load-store-load to the same location. */
if (DR_GROUP_READ_WRITE_DEPENDENCE (vinfo_for_stmt (next))
|| DR_GROUP_READ_WRITE_DEPENDENCE (vinfo_for_stmt (prev)))
{
if (vect_print_dump_info (REPORT_DETAILS))
- fprintf (vect_dump,
+ fprintf (vect_dump,
"READ_WRITE dependence in interleaving.");
return false;
}
- /* For load use the same data-ref load. */
- DR_GROUP_SAME_DR_STMT (vinfo_for_stmt (next)) = prev;
+ /* For load use the same data-ref load. */
+ DR_GROUP_SAME_DR_STMT (vinfo_for_stmt (next)) = prev;
- prev = next;
- next = DR_GROUP_NEXT_DR (vinfo_for_stmt (next));
- continue;
- }
- prev = next;
+ prev = next;
+ next = DR_GROUP_NEXT_DR (vinfo_for_stmt (next));
+ continue;
+ }
+ prev = next;
- /* Check that all the accesses have the same STEP. */
- next_step = DR_STEP (STMT_VINFO_DATA_REF (vinfo_for_stmt (next)));
- if (tree_int_cst_compare (step, next_step))
- {
- if (vect_print_dump_info (REPORT_DETAILS))
- fprintf (vect_dump, "not consecutive access in interleaving");
- return false;
- }
+ /* Check that all the accesses have the same STEP. */
+ next_step = DR_STEP (STMT_VINFO_DATA_REF (vinfo_for_stmt (next)));
+ if (tree_int_cst_compare (step, next_step))
+ {
+ if (vect_print_dump_info (REPORT_DETAILS))
+ fprintf (vect_dump, "not consecutive access in interleaving");
+ return false;
+ }
- data_ref = STMT_VINFO_DATA_REF (vinfo_for_stmt (next));
- /* Check that the distance between two accesses is equal to the type
- size. Otherwise, we have gaps. */
- diff = (TREE_INT_CST_LOW (DR_INIT (data_ref))
- - TREE_INT_CST_LOW (prev_init)) / type_size;
- if (!DR_IS_READ (data_ref) && diff != 1)
+ data_ref = STMT_VINFO_DATA_REF (vinfo_for_stmt (next));
+ /* Check that the distance between two accesses is equal to the type
+ size. Otherwise, we have gaps. */
+ diff = (TREE_INT_CST_LOW (DR_INIT (data_ref))
+ - TREE_INT_CST_LOW (prev_init)) / type_size;
+ if (diff != 1)
{
- if (vect_print_dump_info (REPORT_DETAILS))
- fprintf (vect_dump, "interleaved store with gaps");
- return false;
+ /* FORNOW: SLP of accesses with gaps is not supported. */
+ slp_impossible = true;
+ if (!DR_IS_READ (data_ref))
+ {
+ if (vect_print_dump_info (REPORT_DETAILS))
+ fprintf (vect_dump, "interleaved store with gaps");
+ return false;
+ }
}
- /* Store the gap from the previous member of the group. If there is no
+
+ /* Store the gap from the previous member of the group. If there is no
gap in the access, DR_GROUP_GAP is always 1. */
- DR_GROUP_GAP (vinfo_for_stmt (next)) = diff;
+ DR_GROUP_GAP (vinfo_for_stmt (next)) = diff;
- prev_init = DR_INIT (data_ref);
- next = DR_GROUP_NEXT_DR (vinfo_for_stmt (next));
- /* Count the number of data-refs in the chain. */
- count++;
- }
+ prev_init = DR_INIT (data_ref);
+ next = DR_GROUP_NEXT_DR (vinfo_for_stmt (next));
+ /* Count the number of data-refs in the chain. */
+ count++;
+ }
- /* COUNT is the number of accesses found, we multiply it by the size of
- the type to get COUNT_IN_BYTES. */
+ /* COUNT is the number of accesses found, we multiply it by the size of
+ the type to get COUNT_IN_BYTES. */
count_in_bytes = type_size * count;
/* Check that the size of the interleaving is not greater than STEP. */
- if (dr_step < count_in_bytes)
- {
- if (vect_print_dump_info (REPORT_DETAILS))
- {
- fprintf (vect_dump, "interleaving size is greater than step for ");
- print_generic_expr (vect_dump, DR_REF (dr), TDF_SLIM);
- }
- return false;
- }
+ if (dr_step < count_in_bytes)
+ {
+ if (vect_print_dump_info (REPORT_DETAILS))
+ {
+ fprintf (vect_dump, "interleaving size is greater than step for ");
+ print_generic_expr (vect_dump, DR_REF (dr), TDF_SLIM);
+ }
+ return false;
+ }
- /* Check that the size of the interleaving is equal to STEP for stores,
- i.e., that there are no gaps. */
- if (!DR_IS_READ (dr) && dr_step != count_in_bytes)
- {
- if (vect_print_dump_info (REPORT_DETAILS))
- fprintf (vect_dump, "interleaved store with gaps");
- return false;
- }
+ /* Check that the size of the interleaving is equal to STEP for stores,
+ i.e., that there are no gaps. */
+ if (!DR_IS_READ (dr) && dr_step != count_in_bytes)
+ {
+ if (vect_print_dump_info (REPORT_DETAILS))
+ fprintf (vect_dump, "interleaved store with gaps");
+ return false;
+ }
/* Check that STEP is a multiple of type size. */
if ((dr_step % type_size) != 0)
- {
- if (vect_print_dump_info (REPORT_DETAILS))
+ {
+ if (vect_print_dump_info (REPORT_DETAILS))
{
fprintf (vect_dump, "step is not a multiple of type size: step ");
print_generic_expr (vect_dump, step, TDF_SLIM);
@@ -2219,22 +2255,98 @@ vect_analyze_data_ref_access (struct data_reference *dr)
print_generic_expr (vect_dump, TYPE_SIZE_UNIT (scalar_type),
TDF_SLIM);
}
- return false;
- }
+ return false;
+ }
- /* FORNOW: we handle only interleaving that is a power of 2. */
+ /* FORNOW: we handle only interleaving that is a power of 2.
+ We don't fail here if it may be still possible to vectorize the
+ group using SLP. If not, the size of the group will be checked in
+ vect_analyze_operations, and the vectorization will fail. */
if (exact_log2 (stride) == -1)
{
if (vect_print_dump_info (REPORT_DETAILS))
fprintf (vect_dump, "interleaving is not a power of 2");
- return false;
+
+ if (slp_impossible)
+ return false;
}
DR_GROUP_SIZE (vinfo_for_stmt (stmt)) = stride;
+ if (vect_print_dump_info (REPORT_DETAILS))
+ fprintf (vect_dump, "Detected interleaving of size %d", (int)stride);
+
+ /* SLP: create an SLP data structure for every interleaving group of
+ stores for further analysis in vect_analyse_slp. */
+ if (!DR_IS_READ (dr) && !slp_impossible)
+ VEC_safe_push (tree, heap, LOOP_VINFO_STRIDED_STORES (loop_vinfo), stmt);
}
+
return true;
}
+/* Analyze the access pattern of the data-reference DR.
+ In case of non-consecutive accesse call vect_analyze_group_access() to
+ analyze groups of strided accesses. */
+
+static bool
+vect_analyze_data_ref_access (struct data_reference *dr)
+{
+ tree step = DR_STEP (dr);
+ tree scalar_type = TREE_TYPE (DR_REF (dr));
+ tree stmt = DR_STMT (dr);
+ stmt_vec_info stmt_info = vinfo_for_stmt (stmt);
+ loop_vec_info loop_vinfo = STMT_VINFO_LOOP_VINFO (stmt_info);
+ struct loop *loop = LOOP_VINFO_LOOP (loop_vinfo);
+ HOST_WIDE_INT dr_step = TREE_INT_CST_LOW (step);
+
+ if (!step)
+ {
+ if (vect_print_dump_info (REPORT_DETAILS))
+ fprintf (vect_dump, "bad data-ref access");
+ return false;
+ }
+
+ /* Don't allow invariant accesses. */
+ if (dr_step == 0)
+ return false;
+
+ if (nested_in_vect_loop_p (loop, stmt))
+ {
+ /* For the rest of the analysis we use the outer-loop step. */
+ step = STMT_VINFO_DR_STEP (stmt_info);
+ dr_step = TREE_INT_CST_LOW (step);
+
+ if (dr_step == 0)
+ {
+ if (vect_print_dump_info (REPORT_ALIGNMENT))
+ fprintf (vect_dump, "zero step in outer loop.");
+ if (DR_IS_READ (dr))
+ return true;
+ else
+ return false;
+ }
+ }
+
+ /* Consecutive? */
+ if (!tree_int_cst_compare (step, TYPE_SIZE_UNIT (scalar_type)))
+ {
+ /* Mark that it is not interleaving. */
+ DR_GROUP_FIRST_DR (vinfo_for_stmt (stmt)) = NULL_TREE;
+ return true;
+ }
+
+ if (nested_in_vect_loop_p (loop, stmt))
+ {
+ if (vect_print_dump_info (REPORT_ALIGNMENT))
+ fprintf (vect_dump, "strided access in outer loop.");
+ return false;
+ }
+
+ /* Not consecutive access - check if it's a part of interleaving group. */
+ return vect_analyze_group_access (dr);
+}
+
+
/* Function vect_analyze_data_ref_accesses.
Analyze the access pattern of all the data references in the loop.
@@ -2266,6 +2378,697 @@ vect_analyze_data_ref_accesses (loop_vec_info loop_vinfo)
}
+/* Recursively free the memory allocated for the SLP tree rooted at NODE. */
+
+void
+vect_free_slp_tree (slp_tree node)
+{
+ if (!node)
+ return;
+
+ if (SLP_TREE_LEFT (node))
+ vect_free_slp_tree (SLP_TREE_LEFT (node));
+
+ if (SLP_TREE_RIGHT (node))
+ vect_free_slp_tree (SLP_TREE_RIGHT (node));
+
+ VEC_free (tree, heap, SLP_TREE_SCALAR_STMTS (node));
+
+ if (SLP_TREE_VEC_STMTS (node))
+ VEC_free (tree, heap, SLP_TREE_VEC_STMTS (node));
+
+ free (node);
+}
+
+
+/* Get the defs for the RHS (collect them in DEF_STMTS0/1), check that they are
+ of a legal type and that they match the defs of the first stmt of the SLP
+ group (stored in FIRST_STMT_...). */
+
+static bool
+vect_get_and_check_slp_defs (loop_vec_info loop_vinfo, slp_tree slp_node,
+ tree rhs, VEC (tree, heap) **def_stmts0,
+ VEC (tree, heap) **def_stmts1,
+ enum vect_def_type *first_stmt_dt0,
+ enum vect_def_type *first_stmt_dt1,
+ tree *first_stmt_def0_type,
+ tree *first_stmt_def1_type,
+ tree *first_stmt_const_oprnd,
+ int ncopies_for_cost)
+{
+ tree oprnd;
+ enum operation_type op_type = TREE_OPERAND_LENGTH (rhs);
+ unsigned int i, number_of_oprnds = op_type;
+ tree def, def_stmt;
+ enum vect_def_type dt[2] = {vect_unknown_def_type, vect_unknown_def_type};
+ stmt_vec_info stmt_info =
+ vinfo_for_stmt (VEC_index (tree, SLP_TREE_SCALAR_STMTS (slp_node), 0));
+
+ /* Store. */
+ if (!op_type)
+ number_of_oprnds = 1;
+ else
+ gcc_assert (op_type == unary_op || op_type == binary_op);
+
+ for (i = 0; i < number_of_oprnds; i++)
+ {
+ if (op_type)
+ oprnd = TREE_OPERAND (rhs, i);
+ else
+ oprnd = rhs;
+
+ if (!vect_is_simple_use (oprnd, loop_vinfo, &def_stmt, &def, &dt[i])
+ || (!def_stmt && dt[i] != vect_constant_def))
+ {
+ if (vect_print_dump_info (REPORT_SLP))
+ {
+ fprintf (vect_dump, "Build SLP failed: can't find def for ");
+ print_generic_expr (vect_dump, oprnd, TDF_SLIM);
+ }
+
+ return false;
+ }
+
+ if (!*first_stmt_dt0)
+ {
+ /* op0 of the first stmt of the group - store its info. */
+ *first_stmt_dt0 = dt[i];
+ if (def)
+ *first_stmt_def0_type = TREE_TYPE (def);
+ else
+ *first_stmt_const_oprnd = oprnd;
+
+ /* Analyze costs (for the first stmt of the group only). */
+ if (op_type)
+ /* Not memory operation (we don't call this functions for loads). */
+ vect_model_simple_cost (stmt_info, ncopies_for_cost, dt, slp_node);
+ else
+ /* Store. */
+ vect_model_store_cost (stmt_info, ncopies_for_cost, dt[0], slp_node);
+ }
+
+ else
+ {
+ if (!*first_stmt_dt1 && i == 1)
+ {
+ /* op1 of the first stmt of the group - store its info. */
+ *first_stmt_dt1 = dt[i];
+ if (def)
+ *first_stmt_def1_type = TREE_TYPE (def);
+ else
+ {
+ /* We assume that the stmt contains only one constant
+ operand. We fail otherwise, to be on the safe side. */
+ if (*first_stmt_const_oprnd)
+ {
+ if (vect_print_dump_info (REPORT_SLP))
+ fprintf (vect_dump, "Build SLP failed: two constant "
+ "oprnds in stmt");
+ return false;
+ }
+ *first_stmt_const_oprnd = oprnd;
+ }
+ }
+ else
+ {
+ /* Not first stmt of the group, check that the def-stmt/s match
+ the def-stmt/s of the first stmt. */
+ if ((i == 0
+ && (*first_stmt_dt0 != dt[i]
+ || (*first_stmt_def0_type && def
+ && *first_stmt_def0_type != TREE_TYPE (def))))
+ || (i == 1
+ && (*first_stmt_dt1 != dt[i]
+ || (*first_stmt_def1_type && def
+ && *first_stmt_def1_type != TREE_TYPE (def))))
+ || (!def
+ && TREE_TYPE (*first_stmt_const_oprnd)
+ != TREE_TYPE (oprnd)))
+ {
+ if (vect_print_dump_info (REPORT_SLP))
+ fprintf (vect_dump, "Build SLP failed: different types ");
+
+ return false;
+ }
+ }
+ }
+
+ /* Check the types of the definitions. */
+ switch (dt[i])
+ {
+ case vect_constant_def:
+ case vect_invariant_def:
+ break;
+
+ case vect_loop_def:
+ if (i == 0)
+ VEC_safe_push (tree, heap, *def_stmts0, def_stmt);
+ else
+ VEC_safe_push (tree, heap, *def_stmts1, def_stmt);
+ break;
+
+ default:
+ /* FORNOW: Not supported. */
+ if (vect_print_dump_info (REPORT_SLP))
+ {
+ fprintf (vect_dump, "Build SLP failed: illegal type of def ");
+ print_generic_expr (vect_dump, def, TDF_SLIM);
+ }
+
+ return false;
+ }
+ }
+
+ return true;
+}
+
+
+/* Recursively build an SLP tree starting from NODE.
+ Fail (and return FALSE) if def-stmts are not isomorphic, require data
+ permutation or are of unsupported types of operation. Otherwise, return
+ TRUE.
+ SLP_IMPOSSIBLE is TRUE if it is impossible to SLP in the loop, for example
+ in the case of multiple types for now. */
+
+static bool
+vect_build_slp_tree (loop_vec_info loop_vinfo, slp_tree *node,
+ unsigned int group_size, bool *slp_impossible,
+ int *inside_cost, int *outside_cost,
+ int ncopies_for_cost)
+{
+ VEC (tree, heap) *def_stmts0 = VEC_alloc (tree, heap, group_size);
+ VEC (tree, heap) *def_stmts1 = VEC_alloc (tree, heap, group_size);
+ unsigned int i;
+ VEC (tree, heap) *stmts = SLP_TREE_SCALAR_STMTS (*node);
+ tree stmt = VEC_index (tree, stmts, 0);
+ enum vect_def_type first_stmt_dt0 = 0, first_stmt_dt1 = 0;
+ enum tree_code first_stmt_code = 0;
+ tree first_stmt_def1_type = NULL_TREE, first_stmt_def0_type = NULL_TREE;
+ tree lhs, rhs, prev_stmt = NULL_TREE;
+ bool stop_recursion = false, need_same_oprnds = false;
+ tree vectype, scalar_type, first_op1 = NULL_TREE;
+ unsigned int vectorization_factor = 0, ncopies;
+ optab optab;
+ int icode;
+ enum machine_mode optab_op2_mode;
+ enum machine_mode vec_mode;
+ tree first_stmt_const_oprnd = NULL_TREE;
+ struct data_reference *first_dr;
+
+ /* For every stmt in NODE find its def stmt/s. */
+ for (i = 0; VEC_iterate (tree, stmts, i, stmt); i++)
+ {
+ if (vect_print_dump_info (REPORT_SLP))
+ {
+ fprintf (vect_dump, "Build SLP for ");
+ print_generic_expr (vect_dump, stmt, TDF_SLIM);
+ }
+
+ if (TREE_CODE (stmt) != GIMPLE_MODIFY_STMT)
+ {
+ if (vect_print_dump_info (REPORT_SLP))
+ {
+ fprintf (vect_dump, "Build SLP failed: not MODIFY_STMT ");
+ print_generic_expr (vect_dump, stmt, TDF_SLIM);
+ }
+
+ return false;
+ }
+
+ scalar_type = TREE_TYPE (GIMPLE_STMT_OPERAND (stmt, 0));
+ vectype = get_vectype_for_scalar_type (scalar_type);
+ gcc_assert (LOOP_VINFO_VECT_FACTOR (loop_vinfo));
+ vectorization_factor = LOOP_VINFO_VECT_FACTOR (loop_vinfo);
+ ncopies = vectorization_factor / TYPE_VECTOR_SUBPARTS (vectype);
+ if (ncopies > 1)
+ {
+ /* FORNOW. */
+ if (vect_print_dump_info (REPORT_SLP))
+ fprintf (vect_dump, "SLP failed - multiple types ");
+
+ *slp_impossible = true;
+ return false;
+ }
+
+ lhs = GIMPLE_STMT_OPERAND (stmt, 0);
+ rhs = GIMPLE_STMT_OPERAND (stmt, 1);
+
+ /* Check the operation. */
+ if (i == 0)
+ {
+ first_stmt_code = TREE_CODE (rhs);
+
+ /* Shift arguments should be equal in all the packed stmts for a
+ vector shift with scalar shift operand. */
+ if (TREE_CODE (rhs) == LSHIFT_EXPR || TREE_CODE (rhs) == RSHIFT_EXPR)
+ {
+ vec_mode = TYPE_MODE (vectype);
+ optab = optab_for_tree_code (TREE_CODE (rhs), vectype);
+ if (!optab)
+ {
+ if (vect_print_dump_info (REPORT_SLP))
+ fprintf (vect_dump, "Build SLP failed: no optab.");
+ return false;
+ }
+ icode = (int) optab->handlers[(int) vec_mode].insn_code;
+ optab_op2_mode = insn_data[icode].operand[2].mode;
+ if (!VECTOR_MODE_P (optab_op2_mode))
+ {
+ need_same_oprnds = true;
+ first_op1 = TREE_OPERAND (rhs, 1);
+ }
+ }
+ }
+ else
+ {
+ if (first_stmt_code != TREE_CODE (rhs))
+ {
+ if (vect_print_dump_info (REPORT_SLP))
+ {
+ fprintf (vect_dump,
+ "Build SLP failed: different operation in stmt ");
+ print_generic_expr (vect_dump, stmt, TDF_SLIM);
+ }
+
+ return false;
+ }
+
+ if (need_same_oprnds
+ && !operand_equal_p (first_op1, TREE_OPERAND (rhs, 1), 0))
+ {
+ if (vect_print_dump_info (REPORT_SLP))
+ {
+ fprintf (vect_dump,
+ "Build SLP failed: different shift arguments in ");
+ print_generic_expr (vect_dump, stmt, TDF_SLIM);
+ }
+
+ return false;
+ }
+ }
+
+ /* Strided store or load. */
+ if (STMT_VINFO_STRIDED_ACCESS (vinfo_for_stmt (stmt)))
+ {
+ if (REFERENCE_CLASS_P (lhs))
+ {
+ /* Store. */
+ if (!vect_get_and_check_slp_defs (loop_vinfo, *node, rhs,
+ &def_stmts0, &def_stmts1,
+ &first_stmt_dt0,
+ &first_stmt_dt1,
+ &first_stmt_def0_type,
+ &first_stmt_def1_type,
+ &first_stmt_const_oprnd,
+ ncopies_for_cost))
+ return false;
+ }
+ else
+ {
+ /* Load. */
+ if (i == 0)
+ {
+ /* First stmt of the SLP group should be the first load of
+ the interleaving loop if data permutation is not
+ allowed. */
+ if (DR_GROUP_FIRST_DR (vinfo_for_stmt (stmt)) != stmt)
+ {
+ /* FORNOW: data permutations are not supported. */
+ if (vect_print_dump_info (REPORT_SLP))
+ {
+ fprintf (vect_dump, "Build SLP failed: strided "
+ " loads need permutation ");
+ print_generic_expr (vect_dump, stmt, TDF_SLIM);
+ }
+
+ return false;
+ }
+
+ first_dr = STMT_VINFO_DATA_REF (vinfo_for_stmt (stmt));
+ if (vect_supportable_dr_alignment (first_dr)
+ == dr_unaligned_unsupported)
+ {
+ if (vect_print_dump_info (REPORT_SLP))
+ {
+ fprintf (vect_dump, "Build SLP failed: unsupported "
+ " unaligned load ");
+ print_generic_expr (vect_dump, stmt, TDF_SLIM);
+ }
+
+ return false;
+ }
+
+ /* Analyze costs (for the first stmt in the group). */
+ vect_model_load_cost (vinfo_for_stmt (stmt),
+ ncopies_for_cost, *node);
+ }
+ else
+ {
+ if (DR_GROUP_NEXT_DR (vinfo_for_stmt (prev_stmt)) != stmt)
+ {
+ /* FORNOW: data permutations are not supported. */
+ if (vect_print_dump_info (REPORT_SLP))
+ {
+ fprintf (vect_dump, "Build SLP failed: strided "
+ " loads need permutation ");
+ print_generic_expr (vect_dump, stmt, TDF_SLIM);
+ }
+ return false;
+ }
+ }
+
+ prev_stmt = stmt;
+
+ /* We stop the tree when we reach a group of loads. */
+ stop_recursion = true;
+ continue;
+ }
+ } /* Strided access. */
+ else
+ {
+ if (REFERENCE_CLASS_P (rhs))
+ {
+ /* Not strided load. */
+ if (vect_print_dump_info (REPORT_SLP))
+ {
+ fprintf (vect_dump, "Build SLP failed: not strided load ");
+ print_generic_expr (vect_dump, stmt, TDF_SLIM);
+ }
+
+ /* FORNOW: Not strided loads are not supported. */
+ return false;
+ }
+
+ /* Not memory operation. */
+ if (!BINARY_CLASS_P (rhs) && !UNARY_CLASS_P (rhs))
+ {
+ if (vect_print_dump_info (REPORT_SLP))
+ {
+ fprintf (vect_dump, "Build SLP failed: operation");
+ fprintf (vect_dump, " unsupported ");
+ print_generic_expr (vect_dump, stmt, TDF_SLIM);
+ }
+
+ return false;
+ }
+
+ /* Find the def-stmts. */
+ if (!vect_get_and_check_slp_defs (loop_vinfo, *node, rhs, &def_stmts0,
+ &def_stmts1, &first_stmt_dt0,
+ &first_stmt_dt1,
+ &first_stmt_def0_type,
+ &first_stmt_def1_type,
+ &first_stmt_const_oprnd,
+ ncopies_for_cost))
+ return false;
+ }
+ }
+
+ /* Add the costs of the node to the overall instance costs. */
+ *inside_cost += SLP_TREE_INSIDE_OF_LOOP_COST (*node);
+ *outside_cost += SLP_TREE_OUTSIDE_OF_LOOP_COST (*node);
+
+ /* Strided loads were reached - stop the recursion. */
+ if (stop_recursion)
+ return true;
+
+ /* Create SLP_TREE nodes for the definition node/s. */
+ if (first_stmt_dt0 == vect_loop_def)
+ {
+ slp_tree left_node = XNEW (struct _slp_tree);
+ SLP_TREE_SCALAR_STMTS (left_node) = def_stmts0;
+ SLP_TREE_VEC_STMTS (left_node) = NULL;
+ SLP_TREE_LEFT (left_node) = NULL;
+ SLP_TREE_RIGHT (left_node) = NULL;
+ SLP_TREE_OUTSIDE_OF_LOOP_COST (left_node) = 0;
+ SLP_TREE_INSIDE_OF_LOOP_COST (left_node) = 0;
+ if (!vect_build_slp_tree (loop_vinfo, &left_node, group_size,
+ slp_impossible, inside_cost, outside_cost,
+ ncopies_for_cost))
+ return false;
+
+ SLP_TREE_LEFT (*node) = left_node;
+ }
+
+ if (first_stmt_dt1 == vect_loop_def)
+ {
+ slp_tree right_node = XNEW (struct _slp_tree);
+ SLP_TREE_SCALAR_STMTS (right_node) = def_stmts1;
+ SLP_TREE_VEC_STMTS (right_node) = NULL;
+ SLP_TREE_LEFT (right_node) = NULL;
+ SLP_TREE_RIGHT (right_node) = NULL;
+ SLP_TREE_OUTSIDE_OF_LOOP_COST (right_node) = 0;
+ SLP_TREE_INSIDE_OF_LOOP_COST (right_node) = 0;
+ if (!vect_build_slp_tree (loop_vinfo, &right_node, group_size,
+ slp_impossible, inside_cost, outside_cost,
+ ncopies_for_cost))
+ return false;
+
+ SLP_TREE_RIGHT (*node) = right_node;
+ }
+
+ return true;
+}
+
+
+static void
+vect_print_slp_tree (slp_tree node)
+{
+ int i;
+ tree stmt;
+
+ if (!node)
+ return;
+
+ fprintf (vect_dump, "node ");
+ for (i = 0; VEC_iterate (tree, SLP_TREE_SCALAR_STMTS (node), i, stmt); i++)
+ {
+ fprintf (vect_dump, "\n\tstmt %d ", i);
+ print_generic_expr (vect_dump, stmt, TDF_SLIM);
+ }
+ fprintf (vect_dump, "\n");
+
+ vect_print_slp_tree (SLP_TREE_LEFT (node));
+ vect_print_slp_tree (SLP_TREE_RIGHT (node));
+}
+
+
+/* Mark the tree rooted at NODE with MARK (PURE_SLP or HYBRID).
+ If MARK is HYBRID, it refers to a specific stmt in NODE (the stmt at index
+ J). Otherwise, MARK is PURE_SLP and J is -1, which indicates that all the
+ stmts in NODE are to be marked. */
+
+static void
+vect_mark_slp_stmts (slp_tree node, enum slp_vect_type mark, int j)
+{
+ int i;
+ tree stmt;
+
+ if (!node)
+ return;
+
+ for (i = 0; VEC_iterate (tree, SLP_TREE_SCALAR_STMTS (node), i, stmt); i++)
+ if (j < 0 || i == j)
+ STMT_SLP_TYPE (vinfo_for_stmt (stmt)) = mark;
+
+ vect_mark_slp_stmts (SLP_TREE_LEFT (node), mark, j);
+ vect_mark_slp_stmts (SLP_TREE_RIGHT (node), mark, j);
+}
+
+
+/* Analyze an SLP instance starting from a group of strided stores. Call
+ vect_build_slp_tree to build a tree of packed stmts if possible.
+ Return FALSE if it's impossible to SLP any stmt in the loop. */
+
+static bool
+vect_analyze_slp_instance (loop_vec_info loop_vinfo, tree stmt)
+{
+ slp_instance new_instance;
+ slp_tree node = XNEW (struct _slp_tree);
+ unsigned int group_size = DR_GROUP_SIZE (vinfo_for_stmt (stmt));
+ unsigned int unrolling_factor = 1, nunits;
+ tree vectype, scalar_type, next;
+ unsigned int vectorization_factor = 0, ncopies;
+ bool slp_impossible = false;
+ int inside_cost = 0, outside_cost = 0, ncopies_for_cost;
+
+ /* FORNOW: multiple types are not supported. */
+ scalar_type = TREE_TYPE (DR_REF (STMT_VINFO_DATA_REF (vinfo_for_stmt (stmt))));
+ vectype = get_vectype_for_scalar_type (scalar_type);
+ nunits = TYPE_VECTOR_SUBPARTS (vectype);
+ vectorization_factor = LOOP_VINFO_VECT_FACTOR (loop_vinfo);
+ ncopies = vectorization_factor / nunits;
+ if (ncopies > 1)
+ {
+ if (vect_print_dump_info (REPORT_SLP))
+ fprintf (vect_dump, "SLP failed - multiple types ");
+
+ return false;
+ }
+
+ /* Create a node (a root of the SLP tree) for the packed strided stores. */
+ SLP_TREE_SCALAR_STMTS (node) = VEC_alloc (tree, heap, group_size);
+ next = stmt;
+ /* Collect the stores and store them in SLP_TREE_SCALAR_STMTS. */
+ while (next)
+ {
+ VEC_safe_push (tree, heap, SLP_TREE_SCALAR_STMTS (node), next);
+ next = DR_GROUP_NEXT_DR (vinfo_for_stmt (next));
+ }
+
+ SLP_TREE_VEC_STMTS (node) = NULL;
+ SLP_TREE_NUMBER_OF_VEC_STMTS (node) = 0;
+ SLP_TREE_LEFT (node) = NULL;
+ SLP_TREE_RIGHT (node) = NULL;
+ SLP_TREE_OUTSIDE_OF_LOOP_COST (node) = 0;
+ SLP_TREE_INSIDE_OF_LOOP_COST (node) = 0;
+
+ /* Calculate the unrolling factor. */
+ unrolling_factor = least_common_multiple (nunits, group_size) / group_size;
+
+ /* Calculate the number of vector stmts to create based on the unrolling
+ factor (number of vectors is 1 if NUNITS >= GROUP_SIZE, and is
+ GROUP_SIZE / NUNITS otherwise. */
+ ncopies_for_cost = unrolling_factor * group_size / nunits;
+
+ /* Build the tree for the SLP instance. */
+ if (vect_build_slp_tree (loop_vinfo, &node, group_size, &slp_impossible,
+ &inside_cost, &outside_cost, ncopies_for_cost))
+ {
+ /* Create a new SLP instance. */
+ new_instance = XNEW (struct _slp_instance);
+ SLP_INSTANCE_TREE (new_instance) = node;
+ SLP_INSTANCE_GROUP_SIZE (new_instance) = group_size;
+ SLP_INSTANCE_UNROLLING_FACTOR (new_instance) = unrolling_factor;
+ SLP_INSTANCE_OUTSIDE_OF_LOOP_COST (new_instance) = outside_cost;
+ SLP_INSTANCE_INSIDE_OF_LOOP_COST (new_instance) = inside_cost;
+ VEC_safe_push (slp_instance, heap, LOOP_VINFO_SLP_INSTANCES (loop_vinfo),
+ new_instance);
+ if (vect_print_dump_info (REPORT_SLP))
+ vect_print_slp_tree (node);
+
+ return true;
+ }
+
+ /* Failed to SLP. */
+ /* Free the allocated memory. */
+ vect_free_slp_tree (node);
+
+ if (slp_impossible)
+ return false;
+
+ /* SLP failed for this instance, but it is still possible to SLP other stmts
+ in the loop. */
+ return true;
+}
+
+
+/* Check if there are stmts in the loop can be vectorized using SLP. Build SLP
+ trees of packed scalar stmts if SLP is possible. */
+
+static bool
+vect_analyze_slp (loop_vec_info loop_vinfo)
+{
+ unsigned int i;
+ VEC (tree, heap) *strided_stores = LOOP_VINFO_STRIDED_STORES (loop_vinfo);
+ tree store;
+
+ if (vect_print_dump_info (REPORT_SLP))
+ fprintf (vect_dump, "=== vect_analyze_slp ===");
+
+ for (i = 0; VEC_iterate (tree, strided_stores, i, store); i++)
+ if (!vect_analyze_slp_instance (loop_vinfo, store))
+ {
+ /* SLP failed. No instance can be SLPed in the loop. */
+ if (vect_print_dump_info (REPORT_UNVECTORIZED_LOOPS))
+ fprintf (vect_dump, "SLP failed.");
+
+ return false;
+ }
+
+ return true;
+}
+
+
+/* For each possible SLP instance decide whether to SLP it and calculate overall
+ unrolling factor needed to SLP the loop. */
+
+static void
+vect_make_slp_decision (loop_vec_info loop_vinfo)
+{
+ unsigned int i, unrolling_factor = 1;
+ VEC (slp_instance, heap) *slp_instances = LOOP_VINFO_SLP_INSTANCES (loop_vinfo);
+ slp_instance instance;
+ int decided_to_slp = 0;
+
+ if (vect_print_dump_info (REPORT_SLP))
+ fprintf (vect_dump, "=== vect_make_slp_decision ===");
+
+ for (i = 0; VEC_iterate (slp_instance, slp_instances, i, instance); i++)
+ {
+ /* FORNOW: SLP if you can. */
+ if (unrolling_factor < SLP_INSTANCE_UNROLLING_FACTOR (instance))
+ unrolling_factor = SLP_INSTANCE_UNROLLING_FACTOR (instance);
+
+ /* Mark all the stmts that belong to INSTANCE as PURE_SLP stmts. Later we
+ call vect_detect_hybrid_slp () to find stmts that need hybrid SLP and
+ loop-based vectorization. Such stmts will be marked as HYBRID. */
+ vect_mark_slp_stmts (SLP_INSTANCE_TREE (instance), pure_slp, -1);
+ decided_to_slp++;
+ }
+
+ LOOP_VINFO_SLP_UNROLLING_FACTOR (loop_vinfo) = unrolling_factor;
+
+ if (decided_to_slp && vect_print_dump_info (REPORT_SLP))
+ fprintf (vect_dump, "Decided to SLP %d instances. Unrolling factor %d",
+ decided_to_slp, unrolling_factor);
+}
+
+
+/* Find stmts that must be both vectorized and SLPed (since they feed stmts that
+ can't be SLPed) in the tree rooted at NODE. Mark such stmts as HYBRID. */
+
+static void
+vect_detect_hybrid_slp_stmts (slp_tree node)
+{
+ int i;
+ tree stmt;
+ imm_use_iterator imm_iter;
+ tree use_stmt;
+
+ if (!node)
+ return;
+
+ for (i = 0; VEC_iterate (tree, SLP_TREE_SCALAR_STMTS (node), i, stmt); i++)
+ if (PURE_SLP_STMT (vinfo_for_stmt (stmt))
+ && TREE_CODE (GIMPLE_STMT_OPERAND (stmt, 0)) == SSA_NAME)
+ FOR_EACH_IMM_USE_STMT (use_stmt, imm_iter, GIMPLE_STMT_OPERAND (stmt, 0))
+ if (vinfo_for_stmt (use_stmt)
+ && !STMT_SLP_TYPE (vinfo_for_stmt (use_stmt)))
+ vect_mark_slp_stmts (node, hybrid, i);
+
+ vect_detect_hybrid_slp_stmts (SLP_TREE_LEFT (node));
+ vect_detect_hybrid_slp_stmts (SLP_TREE_RIGHT (node));
+}
+
+
+/* Find stmts that must be both vectorized and SLPed. */
+
+static void
+vect_detect_hybrid_slp (loop_vec_info loop_vinfo)
+{
+ unsigned int i;
+ VEC (slp_instance, heap) *slp_instances = LOOP_VINFO_SLP_INSTANCES (loop_vinfo);
+ slp_instance instance;
+
+ if (vect_print_dump_info (REPORT_SLP))
+ fprintf (vect_dump, "=== vect_detect_hybrid_slp ===");
+
+ for (i = 0; VEC_iterate (slp_instance, slp_instances, i, instance); i++)
+ vect_detect_hybrid_slp_stmts (SLP_INSTANCE_TREE (instance));
+}
+
+
/* Function vect_analyze_data_refs.
Find all the data references in the loop.
@@ -3424,6 +4227,17 @@ vect_analyze_loop (struct loop *loop)
return NULL;
}
+ /* Check the SLP opportunities in the loop, analyze and build SLP trees. */
+ ok = vect_analyze_slp (loop_vinfo);
+ if (ok)
+ {
+ /* Decide which possible SLP instances to SLP. */
+ vect_make_slp_decision (loop_vinfo);
+
+ /* Find stmts that need to be both vectorized and SLPed. */
+ vect_detect_hybrid_slp (loop_vinfo);
+ }
+
/* This pass will decide on using loop versioning and/or loop peeling in
order to enhance the alignment of data references in the loop. */
diff --git a/gcc/tree-vect-transform.c b/gcc/tree-vect-transform.c
index cddebde26ad..9698b417885 100644
--- a/gcc/tree-vect-transform.c
+++ b/gcc/tree-vect-transform.c
@@ -46,7 +46,7 @@ along with GCC; see the file COPYING3. If not see
#include "real.h"
/* Utility functions for the code transformation. */
-static bool vect_transform_stmt (tree, block_stmt_iterator *, bool *);
+static bool vect_transform_stmt (tree, block_stmt_iterator *, bool *, slp_tree);
static tree vect_create_destination_var (tree, tree);
static tree vect_create_data_ref_ptr
(tree, struct loop*, tree, tree *, tree *, bool, tree, bool *);
@@ -125,6 +125,8 @@ vect_estimate_min_profitable_iters (loop_vec_info loop_vinfo)
int nbbs = loop->num_nodes;
int byte_misalign;
int innerloop_iters = 0, factor;
+ VEC (slp_instance, heap) *slp_instances;
+ slp_instance instance;
/* Cost model disabled. */
if (!flag_vect_cost_model)
@@ -287,6 +289,14 @@ vect_estimate_min_profitable_iters (loop_vec_info loop_vinfo)
targetm.vectorize.builtin_vectorization_cost (runtime_test));
}
+ /* Add SLP costs. */
+ slp_instances = LOOP_VINFO_SLP_INSTANCES (loop_vinfo);
+ for (i = 0; VEC_iterate (slp_instance, slp_instances, i, instance); i++)
+ {
+ vec_outside_cost += SLP_INSTANCE_OUTSIDE_OF_LOOP_COST (instance);
+ vec_inside_cost += SLP_INSTANCE_INSIDE_OF_LOOP_COST (instance);
+ }
+
/* Calculate number of iterations required to make the vector version
profitable, relative to the loop bodies only. The following condition
must hold true: ((SIC*VF)-VIC)*niters > VOC*VF, where
@@ -452,30 +462,55 @@ vect_model_induction_cost (stmt_vec_info stmt_info, int ncopies)
}
+/* Return addresses of the cost fields of SLP_NODE if it's not NULL, and of
+ the stmt otherwise. */
+
+static inline void
+vect_get_cost_fields (stmt_vec_info stmt_info, slp_tree slp_node,
+ int **inside_cost_field, int **outside_cost_field)
+{
+ if (slp_node)
+ {
+ *inside_cost_field = &(SLP_TREE_INSIDE_OF_LOOP_COST (slp_node));
+ *outside_cost_field = &(SLP_TREE_OUTSIDE_OF_LOOP_COST (slp_node));
+ }
+ else
+ {
+ *inside_cost_field = &(STMT_VINFO_INSIDE_OF_LOOP_COST (stmt_info));
+ *outside_cost_field = &(STMT_VINFO_OUTSIDE_OF_LOOP_COST (stmt_info));
+ }
+}
+
+
/* Function vect_model_simple_cost.
Models cost for simple operations, i.e. those that only emit ncopies of a
single op. Right now, this does not account for multiple insns that could
be generated for the single vector op. We will handle that shortly. */
-static void
-vect_model_simple_cost (stmt_vec_info stmt_info, int ncopies, enum vect_def_type *dt)
+void
+vect_model_simple_cost (stmt_vec_info stmt_info, int ncopies,
+ enum vect_def_type *dt, slp_tree slp_node)
{
int i;
+ int *inside_cost_field, *outside_cost_field;
- STMT_VINFO_INSIDE_OF_LOOP_COST (stmt_info) = ncopies * TARG_VEC_STMT_COST;
+ /* Take addresses of relevant fields to update in the function. */
+ vect_get_cost_fields (stmt_info, slp_node, &inside_cost_field,
+ &outside_cost_field);
+
+ *inside_cost_field = ncopies * TARG_VEC_STMT_COST;
/* FORNOW: Assuming maximum 2 args per stmts. */
- for (i=0; i<2; i++)
+ for (i = 0; i < 2; i++)
{
if (dt[i] == vect_constant_def || dt[i] == vect_invariant_def)
- STMT_VINFO_OUTSIDE_OF_LOOP_COST (stmt_info) += TARG_SCALAR_TO_VEC_COST;
+ *outside_cost_field += TARG_SCALAR_TO_VEC_COST;
}
if (vect_print_dump_info (REPORT_DETAILS))
fprintf (vect_dump, "vect_model_simple_cost: inside_cost = %d, "
- "outside_cost = %d .", STMT_VINFO_INSIDE_OF_LOOP_COST (stmt_info),
- STMT_VINFO_OUTSIDE_OF_LOOP_COST (stmt_info));
+ "outside_cost = %d .", *inside_cost_field, *outside_cost_field);
}
@@ -502,14 +537,20 @@ vect_cost_strided_group_size (stmt_vec_info stmt_info)
Models cost for stores. In the case of strided accesses, one access
has the overhead of the strided access attributed to it. */
-static void
-vect_model_store_cost (stmt_vec_info stmt_info, int ncopies, enum vect_def_type dt)
+void
+vect_model_store_cost (stmt_vec_info stmt_info, int ncopies,
+ enum vect_def_type dt, slp_tree slp_node)
{
int cost = 0;
int group_size;
+ int *inside_cost_field, *outside_cost_field;
+
+ /* Take addresses of relevant fields to update in the function. */
+ vect_get_cost_fields (stmt_info, slp_node, &inside_cost_field,
+ &outside_cost_field);
if (dt == vect_constant_def || dt == vect_invariant_def)
- STMT_VINFO_OUTSIDE_OF_LOOP_COST (stmt_info) = TARG_SCALAR_TO_VEC_COST;
+ *outside_cost_field = TARG_SCALAR_TO_VEC_COST;
/* Strided access? */
if (DR_GROUP_FIRST_DR (stmt_info))
@@ -535,12 +576,11 @@ vect_model_store_cost (stmt_vec_info stmt_info, int ncopies, enum vect_def_type
/* Costs of the stores. */
cost += ncopies * TARG_VEC_STORE_COST;
- STMT_VINFO_INSIDE_OF_LOOP_COST (stmt_info) = cost;
+ *inside_cost_field = cost;
if (vect_print_dump_info (REPORT_DETAILS))
fprintf (vect_dump, "vect_model_store_cost: inside_cost = %d, "
- "outside_cost = %d .", STMT_VINFO_INSIDE_OF_LOOP_COST (stmt_info),
- STMT_VINFO_OUTSIDE_OF_LOOP_COST (stmt_info));
+ "outside_cost = %d .", *inside_cost_field, *outside_cost_field);
}
@@ -551,8 +591,8 @@ vect_model_store_cost (stmt_vec_info stmt_info, int ncopies, enum vect_def_type
accesses are supported for loads, we also account for the costs of the
access scheme chosen. */
-static void
-vect_model_load_cost (stmt_vec_info stmt_info, int ncopies)
+void
+vect_model_load_cost (stmt_vec_info stmt_info, int ncopies, slp_tree slp_node)
{
int inner_cost = 0;
@@ -560,10 +600,15 @@ vect_model_load_cost (stmt_vec_info stmt_info, int ncopies)
int alignment_support_cheme;
tree first_stmt;
struct data_reference *dr = STMT_VINFO_DATA_REF (stmt_info), *first_dr;
+ int *inside_cost_field, *outside_cost_field;
+
+ /* Take addresses of relevant fields to update in the function. */
+ vect_get_cost_fields (stmt_info, slp_node, &inside_cost_field,
+ &outside_cost_field);
/* Strided accesses? */
first_stmt = DR_GROUP_FIRST_DR (stmt_info);
- if (first_stmt)
+ if (first_stmt && !slp_node)
{
group_size = vect_cost_strided_group_size (stmt_info);
first_dr = STMT_VINFO_DATA_REF (vinfo_for_stmt (first_stmt));
@@ -641,14 +686,14 @@ vect_model_load_cost (stmt_vec_info stmt_info, int ncopies)
access in the group. Inside the loop, there is a load op
and a realignment op. */
- if ((!DR_GROUP_FIRST_DR (stmt_info)) || group_size > 1)
+ if ((!DR_GROUP_FIRST_DR (stmt_info)) || group_size > 1 || slp_node)
{
outer_cost = 2*TARG_VEC_STMT_COST;
if (targetm.vectorize.builtin_mask_for_load)
outer_cost += TARG_VEC_STMT_COST;
}
- STMT_VINFO_OUTSIDE_OF_LOOP_COST (stmt_info) = outer_cost;
+ *outside_cost_field = outer_cost;
inner_cost += ncopies * (TARG_VEC_LOAD_COST + TARG_VEC_STMT_COST);
@@ -659,12 +704,11 @@ vect_model_load_cost (stmt_vec_info stmt_info, int ncopies)
gcc_unreachable ();
}
- STMT_VINFO_INSIDE_OF_LOOP_COST (stmt_info) = inner_cost;
+ *inside_cost_field = inner_cost;
if (vect_print_dump_info (REPORT_DETAILS))
fprintf (vect_dump, "vect_model_load_cost: inside_cost = %d, "
- "outside_cost = %d .", STMT_VINFO_INSIDE_OF_LOOP_COST (stmt_info),
- STMT_VINFO_OUTSIDE_OF_LOOP_COST (stmt_info));
+ "outside_cost = %d .", *inside_cost_field, *outside_cost_field);
}
@@ -1256,6 +1300,177 @@ vect_init_vector (tree stmt, tree vector_var, tree vector_type,
}
+/* For constant and loop invariant defs of SLP_NODE this function returns
+ (vector) defs (VEC_OPRNDS) that will be used in the vectorized stmts.
+ OP_NUM determines if we gather defs for operand 0 or operand 1 of the scalar
+ stmts. */
+
+static void
+vect_get_constant_vectors (slp_tree slp_node, VEC(tree,heap) **vec_oprnds,
+ unsigned int op_num)
+{
+ VEC (tree, heap) *stmts = SLP_TREE_SCALAR_STMTS (slp_node);
+ tree stmt = VEC_index (tree, stmts, 0);
+ stmt_vec_info stmt_vinfo = vinfo_for_stmt (stmt);
+ tree vectype = STMT_VINFO_VECTYPE (stmt_vinfo);
+ int nunits = TYPE_VECTOR_SUBPARTS (vectype);
+ tree vec_cst;
+ tree t = NULL_TREE;
+ int j, number_of_places_left_in_vector;
+ tree vector_type;
+ tree op, vop, operation;
+ int group_size = VEC_length (tree, stmts);
+ unsigned int vec_num, i;
+ int number_of_copies = 1;
+ bool is_store = false;
+ unsigned int number_of_vectors = SLP_TREE_NUMBER_OF_VEC_STMTS (slp_node);
+ VEC (tree, heap) *voprnds = VEC_alloc (tree, heap, number_of_vectors);
+
+ if (STMT_VINFO_DATA_REF (stmt_vinfo))
+ is_store = true;
+
+ /* NUMBER_OF_COPIES is the number of times we need to use the same values in
+ created vectors. It is greater than 1 if unrolling is performed.
+
+ For example, we have two scalar operands, s1 and s2 (e.g., group of
+ strided accesses of size two), while NUINTS is four (i.e., four scalars
+ of this type can be packed in a vector). The output vector will contain
+ two copies of each scalar operand: {s1, s2, s1, s2}. (NUMBER_OF_COPIES
+ will be 2).
+
+ If GROUP_SIZE > NUNITS, the scalars will be split into several vectors
+ containing the operands.
+
+ For example, NUINTS is four as before, and the group size is 8
+ (s1, s2, ..., s8). We will create two vectors {s1, s2, s3, s4} and
+ {s5, s6, s7, s8}. */
+
+ number_of_copies = least_common_multiple (nunits, group_size) / group_size;
+
+ number_of_places_left_in_vector = nunits;
+ for (j = 0; j < number_of_copies; j++)
+ {
+ for (i = group_size - 1; VEC_iterate (tree, stmts, i, stmt); i--)
+ {
+ operation = GIMPLE_STMT_OPERAND (stmt, 1);
+ if (is_store)
+ op = operation;
+ else
+ op = TREE_OPERAND (operation, op_num);
+
+ /* Create 'vect_ = {op0,op1,...,opn}'. */
+ t = tree_cons (NULL_TREE, op, t);
+
+ number_of_places_left_in_vector--;
+
+ if (number_of_places_left_in_vector == 0)
+ {
+ number_of_places_left_in_vector = nunits;
+
+ vector_type = get_vectype_for_scalar_type (TREE_TYPE (op));
+ vec_cst = build_constructor_from_list (vector_type, t);
+ VEC_quick_push (tree, voprnds,
+ vect_init_vector (stmt, vec_cst, vector_type,
+ NULL));
+ t = NULL_TREE;
+ }
+ }
+ }
+
+ /* Since the vectors are created in the reverse order, we should invert
+ them. */
+ vec_num = VEC_length (tree, voprnds);
+ for (j = vec_num - 1; j >= 0; j--)
+ {
+ vop = VEC_index (tree, voprnds, j);
+ VEC_quick_push (tree, *vec_oprnds, vop);
+ }
+
+ VEC_free (tree, heap, voprnds);
+
+ /* In case that VF is greater than the unrolling factor needed for the SLP
+ group of stmts, NUMBER_OF_VECTORS to be created is greater than
+ NUMBER_OF_SCALARS/NUNITS or NUNITS/NUMBER_OF_SCALARS, and hence we have
+ to replicate the vectors. */
+ while (number_of_vectors > VEC_length (tree, *vec_oprnds))
+ {
+ for (i = 0; VEC_iterate (tree, *vec_oprnds, i, vop) && i < vec_num; i++)
+ VEC_quick_push (tree, *vec_oprnds, vop);
+ }
+}
+
+
+/* Get vectorized defintions from SLP_NODE that contains corresponding
+ vectorized def-stmts. */
+
+static void
+vect_get_slp_vect_defs (slp_tree slp_node, VEC (tree,heap) **vec_oprnds)
+{
+ tree vec_oprnd;
+ tree vec_def_stmt;
+ unsigned int i;
+
+ gcc_assert (SLP_TREE_VEC_STMTS (slp_node));
+
+ for (i = 0;
+ VEC_iterate (tree, SLP_TREE_VEC_STMTS (slp_node), i, vec_def_stmt);
+ i++)
+ {
+ gcc_assert (vec_def_stmt);
+ vec_oprnd = GIMPLE_STMT_OPERAND (vec_def_stmt, 0);
+ VEC_quick_push (tree, *vec_oprnds, vec_oprnd);
+ }
+}
+
+
+/* Get vectorized definitions for SLP_NODE.
+ If the scalar definitions are loop invariants or constants, collect them and
+ call vect_get_constant_vectors() to create vector stmts.
+ Otherwise, the def-stmts must be already vectorized and the vectorized stmts
+ must be stored in the LEFT/RIGHT node of SLP_NODE, and we call
+ vect_get_slp_vect_defs() to retrieve them. */
+
+static void
+vect_get_slp_defs (slp_tree slp_node, VEC (tree,heap) **vec_oprnds0,
+ VEC (tree,heap) **vec_oprnds1)
+{
+ tree operation, first_stmt;
+
+ /* Allocate memory for vectorized defs. */
+ *vec_oprnds0 = VEC_alloc (tree, heap,
+ SLP_TREE_NUMBER_OF_VEC_STMTS (slp_node));
+
+ /* SLP_NODE corresponds either to a group of stores or to a group of
+ unary/binary operations. We don't call this function for loads. */
+ if (SLP_TREE_LEFT (slp_node))
+ /* The defs are already vectorized. */
+ vect_get_slp_vect_defs (SLP_TREE_LEFT (slp_node), vec_oprnds0);
+ else
+ /* Build vectors from scalar defs. */
+ vect_get_constant_vectors (slp_node, vec_oprnds0, 0);
+
+ first_stmt = VEC_index (tree, SLP_TREE_SCALAR_STMTS (slp_node), 0);
+ if (STMT_VINFO_DATA_REF (vinfo_for_stmt (first_stmt)))
+ /* Since we don't call this function with loads, this is a group of
+ stores. */
+ return;
+
+ operation = GIMPLE_STMT_OPERAND (first_stmt, 1);
+ if (TREE_OPERAND_LENGTH (operation) == unary_op)
+ return;
+
+ *vec_oprnds1 = VEC_alloc (tree, heap,
+ SLP_TREE_NUMBER_OF_VEC_STMTS (slp_node));
+
+ if (SLP_TREE_RIGHT (slp_node))
+ /* The defs are already vectorized. */
+ vect_get_slp_vect_defs (SLP_TREE_RIGHT (slp_node), vec_oprnds1);
+ else
+ /* Build vectors from scalar defs. */
+ vect_get_constant_vectors (slp_node, vec_oprnds1, 1);
+}
+
+
/* Function get_initial_def_for_induction
Input:
@@ -1744,6 +1959,54 @@ vect_get_vec_def_for_stmt_copy (enum vect_def_type dt, tree vec_oprnd)
}
+/* Get vectorized definitions for the operands to create a copy of an original
+ stmt. See vect_get_vec_def_for_stmt_copy() for details. */
+
+static void
+vect_get_vec_defs_for_stmt_copy (enum vect_def_type *dt,
+ VEC(tree,heap) **vec_oprnds0,
+ VEC(tree,heap) **vec_oprnds1)
+{
+ tree vec_oprnd = VEC_pop (tree, *vec_oprnds0);
+
+ vec_oprnd = vect_get_vec_def_for_stmt_copy (dt[0], vec_oprnd);
+ VEC_quick_push (tree, *vec_oprnds0, vec_oprnd);
+
+ if (vec_oprnds1)
+ {
+ vec_oprnd = VEC_pop (tree, *vec_oprnds1);
+ vec_oprnd = vect_get_vec_def_for_stmt_copy (dt[1], vec_oprnd);
+ VEC_quick_push (tree, *vec_oprnds1, vec_oprnd);
+ }
+}
+
+
+/* Get vectorized definitions for OP0 and OP1, or SLP_NODE if it is not NULL. */
+
+static void
+vect_get_vec_defs (tree op0, tree op1, tree stmt, VEC(tree,heap) **vec_oprnds0,
+ VEC(tree,heap) **vec_oprnds1, slp_tree slp_node)
+{
+ if (slp_node)
+ vect_get_slp_defs (slp_node, vec_oprnds0, vec_oprnds1);
+ else
+ {
+ tree vec_oprnd;
+
+ *vec_oprnds0 = VEC_alloc (tree, heap, 1);
+ vec_oprnd = vect_get_vec_def_for_operand (op0, stmt, NULL);
+ VEC_quick_push (tree, *vec_oprnds0, vec_oprnd);
+
+ if (op1)
+ {
+ *vec_oprnds1 = VEC_alloc (tree, heap, 1);
+ vec_oprnd = vect_get_vec_def_for_operand (op1, stmt, NULL);
+ VEC_quick_push (tree, *vec_oprnds1, vec_oprnd);
+ }
+ }
+}
+
+
/* Function vect_finish_stmt_generation.
Insert a new stmt. */
@@ -2399,6 +2662,10 @@ vectorizable_reduction (tree stmt, block_stmt_iterator *bsi, tree *vec_stmt)
gcc_assert (ncopies >= 1);
+ /* FORNOW: SLP not supported. */
+ if (STMT_SLP_TYPE (stmt_info))
+ return false;
+
/* 1. Is vectorizable reduction? */
/* Not supportable if the reduction variable is used in the loop. */
@@ -2707,6 +2974,10 @@ vectorizable_call (tree stmt, block_stmt_iterator *bsi, tree *vec_stmt)
if (STMT_VINFO_DEF_TYPE (stmt_info) != vect_loop_def)
return false;
+ /* FORNOW: SLP not supported. */
+ if (STMT_SLP_TYPE (stmt_info))
+ return false;
+
/* FORNOW: not yet supported. */
if (STMT_VINFO_LIVE_P (stmt_info))
{
@@ -2815,7 +3086,7 @@ vectorizable_call (tree stmt, block_stmt_iterator *bsi, tree *vec_stmt)
STMT_VINFO_TYPE (stmt_info) = call_vec_info_type;
if (vect_print_dump_info (REPORT_DETAILS))
fprintf (vect_dump, "=== vectorizable_call ===");
- vect_model_simple_cost (stmt_info, ncopies, dt);
+ vect_model_simple_cost (stmt_info, ncopies, dt, NULL);
return true;
}
@@ -3005,16 +3276,14 @@ vect_gen_widened_results_half (enum tree_code code, tree vectype, tree decl,
}
-/* Function vectorizable_conversion.
-
-Check if STMT performs a conversion operation, that can be vectorized.
-If VEC_STMT is also passed, vectorize the STMT: create a vectorized
-stmt to replace it, put it in VEC_STMT, and insert it at BSI.
-Return FALSE if not a vectorizable STMT, TRUE otherwise. */
+/* Check if STMT performs a conversion operation, that can be vectorized.
+ If VEC_STMT is also passed, vectorize the STMT: create a vectorized
+ stmt to replace it, put it in VEC_STMT, and insert it at BSI.
+ Return FALSE if not a vectorizable STMT, TRUE otherwise. */
bool
-vectorizable_conversion (tree stmt, block_stmt_iterator * bsi,
- tree * vec_stmt)
+vectorizable_conversion (tree stmt, block_stmt_iterator *bsi,
+ tree *vec_stmt, slp_tree slp_node)
{
tree vec_dest;
tree scalar_dest;
@@ -3028,8 +3297,8 @@ vectorizable_conversion (tree stmt, block_stmt_iterator * bsi,
tree decl1 = NULL_TREE, decl2 = NULL_TREE;
tree new_temp;
tree def, def_stmt;
- enum vect_def_type dt0;
- tree new_stmt;
+ enum vect_def_type dt[2] = {vect_unknown_def_type, vect_unknown_def_type};
+ tree new_stmt = NULL_TREE;
stmt_vec_info prev_stmt_info;
int nunits_in;
int nunits_out;
@@ -3039,6 +3308,9 @@ vectorizable_conversion (tree stmt, block_stmt_iterator * bsi,
tree rhs_type, lhs_type;
tree builtin_decl;
enum { NARROW, NONE, WIDEN } modifier;
+ int i;
+ VEC(tree,heap) *vec_oprnds0 = NULL;
+ tree vop0;
/* Is STMT a vectorizable conversion? */
@@ -3067,7 +3339,7 @@ vectorizable_conversion (tree stmt, block_stmt_iterator * bsi,
if (code != FIX_TRUNC_EXPR && code != FLOAT_EXPR)
return false;
- /* Check types of lhs and rhs */
+ /* Check types of lhs and rhs. */
op0 = TREE_OPERAND (operation, 0);
rhs_type = TREE_TYPE (op0);
vectype_in = get_vectype_for_scalar_type (rhs_type);
@@ -3091,7 +3363,7 @@ vectorizable_conversion (tree stmt, block_stmt_iterator * bsi,
if (modifier == NONE)
gcc_assert (STMT_VINFO_VECTYPE (stmt_info) == vectype_out);
- /* Bail out if the types are both integral or non-integral */
+ /* Bail out if the types are both integral or non-integral. */
if ((INTEGRAL_TYPE_P (rhs_type) && INTEGRAL_TYPE_P (lhs_type))
|| (!INTEGRAL_TYPE_P (rhs_type) && !INTEGRAL_TYPE_P (lhs_type)))
return false;
@@ -3101,6 +3373,11 @@ vectorizable_conversion (tree stmt, block_stmt_iterator * bsi,
else
ncopies = LOOP_VINFO_VECT_FACTOR (loop_vinfo) / nunits_in;
+ /* FORNOW: SLP with multiple types is not supported. The SLP analysis verifies
+ this, so we can safely override NCOPIES with 1 here. */
+ if (slp_node)
+ ncopies = 1;
+
/* Sanity check: make sure that at least one copy of the vectorized stmt
needs to be generated. */
gcc_assert (ncopies >= 1);
@@ -3114,7 +3391,7 @@ vectorizable_conversion (tree stmt, block_stmt_iterator * bsi,
}
/* Check the operands of the operation. */
- if (!vect_is_simple_use (op0, loop_vinfo, &def_stmt, &def, &dt0))
+ if (!vect_is_simple_use (op0, loop_vinfo, &def_stmt, &def, &dt[0]))
{
if (vect_print_dump_info (REPORT_DETAILS))
fprintf (vect_dump, "use not simple.");
@@ -3138,7 +3415,12 @@ vectorizable_conversion (tree stmt, block_stmt_iterator * bsi,
}
if (modifier != NONE)
- STMT_VINFO_VECTYPE (stmt_info) = vectype_in;
+ {
+ STMT_VINFO_VECTYPE (stmt_info) = vectype_in;
+ /* FORNOW: SLP not supported. */
+ if (STMT_SLP_TYPE (stmt_info))
+ return false;
+ }
if (!vec_stmt) /* transformation not required. */
{
@@ -3153,6 +3435,9 @@ vectorizable_conversion (tree stmt, block_stmt_iterator * bsi,
/* Handle def. */
vec_dest = vect_create_destination_var (scalar_dest, vectype_out);
+ if (modifier == NONE && !slp_node)
+ vec_oprnds0 = VEC_alloc (tree, heap, 1);
+
prev_stmt_info = NULL;
switch (modifier)
{
@@ -3163,24 +3448,30 @@ vectorizable_conversion (tree stmt, block_stmt_iterator * bsi,
ssa_op_iter iter;
if (j == 0)
- vec_oprnd0 = vect_get_vec_def_for_operand (op0, stmt, NULL);
+ vect_get_vec_defs (op0, NULL, stmt, &vec_oprnds0, NULL, slp_node);
else
- vec_oprnd0 = vect_get_vec_def_for_stmt_copy (dt0, vec_oprnd0);
+ vect_get_vec_defs_for_stmt_copy (dt, &vec_oprnds0, NULL);
builtin_decl =
targetm.vectorize.builtin_conversion (code, vectype_in);
- new_stmt = build_call_expr (builtin_decl, 1, vec_oprnd0);
+ for (i = 0; VEC_iterate (tree, vec_oprnds0, i, vop0); i++)
+ {
+ new_stmt = build_call_expr (builtin_decl, 1, vop0);
- /* Arguments are ready. create the new vector stmt. */
- new_stmt = build_gimple_modify_stmt (vec_dest, new_stmt);
- new_temp = make_ssa_name (vec_dest, new_stmt);
- GIMPLE_STMT_OPERAND (new_stmt, 0) = new_temp;
- vect_finish_stmt_generation (stmt, new_stmt, bsi);
- FOR_EACH_SSA_TREE_OPERAND (sym, new_stmt, iter, SSA_OP_ALL_VIRTUALS)
- {
- if (TREE_CODE (sym) == SSA_NAME)
- sym = SSA_NAME_VAR (sym);
- mark_sym_for_renaming (sym);
+ /* Arguments are ready. create the new vector stmt. */
+ new_stmt = build_gimple_modify_stmt (vec_dest, new_stmt);
+ new_temp = make_ssa_name (vec_dest, new_stmt);
+ GIMPLE_STMT_OPERAND (new_stmt, 0) = new_temp;
+ vect_finish_stmt_generation (stmt, new_stmt, bsi);
+ FOR_EACH_SSA_TREE_OPERAND (sym, new_stmt, iter,
+ SSA_OP_ALL_VIRTUALS)
+ {
+ if (TREE_CODE (sym) == SSA_NAME)
+ sym = SSA_NAME_VAR (sym);
+ mark_sym_for_renaming (sym);
+ }
+ if (slp_node)
+ VEC_quick_push (tree, SLP_TREE_VEC_STMTS (slp_node), new_stmt);
}
if (j == 0)
@@ -3201,7 +3492,7 @@ vectorizable_conversion (tree stmt, block_stmt_iterator * bsi,
if (j == 0)
vec_oprnd0 = vect_get_vec_def_for_operand (op0, stmt, NULL);
else
- vec_oprnd0 = vect_get_vec_def_for_stmt_copy (dt0, vec_oprnd0);
+ vec_oprnd0 = vect_get_vec_def_for_stmt_copy (dt[0], vec_oprnd0);
STMT_VINFO_VECTYPE (stmt_info) = vectype_in;
@@ -3237,12 +3528,12 @@ vectorizable_conversion (tree stmt, block_stmt_iterator * bsi,
if (j == 0)
{
vec_oprnd0 = vect_get_vec_def_for_operand (op0, stmt, NULL);
- vec_oprnd1 = vect_get_vec_def_for_stmt_copy (dt0, vec_oprnd0);
+ vec_oprnd1 = vect_get_vec_def_for_stmt_copy (dt[0], vec_oprnd0);
}
else
{
- vec_oprnd0 = vect_get_vec_def_for_stmt_copy (dt0, vec_oprnd1);
- vec_oprnd1 = vect_get_vec_def_for_stmt_copy (dt0, vec_oprnd0);
+ vec_oprnd0 = vect_get_vec_def_for_stmt_copy (dt[0], vec_oprnd1);
+ vec_oprnd1 = vect_get_vec_def_for_stmt_copy (dt[0], vec_oprnd0);
}
/* Arguments are ready. Create the new vector stmt. */
@@ -3262,6 +3553,7 @@ vectorizable_conversion (tree stmt, block_stmt_iterator * bsi,
*vec_stmt = STMT_VINFO_VEC_STMT (stmt_info);
}
+
return true;
}
@@ -3274,12 +3566,12 @@ vectorizable_conversion (tree stmt, block_stmt_iterator * bsi,
Return FALSE if not a vectorizable STMT, TRUE otherwise. */
bool
-vectorizable_assignment (tree stmt, block_stmt_iterator *bsi, tree *vec_stmt)
+vectorizable_assignment (tree stmt, block_stmt_iterator *bsi, tree *vec_stmt,
+ slp_tree slp_node)
{
tree vec_dest;
tree scalar_dest;
tree op;
- tree vec_oprnd;
stmt_vec_info stmt_info = vinfo_for_stmt (stmt);
tree vectype = STMT_VINFO_VECTYPE (stmt_info);
loop_vec_info loop_vinfo = STMT_VINFO_LOOP_VINFO (stmt_info);
@@ -3288,6 +3580,9 @@ vectorizable_assignment (tree stmt, block_stmt_iterator *bsi, tree *vec_stmt)
enum vect_def_type dt[2] = {vect_unknown_def_type, vect_unknown_def_type};
int nunits = TYPE_VECTOR_SUBPARTS (vectype);
int ncopies = LOOP_VINFO_VECT_FACTOR (loop_vinfo) / nunits;
+ int i;
+ VEC(tree,heap) *vec_oprnds = NULL;
+ tree vop;
gcc_assert (ncopies >= 1);
if (ncopies > 1)
@@ -3328,7 +3623,7 @@ vectorizable_assignment (tree stmt, block_stmt_iterator *bsi, tree *vec_stmt)
STMT_VINFO_TYPE (stmt_info) = assignment_vec_info_type;
if (vect_print_dump_info (REPORT_DETAILS))
fprintf (vect_dump, "=== vectorizable_assignment ===");
- vect_model_simple_cost (stmt_info, ncopies, dt);
+ vect_model_simple_cost (stmt_info, ncopies, dt, NULL);
return true;
}
@@ -3340,15 +3635,22 @@ vectorizable_assignment (tree stmt, block_stmt_iterator *bsi, tree *vec_stmt)
vec_dest = vect_create_destination_var (scalar_dest, vectype);
/* Handle use. */
- op = GIMPLE_STMT_OPERAND (stmt, 1);
- vec_oprnd = vect_get_vec_def_for_operand (op, stmt, NULL);
+ vect_get_vec_defs (op, NULL, stmt, &vec_oprnds, NULL, slp_node);
/* Arguments are ready. create the new vector stmt. */
- *vec_stmt = build_gimple_modify_stmt (vec_dest, vec_oprnd);
- new_temp = make_ssa_name (vec_dest, *vec_stmt);
- GIMPLE_STMT_OPERAND (*vec_stmt, 0) = new_temp;
- vect_finish_stmt_generation (stmt, *vec_stmt, bsi);
+ for (i = 0; VEC_iterate (tree, vec_oprnds, i, vop); i++)
+ {
+ *vec_stmt = build_gimple_modify_stmt (vec_dest, vop);
+ new_temp = make_ssa_name (vec_dest, *vec_stmt);
+ GIMPLE_STMT_OPERAND (*vec_stmt, 0) = new_temp;
+ vect_finish_stmt_generation (stmt, *vec_stmt, bsi);
+ STMT_VINFO_VEC_STMT (stmt_info) = *vec_stmt;
+
+ if (slp_node)
+ VEC_quick_push (tree, SLP_TREE_VEC_STMTS (slp_node), *vec_stmt);
+ }
+ VEC_free (tree, heap, vec_oprnds);
return true;
}
@@ -3403,6 +3705,10 @@ vectorizable_induction (tree phi, block_stmt_iterator *bsi ATTRIBUTE_UNUSED,
if (!STMT_VINFO_RELEVANT_P (stmt_info))
return false;
+ /* FORNOW: SLP not supported. */
+ if (STMT_SLP_TYPE (stmt_info))
+ return false;
+
gcc_assert (STMT_VINFO_DEF_TYPE (stmt_info) == vect_induction_def);
if (STMT_VINFO_LIVE_P (stmt_info))
@@ -3444,13 +3750,14 @@ vectorizable_induction (tree phi, block_stmt_iterator *bsi ATTRIBUTE_UNUSED,
Return FALSE if not a vectorizable STMT, TRUE otherwise. */
bool
-vectorizable_operation (tree stmt, block_stmt_iterator *bsi, tree *vec_stmt)
+vectorizable_operation (tree stmt, block_stmt_iterator *bsi, tree *vec_stmt,
+ slp_tree slp_node)
{
tree vec_dest;
tree scalar_dest;
tree operation;
tree op0, op1 = NULL;
- tree vec_oprnd0 = NULL_TREE, vec_oprnd1 = NULL_TREE;
+ tree vec_oprnd1 = NULL_TREE;
stmt_vec_info stmt_info = vinfo_for_stmt (stmt);
tree vectype = STMT_VINFO_VECTYPE (stmt_info);
loop_vec_info loop_vinfo = STMT_VINFO_LOOP_VINFO (stmt_info);
@@ -3464,14 +3771,20 @@ vectorizable_operation (tree stmt, block_stmt_iterator *bsi, tree *vec_stmt)
enum machine_mode optab_op2_mode;
tree def, def_stmt;
enum vect_def_type dt[2] = {vect_unknown_def_type, vect_unknown_def_type};
- tree new_stmt;
+ tree new_stmt = NULL_TREE;
stmt_vec_info prev_stmt_info;
int nunits_in = TYPE_VECTOR_SUBPARTS (vectype);
int nunits_out;
tree vectype_out;
int ncopies = LOOP_VINFO_VECT_FACTOR (loop_vinfo) / nunits_in;
- int j;
-
+ int j, i;
+ VEC(tree,heap) *vec_oprnds0 = NULL, *vec_oprnds1 = NULL;
+ tree vop0, vop1;
+
+ /* FORNOW: SLP with multiple types is not supported. The SLP analysis verifies
+ this, so we can safely override NCOPIES with 1 here. */
+ if (slp_node)
+ ncopies = 1;
gcc_assert (ncopies >= 1);
/* FORNOW. This restriction should be relaxed. */
if (nested_in_vect_loop_p (loop, stmt) && ncopies > 1)
@@ -3601,7 +3914,7 @@ vectorizable_operation (tree stmt, block_stmt_iterator *bsi, tree *vec_stmt)
STMT_VINFO_TYPE (stmt_info) = op_vec_info_type;
if (vect_print_dump_info (REPORT_DETAILS))
fprintf (vect_dump, "=== vectorizable_operation ===");
- vect_model_simple_cost (stmt_info, ncopies, dt);
+ vect_model_simple_cost (stmt_info, ncopies, dt, NULL);
return true;
}
@@ -3613,6 +3926,13 @@ vectorizable_operation (tree stmt, block_stmt_iterator *bsi, tree *vec_stmt)
/* Handle def. */
vec_dest = vect_create_destination_var (scalar_dest, vectype);
+ if (!slp_node)
+ {
+ vec_oprnds0 = VEC_alloc (tree, heap, 1);
+ if (op_type == binary_op)
+ vec_oprnds1 = VEC_alloc (tree, heap, 1);
+ }
+
/* In case the vectorization factor (VF) is bigger than the number
of elements that we can fit in a vectype (nunits), we have to generate
more than one vector stmt - i.e - we need to "unroll" the
@@ -3672,45 +3992,53 @@ vectorizable_operation (tree stmt, block_stmt_iterator *bsi, tree *vec_stmt)
/* Handle uses. */
if (j == 0)
{
- vec_oprnd0 = vect_get_vec_def_for_operand (op0, stmt, NULL);
- if (op_type == binary_op)
+ if (op_type == binary_op
+ && (code == LSHIFT_EXPR || code == RSHIFT_EXPR)
+ && !slp_node)
{
- if (code == LSHIFT_EXPR || code == RSHIFT_EXPR)
- {
- /* Vector shl and shr insn patterns can be defined with
- scalar operand 2 (shift operand). In this case, use
- constant or loop invariant op1 directly, without
- extending it to vector mode first. */
- optab_op2_mode = insn_data[icode].operand[2].mode;
- if (!VECTOR_MODE_P (optab_op2_mode))
- {
- if (vect_print_dump_info (REPORT_DETAILS))
- fprintf (vect_dump, "operand 1 using scalar mode.");
- vec_oprnd1 = op1;
- }
- }
- if (!vec_oprnd1)
- vec_oprnd1 = vect_get_vec_def_for_operand (op1, stmt, NULL);
+ /* Vector shl and shr insn patterns can be defined with scalar
+ operand 2 (shift operand). In this case, use constant or loop
+ invariant op1 directly, without extending it to vector mode
+ first. */
+ optab_op2_mode = insn_data[icode].operand[2].mode;
+ if (!VECTOR_MODE_P (optab_op2_mode))
+ {
+ if (vect_print_dump_info (REPORT_DETAILS))
+ fprintf (vect_dump, "operand 1 using scalar mode.");
+ vec_oprnd1 = op1;
+ VEC_quick_push (tree, vec_oprnds1, vec_oprnd1);
+ }
}
+
+ if (op_type == binary_op && !vec_oprnd1)
+ vect_get_vec_defs (op0, op1, stmt, &vec_oprnds0, &vec_oprnds1,
+ slp_node);
+ else
+ vect_get_vec_defs (op0, NULL_TREE, stmt, &vec_oprnds0, &vec_oprnds1,
+ slp_node);
}
else
- {
- vec_oprnd0 = vect_get_vec_def_for_stmt_copy (dt[0], vec_oprnd0);
- if (op_type == binary_op)
- vec_oprnd1 = vect_get_vec_def_for_stmt_copy (dt[1], vec_oprnd1);
- }
+ vect_get_vec_defs_for_stmt_copy (dt, &vec_oprnds0, &vec_oprnds1);
- /* Arguments are ready. create the new vector stmt. */
+ /* Arguments are ready. Create the new vector stmt. */
+ for (i = 0; VEC_iterate (tree, vec_oprnds0, i, vop0); i++)
+ {
+ if (op_type == binary_op)
+ {
+ vop1 = VEC_index (tree, vec_oprnds1, i);
+ new_stmt = build_gimple_modify_stmt (vec_dest,
+ build2 (code, vectype, vop0, vop1));
+ }
+ else
+ new_stmt = build_gimple_modify_stmt (vec_dest,
+ build1 (code, vectype, vop0));
- if (op_type == binary_op)
- new_stmt = build_gimple_modify_stmt (vec_dest,
- build2 (code, vectype, vec_oprnd0, vec_oprnd1));
- else
- new_stmt = build_gimple_modify_stmt (vec_dest,
- build1 (code, vectype, vec_oprnd0));
- new_temp = make_ssa_name (vec_dest, new_stmt);
- GIMPLE_STMT_OPERAND (new_stmt, 0) = new_temp;
- vect_finish_stmt_generation (stmt, new_stmt, bsi);
+ new_temp = make_ssa_name (vec_dest, new_stmt);
+ GIMPLE_STMT_OPERAND (new_stmt, 0) = new_temp;
+ vect_finish_stmt_generation (stmt, new_stmt, bsi);
+ if (slp_node)
+ VEC_quick_push (tree, SLP_TREE_VEC_STMTS (slp_node), new_stmt);
+ }
if (j == 0)
STMT_VINFO_VEC_STMT (stmt_info) = *vec_stmt = new_stmt;
@@ -3719,6 +4047,10 @@ vectorizable_operation (tree stmt, block_stmt_iterator *bsi, tree *vec_stmt)
prev_stmt_info = vinfo_for_stmt (new_stmt);
}
+ VEC_free (tree, heap, vec_oprnds0);
+ if (vec_oprnds1)
+ VEC_free (tree, heap, vec_oprnds1);
+
return true;
}
@@ -3829,7 +4161,7 @@ vectorizable_type_demotion (tree stmt, block_stmt_iterator *bsi,
STMT_VINFO_TYPE (stmt_info) = type_demotion_vec_info_type;
if (vect_print_dump_info (REPORT_DETAILS))
fprintf (vect_dump, "=== vectorizable_demotion ===");
- vect_model_simple_cost (stmt_info, ncopies, dt);
+ vect_model_simple_cost (stmt_info, ncopies, dt, NULL);
return true;
}
@@ -4000,7 +4332,7 @@ vectorizable_type_promotion (tree stmt, block_stmt_iterator *bsi,
STMT_VINFO_TYPE (stmt_info) = type_promotion_vec_info_type;
if (vect_print_dump_info (REPORT_DETAILS))
fprintf (vect_dump, "=== vectorizable_promotion ===");
- vect_model_simple_cost (stmt_info, 2*ncopies, dt);
+ vect_model_simple_cost (stmt_info, 2*ncopies, dt, NULL);
return true;
}
@@ -4095,6 +4427,7 @@ vect_strided_store_supported (tree vectype)
fprintf (vect_dump, "interleave op not supported by target.");
return false;
}
+
return true;
}
@@ -4242,7 +4575,8 @@ vect_permute_store_chain (VEC(tree,heap) *dr_chain,
Return FALSE if not a vectorizable STMT, TRUE otherwise. */
bool
-vectorizable_store (tree stmt, block_stmt_iterator *bsi, tree *vec_stmt)
+vectorizable_store (tree stmt, block_stmt_iterator *bsi, tree *vec_stmt,
+ slp_tree slp_node)
{
tree scalar_dest;
tree data_ref;
@@ -4268,6 +4602,15 @@ vectorizable_store (tree stmt, block_stmt_iterator *bsi, tree *vec_stmt)
unsigned int group_size, i;
VEC(tree,heap) *dr_chain = NULL, *oprnds = NULL, *result_chain = NULL;
bool inv_p;
+ VEC(tree,heap) *vec_oprnds = NULL;
+ bool slp = (slp_node != NULL);
+ stmt_vec_info first_stmt_vinfo;
+ unsigned int vec_num;
+
+ /* FORNOW: SLP with multiple types is not supported. The SLP analysis verifies
+ this, so we can safely override NCOPIES with 1 here. */
+ if (slp)
+ ncopies = 1;
gcc_assert (ncopies >= 1);
@@ -4300,7 +4643,7 @@ vectorizable_store (tree stmt, block_stmt_iterator *bsi, tree *vec_stmt)
scalar_dest = GIMPLE_STMT_OPERAND (stmt, 0);
if (TREE_CODE (scalar_dest) != ARRAY_REF
&& TREE_CODE (scalar_dest) != INDIRECT_REF
- && !DR_GROUP_FIRST_DR (stmt_info))
+ && !STMT_VINFO_STRIDED_ACCESS (stmt_info))
return false;
op = GIMPLE_STMT_OPERAND (stmt, 1);
@@ -4320,17 +4663,19 @@ vectorizable_store (tree stmt, block_stmt_iterator *bsi, tree *vec_stmt)
if (!STMT_VINFO_DATA_REF (stmt_info))
return false;
- if (DR_GROUP_FIRST_DR (stmt_info))
+ if (STMT_VINFO_STRIDED_ACCESS (stmt_info))
{
strided_store = true;
- if (!vect_strided_store_supported (vectype))
+ if (!vect_strided_store_supported (vectype)
+ && !PURE_SLP_STMT (stmt_info) && !slp)
return false;
}
if (!vec_stmt) /* transformation not required. */
{
STMT_VINFO_TYPE (stmt_info) = store_vec_info_type;
- vect_model_store_cost (stmt_info, ncopies, dt);
+ if (!PURE_SLP_STMT (stmt_info))
+ vect_model_store_cost (stmt_info, ncopies, dt, NULL);
return true;
}
@@ -4350,17 +4695,28 @@ vectorizable_store (tree stmt, block_stmt_iterator *bsi, tree *vec_stmt)
/* We vectorize all the stmts of the interleaving group when we
reach the last stmt in the group. */
if (DR_GROUP_STORE_COUNT (vinfo_for_stmt (first_stmt))
- < DR_GROUP_SIZE (vinfo_for_stmt (first_stmt)))
+ < DR_GROUP_SIZE (vinfo_for_stmt (first_stmt))
+ && !slp)
{
*vec_stmt = NULL_TREE;
return true;
}
+
+ if (slp)
+ strided_store = false;
+
+ /* VEC_NUM is the number of vect stmts to be created for this group. */
+ if (slp && SLP_TREE_NUMBER_OF_VEC_STMTS (slp_node) < group_size)
+ vec_num = SLP_TREE_NUMBER_OF_VEC_STMTS (slp_node);
+ else
+ vec_num = group_size;
}
else
{
first_stmt = stmt;
first_dr = dr;
- group_size = 1;
+ group_size = vec_num = 1;
+ first_stmt_vinfo = stmt_info;
}
if (vect_print_dump_info (REPORT_DETAILS))
@@ -4420,26 +4776,39 @@ vectorizable_store (tree stmt, block_stmt_iterator *bsi, tree *vec_stmt)
if (j == 0)
{
- /* For interleaved stores we collect vectorized defs for all the
- stores in the group in DR_CHAIN and OPRNDS. DR_CHAIN is then used
- as an input to vect_permute_store_chain(), and OPRNDS as an input
- to vect_get_vec_def_for_stmt_copy() for the next copy.
- If the store is not strided, GROUP_SIZE is 1, and DR_CHAIN and
- OPRNDS are of size 1. */
- next_stmt = first_stmt;
- for (i = 0; i < group_size; i++)
- {
- /* Since gaps are not supported for interleaved stores, GROUP_SIZE
- is the exact number of stmts in the chain. Therefore, NEXT_STMT
- can't be NULL_TREE. In case that there is no interleaving,
- GROUP_SIZE is 1, and only one iteration of the loop will be
- executed. */
- gcc_assert (next_stmt);
- op = GIMPLE_STMT_OPERAND (next_stmt, 1);
- vec_oprnd = vect_get_vec_def_for_operand (op, next_stmt, NULL);
- VEC_quick_push(tree, dr_chain, vec_oprnd);
- VEC_quick_push(tree, oprnds, vec_oprnd);
- next_stmt = DR_GROUP_NEXT_DR (vinfo_for_stmt (next_stmt));
+ if (slp)
+ {
+ /* Get vectorized arguments for SLP_NODE. */
+ vect_get_slp_defs (slp_node, &vec_oprnds, NULL);
+
+ vec_oprnd = VEC_index (tree, vec_oprnds, 0);
+ }
+ else
+ {
+ /* For interleaved stores we collect vectorized defs for all the
+ stores in the group in DR_CHAIN and OPRNDS. DR_CHAIN is then
+ used as an input to vect_permute_store_chain(), and OPRNDS as
+ an input to vect_get_vec_def_for_stmt_copy() for the next copy.
+
+ If the store is not strided, GROUP_SIZE is 1, and DR_CHAIN and
+ OPRNDS are of size 1. */
+ next_stmt = first_stmt;
+ for (i = 0; i < group_size; i++)
+ {
+ /* Since gaps are not supported for interleaved stores,
+ GROUP_SIZE is the exact number of stmts in the chain.
+ Therefore, NEXT_STMT can't be NULL_TREE. In case that
+ there is no interleaving, GROUP_SIZE is 1, and only one
+ iteration of the loop will be executed. */
+ gcc_assert (next_stmt);
+ op = GIMPLE_STMT_OPERAND (next_stmt, 1);
+
+ vec_oprnd = vect_get_vec_def_for_operand (op, next_stmt,
+ NULL);
+ VEC_quick_push(tree, dr_chain, vec_oprnd);
+ VEC_quick_push(tree, oprnds, vec_oprnd);
+ next_stmt = DR_GROUP_NEXT_DR (vinfo_for_stmt (next_stmt));
+ }
}
dataref_ptr = vect_create_data_ref_ptr (first_stmt, NULL, NULL_TREE,
&dummy, &ptr_incr, false,
@@ -4448,6 +4817,9 @@ vectorizable_store (tree stmt, block_stmt_iterator *bsi, tree *vec_stmt)
}
else
{
+ /* FORNOW SLP doesn't work for multiple types. */
+ gcc_assert (!slp);
+
/* For interleaved stores we created vectorized defs for all the
defs stored in OPRNDS in the previous iteration (previous copy).
DR_CHAIN is then used as an input to vect_permute_store_chain(),
@@ -4476,12 +4848,19 @@ vectorizable_store (tree stmt, block_stmt_iterator *bsi, tree *vec_stmt)
}
next_stmt = first_stmt;
- for (i = 0; i < group_size; i++)
+ for (i = 0; i < vec_num; i++)
{
- /* For strided stores vectorized defs are interleaved in
- vect_permute_store_chain(). */
- if (strided_store)
- vec_oprnd = VEC_index(tree, result_chain, i);
+ if (i > 0)
+ /* Bump the vector pointer. */
+ dataref_ptr = bump_vector_ptr (dataref_ptr, ptr_incr, bsi, stmt,
+ NULL_TREE);
+
+ if (slp)
+ vec_oprnd = VEC_index (tree, vec_oprnds, i);
+ else if (strided_store)
+ /* For strided stores vectorized defs are interleaved in
+ vect_permute_store_chain(). */
+ vec_oprnd = VEC_index (tree, result_chain, i);
data_ref = build_fold_indirect_ref (dataref_ptr);
/* Arguments are ready. Create the new vector stmt. */
@@ -4498,9 +4877,6 @@ vectorizable_store (tree stmt, block_stmt_iterator *bsi, tree *vec_stmt)
next_stmt = DR_GROUP_NEXT_DR (vinfo_for_stmt (next_stmt));
if (!next_stmt)
break;
- /* Bump the vector pointer. */
- dataref_ptr =
- bump_vector_ptr (dataref_ptr, ptr_incr, bsi, stmt, NULL_TREE);
}
}
@@ -5021,7 +5397,8 @@ vect_transform_strided_load (tree stmt, VEC(tree,heap) *dr_chain, int size,
Return FALSE if not a vectorizable STMT, TRUE otherwise. */
bool
-vectorizable_load (tree stmt, block_stmt_iterator *bsi, tree *vec_stmt)
+vectorizable_load (tree stmt, block_stmt_iterator *bsi, tree *vec_stmt,
+ slp_tree slp_node)
{
tree scalar_dest;
tree vec_dest = NULL;
@@ -5056,6 +5433,13 @@ vectorizable_load (tree stmt, block_stmt_iterator *bsi, tree *vec_stmt)
bool inv_p;
bool compute_in_loop = false;
struct loop *at_loop;
+ int vec_num;
+ bool slp = (slp_node != NULL);
+
+ /* FORNOW: SLP with multiple types is not supported. The SLP analysis verifies
+ this, so we can safely override NCOPIES with 1 here. */
+ if (slp)
+ ncopies = 1;
gcc_assert (ncopies >= 1);
@@ -5092,7 +5476,7 @@ vectorizable_load (tree stmt, block_stmt_iterator *bsi, tree *vec_stmt)
op = GIMPLE_STMT_OPERAND (stmt, 1);
if (TREE_CODE (op) != ARRAY_REF
&& TREE_CODE (op) != INDIRECT_REF
- && !DR_GROUP_FIRST_DR (stmt_info))
+ && !STMT_VINFO_STRIDED_ACCESS (stmt_info))
return false;
if (!STMT_VINFO_DATA_REF (stmt_info))
@@ -5111,21 +5495,22 @@ vectorizable_load (tree stmt, block_stmt_iterator *bsi, tree *vec_stmt)
}
/* Check if the load is a part of an interleaving chain. */
- if (DR_GROUP_FIRST_DR (stmt_info))
+ if (STMT_VINFO_STRIDED_ACCESS (stmt_info))
{
strided_load = true;
/* FORNOW */
gcc_assert (! nested_in_vect_loop);
/* Check if interleaving is supported. */
- if (!vect_strided_load_supported (vectype))
+ if (!vect_strided_load_supported (vectype)
+ && !PURE_SLP_STMT (stmt_info) && !slp)
return false;
}
if (!vec_stmt) /* transformation not required. */
{
STMT_VINFO_TYPE (stmt_info) = load_vec_info_type;
- vect_model_load_cost (stmt_info, ncopies);
+ vect_model_load_cost (stmt_info, ncopies, NULL);
return true;
}
@@ -5146,12 +5531,21 @@ vectorizable_load (tree stmt, block_stmt_iterator *bsi, tree *vec_stmt)
first_dr = STMT_VINFO_DATA_REF (vinfo_for_stmt (first_stmt));
group_size = DR_GROUP_SIZE (vinfo_for_stmt (first_stmt));
dr_chain = VEC_alloc (tree, heap, group_size);
+
+ /* VEC_NUM is the number of vect stmts to be created for this group. */
+ if (slp)
+ {
+ strided_load = false;
+ vec_num = SLP_TREE_NUMBER_OF_VEC_STMTS (slp_node);
+ }
+ else
+ vec_num = group_size;
}
else
{
first_stmt = stmt;
first_dr = dr;
- group_size = 1;
+ group_size = vec_num = 1;
}
alignment_support_scheme = vect_supportable_dr_alignment (first_dr);
@@ -5296,8 +5690,12 @@ vectorizable_load (tree stmt, block_stmt_iterator *bsi, tree *vec_stmt)
dataref_ptr =
bump_vector_ptr (dataref_ptr, ptr_incr, bsi, stmt, NULL_TREE);
- for (i = 0; i < group_size; i++)
+ for (i = 0; i < vec_num; i++)
{
+ if (i > 0)
+ dataref_ptr = bump_vector_ptr (dataref_ptr, ptr_incr, bsi, stmt,
+ NULL_TREE);
+
/* 2. Create the vector-load in the loop. */
switch (alignment_support_scheme)
{
@@ -5373,7 +5771,7 @@ vectorizable_load (tree stmt, block_stmt_iterator *bsi, tree *vec_stmt)
if (alignment_support_scheme == dr_explicit_realign_optimized)
{
- if (i == group_size - 1 && j == ncopies - 1)
+ if (i == vec_num - 1 && j == ncopies - 1)
add_phi_arg (phi, lsq, loop_latch_edge (containing_loop));
msq = lsq;
}
@@ -5414,13 +5812,20 @@ vectorizable_load (tree stmt, block_stmt_iterator *bsi, tree *vec_stmt)
gcc_unreachable (); /* FORNOW. */
}
- if (strided_load)
- VEC_quick_push (tree, dr_chain, new_temp);
- if (i < group_size - 1)
- dataref_ptr =
- bump_vector_ptr (dataref_ptr, ptr_incr, bsi, stmt, NULL_TREE);
+ /* Collect vector loads and later create their permutation in
+ vect_transform_strided_load (). */
+ if (strided_load)
+ VEC_quick_push (tree, dr_chain, new_temp);
+
+ /* Store vector loads in the corresponding SLP_NODE. */
+ if (slp)
+ VEC_quick_push (tree, SLP_TREE_VEC_STMTS (slp_node), new_stmt);
}
+ /* FORNOW: SLP with multiple types is unsupported. */
+ if (slp)
+ return true;
+
if (strided_load)
{
if (!vect_transform_strided_load (stmt, dr_chain, group_size, bsi))
@@ -5586,6 +5991,10 @@ vectorizable_condition (tree stmt, block_stmt_iterator *bsi, tree *vec_stmt)
if (STMT_VINFO_DEF_TYPE (stmt_info) != vect_loop_def)
return false;
+ /* FORNOW: SLP not supported. */
+ if (STMT_SLP_TYPE (stmt_info))
+ return false;
+
/* FORNOW: not yet supported. */
if (STMT_VINFO_LIVE_P (stmt_info))
{
@@ -5676,12 +6085,14 @@ vectorizable_condition (tree stmt, block_stmt_iterator *bsi, tree *vec_stmt)
return true;
}
+
/* Function vect_transform_stmt.
Create a vectorized stmt to replace STMT, and insert it at BSI. */
-bool
-vect_transform_stmt (tree stmt, block_stmt_iterator *bsi, bool *strided_store)
+static bool
+vect_transform_stmt (tree stmt, block_stmt_iterator *bsi, bool *strided_store,
+ slp_tree slp_node)
{
bool is_store = false;
tree vec_stmt = NULL_TREE;
@@ -5692,44 +6103,47 @@ vect_transform_stmt (tree stmt, block_stmt_iterator *bsi, bool *strided_store)
switch (STMT_VINFO_TYPE (stmt_info))
{
case type_demotion_vec_info_type:
+ gcc_assert (!slp_node);
done = vectorizable_type_demotion (stmt, bsi, &vec_stmt);
gcc_assert (done);
break;
case type_promotion_vec_info_type:
+ gcc_assert (!slp_node);
done = vectorizable_type_promotion (stmt, bsi, &vec_stmt);
gcc_assert (done);
break;
case type_conversion_vec_info_type:
- done = vectorizable_conversion (stmt, bsi, &vec_stmt);
+ done = vectorizable_conversion (stmt, bsi, &vec_stmt, slp_node);
gcc_assert (done);
break;
case induc_vec_info_type:
+ gcc_assert (!slp_node);
done = vectorizable_induction (stmt, bsi, &vec_stmt);
gcc_assert (done);
break;
case op_vec_info_type:
- done = vectorizable_operation (stmt, bsi, &vec_stmt);
+ done = vectorizable_operation (stmt, bsi, &vec_stmt, slp_node);
gcc_assert (done);
break;
case assignment_vec_info_type:
- done = vectorizable_assignment (stmt, bsi, &vec_stmt);
+ done = vectorizable_assignment (stmt, bsi, &vec_stmt, slp_node);
gcc_assert (done);
break;
case load_vec_info_type:
- done = vectorizable_load (stmt, bsi, &vec_stmt);
+ done = vectorizable_load (stmt, bsi, &vec_stmt, slp_node);
gcc_assert (done);
break;
case store_vec_info_type:
- done = vectorizable_store (stmt, bsi, &vec_stmt);
+ done = vectorizable_store (stmt, bsi, &vec_stmt, slp_node);
gcc_assert (done);
- if (DR_GROUP_FIRST_DR (stmt_info))
+ if (STMT_VINFO_STRIDED_ACCESS (stmt_info))
{
/* In case of interleaving, the whole chain is vectorized when the
last store in the chain is reached. Store stmts before the last
@@ -5744,15 +6158,18 @@ vect_transform_stmt (tree stmt, block_stmt_iterator *bsi, bool *strided_store)
break;
case condition_vec_info_type:
+ gcc_assert (!slp_node);
done = vectorizable_condition (stmt, bsi, &vec_stmt);
gcc_assert (done);
break;
case call_vec_info_type:
+ gcc_assert (!slp_node);
done = vectorizable_call (stmt, bsi, &vec_stmt);
break;
case reduc_vec_info_type:
+ gcc_assert (!slp_node);
done = vectorizable_reduction (stmt, bsi, &vec_stmt);
gcc_assert (done);
break;
@@ -6165,7 +6582,7 @@ vect_gen_niters_for_prolog_loop (loop_vec_info loop_vinfo, tree loop_niters)
int element_size = GET_MODE_SIZE (TYPE_MODE (TREE_TYPE (DR_REF (dr))));
int nelements = TYPE_VECTOR_SUBPARTS (vectype);
- if (DR_GROUP_FIRST_DR (stmt_info))
+ if (STMT_VINFO_STRIDED_ACCESS (stmt_info))
{
/* For interleaved access element size must be multiplied by the size of
the interleaved group. */
@@ -6593,6 +7010,115 @@ vect_create_cond_for_alias_checks (loop_vec_info loop_vinfo,
}
+/* Remove a group of stores (for SLP or interleaving), free their
+ stmt_vec_info. */
+
+static void
+vect_remove_stores (tree first_stmt)
+{
+ stmt_ann_t ann;
+ tree next = first_stmt;
+ tree tmp;
+ stmt_vec_info next_stmt_info;
+ block_stmt_iterator next_si;
+
+ while (next)
+ {
+ /* Free the attached stmt_vec_info and remove the stmt. */
+ next_si = bsi_for_stmt (next);
+ bsi_remove (&next_si, true);
+ next_stmt_info = vinfo_for_stmt (next);
+ ann = stmt_ann (next);
+ tmp = DR_GROUP_NEXT_DR (next_stmt_info);
+ free (next_stmt_info);
+ set_stmt_info (ann, NULL);
+ next = tmp;
+ }
+}
+
+
+/* Vectorize SLP instance tree in postorder. */
+
+static bool
+vect_schedule_slp_instance (slp_tree node, unsigned int vec_stmts_size)
+{
+ tree stmt;
+ bool strided_store, is_store;
+ block_stmt_iterator si;
+ stmt_vec_info stmt_info;
+
+ if (!node)
+ return false;
+
+ vect_schedule_slp_instance (SLP_TREE_LEFT (node), vec_stmts_size);
+ vect_schedule_slp_instance (SLP_TREE_RIGHT (node), vec_stmts_size);
+
+ stmt = VEC_index(tree, SLP_TREE_SCALAR_STMTS (node), 0);
+ stmt_info = vinfo_for_stmt (stmt);
+ SLP_TREE_VEC_STMTS (node) = VEC_alloc (tree, heap, vec_stmts_size);
+ SLP_TREE_NUMBER_OF_VEC_STMTS (node) = vec_stmts_size;
+
+ if (vect_print_dump_info (REPORT_DETAILS))
+ {
+ fprintf (vect_dump, "------>vectorizing SLP node starting from: ");
+ print_generic_expr (vect_dump, stmt, TDF_SLIM);
+ }
+
+ si = bsi_for_stmt (stmt);
+ is_store = vect_transform_stmt (stmt, &si, &strided_store, node);
+ if (is_store)
+ {
+ if (DR_GROUP_FIRST_DR (stmt_info))
+ /* If IS_STORE is TRUE, the vectorization of the
+ interleaving chain was completed - free all the stores in
+ the chain. */
+ vect_remove_stores (DR_GROUP_FIRST_DR (stmt_info));
+ else
+ /* FORNOW: SLP originates only from strided stores. */
+ gcc_unreachable ();
+
+ return true;
+ }
+
+ /* FORNOW: SLP originates only from strided stores. */
+ return false;
+}
+
+
+static bool
+vect_schedule_slp (loop_vec_info loop_vinfo, unsigned int nunits)
+{
+ VEC (slp_instance, heap) *slp_instances =
+ LOOP_VINFO_SLP_INSTANCES (loop_vinfo);
+ slp_instance instance;
+ unsigned int vec_stmts_size;
+ unsigned int group_size, i;
+ unsigned int vectorization_factor = LOOP_VINFO_VECT_FACTOR (loop_vinfo);
+ bool is_store = false;
+
+ for (i = 0; VEC_iterate (slp_instance, slp_instances, i, instance); i++)
+ {
+ group_size = SLP_INSTANCE_GROUP_SIZE (instance);
+ /* For each SLP instance calculate number of vector stmts to be created
+ for the scalar stmts in each node of the SLP tree. Number of vector
+ elements in one vector iteration is the number of scalar elements in
+ one scalar iteration (GROUP_SIZE) multiplied by VF divided by vector
+ size. */
+ vec_stmts_size = vectorization_factor * group_size / nunits;
+
+ /* Schedule the tree of INSTANCE. */
+ is_store = vect_schedule_slp_instance (SLP_INSTANCE_TREE (instance),
+ vec_stmts_size);
+
+ if (vect_print_dump_info (REPORT_VECTORIZED_LOOPS)
+ || vect_print_dump_info (REPORT_UNVECTORIZED_LOOPS))
+ fprintf (vect_dump, "vectorizing stmts using SLP.");
+ }
+
+ return is_store;
+}
+
+
/* Function vect_transform_loop.
The analysis phase has determined that the loop is vectorizable.
@@ -6610,6 +7136,8 @@ vect_transform_loop (loop_vec_info loop_vinfo)
tree ratio = NULL;
int vectorization_factor = LOOP_VINFO_VECT_FACTOR (loop_vinfo);
bool strided_store;
+ bool slp_scheduled = false;
+ unsigned int nunits;
if (vect_print_dump_info (REPORT_DETAILS))
fprintf (vect_dump, "=== vec_transform_loop ===");
@@ -6744,6 +7272,7 @@ vect_transform_loop (loop_vec_info loop_vinfo)
stmt_info = vinfo_for_stmt (phi);
if (!stmt_info)
continue;
+
if (!STMT_VINFO_RELEVANT_P (stmt_info)
&& !STMT_VINFO_LIVE_P (stmt_info))
continue;
@@ -6757,7 +7286,7 @@ vect_transform_loop (loop_vec_info loop_vinfo)
{
if (vect_print_dump_info (REPORT_DETAILS))
fprintf (vect_dump, "transform phi.");
- vect_transform_stmt (phi, NULL, NULL);
+ vect_transform_stmt (phi, NULL, NULL, NULL);
}
}
@@ -6791,21 +7320,56 @@ vect_transform_loop (loop_vec_info loop_vinfo)
}
gcc_assert (STMT_VINFO_VECTYPE (stmt_info));
- if ((TYPE_VECTOR_SUBPARTS (STMT_VINFO_VECTYPE (stmt_info))
- != (unsigned HOST_WIDE_INT) vectorization_factor)
- && vect_print_dump_info (REPORT_DETAILS))
- fprintf (vect_dump, "multiple-types.");
+ nunits =
+ (unsigned int) TYPE_VECTOR_SUBPARTS (STMT_VINFO_VECTYPE (stmt_info));
+ if (!STMT_SLP_TYPE (stmt_info)
+ && nunits != (unsigned int) vectorization_factor
+ && vect_print_dump_info (REPORT_DETAILS))
+ /* For SLP VF is set according to unrolling factor, and not to
+ vector size, hence for SLP this print is not valid. */
+ fprintf (vect_dump, "multiple-types.");
+
+ /* SLP. Schedule all the SLP instances when the first SLP stmt is
+ reached. */
+ if (STMT_SLP_TYPE (stmt_info))
+ {
+ if (!slp_scheduled)
+ {
+ slp_scheduled = true;
+
+ if (vect_print_dump_info (REPORT_DETAILS))
+ fprintf (vect_dump, "=== scheduling SLP instances ===");
+ is_store = vect_schedule_slp (loop_vinfo, nunits);
+
+ /* IS_STORE is true if STMT is a store. Stores cannot be of
+ hybrid SLP type. They are removed in
+ vect_schedule_slp_instance and their vinfo is destroyed. */
+ if (is_store)
+ {
+ bsi_next (&si);
+ continue;
+ }
+ }
+
+ /* Hybrid SLP stmts must be vectorized in addition to SLP. */
+ if (PURE_SLP_STMT (stmt_info))
+ {
+ bsi_next (&si);
+ continue;
+ }
+ }
+
/* -------- vectorize statement ------------ */
if (vect_print_dump_info (REPORT_DETAILS))
fprintf (vect_dump, "transform statement.");
strided_store = false;
- is_store = vect_transform_stmt (stmt, &si, &strided_store);
+ is_store = vect_transform_stmt (stmt, &si, &strided_store, NULL);
if (is_store)
{
stmt_ann_t ann;
- if (DR_GROUP_FIRST_DR (stmt_info))
+ if (STMT_VINFO_STRIDED_ACCESS (stmt_info))
{
/* Interleaving. If IS_STORE is TRUE, the vectorization of the
interleaving chain was completed - free all the stores in
diff --git a/gcc/tree-vectorizer.c b/gcc/tree-vectorizer.c
index c239d296316..29acfac5289 100644
--- a/gcc/tree-vectorizer.c
+++ b/gcc/tree-vectorizer.c
@@ -1359,6 +1359,7 @@ new_stmt_vec_info (tree stmt, loop_vec_info loop_vinfo)
STMT_VINFO_SAME_ALIGN_REFS (res) = VEC_alloc (dr_p, heap, 5);
STMT_VINFO_INSIDE_OF_LOOP_COST (res) = 0;
STMT_VINFO_OUTSIDE_OF_LOOP_COST (res) = 0;
+ STMT_SLP_TYPE (res) = 0;
DR_GROUP_FIRST_DR (res) = NULL_TREE;
DR_GROUP_NEXT_DR (res) = NULL_TREE;
DR_GROUP_SIZE (res) = 0;
@@ -1478,7 +1479,9 @@ new_loop_vec_info (struct loop *loop)
VEC_alloc (tree, heap, PARAM_VALUE (PARAM_VECT_MAX_VERSION_FOR_ALIGNMENT_CHECKS));
LOOP_VINFO_MAY_ALIAS_DDRS (res) =
VEC_alloc (ddr_p, heap, PARAM_VALUE (PARAM_VECT_MAX_VERSION_FOR_ALIAS_CHECKS));
-
+ LOOP_VINFO_STRIDED_STORES (res) = VEC_alloc (tree, heap, 10);
+ LOOP_VINFO_SLP_INSTANCES (res) = VEC_alloc (slp_instance, heap, 10);
+ LOOP_VINFO_SLP_UNROLLING_FACTOR (res) = 1;
return res;
}
@@ -1497,6 +1500,8 @@ destroy_loop_vec_info (loop_vec_info loop_vinfo, bool clean_stmts)
int nbbs;
block_stmt_iterator si;
int j;
+ VEC (slp_instance, heap) *slp_instances;
+ slp_instance instance;
if (!loop_vinfo)
return;
@@ -1571,6 +1576,10 @@ destroy_loop_vec_info (loop_vec_info loop_vinfo, bool clean_stmts)
free_dependence_relations (LOOP_VINFO_DDRS (loop_vinfo));
VEC_free (tree, heap, LOOP_VINFO_MAY_MISALIGN_STMTS (loop_vinfo));
VEC_free (ddr_p, heap, LOOP_VINFO_MAY_ALIAS_DDRS (loop_vinfo));
+ slp_instances = LOOP_VINFO_SLP_INSTANCES (loop_vinfo);
+ for (j = 0; VEC_iterate (slp_instance, slp_instances, j, instance); j++)
+ vect_free_slp_tree (SLP_INSTANCE_TREE (instance));
+ VEC_free (slp_instance, heap, LOOP_VINFO_SLP_INSTANCES (loop_vinfo));
free (loop_vinfo);
loop->aux = NULL;
diff --git a/gcc/tree-vectorizer.h b/gcc/tree-vectorizer.h
index 1dd472c3e2d..0168c2397bb 100644
--- a/gcc/tree-vectorizer.h
+++ b/gcc/tree-vectorizer.h
@@ -60,7 +60,7 @@ enum dr_alignment_support {
/* Define type of def-use cross-iteration cycle. */
enum vect_def_type {
- vect_constant_def,
+ vect_constant_def = 1,
vect_invariant_def,
vect_loop_def,
vect_induction_def,
@@ -77,11 +77,80 @@ enum verbosity_levels {
REPORT_DR_DETAILS,
REPORT_BAD_FORM_LOOPS,
REPORT_OUTER_LOOPS,
+ REPORT_SLP,
REPORT_DETAILS,
/* New verbosity levels should be added before this one. */
MAX_VERBOSITY_LEVEL
};
+/************************************************************************
+ SLP
+ ************************************************************************/
+
+/* A computation tree of an SLP instance. Each node corresponds to a group of
+ stmts to be packed in a SIMD stmt. */
+typedef struct _slp_tree {
+ /* Only binary and unary operations are supported. LEFT child corresponds to
+ the first operand and RIGHT child to the second if the operation is
+ binary. */
+ struct _slp_tree *left;
+ struct _slp_tree *right;
+ /* A group of scalar stmts to be vectorized together. */
+ VEC (tree, heap) *stmts;
+ /* Vectorized stmt/s. */
+ VEC (tree, heap) *vec_stmts;
+ /* Number of vector stmts that are created to replace the group of scalar
+ stmts. It is calculated during the transformation phase as the number of
+ scalar elements in one scalar iteration (GROUP_SIZE) multiplied by VF
+ divided by vector size. */
+ unsigned int vec_stmts_size;
+ /* Vectorization costs associated with SLP node. */
+ struct
+ {
+ int outside_of_loop; /* Statements generated outside loop. */
+ int inside_of_loop; /* Statements generated inside loop. */
+ } cost;
+} *slp_tree;
+
+
+/* SLP instance is a sequence of stmts in a loop that can be packed into
+ SIMD stmts. */
+typedef struct _slp_instance {
+ /* The root of SLP tree. */
+ slp_tree root;
+
+ /* Size of groups of scalar stmts that will be replaced by SIMD stmt/s. */
+ unsigned int group_size;
+
+ /* The unrolling factor required to vectorized this SLP instance. */
+ unsigned int unrolling_factor;
+
+ /* Vectorization costs associated with SLP instance. */
+ struct
+ {
+ int outside_of_loop; /* Statements generated outside loop. */
+ int inside_of_loop; /* Statements generated inside loop. */
+ } cost;
+} *slp_instance;
+
+DEF_VEC_P(slp_instance);
+DEF_VEC_ALLOC_P(slp_instance, heap);
+
+/* Access Functions. */
+#define SLP_INSTANCE_TREE(S) (S)->root
+#define SLP_INSTANCE_GROUP_SIZE(S) (S)->group_size
+#define SLP_INSTANCE_UNROLLING_FACTOR(S) (S)->unrolling_factor
+#define SLP_INSTANCE_OUTSIDE_OF_LOOP_COST(S) (S)->cost.outside_of_loop
+#define SLP_INSTANCE_INSIDE_OF_LOOP_COST(S) (S)->cost.inside_of_loop
+
+#define SLP_TREE_LEFT(S) (S)->left
+#define SLP_TREE_RIGHT(S) (S)->right
+#define SLP_TREE_SCALAR_STMTS(S) (S)->stmts
+#define SLP_TREE_VEC_STMTS(S) (S)->vec_stmts
+#define SLP_TREE_NUMBER_OF_VEC_STMTS(S) (S)->vec_stmts_size
+#define SLP_TREE_OUTSIDE_OF_LOOP_COST(S) (S)->cost.outside_of_loop
+#define SLP_TREE_INSIDE_OF_LOOP_COST(S) (S)->cost.inside_of_loop
+
/*-----------------------------------------------------------------*/
/* Info on vectorized loops. */
/*-----------------------------------------------------------------*/
@@ -141,6 +210,18 @@ typedef struct _loop_vec_info {
/* The loop location in the source. */
LOC loop_line_number;
+
+ /* All interleaving chains of stores in the loop, represented by the first
+ stmt in the chain. */
+ VEC(tree, heap) *strided_stores;
+
+ /* All SLP instances in the loop. This is a subset of the set of STRIDED_STORES
+ of the loop. */
+ VEC(slp_instance, heap) *slp_instances;
+
+ /* The unrolling factor needed to SLP the loop. In case of that pure SLP is
+ applied to the loop, i.e., no unrolling is needed, this is 1. */
+ unsigned slp_unrolling_factor;
} *loop_vec_info;
/* Access Functions. */
@@ -159,6 +240,9 @@ typedef struct _loop_vec_info {
#define LOOP_VINFO_MAY_MISALIGN_STMTS(L) (L)->may_misalign_stmts
#define LOOP_VINFO_LOC(L) (L)->loop_line_number
#define LOOP_VINFO_MAY_ALIAS_DDRS(L) (L)->may_alias_ddrs
+#define LOOP_VINFO_STRIDED_STORES(L) (L)->strided_stores
+#define LOOP_VINFO_SLP_INSTANCES(L) (L)->slp_instances
+#define LOOP_VINFO_SLP_UNROLLING_FACTOR(L) (L)->slp_unrolling_factor
#define NITERS_KNOWN_P(n) \
(host_integerp ((n),0) \
@@ -216,6 +300,29 @@ enum vect_relevant {
vect_used_in_loop
};
+/* The type of vectorization that can be applied to the stmt: regular loop-based
+ vectorization; pure SLP - the stmt is a part of SLP instances and does not
+ have uses outside SLP instances; or hybrid SLP and loop-based - the stmt is
+ a part of SLP instance and also must be loop-based vectorized, since it has
+ uses outside SLP sequences.
+
+ In the loop context the meanings of pure and hybrid SLP are slightly
+ different. By saying that pure SLP is applied to the loop, we mean that we
+ exploit only intra-iteration parallelism in the loop; i.e., the loop can be
+ vectorized without doing any conceptual unrolling, cause we don't pack
+ together stmts from different iterations, only within a single iteration.
+ Loop hybrid SLP means that we exploit both intra-iteration and
+ inter-iteration parallelism (e.g., number of elements in the vector is 4
+ and the slp-group-size is 2, in which case we don't have enough parallelism
+ within an iteration, so we obtain the rest of the parallelism from subsequent
+ iterations by unrolling the loop by 2). */
+enum slp_vect_type {
+ loop_vect = 0,
+ pure_slp,
+ hybrid
+};
+
+
typedef struct data_reference *dr_p;
DEF_VEC_P(dr_p);
DEF_VEC_ALLOC_P(dr_p,heap);
@@ -309,6 +416,9 @@ typedef struct _stmt_vec_info {
int outside_of_loop; /* Statements generated outside loop. */
int inside_of_loop; /* Statements generated inside loop. */
} cost;
+
+ /* Whether the stmt is SLPed, loop-based vectorized, or both. */
+ enum slp_vect_type slp_type;
} *stmt_vec_info;
/* Access Functions. */
@@ -338,6 +448,7 @@ typedef struct _stmt_vec_info {
#define STMT_VINFO_DR_GROUP_GAP(S) (S)->gap
#define STMT_VINFO_DR_GROUP_SAME_DR_STMT(S)(S)->same_dr_stmt
#define STMT_VINFO_DR_GROUP_READ_WRITE_DEPENDENCE(S) (S)->read_write_dep
+#define STMT_VINFO_STRIDED_ACCESS(S) ((S)->first_dr != NULL)
#define DR_GROUP_FIRST_DR(S) (S)->first_dr
#define DR_GROUP_NEXT_DR(S) (S)->next_dr
@@ -351,6 +462,10 @@ typedef struct _stmt_vec_info {
#define STMT_VINFO_OUTSIDE_OF_LOOP_COST(S) (S)->cost.outside_of_loop
#define STMT_VINFO_INSIDE_OF_LOOP_COST(S) (S)->cost.inside_of_loop
+#define HYBRID_SLP_STMT(S) ((S)->slp_type == hybrid)
+#define PURE_SLP_STMT(S) ((S)->slp_type == pure_slp)
+#define STMT_SLP_TYPE(S) (S)->slp_type
+
/* These are some defines for the initial implementation of the vectorizer's
cost model. These will later be target specific hooks. */
@@ -524,6 +639,7 @@ extern stmt_vec_info new_stmt_vec_info (tree stmt, loop_vec_info);
/** In tree-vect-analyze.c **/
/* Driver for analysis stage. */
extern loop_vec_info vect_analyze_loop (struct loop *);
+extern void vect_free_slp_tree (slp_tree);
/** In tree-vect-patterns.c **/
@@ -536,14 +652,16 @@ void vect_pattern_recog (loop_vec_info);
/** In tree-vect-transform.c **/
-extern bool vectorizable_load (tree, block_stmt_iterator *, tree *);
-extern bool vectorizable_store (tree, block_stmt_iterator *, tree *);
-extern bool vectorizable_operation (tree, block_stmt_iterator *, tree *);
+extern bool vectorizable_load (tree, block_stmt_iterator *, tree *, slp_tree);
+extern bool vectorizable_store (tree, block_stmt_iterator *, tree *, slp_tree);
+extern bool vectorizable_operation (tree, block_stmt_iterator *, tree *,
+ slp_tree);
extern bool vectorizable_type_promotion (tree, block_stmt_iterator *, tree *);
extern bool vectorizable_type_demotion (tree, block_stmt_iterator *, tree *);
extern bool vectorizable_conversion (tree, block_stmt_iterator *,
- tree *);
-extern bool vectorizable_assignment (tree, block_stmt_iterator *, tree *);
+ tree *, slp_tree);
+extern bool vectorizable_assignment (tree, block_stmt_iterator *, tree *,
+ slp_tree);
extern tree vectorizable_function (tree, tree, tree);
extern bool vectorizable_call (tree, block_stmt_iterator *, tree *);
extern bool vectorizable_condition (tree, block_stmt_iterator *, tree *);
@@ -551,6 +669,11 @@ extern bool vectorizable_live_operation (tree, block_stmt_iterator *, tree *);
extern bool vectorizable_reduction (tree, block_stmt_iterator *, tree *);
extern bool vectorizable_induction (tree, block_stmt_iterator *, tree *);
extern int vect_estimate_min_profitable_iters (loop_vec_info);
+extern void vect_model_simple_cost (stmt_vec_info, int, enum vect_def_type *,
+ slp_tree);
+extern void vect_model_store_cost (stmt_vec_info, int, enum vect_def_type,
+ slp_tree);
+extern void vect_model_load_cost (stmt_vec_info, int, slp_tree);
/* Driver for transformation stage. */
extern void vect_transform_loop (loop_vec_info);