diff options
52 files changed, 5780 insertions, 310 deletions
diff --git a/gcc/ChangeLog b/gcc/ChangeLog index 5b93ed1e9f1..de8ecc2b79d 100644 --- a/gcc/ChangeLog +++ b/gcc/ChangeLog @@ -1,3 +1,69 @@ +2007-09-09 Ira Rosen <irar@il.ibm.com> + + * tree-vectorizer.h (enum vect_def_type): Start enumeration from 1. + (struct _slp_tree, struct _slp_instance): Define new data structures + along macros for their access. + (struct _loop_vec_info): Define new fields: strided_stores, + slp_instances, and slp_unrolling_factor along macros for their access. + (enum slp_vect_type): New. + (struct _stmt_vec_info): Define new field, slp_type, and macros for its + access. + (STMT_VINFO_STRIDED_ACCESS): New macro. + (vect_free_slp_tree): Declare. + (vectorizable_load): Add an argument of type slp_tree. + (vectorizable_store, vectorizable_operation, vectorizable_conversion, + vectorizable_assignment): Likewise. + (vect_model_simple_cost, vect_model_store_cost, vect_model_load_cost): + Declare (make extern). + * tree-vectorizer.c (new_stmt_vec_info): Initiliaze the new field. + (new_loop_vec_info): Likewise. + (destroy_loop_vec_info): Free memory allocated for SLP structures. + * tree-vect-analyze.c: Include recog.h. + (vect_update_slp_costs_according_to_vf): New. + (vect_analyze_operations): Add argument for calls to vectorizable_ () + functions. For not pure SLP stmts with strided access check that the + group size is power of 2. Update the vectorization factor according to + SLP. Call vect_update_slp_costs_according_to_vf. + (vect_analyze_group_access): New. + (vect_analyze_data_ref_access): Call vect_analyze_group_access. + (vect_free_slp_tree): New functions. + (vect_get_and_check_slp_defs, vect_build_slp_tree, vect_print_slp_tree, + vect_mark_slp_stmts, vect_analyze_slp_instance, vect_analyze_slp, + vect_make_slp_decision, vect_detect_hybrid_slp_stmts, + vect_detect_hybrid_slp): Likewise. + (vect_analyze_loop): Call vect_analyze_slp, vect_make_slp_decision + and vect_detect_hybrid_slp. + * tree-vect-transform.c (vect_estimate_min_profitable_iters): Take + SLP costs into account. + (vect_get_cost_fields): New function. + (vect_model_simple_cost): Make extern, add SLP parameter and handle + SLP. + (vect_model_store_cost, vect_model_load_cost): Likewise. + (vect_get_constant_vectors): New function. + (vect_get_slp_vect_defs, vect_get_slp_defs, + vect_get_vec_defs_for_stmt_copy, vect_get_vec_defs_for_stmt_copy, + vect_get_vec_defs): Likewise. + (vectorizable_reduction): Don't handle SLP for now. + (vectorizable_call): Don't handle SLP for now. Add argument to + vect_model_simple_cost. + (vectorizable_conversion): Handle SLP (call vect_get_vec_defs to + get SLPed and vectorized defs). Fix indentation and spacing. + (vectorizable_assignment): Handle SLP. + (vectorizable_induction): Don't handle SLP for now. + (vectorizable_operation): Likewise. + (vectorizable_type_demotion): Add argument to + vect_model_simple_cost. + (vectorizable_type_promotion): Likewise. + (vectorizable_store, vectorizable_load): Handle SLP. + (vectorizable_condition): Don't handle SLP for now. + (vect_transform_stmt): Add a new argument for SLP. Check that there is + no SLP transformation required for unsupported cases. Add SLP + argument for supported cases. + (vect_remove_stores): New function. + (vect_schedule_slp_instance, vect_schedule_slp): Likewise. + (vect_transform_loop): Schedule SLP instances. + * Makefile.in: (tree-vect-analyze.o): Depend on recog.h. + 2007-09-09 Andrew Haley <aph@redhat.com> * optabs.c (sign_expand_binop): Set libcall_gen = NULL in the diff --git a/gcc/Makefile.in b/gcc/Makefile.in index 724c452cab8..348dbc4b774 100644 --- a/gcc/Makefile.in +++ b/gcc/Makefile.in @@ -2246,7 +2246,7 @@ tree-data-ref.o: tree-data-ref.c $(CONFIG_H) $(SYSTEM_H) coretypes.h $(TM_H) \ $(TREE_FLOW_H) $(TREE_DUMP_H) $(TIMEVAR_H) $(CFGLOOP_H) \ $(TREE_DATA_REF_H) $(SCEV_H) tree-pass.h tree-chrec.h langhooks.h tree-vect-analyze.o: tree-vect-analyze.c $(CONFIG_H) $(SYSTEM_H) coretypes.h \ - $(TM_H) $(GGC_H) $(OPTABS_H) $(TREE_H) $(BASIC_BLOCK_H) \ + $(TM_H) $(GGC_H) $(OPTABS_H) $(TREE_H) $(RECOG_H) $(BASIC_BLOCK_H) \ $(DIAGNOSTIC_H) $(TREE_FLOW_H) $(TREE_DUMP_H) $(TIMEVAR_H) $(CFGLOOP_H) \ tree-vectorizer.h $(TREE_DATA_REF_H) $(SCEV_H) $(EXPR_H) tree-chrec.h tree-vect-patterns.o: tree-vect-patterns.c $(CONFIG_H) $(SYSTEM_H) coretypes.h \ diff --git a/gcc/testsuite/ChangeLog b/gcc/testsuite/ChangeLog index 06d1e728f1f..4d294e10fac 100644 --- a/gcc/testsuite/ChangeLog +++ b/gcc/testsuite/ChangeLog @@ -1,3 +1,29 @@ +2007-09-09 Ira Rosen <irar@il.ibm.com> + + * gcc.dg/vect/vect.exp: Compile tests starting with slp-. + Remove "vect" part from test names for -ffast-math, -ffast-math-errno, + -fwrapv, -ftrapv tests. Add -fno-tree-scev-cprop for slp- tests. + Compile tests with -fno-tree-pre. + * gcc.dg/vect/costmodel/ppc/ppc-costmodel-vect.exp: Run SLP tests. + * lib/target-supports.exp (check_effective_target_vect_strided): New. + * gcc.dg/vect/slp-1.c, gcc.dg/vect/slp-2.c, gcc.dg/vect/slp-3.c, + gcc.dg/vect/slp-4.c, gcc.dg/vect/slp-5.c, gcc.dg/vect/slp-6.c, + gcc.dg/vect/slp-7.c, gcc.dg/vect/slp-8.c, gcc.dg/vect/slp-9.c, + gcc.dg/vect/slp-10.c, gcc.dg/vect/slp-11.c, gcc.dg/vect/slp-12.c, + gcc.dg/vect/slp-13.c, gcc.dg/vect/slp-14.c, gcc.dg/vect/slp-15.c, + gcc.dg/vect/slp-16.c, gcc.dg/vect/slp-17.c, gcc.dg/vect/slp-18.c, + gcc.dg/vect/slp-19.c, gcc.dg/vect/slp-20.c, gcc.dg/vect/slp-21.c, + gcc.dg/vect/slp-22.c, gcc.dg/vect/slp-23.c, gcc.dg/vect/slp-24.c, + gcc.dg/vect/slp-25.c, gcc.dg/vect/slp-26.c, gcc.dg/vect/slp-28.c, + gcc.dg/vect/fast-math-slp-27.c, gcc.dg/vect/no-tree-pre-slp-29.c, + gcc.dg/vect/no-scevccp-slp-30.c, gcc.dg/vect/no-scevccp-slp-31.c, + gcc.dg/vect/no-math-errno-slp-32.c, gcc.dg/vect/slp-33.c, + gcc.dg/vect/slp-34.c, gcc.dg/vect/slp-35.c, gcc.dg/vect/slp-36.c, + gcc.dg/vect/slp-37.c, gcc.dg/vect/vect-vfa-slp.c, + gcc.dg/vect/costmodel/ppc/costmodel-slp-12.c, + gcc.dg/vect/costmodel/ppc/costmodel-slp-33.c: New testcases. + * gcc.dg/vect/vect-vfa-03.c: Change the test to prevent SLP. + 2007-09-09 Joseph Myers <joseph@codesourcery.com> * lib/file-format.exp (gcc_target_object_format): Use remote_exec diff --git a/gcc/testsuite/gcc.dg/vect/costmodel/ppc/costmodel-slp-12.c b/gcc/testsuite/gcc.dg/vect/costmodel/ppc/costmodel-slp-12.c new file mode 100644 index 00000000000..752c4f61390 --- /dev/null +++ b/gcc/testsuite/gcc.dg/vect/costmodel/ppc/costmodel-slp-12.c @@ -0,0 +1,120 @@ +/* { dg-require-effective-target vect_int } */ + +#include <stdarg.h> +#include <stdio.h> +#include "../../tree-vect.h" + +#define N 8 + +int +main1 () +{ + int i; + unsigned int out[N*8], a0, a1, a2, a3, a4, a5, a6, a7, b1, b0, b2, b3, b4, b5, b6, b7; + unsigned int in[N*8] = {0,1,2,3,4,5,6,7,8,9,10,11,12,13,14,15,16,17,18,19,20,21,22,23,24,25,26,27,28,29,30,31,32,33,34,35,36,37,38,39,40,41,42,43,44,45,46,47,48,49,50,51,52,53,54,55,56,57,58,59,60,61,62,63}; + float out2[N*8], fa[N*4]; + unsigned int ia[N], ib[N*2]; + + for (i = 0; i < N; i++) + { + + a0 = in[i*8] + 5; + a1 = in[i*8 + 1] + 6; + a2 = in[i*8 + 2] + 7; + a3 = in[i*8 + 3] + 8; + a4 = in[i*8 + 4] + 9; + a5 = in[i*8 + 5] + 10; + a6 = in[i*8 + 6] + 11; + a7 = in[i*8 + 7] + 12; + + b0 = a0 * 3; + b1 = a1 * 2; + b2 = a2 * 12; + b3 = a3 * 5; + b4 = a4 * 8; + b5 = a5 * 4; + b6 = a6 * 3; + b7 = a7 * 2; + + out[i*8] = b0 - 2; + out[i*8 + 1] = b1 - 3; + out[i*8 + 2] = b2 - 2; + out[i*8 + 3] = b3 - 1; + out[i*8 + 4] = b4 - 8; + out[i*8 + 5] = b5 - 7; + out[i*8 + 6] = b6 - 3; + out[i*8 + 7] = b7 - 7; + + ia[i] = b6; + } + + /* check results: */ + for (i = 0; i < N; i++) + { + if (out[i*8] != (in[i*8] + 5) * 3 - 2 + || out[i*8 + 1] != (in[i*8 + 1] + 6) * 2 - 3 + || out[i*8 + 2] != (in[i*8 + 2] + 7) * 12 - 2 + || out[i*8 + 3] != (in[i*8 + 3] + 8) * 5 - 1 + || out[i*8 + 4] != (in[i*8 + 4] + 9) * 8 - 8 + || out[i*8 + 5] != (in[i*8 + 5] + 10) * 4 - 7 + || out[i*8 + 6] != (in[i*8 + 6] + 11) * 3 - 3 + || out[i*8 + 7] != (in[i*8 + 7] + 12) * 2 - 7 + || ia[i] != (in[i*8 + 6] + 11) * 3) + abort (); + } + + for (i = 0; i < N*2; i++) + { + out[i*4] = (in[i*4] + 2) * 3; + out[i*4 + 1] = (in[i*4 + 1] + 2) * 7; + out[i*4 + 2] = (in[i*4 + 2] + 7) * 3; + out[i*4 + 3] = (in[i*4 + 3] + 7) * 7; + + ib[i] = 7; + } + + /* check results: */ + for (i = 0; i < N*2; i++) + { + if (out[i*4] != (in[i*4] + 2) * 3 + || out[i*4 + 1] != (in[i*4 + 1] + 2) * 7 + || out[i*4 + 2] != (in[i*4 + 2] + 7) * 3 + || out[i*4 + 3] != (in[i*4 + 3] + 7) * 7 + || ib[i] != 7) + abort (); + } + + for (i = 0; i < N*4; i++) + { + out2[i*2] = (float) (in[i*2] * 2 + 11) ; + out2[i*2 + 1] = (float) (in[i*2 + 1] * 3 + 7); + + fa[i] = (float) in[i*2+1]; + } + + /* check results: */ + for (i = 0; i < N*4; i++) + { + if (out2[i*2] != (float) (in[i*2] * 2 + 11) + || out2[i*2 + 1] != (float) (in[i*2 + 1] * 3 + 7) + || fa[i] != (float) in[i*2+1]) + abort (); + } + + + return 0; +} + +int main (void) +{ + check_vect (); + + main1 (); + + return 0; +} + +/* { dg-final { scan-tree-dump-times "vectorized 1 loops" 1 "vect" {target { vect_strided && vect_int_mult } } } } */ +/* { dg-final { scan-tree-dump-times "vectorizing stmts using SLP" 1 "vect" {target { vect_strided && vect_int_mult } } } } */ +/* { dg-final { cleanup-tree-dump "vect" } } */ + diff --git a/gcc/testsuite/gcc.dg/vect/costmodel/ppc/costmodel-slp-33.c b/gcc/testsuite/gcc.dg/vect/costmodel/ppc/costmodel-slp-33.c new file mode 100644 index 00000000000..9cae12fdbb3 --- /dev/null +++ b/gcc/testsuite/gcc.dg/vect/costmodel/ppc/costmodel-slp-33.c @@ -0,0 +1,45 @@ +/* { dg-require-effective-target vect_int } */ + +#include <stdarg.h> +#include "../../tree-vect.h" + +#define N 32 + +struct s{ + short a; /* aligned */ + char b[N-1]; /* unaligned (offset 2B) */ +}; + +int main1 () +{ + int i; + struct s tmp; + + /* unaligned */ + for (i = 0; i < N/4; i++) + { + tmp.b[2*i] = 5; + tmp.b[2*i+1] = 15; + } + + /* check results: */ + for (i = 0; i <N/4; i++) + { + if (tmp.b[2*i] != 5 + || tmp.b[2*i+1] != 15) + abort (); + } + + return 0; +} + +int main (void) +{ + check_vect (); + + return main1 (); +} + +/* { dg-final { scan-tree-dump-times "vectorization not profitable" 1 "vect" } } */ +/* { dg-final { scan-tree-dump-times "vectorized 1 loops" 0 "vect" } } */ +/* { dg-final { cleanup-tree-dump "vect" } } */ diff --git a/gcc/testsuite/gcc.dg/vect/costmodel/ppc/ppc-costmodel-vect.exp b/gcc/testsuite/gcc.dg/vect/costmodel/ppc/ppc-costmodel-vect.exp index 63f5349ee84..4f710634e8a 100644 --- a/gcc/testsuite/gcc.dg/vect/costmodel/ppc/ppc-costmodel-vect.exp +++ b/gcc/testsuite/gcc.dg/vect/costmodel/ppc/ppc-costmodel-vect.exp @@ -64,6 +64,8 @@ dg-runtest [lsort [glob -nocomplain $srcdir/$subdir/costmodel-pr*.\[cS\]]] \ "" $DEFAULT_VECTCFLAGS dg-runtest [lsort [glob -nocomplain $srcdir/$subdir/costmodel-vect-*.\[cS\]]] \ "" $DEFAULT_VECTCFLAGS +dg-runtest [lsort [glob -nocomplain $srcdir/$subdir/costmodel-slp-*.\[cS\]]] \ + "" $DEFAULT_VECTCFLAGS #### Tests with special options global SAVED_DEFAULT_VECTCFLAGS diff --git a/gcc/testsuite/gcc.dg/vect/fast-math-slp-27.c b/gcc/testsuite/gcc.dg/vect/fast-math-slp-27.c new file mode 100644 index 00000000000..d4c7d19925b --- /dev/null +++ b/gcc/testsuite/gcc.dg/vect/fast-math-slp-27.c @@ -0,0 +1,17 @@ +/* { dg-do compile } */ +/* { dg-require-effective-target vect_float } */ + +float x[256]; + +void foo(void) +{ + int i; + for (i=0; i<256; ++i) + { + x[2*i] = x[2*i] * x[2*i]; + x[2*i+1] = x[2*i+1] * x[2*i+1]; + } +} + +/* { dg-final { scan-tree-dump "vectorized 1 loops" "vect" { target vect_strided } } } */ +/* { dg-final { cleanup-tree-dump "vect" } } */ diff --git a/gcc/testsuite/gcc.dg/vect/no-math-errno-slp-32.c b/gcc/testsuite/gcc.dg/vect/no-math-errno-slp-32.c new file mode 100644 index 00000000000..c952e7f1b11 --- /dev/null +++ b/gcc/testsuite/gcc.dg/vect/no-math-errno-slp-32.c @@ -0,0 +1,17 @@ +/* { dg-do compile } */ +/* { dg-require-effective-target vect_double } */ + +double x[256]; + +void foo(void) +{ + int i; + for (i=0; i<128; ++i) + { + x[2*i] = __builtin_pow (x[2*i], 0.5); + x[2*i+1] = __builtin_pow (x[2*i+1], 0.5); + } +} + +/* { dg-final { scan-tree-dump "pattern recognized" "vect" { xfail spu*-*-* } } } */ +/* { dg-final { cleanup-tree-dump "vect" } } */ diff --git a/gcc/testsuite/gcc.dg/vect/no-scevccp-slp-30.c b/gcc/testsuite/gcc.dg/vect/no-scevccp-slp-30.c new file mode 100644 index 00000000000..30cb947bed0 --- /dev/null +++ b/gcc/testsuite/gcc.dg/vect/no-scevccp-slp-30.c @@ -0,0 +1,58 @@ +/* { dg-require-effective-target vect_int } */ + +#include <stdarg.h> +#include <stdio.h> +#include "tree-vect.h" + +#define N 128 + +int +main1 () +{ + int i, j; + unsigned short out[N*8], a[N]; + + for (j = 0; j < N; j++) + { + for (i = 0; i < N; i++) + { + out[i*4] = 8; + out[i*4 + 1] = 18; + out[i*4 + 2] = 28; + out[i*4 + 3] = 38; + } + a[j] = 8; + } + + /* check results: */ + for (j = 0; j < N; j++) + { + for (i = 0; i < N; i++) + { + if (out[i*4] != 8 + || out[i*4 + 1] != 18 + || out[i*4 + 2] != 28 + || out[i*4 + 3] != 38) + abort(); + } + + if (a[j] != 8) + abort (); + } + + return 0; +} + +int main (void) +{ + check_vect (); + + main1 (); + + return 0; +} + +/* { dg-final { scan-tree-dump-times "vectorized 1 loops" 1 "vect" } } */ +/* { dg-final { scan-tree-dump-times "vectorizing stmts using SLP" 1 "vect" } } */ +/* { dg-final { cleanup-tree-dump "vect" } } */ + diff --git a/gcc/testsuite/gcc.dg/vect/no-scevccp-slp-31.c b/gcc/testsuite/gcc.dg/vect/no-scevccp-slp-31.c new file mode 100644 index 00000000000..2e43db13601 --- /dev/null +++ b/gcc/testsuite/gcc.dg/vect/no-scevccp-slp-31.c @@ -0,0 +1,58 @@ +/* { dg-require-effective-target vect_int } */ + +#include <stdarg.h> +#include <stdio.h> +#include "tree-vect.h" + +#define N 128 + +int +main1 () +{ + int i, j; + unsigned short out[N*8], a[N][N]; + + for (i = 0; i < N; i++) + { + for (j = 0; j < N; j++) + { + a[i][j] = 8; + } + out[i*4] = 8; + out[i*4 + 1] = 18; + out[i*4 + 2] = 28; + out[i*4 + 3] = 38; + } + + /* check results: */ + for (i = 0; i < N; i++) + { + for (j = 0; j < N; j++) + { + if (a[i][j] != 8) + abort (); + } + if (out[i*4] != 8 + || out[i*4 + 1] != 18 + || out[i*4 + 2] != 28 + || out[i*4 + 3] != 38) + abort(); + } + + + return 0; +} + +int main (void) +{ + check_vect (); + + main1 (); + + return 0; +} + +/* { dg-final { scan-tree-dump-times "vectorized 1 loops" 1 "vect" } } */ +/* { dg-final { scan-tree-dump-times "vectorizing stmts using SLP" 0 "vect" } } */ +/* { dg-final { cleanup-tree-dump "vect" } } */ + diff --git a/gcc/testsuite/gcc.dg/vect/no-tree-pre-slp-29.c b/gcc/testsuite/gcc.dg/vect/no-tree-pre-slp-29.c new file mode 100644 index 00000000000..9c75e9dc5b0 --- /dev/null +++ b/gcc/testsuite/gcc.dg/vect/no-tree-pre-slp-29.c @@ -0,0 +1,79 @@ +/* { dg-require-effective-target vect_int } */ + +#include <stdarg.h> +#include <stdio.h> +#include "tree-vect.h" + +#define N 8 + +unsigned short in2[N*8] = {0,1,2,3,4,5,6,7,8,9,10,11,12,13,14,15,16,17,18,19,20,21,22,23,24,25,26,27,28,29,30,31,32,33,34,35,36,37,38,39,40,41,42,43,44,45,46,47,48,49,50,51,52,53,54,55,56,57,58,59,60,61,62,63}; + +int +main1 (unsigned short *in) +{ + int i; + unsigned short out[N*8]; + + for (i = 0; i < N; i++) + { + out[i*4] = in[i*4]; + out[i*4 + 1] = in[i*4 + 1]; + out[i*4 + 2] = in[i*4 + 2]; + out[i*4 + 3] = in[i*4 + 3]; + } + + /* check results: */ + for (i = 0; i < N; i++) + { + if (out[i*4] != in[i*4] + || out[i*4 + 1] != in[i*4 + 1] + || out[i*4 + 2] != in[i*4 + 2] + || out[i*4 + 3] != in[i*4 + 3]) + abort (); + } + + return 0; +} + +int +main2 (unsigned short * __restrict__ in, unsigned short * __restrict__ out) +{ + int i; + + for (i = 0; i < N; i++) + { + out[i*4] = in[i*4]; + out[i*4 + 1] = in[i*4 + 1]; + out[i*4 + 2] = in[i*4 + 2]; + out[i*4 + 3] = in[i*4 + 3]; + } + + /* check results: */ + for (i = 0; i < N; i++) + { + if (out[i*4] != in[i*4] + || out[i*4 + 1] != in[i*4 + 1] + || out[i*4 + 2] != in[i*4 + 2] + || out[i*4 + 3] != in[i*4 + 3]) + abort (); + } + + return 0; +} + +int main (void) +{ + unsigned short out[N*8]; + + check_vect (); + + main1 (&in2[5]); + main2 (&in2[3], &out[3]); + + return 0; +} + +/* { dg-final { scan-tree-dump-times "vectorized 1 loops" 2 "vect" { xfail vect_no_align } } } */ +/* { dg-final { scan-tree-dump-times "vectorizing stmts using SLP" 2 "vect" { xfail vect_no_align } } } */ +/* { dg-final { cleanup-tree-dump "vect" } } */ + diff --git a/gcc/testsuite/gcc.dg/vect/slp-1.c b/gcc/testsuite/gcc.dg/vect/slp-1.c new file mode 100644 index 00000000000..f7e20973818 --- /dev/null +++ b/gcc/testsuite/gcc.dg/vect/slp-1.c @@ -0,0 +1,124 @@ +/* { dg-require-effective-target vect_int } */ + +#include <stdarg.h> +#include <stdio.h> +#include "tree-vect.h" + +#define N 128 + +int +main1 () +{ + int i; + unsigned short out[N*8]; + + for (i = 0; i < N; i++) + { + out[i*4] = 8; + out[i*4 + 1] = 18; + out[i*4 + 2] = 28; + out[i*4 + 3] = 38; + } + + /* check results: */ + for (i = 0; i < N; i++) + { + if (out[i*4] != 8 + || out[i*4 + 1] != 18 + || out[i*4 + 2] != 28 + || out[i*4 + 3] != 38) + abort (); + } + + for (i = 0; i < N; i++) + { + out[i*8] = 8; + out[i*8 + 1] = 7; + out[i*8 + 2] = 81; + out[i*8 + 3] = 28; + out[i*8 + 4] = 18; + out[i*8 + 5] = 85; + out[i*8 + 6] = 5; + out[i*8 + 7] = 4; + } + + /* check results: */ + for (i = 0; i < N; i++) + { + if (out[i*8] != 8 + || out[i*8 + 1] != 7 + || out[i*8 + 2] != 81 + || out[i*8 + 3] != 28 + || out[i*8 + 4] != 18 + || out[i*8 + 5] != 85 + || out[i*8 + 6] != 5 + || out[i*8 + 7] != 4) + abort (); + } + + /* SLP with unrolling by 8. */ + for (i = 0; i < N; i++) + { + out[i*5] = 8; + out[i*5 + 1] = 7; + out[i*5 + 2] = 81; + out[i*5 + 3] = 28; + out[i*5 + 4] = 18; + } + + /* check results: */ + for (i = 0; i < N; i++) + { + if (out[i*5] != 8 + || out[i*5 + 1] != 7 + || out[i*5 + 2] != 81 + || out[i*5 + 3] != 28 + || out[i*5 + 4] != 18) + abort (); + } + + /* SLP with unrolling by 8. */ + for (i = 0; i < N/2; i++) + { + out[i*9] = 8; + out[i*9 + 1] = 7; + out[i*9 + 2] = 81; + out[i*9 + 3] = 28; + out[i*9 + 4] = 18; + out[i*9 + 5] = 85; + out[i*9 + 6] = 5; + out[i*9 + 7] = 4; + out[i*9 + 8] = 14; + } + + /* check results: */ + for (i = 0; i < N/2; i++) + { + if (out[i*9] != 8 + || out[i*9 + 1] != 7 + || out[i*9 + 2] != 81 + || out[i*9 + 3] != 28 + || out[i*9 + 4] != 18 + || out[i*9 + 5] != 85 + || out[i*9 + 6] != 5 + || out[i*9 + 7] != 4 + || out[i*9 + 8] != 14) + abort (); + } + + return 0; +} + +int main (void) +{ + check_vect (); + + main1 (); + + return 0; +} + +/* { dg-final { scan-tree-dump-times "vectorized 4 loops" 1 "vect" } } */ +/* { dg-final { scan-tree-dump-times "vectorizing stmts using SLP" 4 "vect" } } */ +/* { dg-final { cleanup-tree-dump "vect" } } */ + diff --git a/gcc/testsuite/gcc.dg/vect/slp-10.c b/gcc/testsuite/gcc.dg/vect/slp-10.c new file mode 100644 index 00000000000..737e1e7860d --- /dev/null +++ b/gcc/testsuite/gcc.dg/vect/slp-10.c @@ -0,0 +1,114 @@ +/* { dg-require-effective-target vect_int } */ + +#include <stdarg.h> +#include <stdio.h> +#include "tree-vect.h" + +#define N 8 + +int +main1 () +{ + int i; + unsigned int out[N*8], a0, a1, a2, a3, a4, a5, a6, a7, b1, b0, b2, b3, b4, b5, b6, b7; + unsigned int in[N*8] = {0,1,2,3,4,5,6,7,8,9,10,11,12,13,14,15,16,17,18,19,20,21,22,23,24,25,26,27,28,29,30,31,32,33,34,35,36,37,38,39,40,41,42,43,44,45,46,47,48,49,50,51,52,53,54,55,56,57,58,59,60,61,62,63}; + float out2[N*8]; + + for (i = 0; i < N; i++) + { + + a0 = in[i*8] + 5; + a1 = in[i*8 + 1] + 6; + a2 = in[i*8 + 2] + 7; + a3 = in[i*8 + 3] + 8; + a4 = in[i*8 + 4] + 9; + a5 = in[i*8 + 5] + 10; + a6 = in[i*8 + 6] + 11; + a7 = in[i*8 + 7] + 12; + + b0 = a0 * 3; + b1 = a1 * 2; + b2 = a2 * 12; + b3 = a3 * 5; + b4 = a4 * 8; + b5 = a5 * 4; + b6 = a6 * 3; + b7 = a7 * 2; + + out[i*8] = b0 - 2; + out[i*8 + 1] = b1 - 3; + out[i*8 + 2] = b2 - 2; + out[i*8 + 3] = b3 - 1; + out[i*8 + 4] = b4 - 8; + out[i*8 + 5] = b5 - 7; + out[i*8 + 6] = b6 - 3; + out[i*8 + 7] = b7 - 7; + } + + /* check results: */ + for (i = 0; i < N; i++) + { + if (out[i*8] != (in[i*8] + 5) * 3 - 2 + || out[i*8 + 1] != (in[i*8 + 1] + 6) * 2 - 3 + || out[i*8 + 2] != (in[i*8 + 2] + 7) * 12 - 2 + || out[i*8 + 3] != (in[i*8 + 3] + 8) * 5 - 1 + || out[i*8 + 4] != (in[i*8 + 4] + 9) * 8 - 8 + || out[i*8 + 5] != (in[i*8 + 5] + 10) * 4 - 7 + || out[i*8 + 6] != (in[i*8 + 6] + 11) * 3 - 3 + || out[i*8 + 7] != (in[i*8 + 7] + 12) * 2 - 7) + abort (); + } + + for (i = 0; i < N*2; i++) + { + out[i*4] = (in[i*4] + 2) * 3; + out[i*4 + 1] = (in[i*4 + 1] + 2) * 7; + out[i*4 + 2] = (in[i*4 + 2] + 7) * 3; + out[i*4 + 3] = (in[i*4 + 3] + 7) * 7; + } + + /* check results: */ + for (i = 0; i < N*2; i++) + { + if (out[i*4] != (in[i*4] + 2) * 3 + || out[i*4 + 1] != (in[i*4 + 1] + 2) * 7 + || out[i*4 + 2] != (in[i*4 + 2] + 7) * 3 + || out[i*4 + 3] != (in[i*4 + 3] + 7) * 7) + abort (); + } + + for (i = 0; i < N*4; i++) + { + out2[i*2] = (float) (in[i*2] * 2 + 5) ; + out2[i*2 + 1] = (float) (in[i*2 + 1] * 3 + 7); + } + + /* check results: */ + for (i = 0; i < N*4; i++) + { + if (out2[i*2] != (float) (in[i*2] * 2 + 5) + || out2[i*2 + 1] != (float) (in[i*2 + 1] * 3 + 7)) + abort (); + } + + + return 0; +} + +int main (void) +{ + check_vect (); + + main1 (); + + return 0; +} + +/* { dg-final { scan-tree-dump-times "vectorized 3 loops" 1 "vect" {target {vect_intfloat_cvt && vect_int_mult} } } } */ +/* { dg-final { scan-tree-dump-times "vectorized 2 loops" 1 "vect" {target {{! { vect_intfloat_cvt}} && vect_int_mult} } } } */ +/* { dg-final { scan-tree-dump-times "vectorized 0 loops" 1 "vect" {target {{! { vect_intfloat_cvt}} && {!{vect_int_mult}}} } } } */ +/* { dg-final { scan-tree-dump-times "vectorizing stmts using SLP" 3 "vect" {target {vect_intfloat_cvt && vect_int_mult} } } } */ +/* { dg-final { scan-tree-dump-times "vectorizing stmts using SLP" 2 "vect" {target {{! { vect_intfloat_cvt}} && vect_int_mult} } } } */ +/* { dg-final { scan-tree-dump-times "vectorizing stmts using SLP" 0 "vect" {target {{! { vect_intfloat_cvt}} && {!{vect_int_mult}}} } } } */ +/* { dg-final { cleanup-tree-dump "vect" } } */ + diff --git a/gcc/testsuite/gcc.dg/vect/slp-11.c b/gcc/testsuite/gcc.dg/vect/slp-11.c new file mode 100644 index 00000000000..118818c97bd --- /dev/null +++ b/gcc/testsuite/gcc.dg/vect/slp-11.c @@ -0,0 +1,113 @@ +/* { dg-require-effective-target vect_int } */ + +#include <stdarg.h> +#include <stdio.h> +#include "tree-vect.h" + +#define N 8 + +int +main1 () +{ + int i; + unsigned int out[N*8], a0, a1, a2, a3, a4, a5, a6, a7, b1, b0, b2, b3, b4, b5, b6, b7; + unsigned int in[N*8] = {0,1,2,3,4,5,6,7,8,9,10,11,12,13,14,15,16,17,18,19,20,21,22,23,24,25,26,27,28,29,30,31,32,33,34,35,36,37,38,39,40,41,42,43,44,45,46,47,48,49,50,51,52,53,54,55,56,57,58,59,60,61,62,63}; + float out2[N*8]; + + /* Different operations - not SLPable. */ + for (i = 0; i < N; i++) + { + a0 = in[i*8] + 5; + a1 = in[i*8 + 1] * 6; + a2 = in[i*8 + 2] + 7; + a3 = in[i*8 + 3] + 8; + a4 = in[i*8 + 4] + 9; + a5 = in[i*8 + 5] + 10; + a6 = in[i*8 + 6] + 11; + a7 = in[i*8 + 7] + 12; + + b0 = a0 * 3; + b1 = a1 * 2; + b2 = a2 * 12; + b3 = a3 * 5; + b4 = a4 * 8; + b5 = a5 * 4; + b6 = a6 * 3; + b7 = a7 * 2; + + out[i*8] = b0 - 2; + out[i*8 + 1] = b1 - 3; + out[i*8 + 2] = b2 - 2; + out[i*8 + 3] = b3 - 1; + out[i*8 + 4] = b4 - 8; + out[i*8 + 5] = b5 - 7; + out[i*8 + 6] = b6 - 3; + out[i*8 + 7] = b7 - 7; + } + + /* check results: */ + for (i = 0; i < N; i++) + { + if (out[i*8] != (in[i*8] + 5) * 3 - 2 + || out[i*8 + 1] != (in[i*8 + 1] * 6) * 2 - 3 + || out[i*8 + 2] != (in[i*8 + 2] + 7) * 12 - 2 + || out[i*8 + 3] != (in[i*8 + 3] + 8) * 5 - 1 + || out[i*8 + 4] != (in[i*8 + 4] + 9) * 8 - 8 + || out[i*8 + 5] != (in[i*8 + 5] + 10) * 4 - 7 + || out[i*8 + 6] != (in[i*8 + 6] + 11) * 3 - 3 + || out[i*8 + 7] != (in[i*8 + 7] + 12) * 2 - 7) + abort (); + } + + /* Requires permutation - not SLPable. */ + for (i = 0; i < N*2; i++) + { + out[i*4] = (in[i*4] + 2) * 3; + out[i*4 + 1] = (in[i*4 + 2] + 2) * 7; + out[i*4 + 2] = (in[i*4 + 1] + 7) * 3; + out[i*4 + 3] = (in[i*4 + 3] + 3) * 4; + } + + /* check results: */ + for (i = 0; i < N*2; i++) + { + if (out[i*4] != (in[i*4] + 2) * 3 + || out[i*4 + 1] != (in[i*4 + 2] + 2) * 7 + || out[i*4 + 2] != (in[i*4 + 1] + 7) * 3 + || out[i*4 + 3] != (in[i*4 + 3] + 3) * 4) + abort (); + } + + /* Different operations - not SLPable. */ + for (i = 0; i < N*4; i++) + { + out2[i*2] = ((float) in[i*2] * 2 + 6) ; + out2[i*2 + 1] = (float) (in[i*2 + 1] * 3 + 7); + } + + /* check results: */ + for (i = 0; i < N*4; i++) + { + if (out2[i*2] != ((float) in[i*2] * 2 + 6) + || out2[i*2 + 1] != (float) (in[i*2 + 1] * 3 + 7)) + abort (); + } + + + return 0; +} + +int main (void) +{ + check_vect (); + + main1 (); + + return 0; +} + +/* { dg-final { scan-tree-dump-times "vectorized 3 loops" 1 "vect" { target { vect_strided && vect_int_mult } } } } */ +/* { dg-final { scan-tree-dump-times "vectorized 0 loops" 1 "vect" {target { ! { vect_int_mult && vect_strided } } } } } */ +/* { dg-final { scan-tree-dump-times "vectorizing stmts using SLP" 0 "vect" } } */ +/* { dg-final { cleanup-tree-dump "vect" } } */ + diff --git a/gcc/testsuite/gcc.dg/vect/slp-12a.c b/gcc/testsuite/gcc.dg/vect/slp-12a.c new file mode 100644 index 00000000000..066bf7ff9a3 --- /dev/null +++ b/gcc/testsuite/gcc.dg/vect/slp-12a.c @@ -0,0 +1,105 @@ +/* { dg-require-effective-target vect_int } */ + +#include <stdarg.h> +#include <stdio.h> +#include "tree-vect.h" + +#define N 8 + +int +main1 () +{ + int i; + unsigned int out[N*8], a0, a1, a2, a3, a4, a5, a6, a7, b1, b0, b2, b3, b4, b5, b6, b7; + unsigned int in[N*8] = {0,1,2,3,4,5,6,7,8,9,10,11,12,13,14,15,16,17,18,19,20,21,22,23,24,25,26,27,28,29,30,31,32,33,34,35,36,37,38,39,40,41,42,43,44,45,46,47,48,49,50,51,52,53,54,55,56,57,58,59,60,61,62,63}; + unsigned int ia[N], ib[N*2]; + + for (i = 0; i < N; i++) + { + + a0 = in[i*8] + 5; + a1 = in[i*8 + 1] + 6; + a2 = in[i*8 + 2] + 7; + a3 = in[i*8 + 3] + 8; + a4 = in[i*8 + 4] + 9; + a5 = in[i*8 + 5] + 10; + a6 = in[i*8 + 6] + 11; + a7 = in[i*8 + 7] + 12; + + b0 = a0 * 3; + b1 = a1 * 2; + b2 = a2 * 12; + b3 = a3 * 5; + b4 = a4 * 8; + b5 = a5 * 4; + b6 = a6 * 3; + b7 = a7 * 2; + + out[i*8] = b0 - 2; + out[i*8 + 1] = b1 - 3; + out[i*8 + 2] = b2 - 2; + out[i*8 + 3] = b3 - 1; + out[i*8 + 4] = b4 - 8; + out[i*8 + 5] = b5 - 7; + out[i*8 + 6] = b6 - 3; + out[i*8 + 7] = b7 - 7; + + ia[i] = b6; + } + + /* check results: */ + for (i = 0; i < N; i++) + { + if (out[i*8] != (in[i*8] + 5) * 3 - 2 + || out[i*8 + 1] != (in[i*8 + 1] + 6) * 2 - 3 + || out[i*8 + 2] != (in[i*8 + 2] + 7) * 12 - 2 + || out[i*8 + 3] != (in[i*8 + 3] + 8) * 5 - 1 + || out[i*8 + 4] != (in[i*8 + 4] + 9) * 8 - 8 + || out[i*8 + 5] != (in[i*8 + 5] + 10) * 4 - 7 + || out[i*8 + 6] != (in[i*8 + 6] + 11) * 3 - 3 + || out[i*8 + 7] != (in[i*8 + 7] + 12) * 2 - 7 + || ia[i] != (in[i*8 + 6] + 11) * 3) + abort (); + } + + for (i = 0; i < N*2; i++) + { + out[i*4] = (in[i*4] + 2) * 3; + out[i*4 + 1] = (in[i*4 + 1] + 2) * 7; + out[i*4 + 2] = (in[i*4 + 2] + 7) * 3; + out[i*4 + 3] = (in[i*4 + 3] + 7) * 7; + + ib[i] = 7; + } + + /* check results: */ + for (i = 0; i < N*2; i++) + { + if (out[i*4] != (in[i*4] + 2) * 3 + || out[i*4 + 1] != (in[i*4 + 1] + 2) * 7 + || out[i*4 + 2] != (in[i*4 + 2] + 7) * 3 + || out[i*4 + 3] != (in[i*4 + 3] + 7) * 7 + || ib[i] != 7) + abort (); + } + + return 0; +} + +int main (void) +{ + check_vect (); + + main1 (); + + return 0; +} + +/* { dg-final { scan-tree-dump-times "vectorized 2 loops" 1 "vect" {target { vect_strided && vect_int_mult} } } } */ +/* { dg-final { scan-tree-dump-times "vectorized 1 loops" 1 "vect" {target { {! {vect_strided}} && vect_int_mult } } } } */ +/* { dg-final { scan-tree-dump-times "vectorized 0 loops" 1 "vect" {target { ! vect_int_mult } } } } */ +/* { dg-final { scan-tree-dump-times "vectorizing stmts using SLP" 2 "vect" {target { vect_strided && vect_int_mult } } } } */ +/* { dg-final { scan-tree-dump-times "vectorizing stmts using SLP" 1 "vect" {target { {! {vect_strided}} && vect_int_mult } } } } */ +/* { dg-final { scan-tree-dump-times "vectorizing stmts using SLP" 0 "vect" {target { ! vect_int_mult } } } } */ +/* { dg-final { cleanup-tree-dump "vect" } } */ + diff --git a/gcc/testsuite/gcc.dg/vect/slp-12b.c b/gcc/testsuite/gcc.dg/vect/slp-12b.c new file mode 100644 index 00000000000..39570016f38 --- /dev/null +++ b/gcc/testsuite/gcc.dg/vect/slp-12b.c @@ -0,0 +1,51 @@ +/* { dg-require-effective-target vect_intfloat_cvt } */ + +#include <stdarg.h> +#include <stdio.h> +#include "tree-vect.h" + +#define N 64 + +int +main1 () +{ + int i; + unsigned int in[N*8] = {0,1,2,3,4,5,6,7,8,9,10,11,12,13,14,15,16,17,18,19,20,21,22,23,24,25,26,27,28,29,30,31,32,33,34,35,36,37,38,39,40,41,42,43,44,45,46,47,48,49,50,51,52,53,54,55,56,57,58,59,60,61,62,63}; + float out2[N*8], fa[N*4]; + + for (i = 0; i < N; i++) + { + out2[i*2] = (float) (in[i*2] * 2 + 11) ; + out2[i*2 + 1] = (float) (in[i*2 + 1] * 3 + 7); + + fa[i] = (float) in[i*2+1]; + } + + /* check results: */ + for (i = 0; i < N; i++) + { + if (out2[i*2] != (float) (in[i*2] * 2 + 11) + || out2[i*2 + 1] != (float) (in[i*2 + 1] * 3 + 7) + || fa[i] != (float) in[i*2+1]) + abort (); + } + + + return 0; +} + +int main (void) +{ + check_vect (); + + main1 (); + + return 0; +} + +/* { dg-final { scan-tree-dump-times "vectorized 1 loops" 1 "vect" {target { vect_strided && vect_int_mult } } } } */ +/* { dg-final { scan-tree-dump-times "vectorized 0 loops" 1 "vect" {target { { ! { vect_int_mult }} || { ! {vect_strided}}} } } } */ +/* { dg-final { scan-tree-dump-times "vectorizing stmts using SLP" 1 "vect" {target { vect_strided && vect_int_mult } } } } */ +/* { dg-final { scan-tree-dump-times "vectorizing stmts using SLP" 0 "vect" {target { { ! { vect_int_mult }} || { ! {vect_strided}}} } } } */ +/* { dg-final { cleanup-tree-dump "vect" } } */ + diff --git a/gcc/testsuite/gcc.dg/vect/slp-13.c b/gcc/testsuite/gcc.dg/vect/slp-13.c new file mode 100644 index 00000000000..0041526b972 --- /dev/null +++ b/gcc/testsuite/gcc.dg/vect/slp-13.c @@ -0,0 +1,134 @@ +/* { dg-require-effective-target vect_int } */ + +#include <stdarg.h> +#include <stdio.h> +#include "tree-vect.h" + +#define N 8 + +int +main1 () +{ + int i; + unsigned short out[N*8]; + unsigned short in[N*8] = {0,1,2,3,4,5,6,7,8,9,10,11,12,13,14,15,16,17,18,19,20,21,22,23,24,25,26,27,28,29,30,31,32,33,34,35,36,37,38,39,40,41,42,43,44,45,46,47,48,49,50,51,52,53,54,55,56,57,58,59,60,61,62,63}; + unsigned int in2[N*8] = {0,1,2,3,4,5,6,7,8,9,10,11,12,13,14,15,16,17,18,19,20,21,22,23,24,25,26,27,28,29,30,31,32,33,34,35,36,37,38,39,40,41,42,43,44,45,46,47,48,49,50,51,52,53,54,55,56,57,58,59,60,61,62,63}; + unsigned int out2[N*8]; + + /* Induction is not SLPable yet. */ + for (i = 0; i < N; i++) + { + out[i*8] = in[i*8] + i; + out[i*8 + 1] = in[i*8 + 1] + i; + out[i*8 + 2] = in[i*8 + 2] + i; + out[i*8 + 3] = in[i*8 + 3] + i; + out[i*8 + 4] = in[i*8 + 4] + i; + out[i*8 + 5] = in[i*8 + 5] + i; + out[i*8 + 6] = in[i*8 + 6] + i; + out[i*8 + 7] = in[i*8 + 7] + i; + } + + /* check results: */ + for (i = 0; i < N; i++) + { + if (out[i*8] != in[i*8] + i + || out[i*8 + 1] != in[i*8 + 1] + i + || out[i*8 + 2] != in[i*8 + 2] + i + || out[i*8 + 3] != in[i*8 + 3] + i + || out[i*8 + 4] != in[i*8 + 4] + i + || out[i*8 + 5] != in[i*8 + 5] + i + || out[i*8 + 6] != in[i*8 + 6] + i + || out[i*8 + 7] != in[i*8 + 7] + i) + abort (); + } + + /* Induction is not SLPable yet and strided group size must be a power of 2 + to get vectorized. */ + for (i = 0; i < N/2; i++) + { + out2[i*12] = in2[i*12] + i; + out2[i*12 + 1] = in2[i*12 + 1] + i; + out2[i*12 + 2] = in2[i*12 + 2] + i; + out2[i*12 + 3] = in2[i*12 + 3] + i; + out2[i*12 + 4] = in2[i*12 + 4] + i; + out2[i*12 + 5] = in2[i*12 + 5] + i; + out2[i*12 + 6] = in2[i*12 + 6] + i; + out2[i*12 + 7] = in2[i*12 + 7] + i; + out2[i*12 + 8] = in2[i*12 + 8] + i; + out2[i*12 + 9] = in2[i*12 + 9] + i; + out2[i*12 + 10] = in2[i*12 + 10] + i; + out2[i*12 + 11] = in2[i*12 + 11] + i; + } + + /* check results: */ + for (i = 0; i < N/2; i++) + { + if (out2[i*12] != in2[i*12] + i + || out2[i*12 + 1] != in2[i*12 + 1] + i + || out2[i*12 + 2] != in2[i*12 + 2] + i + || out2[i*12 + 3] != in2[i*12 + 3] + i + || out2[i*12 + 4] != in2[i*12 + 4] + i + || out2[i*12 + 5] != in2[i*12 + 5] + i + || out2[i*12 + 6] != in2[i*12 + 6] + i + || out2[i*12 + 7] != in2[i*12 + 7] + i + || out2[i*12 + 8] != in2[i*12 + 8] + i + || out2[i*12 + 9] != in2[i*12 + 9] + i + || out2[i*12 + 10] != in2[i*12 + 10] + i + || out2[i*12 + 11] != in2[i*12 + 11] + i) + abort (); + } + + /* Not power of 2 but SLPable. */ + for (i = 0; i < N/2; i++) + { + out2[i*12] = in2[i*12] + 1; + out2[i*12 + 1] = in2[i*12 + 1] + 2; + out2[i*12 + 2] = in2[i*12 + 2] + 3; + out2[i*12 + 3] = in2[i*12 + 3] + 4; + out2[i*12 + 4] = in2[i*12 + 4] + 5; + out2[i*12 + 5] = in2[i*12 + 5] + 6; + out2[i*12 + 6] = in2[i*12 + 6] + 7; + out2[i*12 + 7] = in2[i*12 + 7] + 8; + out2[i*12 + 8] = in2[i*12 + 8] + 9; + out2[i*12 + 9] = in2[i*12 + 9] + 10; + out2[i*12 + 10] = in2[i*12 + 10] + 11; + out2[i*12 + 11] = in2[i*12 + 11] + 12; + } + + /* check results: */ + for (i = 0; i < N/2; i++) + { + if (out2[i*12] != in2[i*12] + 1 + || out2[i*12 + 1] != in2[i*12 + 1] + 2 + || out2[i*12 + 2] != in2[i*12 + 2] + 3 + || out2[i*12 + 3] != in2[i*12 + 3] + 4 + || out2[i*12 + 4] != in2[i*12 + 4] + 5 + || out2[i*12 + 5] != in2[i*12 + 5] + 6 + || out2[i*12 + 6] != in2[i*12 + 6] + 7 + || out2[i*12 + 7] != in2[i*12 + 7] + 8 + || out2[i*12 + 8] != in2[i*12 + 8] + 9 + || out2[i*12 + 9] != in2[i*12 + 9] + 10 + || out2[i*12 + 10] != in2[i*12 + 10] + 11 + || out2[i*12 + 11] != in2[i*12 + 11] + 12) + abort (); + } + + + + return 0; +} + +int main (void) +{ + check_vect (); + + main1 (); + + return 0; +} + +/* { dg-final { scan-tree-dump-times "vectorized 2 loops" 1 "vect" { target { vect_interleave && vect_extract_even_odd } } } } */ +/* { dg-final { scan-tree-dump-times "vectorizing stmts using SLP" 3 "vect" { xfail *-*-* } } } */ +/* { dg-final { scan-tree-dump-times "vectorizing stmts using SLP" 1 "vect" } } */ +/* { dg-final { cleanup-tree-dump "vect" } } */ + diff --git a/gcc/testsuite/gcc.dg/vect/slp-14.c b/gcc/testsuite/gcc.dg/vect/slp-14.c new file mode 100644 index 00000000000..62610dc0233 --- /dev/null +++ b/gcc/testsuite/gcc.dg/vect/slp-14.c @@ -0,0 +1,118 @@ +/* { dg-require-effective-target vect_int } */ + +#include <stdarg.h> +#include <stdio.h> +#include "tree-vect.h" + +#define N 8 + +int +main1 (int n) +{ + int i; + unsigned int out[N*8], a0, a1, a2, a3, a4, a5, a6, a7, b1, b0, b2, b3, b4, b5, b6, b7; + unsigned int in[N*8] = {0,1,2,3,4,5,6,7,8,9,10,11,12,13,14,15,16,17,18,19,20,21,22,23,24,25,26,27,28,29,30,31,32,33,34,35,36,37,38,39,40,41,42,43,44,45,46,47,48,49,50,51,52,53,54,55,56,57,58,59,60,61,62,63}; + unsigned short in2[N*16] = {0,1,2,3,4,5,6,7,8,9,10,11,12,13,14,15,16,17,18,19,20,21,22,23,24,25,26,27,28,29,30,31,32,33,34,35,36,37,38,39,40,41,42,43,44,45,46,47,48,49,50,51,52,53,54,55,56,57,58,59,60,61,62,63}; + unsigned short out2[N*16]; + + /* Multiple types are not SLPable yet. */ + for (i = 0; i < n; i++) + { + a0 = in[i*8] + 5; + a1 = in[i*8 + 1] + 6; + a2 = in[i*8 + 2] + 7; + a3 = in[i*8 + 3] + 8; + a4 = in[i*8 + 4] + 9; + a5 = in[i*8 + 5] + 10; + a6 = in[i*8 + 6] + 11; + a7 = in[i*8 + 7] + 12; + + b0 = a0 * 3; + b1 = a1 * 2; + b2 = a2 * 12; + b3 = a3 * 5; + b4 = a4 * 8; + b5 = a5 * 4; + b6 = a6 * 3; + b7 = a7 * 2; + + out[i*8] = b0 - 2; + out[i*8 + 1] = b1 - 3; + out[i*8 + 2] = b2 - 2; + out[i*8 + 3] = b3 - 1; + out[i*8 + 4] = b4 - 8; + out[i*8 + 5] = b5 - 7; + out[i*8 + 6] = b6 - 3; + out[i*8 + 7] = b7 - 7; + + out2[i*16] = in2[i*16] + 2; + out2[i*16 + 1] = in2[i*16 + 1] + 3; + out2[i*16 + 2] = in2[i*16 + 2] + 4; + out2[i*16 + 3] = in2[i*16 + 3] + 3; + out2[i*16 + 4] = in2[i*16 + 4] + 2; + out2[i*16 + 5] = in2[i*16 + 5] + 3; + out2[i*16 + 6] = in2[i*16 + 6] + 2; + out2[i*16 + 7] = in2[i*16 + 7] + 4; + out2[i*16 + 8] = in2[i*16 + 8] + 2; + out2[i*16 + 9] = in2[i*16 + 9] + 5; + out2[i*16 + 10] = in2[i*16 + 10] + 2; + out2[i*16 + 11] = in2[i*16 + 11] + 3; + out2[i*16 + 12] = in2[i*16 + 12] + 4; + out2[i*16 + 13] = in2[i*16 + 13] + 4; + out2[i*16 + 14] = in2[i*16 + 14] + 3; + out2[i*16 + 15] = in2[i*16 + 15] + 2; +} + + /* check results: */ + for (i = 0; i < n; i++) + { + if (out[i*8] != (in[i*8] + 5) * 3 - 2 + || out[i*8 + 1] != (in[i*8 + 1] + 6) * 2 - 3 + || out[i*8 + 2] != (in[i*8 + 2] + 7) * 12 - 2 + || out[i*8 + 3] != (in[i*8 + 3] + 8) * 5 - 1 + || out[i*8 + 4] != (in[i*8 + 4] + 9) * 8 - 8 + || out[i*8 + 5] != (in[i*8 + 5] + 10) * 4 - 7 + || out[i*8 + 6] != (in[i*8 + 6] + 11) * 3 - 3 + || out[i*8 + 7] != (in[i*8 + 7] + 12) * 2 - 7) + abort (); + + if (out2[i*16] != in2[i*16] + 2 + || out2[i*16 + 1] != in2[i*16 + 1] + 3 + || out2[i*16 + 2] != in2[i*16 + 2] + 4 + || out2[i*16 + 3] != in2[i*16 + 3] + 3 + || out2[i*16 + 4] != in2[i*16 + 4] + 2 + || out2[i*16 + 5] != in2[i*16 + 5] + 3 + || out2[i*16 + 6] != in2[i*16 + 6] + 2 + || out2[i*16 + 7] != in2[i*16 + 7] + 4 + || out2[i*16 + 8] != in2[i*16 + 8] + 2 + || out2[i*16 + 9] != in2[i*16 + 9] + 5 + || out2[i*16 + 10] != in2[i*16 + 10] + 2 + || out2[i*16 + 11] != in2[i*16 + 11] + 3 + || out2[i*16 + 12] != in2[i*16 + 12] + 4 + || out2[i*16 + 13] != in2[i*16 + 13] + 4 + || out2[i*16 + 14] != in2[i*16 + 14] + 3 + || out2[i*16 + 15] != in2[i*16 + 15] + 2) + abort (); + + } + + + + return 0; +} + +int main (void) +{ + check_vect (); + + main1 (N); + + return 0; +} + +/* { dg-final { scan-tree-dump-times "vectorized 1 loops" 1 "vect" { target { vect_strided && vect_int_mult } } } } */ +/* { dg-final { scan-tree-dump-times "vectorized 0 loops" 1 "vect" {target { ! { vect_strided && vect_int_mult } } } } } */ +/* { dg-final { scan-tree-dump-times "vectorizing stmts using SLP" 2 "vect" { xfail *-*-* } } } */ +/* { dg-final { scan-tree-dump-times "vectorizing stmts using SLP" 0 "vect" } } */ +/* { dg-final { cleanup-tree-dump "vect" } } */ + diff --git a/gcc/testsuite/gcc.dg/vect/slp-15.c b/gcc/testsuite/gcc.dg/vect/slp-15.c new file mode 100644 index 00000000000..6f04e6a3784 --- /dev/null +++ b/gcc/testsuite/gcc.dg/vect/slp-15.c @@ -0,0 +1,117 @@ +/* { dg-require-effective-target vect_int } */ + +#include <stdarg.h> +#include <stdio.h> +#include "tree-vect.h" + +#define N 8 + +int +main1 (int n) +{ + int i; + unsigned int out[N*8], a0, a1, a2, a3, a4, a5, a6, a7, b1, b0, b2, b3, b4, b5, b6, b7; + unsigned int in[N*8] = {0,1,2,3,4,5,6,7,8,9,10,11,12,13,14,15,16,17,18,19,20,21,22,23,24,25,26,27,28,29,30,31,32,33,34,35,36,37,38,39,40,41,42,43,44,45,46,47,48,49,50,51,52,53,54,55,56,57,58,59,60,61,62,63}; + unsigned int in2[N*16] = {0,1,2,3,4,5,6,7,8,9,10,11,12,13,14,15,16,17,18,19,20,21,22,23,24,25,26,27,28,29,30,31,32,33,34,35,36,37,38,39,40,41,42,43,44,45,46,47,48,49,50,51,52,53,54,55,56,57,58,59,60,61,62,63}; + unsigned int out2[N*16]; + + for (i = 0; i < n; i++) + { + a0 = in[i*8] + 5; + a1 = in[i*8 + 1] + 6; + a2 = in[i*8 + 2] + 7; + a3 = in[i*8 + 3] + 8; + a4 = in[i*8 + 4] + 9; + a5 = in[i*8 + 5] + 10; + a6 = in[i*8 + 6] + 11; + a7 = in[i*8 + 7] + 12; + + b0 = a0 * 3; + b1 = a1 * 2; + b2 = a2 * 12; + b3 = a3 * 5; + b4 = a4 * 8; + b5 = a5 * 4; + b6 = a6 * 3; + b7 = a7 * 2; + + out[i*8] = b0 - 2; + out[i*8 + 1] = b1 - 3; + out[i*8 + 2] = b2 - 2; + out[i*8 + 3] = b3 - 1; + out[i*8 + 4] = b4 - 8; + out[i*8 + 5] = b5 - 7; + out[i*8 + 6] = b6 - 3; + out[i*8 + 7] = b7 - 7; + + out2[i*16] = in2[i*16] * 2; + out2[i*16 + 1] = in2[i*16 + 1] * 3; + out2[i*16 + 2] = in2[i*16 + 2] * 4; + out2[i*16 + 3] = in2[i*16 + 3] * 3; + out2[i*16 + 4] = in2[i*16 + 4] * 2; + out2[i*16 + 5] = in2[i*16 + 5] * 3; + out2[i*16 + 6] = in2[i*16 + 6] * 2; + out2[i*16 + 7] = in2[i*16 + 7] * 4; + out2[i*16 + 8] = in2[i*16 + 8] * 2; + out2[i*16 + 9] = in2[i*16 + 9] * 5; + out2[i*16 + 10] = in2[i*16 + 10] * 2; + out2[i*16 + 11] = in2[i*16 + 11] * 3; + out2[i*16 + 12] = in2[i*16 + 12] * 4; + out2[i*16 + 13] = in2[i*16 + 13] * 4; + out2[i*16 + 14] = in2[i*16 + 14] * 3; + out2[i*16 + 15] = in2[i*16 + 15] * 2; +} + + /* check results: */ + for (i = 0; i < n; i++) + { + if (out[i*8] != (in[i*8] + 5) * 3 - 2 + || out[i*8 + 1] != (in[i*8 + 1] + 6) * 2 - 3 + || out[i*8 + 2] != (in[i*8 + 2] + 7) * 12 - 2 + || out[i*8 + 3] != (in[i*8 + 3] + 8) * 5 - 1 + || out[i*8 + 4] != (in[i*8 + 4] + 9) * 8 - 8 + || out[i*8 + 5] != (in[i*8 + 5] + 10) * 4 - 7 + || out[i*8 + 6] != (in[i*8 + 6] + 11) * 3 - 3 + || out[i*8 + 7] != (in[i*8 + 7] + 12) * 2 - 7) + abort (); + + if (out2[i*16] != in2[i*16] * 2 + || out2[i*16 + 1] != in2[i*16 + 1] * 3 + || out2[i*16 + 2] != in2[i*16 + 2] * 4 + || out2[i*16 + 3] != in2[i*16 + 3] * 3 + || out2[i*16 + 4] != in2[i*16 + 4] * 2 + || out2[i*16 + 5] != in2[i*16 + 5] * 3 + || out2[i*16 + 6] != in2[i*16 + 6] * 2 + || out2[i*16 + 7] != in2[i*16 + 7] * 4 + || out2[i*16 + 8] != in2[i*16 + 8] * 2 + || out2[i*16 + 9] != in2[i*16 + 9] * 5 + || out2[i*16 + 10] != in2[i*16 + 10] * 2 + || out2[i*16 + 11] != in2[i*16 + 11] * 3 + || out2[i*16 + 12] != in2[i*16 + 12] * 4 + || out2[i*16 + 13] != in2[i*16 + 13] * 4 + || out2[i*16 + 14] != in2[i*16 + 14] * 3 + || out2[i*16 + 15] != in2[i*16 + 15] * 2) + abort (); + + } + + + + return 0; +} + +int main (void) +{ + check_vect (); + + main1 (N); + + return 0; +} + +/* { dg-final { scan-tree-dump-times "vectorized 1 loops" 1 "vect" {target vect_int_mult } } } */ +/* { dg-final { scan-tree-dump-times "vectorized 0 loops" 1 "vect" {target { ! { vect_int_mult } } } } } */ +/* { dg-final { scan-tree-dump-times "vectorizing stmts using SLP" 2 "vect" {target vect_int_mult } } } */ +/* { dg-final { scan-tree-dump-times "vectorizing stmts using SLP" 2 "vect" {target { ! { vect_int_mult } } } } } */ +/* { dg-final { cleanup-tree-dump "vect" } } */ + diff --git a/gcc/testsuite/gcc.dg/vect/slp-16.c b/gcc/testsuite/gcc.dg/vect/slp-16.c new file mode 100644 index 00000000000..cbc47cd1f2b --- /dev/null +++ b/gcc/testsuite/gcc.dg/vect/slp-16.c @@ -0,0 +1,70 @@ +/* { dg-require-effective-target vect_int } */ + +#include <stdarg.h> +#include <stdio.h> +#include "tree-vect.h" + +#define N 8 + +int +main1 () +{ + int i; + unsigned int out[N*8], a0, a1, a2, a3, a4, a5, a6, a7, b1, b0, b2, b3, b4, b5, b6, b7; + unsigned int in[N*8] = {0,1,2,3,4,5,6,7,8,9,10,11,12,13,14,15,16,17,18,19,20,21,22,23,24,25,26,27,28,29,30,31,32,33,34,35,36,37,38,39,40,41,42,43,44,45,46,47,48,49,50,51,52,53,54,55,56,57,58,59,60,61,62,63}; + unsigned int in2[N*16] = {0,1,2,3,4,5,6,7,8,9,10,11,12,13,14,15,16,17,18,19,20,21,22,23,24,25,26,27,28,29,30,31,32,33,34,35,36,37,38,39,40,41,42,43,44,45,46,47,48,49,50,51,52,53,54,55,56,57,58,59,60,61,62,63}; + unsigned int out2[N*16]; + + /* SLP group of size that is not a multiple of vector size. + Unrolling by 2. */ + for (i = 0; i < N; i++) + { + a0 = in[i*2] + 5; + a1 = in[i*2 + 1] + 6; + + b0 = a0 * 3; + b1 = a1 * 2; + + out[i*2] = b0 - 2; + out[i*2 + 1] = b1 - 3; + + out2[i*6] = in2[i*6] * 2; + out2[i*6 + 1] = in2[i*6 + 1] * 3; + out2[i*6 + 2] = in2[i*6 + 2] * 4; + out2[i*6 + 3] = in2[i*6 + 3] * 2; + out2[i*6 + 4] = in2[i*6 + 4] * 4; + out2[i*6 + 5] = in2[i*6 + 5] * 3; + } + + /* check results: */ + for (i = 0; i < N; i++) + { + if (out[i*2] != (in[i*2] + 5) * 3 - 2 + || out[i*2 + 1] != (in[i*2 + 1] + 6) * 2 - 3 + || out2[i*6] != in2[i*6] * 2 + || out2[i*6 + 1] != in2[i*6 + 1] * 3 + || out2[i*6 + 2] != in2[i*6 + 2] * 4 + || out2[i*6 + 3] != in2[i*6 + 3] * 2 + || out2[i*6 + 4] != in2[i*6 + 4] * 4 + || out2[i*6 + 5] != in2[i*6 + 5] * 3) + abort (); + } + + + + return 0; +} + +int main (void) +{ + check_vect (); + + main1 (); + + return 0; +} + +/* { dg-final { scan-tree-dump-times "vectorized 1 loops" 1 "vect" { target vect_int_mult } } } */ +/* { dg-final { scan-tree-dump-times "vectorizing stmts using SLP" 2 "vect" { target vect_int_mult } } } */ +/* { dg-final { cleanup-tree-dump "vect" } } */ + diff --git a/gcc/testsuite/gcc.dg/vect/slp-17.c b/gcc/testsuite/gcc.dg/vect/slp-17.c new file mode 100644 index 00000000000..0a760bf2c68 --- /dev/null +++ b/gcc/testsuite/gcc.dg/vect/slp-17.c @@ -0,0 +1,56 @@ +/* { dg-require-effective-target vect_int } */ + +#include <stdarg.h> +#include <stdio.h> +#include "tree-vect.h" + +#define N 8 + +int +main1 () +{ + int i; + unsigned short out[N*8]; + unsigned short in[N*8] = {0,1,2,3,4,5,6,7,8,9,10,11,12,13,14,15,16,17,18,19,20,21,22,23,24,25,26,27,28,29,30,31,32,33,34,35,36,37,38,39,40,41,42,43,44,45,46,47,48,49,50,51,52,53,54,55,56,57,58,59,60,61,62,63}; + unsigned short in2[N*8] = {0,1,2,3,4,5,6,7,8,9,10,11,12,13,14,15,16,17,18,19,20,21,22,23,24,25,26,27,28,29,30,31,32,33,34,35,36,37,38,39,40,41,42,43,44,45,46,47,48,49,50,51,52,53,54,55,56,57,58,59,60,61,62,63}; + unsigned short out2[N*8]; + + for (i = 0; i < N*2; i++) + { + out[i*2] = in[i*2] + 5; + out[i*2 + 1] = in[i*2 + 1] + 6; + + out2[i*4] = in2[i*4] + 2; + out2[i*4 + 1] = in2[i*4 + 1] + 2; + out2[i*4 + 2] = in2[i*4 + 2] + 1; + out2[i*4 + 3] = in2[i*4 + 3] + 3; + } + + /* check results: */ + for (i = 0; i < N*2; i++) + { + if (out[i*2] != in[i*2] + 5 + || out[i*2 + 1] != in[i*2 + 1] + 6 + || out2[i*4] != in2[i*4] + 2 + || out2[i*4 + 1] != in2[i*4 + 1] + 2 + || out2[i*4 + 2] != in2[i*4 + 2] + 1 + || out2[i*4 + 3] != in2[i*4 + 3] + 3) + abort (); + } + + return 0; +} + +int main (void) +{ + check_vect (); + + main1 (); + + return 0; +} + +/* { dg-final { scan-tree-dump-times "vectorized 1 loops" 1 "vect" } } */ +/* { dg-final { scan-tree-dump-times "vectorizing stmts using SLP" 2 "vect" } } */ +/* { dg-final { cleanup-tree-dump "vect" } } */ + diff --git a/gcc/testsuite/gcc.dg/vect/slp-18.c b/gcc/testsuite/gcc.dg/vect/slp-18.c new file mode 100644 index 00000000000..b8e122c6cfa --- /dev/null +++ b/gcc/testsuite/gcc.dg/vect/slp-18.c @@ -0,0 +1,97 @@ +/* { dg-require-effective-target vect_int } */ + +#include <stdarg.h> +#include <stdio.h> +#include "tree-vect.h" + +#define N 8 + +int +main1 () +{ + int i; + unsigned int out[N*8], a0, a1, a2, a3, a4, a5, a6, a7, b1, b0, b2, b3, b4, b5, b6, b7; + unsigned int in[N*8] = {0,1,2,3,4,5,6,7,8,9,10,11,12,13,14,15,16,17,18,19,20,21,22,23,24,25,26,27,28,29,30,31,32,33,34,35,36,37,38,39,40,41,42,43,44,45,46,47,48,49,50,51,52,53,54,55,56,57,58,59,60,61,62,63}; + float out2[N*8]; + + for (i = 0; i < N; i++) + { + + a0 = in[i*8] + 5; + a1 = in[i*8 + 1] + 6; + a2 = in[i*8 + 2] + 7; + a3 = in[i*8 + 3] + 8; + a4 = in[i*8 + 4] + 9; + a5 = in[i*8 + 5] + 10; + a6 = in[i*8 + 6] + 11; + a7 = in[i*8 + 7] + 12; + + b0 = a0 * 3; + b1 = a1 * 2; + b2 = a2 * 12; + b3 = a3 * 5; + b4 = a4 * 8; + b5 = a5 * 4; + b6 = a6 * 3; + b7 = a7 * 2; + + out[i*8] = b0 - 2; + out[i*8 + 1] = b1 - 3; + out[i*8 + 2] = b2 - 2; + out[i*8 + 3] = b3 - 1; + out[i*8 + 4] = b4 - 8; + out[i*8 + 5] = b5 - 7; + out[i*8 + 6] = b6 - 3; + out[i*8 + 7] = b7 - 7; + + + out2[i*8] = (float) b0; + out2[i*8 + 1] = (float) b1; + out2[i*8 + 2] = (float) b2; + out2[i*8 + 3] = (float) b3; + out2[i*8 + 4] = (float) b4; + out2[i*8 + 5] = (float) b5; + out2[i*8 + 6] = (float) b6; + out2[i*8 + 7] = (float) b7; + } + + /* check results: */ + for (i = 0; i < N; i++) + { + if (out[i*8] != (in[i*8] + 5) * 3 - 2 + || out[i*8 + 1] != (in[i*8 + 1] + 6) * 2 - 3 + || out[i*8 + 2] != (in[i*8 + 2] + 7) * 12 - 2 + || out[i*8 + 3] != (in[i*8 + 3] + 8) * 5 - 1 + || out[i*8 + 4] != (in[i*8 + 4] + 9) * 8 - 8 + || out[i*8 + 5] != (in[i*8 + 5] + 10) * 4 - 7 + || out[i*8 + 6] != (in[i*8 + 6] + 11) * 3 - 3 + || out[i*8 + 7] != (in[i*8 + 7] + 12) * 2 - 7) + abort (); + + if (out2[i*8] != (float) ((in[i*8] + 5) * 3) + || out2[i*8 + 1] != (float) ((in[i*8 + 1] + 6) * 2) + || out2[i*8 + 2] != (float) ((in[i*8 + 2] + 7) * 12) + || out2[i*8 + 3] != (float) ((in[i*8 + 3] + 8) * 5) + || out2[i*8 + 4] != (float) ((in[i*8 + 4] + 9) * 8) + || out2[i*8 + 5] != (float) ((in[i*8 + 5] + 10) * 4) + || out2[i*8 + 6] != (float) ((in[i*8 + 6] + 11) * 3) + || out2[i*8 + 7] != (float) ((in[i*8 + 7] + 12) * 2)) + abort (); + } + + return 0; +} + +int main (void) +{ + check_vect (); + + main1 (); + + return 0; +} + +/* { dg-final { scan-tree-dump-times "vectorized 1 loops" 1 "vect" { target { vect_strided } } } } */ +/* { dg-final { scan-tree-dump-times "vectorizing stmts using SLP" 2 "vect" { target { vect_strided } } } } */ +/* { dg-final { cleanup-tree-dump "vect" } } */ + diff --git a/gcc/testsuite/gcc.dg/vect/slp-19.c b/gcc/testsuite/gcc.dg/vect/slp-19.c new file mode 100644 index 00000000000..d9a68cd69d4 --- /dev/null +++ b/gcc/testsuite/gcc.dg/vect/slp-19.c @@ -0,0 +1,155 @@ +/* { dg-require-effective-target vect_int } */ + +#include <stdarg.h> +#include <stdio.h> +#include "tree-vect.h" + +#define N 16 + +int +main1 () +{ + unsigned int i; + unsigned int out[N*8]; + unsigned int in[N*8] = {0,1,2,3,4,5,6,7,8,9,10,11,12,13,14,15,16,17,18,19,20,21,22,23,24,25,26,27,28,29,30,31,32,33,34,35,36,37,38,39,40,41,42,43,44,45,46,47,48,49,50,51,52,53,54,55,56,57,58,59,60,61,62,63,0,1,2,3,4,5,6,7,8,9,10,11,12,13,14,15,16,17,18,19,20,21,22,23,24,25,26,27,28,29,30,31,32,33,34,35,36,37,38,39,40,41,42,43,44,45,46,47,48,49,50,51,52,53,54,55,56,57,58,59,60,61,62,63}; + unsigned int ia[N*2], a0, a1, a2, a3; + + for (i = 0; i < N; i++) + { + out[i*8] = in[i*8]; + out[i*8 + 1] = in[i*8 + 1]; + out[i*8 + 2] = in[i*8 + 2]; + out[i*8 + 3] = in[i*8 + 3]; + out[i*8 + 4] = in[i*8 + 4]; + out[i*8 + 5] = in[i*8 + 5]; + out[i*8 + 6] = in[i*8 + 6]; + out[i*8 + 7] = in[i*8 + 7]; + + ia[i] = in[i*8 + 2]; + } + + /* check results: */ + for (i = 0; i < N; i++) + { + if (out[i*8] != in[i*8] + || out[i*8 + 1] != in[i*8 + 1] + || out[i*8 + 2] != in[i*8 + 2] + || out[i*8 + 3] != in[i*8 + 3] + || out[i*8 + 4] != in[i*8 + 4] + || out[i*8 + 5] != in[i*8 + 5] + || out[i*8 + 6] != in[i*8 + 6] + || out[i*8 + 7] != in[i*8 + 7] + || ia[i] != in[i*8 + 2]) + abort (); + } + + for (i = 0; i < N*2; i++) + { + a0 = in[i*4] + 1; + a1 = in[i*4 + 1] + 2; + a2 = in[i*4 + 2] + 3; + a3 = in[i*4 + 3] + 4; + + out[i*4] = a0; + out[i*4 + 1] = a1; + out[i*4 + 2] = a2; + out[i*4 + 3] = a3; + + ia[i] = a2; + } + + /* check results: */ + for (i = 0; i < N*2; i++) + { + if (out[i*4] != in[i*4] + 1 + || out[i*4 + 1] != in[i*4 + 1] + 2 + || out[i*4 + 2] != in[i*4 + 2] + 3 + || out[i*4 + 3] != in[i*4 + 3] + 4 + || ia[i] != in[i*4 + 2] + 3) + abort (); + } + + /* The last stmt requires interleaving of not power of 2 size - not + vectorizable. */ + for (i = 0; i < N/2; i++) + { + out[i*12] = in[i*12]; + out[i*12 + 1] = in[i*12 + 1]; + out[i*12 + 2] = in[i*12 + 2]; + out[i*12 + 3] = in[i*12 + 3]; + out[i*12 + 4] = in[i*12 + 4]; + out[i*12 + 5] = in[i*12 + 5]; + out[i*12 + 6] = in[i*12 + 6]; + out[i*12 + 7] = in[i*12 + 7]; + out[i*12 + 8] = in[i*12 + 8]; + out[i*12 + 9] = in[i*12 + 9]; + out[i*12 + 10] = in[i*12 + 10]; + out[i*12 + 11] = in[i*12 + 11]; + + ia[i] = in[i*12 + 7]; + } + + /* check results: */ + for (i = 0; i < N/2; i++) + { + if (out[i*12] != in[i*12] + || out[i*12 + 1] != in[i*12 + 1] + || out[i*12 + 2] != in[i*12 + 2] + || out[i*12 + 3] != in[i*12 + 3] + || out[i*12 + 4] != in[i*12 + 4] + || out[i*12 + 5] != in[i*12 + 5] + || out[i*12 + 6] != in[i*12 + 6] + || out[i*12 + 7] != in[i*12 + 7] + || out[i*12 + 8] != in[i*12 + 8] + || out[i*12 + 9] != in[i*12 + 9] + || out[i*12 + 10] != in[i*12 + 10] + || out[i*12 + 11] != in[i*12 + 11] + || ia[i] != in[i*12 + 7]) + abort (); + } + + /* Hybrid SLP with unrolling by 2. */ + for (i = 0; i < N; i++) + { + out[i*6] = in[i*6]; + out[i*6 + 1] = in[i*6 + 1]; + out[i*6 + 2] = in[i*6 + 2]; + out[i*6 + 3] = in[i*6 + 3]; + out[i*6 + 4] = in[i*6 + 4]; + out[i*6 + 5] = in[i*6 + 5]; + + ia[i] = i; + } + + /* check results: */ + for (i = 0; i < N/2; i++) + { + if (out[i*6] != in[i*6] + || out[i*6 + 1] != in[i*6 + 1] + || out[i*6 + 2] != in[i*6 + 2] + || out[i*6 + 3] != in[i*6 + 3] + || out[i*6 + 4] != in[i*6 + 4] + || out[i*6 + 5] != in[i*6 + 5] + || ia[i] != i) + abort (); + } + + + return 0; +} + +int main (void) +{ + check_vect (); + + main1 (); + + return 0; +} + +/* { dg-final { scan-tree-dump-times "vectorized 3 loops" 1 "vect" { target vect_strided } } } */ +/* { dg-final { scan-tree-dump-times "vectorized 1 loops" 1 "vect" { target { ! { vect_strided } } } } } */ +/* { dg-final { scan-tree-dump-times "vectorizing stmts using SLP" 3 "vect" { target vect_strided } } } */ +/* { dg-final { scan-tree-dump-times "vectorizing stmts using SLP" 1 "vect" { target { ! { vect_strided } } } } } */ +/* { dg-final { cleanup-tree-dump "vect" } } */ + diff --git a/gcc/testsuite/gcc.dg/vect/slp-2.c b/gcc/testsuite/gcc.dg/vect/slp-2.c new file mode 100644 index 00000000000..2731747dbcf --- /dev/null +++ b/gcc/testsuite/gcc.dg/vect/slp-2.c @@ -0,0 +1,146 @@ +/* { dg-require-effective-target vect_int } */ + +#include <stdarg.h> +#include <stdio.h> +#include "tree-vect.h" + +#define N 128 + +int +main1 (unsigned short a0, unsigned short a1, unsigned short a2, + unsigned short a3, unsigned short a4, unsigned short a5, + unsigned short a6, unsigned short a7, unsigned short a8, + unsigned short a9, unsigned short a10, unsigned short a11, + unsigned short a12, unsigned short a13, unsigned short a14, + unsigned short a15) +{ + int i; + unsigned short out[N*16]; + + for (i = 0; i < N; i++) + { + out[i*4] = a8; + out[i*4 + 1] = a1; + out[i*4 + 2] = a2; + out[i*4 + 3] = a3; + } + + /* check results: */ + for (i = 0; i < N; i++) + { + if (out[i*4] != a8 + || out[i*4 + 1] != a1 + || out[i*4 + 2] != a2 + || out[i*4 + 3] != a3) + abort (); + } + + for (i = 0; i < N; i++) + { + out[i*16] = a8; + out[i*16 + 1] = a7; + out[i*16 + 2] = a1; + out[i*16 + 3] = a2; + out[i*16 + 4] = a8; + out[i*16 + 5] = a5; + out[i*16 + 6] = a5; + out[i*16 + 7] = a4; + out[i*16 + 8] = a12; + out[i*16 + 9] = a13; + out[i*16 + 10] = a14; + out[i*16 + 11] = a15; + out[i*16 + 12] = a6; + out[i*16 + 13] = a9; + out[i*16 + 14] = a0; + out[i*16 + 15] = a7; + } + + /* check results: */ + for (i = 0; i < N; i++) + { + if (out[i*16] != a8 + || out[i*16 + 1] != a7 + || out[i*16 + 2] != a1 + || out[i*16 + 3] != a2 + || out[i*16 + 4] != a8 + || out[i*16 + 5] != a5 + || out[i*16 + 6] != a5 + || out[i*16 + 7] != a4 + || out[i*16 + 8] != a12 + || out[i*16 + 9] != a13 + || out[i*16 + 10] != a14 + || out[i*16 + 11] != a15 + || out[i*16 + 12] != a6 + || out[i*16 + 13] != a9 + || out[i*16 + 14] != a0 + || out[i*16 + 15] != a7) + abort (); + } + + /* SLP with unrolling by 8. */ + for (i = 0; i < N; i++) + { + out[i*3] = a8; + out[i*3 + 1] = a1; + out[i*3 + 2] = a2; + } + + /* check results: */ + for (i = 0; i < N; i++) + { + if (out[i*3] != a8 + || out[i*3 + 1] != a1 + || out[i*3 + 2] != a2) + abort (); + } + + /* SLP with unrolling by 8. */ + for (i = 0; i < N; i++) + { + out[i*11] = a8; + out[i*11 + 1] = a7; + out[i*11 + 2] = a1; + out[i*11 + 3] = a2; + out[i*11 + 4] = a8; + out[i*11 + 5] = a5; + out[i*11 + 6] = a5; + out[i*11 + 7] = a4; + out[i*11 + 8] = a12; + out[i*11 + 9] = a13; + out[i*11 + 10] = a14; + } + + /* check results: */ + for (i = 0; i < N; i++) + { + if (out[i*11] != a8 + || out[i*11 + 1] != a7 + || out[i*11 + 2] != a1 + || out[i*11 + 3] != a2 + || out[i*11 + 4] != a8 + || out[i*11 + 5] != a5 + || out[i*11 + 6] != a5 + || out[i*11 + 7] != a4 + || out[i*11 + 8] != a12 + || out[i*11 + 9] != a13 + || out[i*11 + 10] != a14) + abort (); + } + + + return 0; +} + +int main (void) +{ + check_vect (); + + main1 (15,14,13,12,11,10,9,8,7,6,5,4,3,2,1,0); + + return 0; +} + +/* { dg-final { scan-tree-dump-times "vectorized 4 loops" 1 "vect" } } */ +/* { dg-final { scan-tree-dump-times "vectorizing stmts using SLP" 4 "vect" } } */ +/* { dg-final { cleanup-tree-dump "vect" } } */ + diff --git a/gcc/testsuite/gcc.dg/vect/slp-20.c b/gcc/testsuite/gcc.dg/vect/slp-20.c new file mode 100644 index 00000000000..86d3927a42c --- /dev/null +++ b/gcc/testsuite/gcc.dg/vect/slp-20.c @@ -0,0 +1,116 @@ +/* { dg-require-effective-target vect_int } */ + +#include <stdarg.h> +#include <stdio.h> +#include "tree-vect.h" + +#define N 128 + +int +main1 (unsigned short a0, unsigned short a1, unsigned short a2, + unsigned short a3, unsigned short a4, unsigned short a5, + unsigned short a6, unsigned short a7, unsigned short a8) +{ + int i; + unsigned short out[N*8], out2[N*8], b0, b1, b2, b3, b4, b5, b6, b7, b8; + + for (i = 0; i < N; i++) + { + b0 = a0 + 8; + b1 = a1 + 7; + b2 = a2 + 6; + b3 = a3 + 5; + b4 = a4 + 4; + b5 = a5 + 3; + + out[i*4] = b0; + out[i*4 + 1] = b1; + out[i*4 + 2] = b2; + out[i*4 + 3] = b3; + + out2[i*4] = b0; + out2[i*4 + 1] = b1; + out2[i*4 + 2] = b4; + out2[i*4 + 3] = b5; + } + + /* check results: */ + for (i = 0; i < N; i++) + { + if (out[i*4] != b0 + || out[i*4 + 1] != b1 + || out[i*4 + 2] != b2 + || out[i*4 + 3] != b3) + abort (); + + if (out2[i*4] != b0 + || out2[i*4 + 1] != b1 + || out2[i*4 + 2] != b4 + || out2[i*4 + 3] != b5) + abort (); + } + + for (i = 0; i < N; i++) + { + b0 = a0 + 8; + b1 = a1 + 7; + b2 = a2 + 6; + b3 = a3 + 5; + b4 = a4 + 4; + b5 = a5 + 3; + b6 = a6 + 2; + b7 = a7 + 1; + b8 = a8 + 9; + + out[i*4] = b0; + out[i*4 + 1] = b1; + out[i*4 + 2] = b2; + out[i*4 + 3] = b3; + + out2[i*8] = b0; + out2[i*8 + 1] = b1; + out2[i*8 + 2] = b4; + out2[i*8 + 3] = b5; + out2[i*8 + 4] = b6; + out2[i*8 + 5] = b2; + out2[i*8 + 6] = b7; + out2[i*8 + 7] = b8; + } + + /* check results: */ + for (i = 0; i < N; i++) + { + if (out[i*4] != b0 + || out[i*4 + 1] != b1 + || out[i*4 + 2] != b2 + || out[i*4 + 3] != b3) + abort (); + + if (out2[i*8] != b0 + || out2[i*8 + 1] != b1 + || out2[i*8 + 2] != b4 + || out2[i*8 + 3] != b5 + || out2[i*8 + 4] != b6 + || out2[i*8 + 5] != b2 + || out2[i*8 + 6] != b7 + || out2[i*8 + 7] != b8) + abort (); + } + + + return 0; +} + +int main (void) +{ + check_vect (); + + main1 (8,7,6,5,4,3,2,1,0); + + return 0; +} + +/* { dg-final { scan-tree-dump-times "vectorized 2 loops" 1 "vect" } } */ +/* { dg-final { scan-tree-dump-times "vectorizing stmts using SLP" 4 "vect" } } */ +/* { dg-final { cleanup-tree-dump "vect" } } */ + diff --git a/gcc/testsuite/gcc.dg/vect/slp-21.c b/gcc/testsuite/gcc.dg/vect/slp-21.c new file mode 100644 index 00000000000..327045e4789 --- /dev/null +++ b/gcc/testsuite/gcc.dg/vect/slp-21.c @@ -0,0 +1,208 @@ +/* { dg-require-effective-target vect_int } */ + +#include <stdarg.h> +#include <stdio.h> +#include "tree-vect.h" + +#define N 128 + +int +main1 () +{ + unsigned short i; + unsigned short out[N*8], out2[N*8], b0, b1, b2, b3, b4, a0, a1, a2, a3, b5; + unsigned short in[N*8]; + + for (i = 0; i < N*8; i++) + { + in[i] = i; + } + + /* Different operations in both cases - vectorization with interleaving. */ + for (i = 0; i < N; i++) + { + a0 = in[i*4]; + a1 = in[i*4 + 1]; + a2 = in[i*4 + 2]; + a3 = in[i*4 + 3]; + + b0 = a0 * 8; + b1 = a1 + 7; + b2 = a2 + 6; + b3 = a3 * 5; + + b4 = a2 + 4; + b5 = a3 + 3; + + out[i*4] = b0; + out[i*4 + 1] = b1; + out[i*4 + 2] = b2; + out[i*4 + 3] = b3; + + out2[i*4] = b0; + out2[i*4 + 1] = b1; + out2[i*4 + 2] = b4; + out2[i*4 + 3] = b5; + } + + /* check results: */ + for (i = 0; i < N; i++) + { + a0 = in[i*4]; + a1 = in[i*4 + 1]; + a2 = in[i*4 + 2]; + a3 = in[i*4 + 3]; + + b0 = a0 * 8; + b1 = a1 + 7; + b2 = a2 + 6; + b3 = a3 * 5; + + b4 = a2 + 4; + b5 = a3 + 3; + + if (out[i*4] != b0 + || out[i*4 + 1] != b1 + || out[i*4 + 2] != b2 + || out[i*4 + 3] != b3) + abort (); + + if (out2[i*4] != b0 + || out2[i*4 + 1] != b1 + || out2[i*4 + 2] != b4 + || out2[i*4 + 3] != b5) + abort (); + } + + /* Different operations in the first case - vectorization with interleaving. */ + for (i = 0; i < N; i++) + { + a0 = in[i*4]; + a1 = in[i*4 + 1]; + a2 = in[i*4 + 2]; + a3 = in[i*4 + 3]; + + b0 = a0 + 8; + b1 = a1 + 7; + b2 = a2 + 6; + b3 = a3 * 5; + + b4 = a2 + 4; + b5 = a3 + 3; + + out[i*4] = b0; + out[i*4 + 1] = b1; + out[i*4 + 2] = b2; + out[i*4 + 3] = b3; + + out2[i*4] = b0; + out2[i*4 + 1] = b1; + out2[i*4 + 2] = b4; + out2[i*4 + 3] = b5; + } + + /* check results: */ + for (i = 0; i < N; i++) + { + a0 = in[i*4]; + a1 = in[i*4 + 1]; + a2 = in[i*4 + 2]; + a3 = in[i*4 + 3]; + + b0 = a0 + 8; + b1 = a1 + 7; + b2 = a2 + 6; + b3 = a3 * 5; + + b4 = a2 + 4; + b5 = a3 + 3; + + if (out[i*4] != b0 + || out[i*4 + 1] != b1 + || out[i*4 + 2] != b2 + || out[i*4 + 3] != b3) + abort (); + + if (out2[i*4] != b0 + || out2[i*4 + 1] != b1 + || out2[i*4 + 2] != b4 + || out2[i*4 + 3] != b5) + abort (); + } + + + /* Different operations in the second case - vectorization with interleaving. */ + for (i = 0; i < N; i++) + { + a0 = in[i*4]; + a1 = in[i*4 + 1]; + a2 = in[i*4 + 2]; + a3 = in[i*4 + 3]; + + b0 = a0 + 8; + b1 = a1 + 7; + b2 = a2 + 6; + b3 = a3 + 5; + + b4 = a2 * 4; + b5 = a3 + 3; + + out[i*4] = b0; + out[i*4 + 1] = b1; + out[i*4 + 2] = b2; + out[i*4 + 3] = b3; + + out2[i*4] = b0; + out2[i*4 + 1] = b1; + out2[i*4 + 2] = b4; + out2[i*4 + 3] = b5; + } + + /* check results: */ + for (i = 0; i < N; i++) + { + a0 = in[i*4]; + a1 = in[i*4 + 1]; + a2 = in[i*4 + 2]; + a3 = in[i*4 + 3]; + + b0 = a0 + 8; + b1 = a1 + 7; + b2 = a2 + 6; + b3 = a3 + 5; + + b4 = a2 * 4; + b5 = a3 + 3; + + if (out[i*4] != b0 + || out[i*4 + 1] != b1 + || out[i*4 + 2] != b2 + || out[i*4 + 3] != b3) + abort (); + + if (out2[i*4] != b0 + || out2[i*4 + 1] != b1 + || out2[i*4 + 2] != b4 + || out2[i*4 + 3] != b5) + abort (); + } + + + return 0; +} + +int main (void) +{ + check_vect (); + + main1 (); + + return 0; +} + +/* { dg-final { scan-tree-dump-times "vectorized 4 loops" 1 "vect" { target vect_strided } } } */ +/* { dg-final { scan-tree-dump-times "vectorized 1 loops" 1 "vect" { target { ! { vect_strided } } } } } */ +/* { dg-final { scan-tree-dump-times "vectorizing stmts using SLP" 2 "vect" { target vect_strided } } } */ +/* { dg-final { scan-tree-dump-times "vectorizing stmts using SLP" 0 "vect" { target { ! { vect_strided } } } } } */ +/* { dg-final { cleanup-tree-dump "vect" } } */ + diff --git a/gcc/testsuite/gcc.dg/vect/slp-22.c b/gcc/testsuite/gcc.dg/vect/slp-22.c new file mode 100644 index 00000000000..18df4269a6c --- /dev/null +++ b/gcc/testsuite/gcc.dg/vect/slp-22.c @@ -0,0 +1,135 @@ +/* { dg-require-effective-target vect_int } */ + +#include <stdarg.h> +#include <stdio.h> +#include "tree-vect.h" + +#define N 128 + +int +main1 (unsigned short a0, unsigned short a1, unsigned short a2, + unsigned short a3, unsigned short a4, unsigned short a5, + unsigned short a6, unsigned short a7, unsigned short a8) +{ + int i; + unsigned short out[N*8], out2[N*8], out3[N*8], b0, b1, b2, b3, b4, b5, b6, b7, b8; + + for (i = 0; i < N; i++) + { + b0 = a0 + 8; + b1 = a1 + 7; + b2 = a2 + 6; + b3 = a3 + 5; + b4 = a4 + 4; + b5 = a5 + 3; + + out[i*4] = b0; + out[i*4 + 1] = b1; + out[i*4 + 2] = b2; + out[i*4 + 3] = b3; + + out2[i*4] = b0; + out2[i*4 + 1] = b1; + out2[i*4 + 2] = b4; + out2[i*4 + 3] = b5; + + out3[i*4] = b2; + out3[i*4 + 1] = b1; + out3[i*4 + 2] = b4; + out3[i*4 + 3] = b5; + } + + /* check results: */ + for (i = 0; i < N; i++) + { + if (out[i*4] != b0 + || out[i*4 + 1] != b1 + || out[i*4 + 2] != b2 + || out[i*4 + 3] != b3) + abort (); + + + if (out2[i*4] != b0 + || out2[i*4 + 1] != b1 + || out2[i*4 + 2] != b4 + || out2[i*4 + 3] != b5) + abort (); + + if (out3[i*4] != b2 + || out3[i*4 + 1] != b1 + || out3[i*4 + 2] != b4 + || out3[i*4 + 3] != b5) + abort (); + } + + for (i = 0; i < N; i++) + { + b0 = a0 + 8; + b1 = a1 + 7; + b2 = a2 + 6; + b3 = a3 + 5; + b4 = a4 + 4; + b5 = a5 + 3; + b6 = a6 + 2; + b7 = a7 + 1; + b8 = a8 + 9; + + out[i*4] = b0; + out[i*4 + 1] = b1; + out[i*4 + 2] = b2; + out[i*4 + 3] = b3; + + out2[i*8] = b0; + out2[i*8 + 1] = b1; + out2[i*8 + 2] = b4; + out2[i*8 + 3] = b5; + out2[i*8 + 4] = b6; + out2[i*8 + 5] = b2; + out2[i*8 + 6] = b7; + out2[i*8 + 7] = b8; + + out3[2*i + 1] = a0; + out3[2*i] = b8; + } + + /* check results: */ + for (i = 0; i < N; i++) + { + if (out[i*4] != b0 + || out[i*4 + 1] != b1 + || out[i*4 + 2] != b2 + || out[i*4 + 3] != b3) + abort (); + + if (out2[i*8] != b0 + || out2[i*8 + 1] != b1 + || out2[i*8 + 2] != b4 + || out2[i*8 + 3] != b5 + || out2[i*8 + 4] != b6 + || out2[i*8 + 5] != b2 + || out2[i*8 + 6] != b7 + || out2[i*8 + 7] != b8) + abort (); + + if (out3[2*i] != b8 + || out3[2*i+1] != a0) + abort(); + } + + + return 0; +} + +int main (void) +{ + check_vect (); + + main1 (8,7,6,5,4,3,2,1,0); + + return 0; +} + +/* { dg-final { scan-tree-dump-times "vectorized 2 loops" 1 "vect" } } */ +/* { dg-final { scan-tree-dump-times "vectorizing stmts using SLP" 6 "vect" } } */ +/* { dg-final { cleanup-tree-dump "vect" } } */ + diff --git a/gcc/testsuite/gcc.dg/vect/slp-23.c b/gcc/testsuite/gcc.dg/vect/slp-23.c new file mode 100644 index 00000000000..2bba580271d --- /dev/null +++ b/gcc/testsuite/gcc.dg/vect/slp-23.c @@ -0,0 +1,113 @@ +/* { dg-require-effective-target vect_int } */ + +#include <stdarg.h> +#include "tree-vect.h" + +#define N 128 + +typedef struct { + int a; + int b; + int c; + int d; + int e; + int f; + int g; + int h; +} s; + +int +main1 (s *arr) +{ + int i; + s *ptr = arr; + s res[N]; + + for (i = 0; i < N; i++) + { + res[i].c = ptr->c + ptr->c; + res[i].a = ptr->a + ptr->a; + res[i].d = ptr->d + ptr->d; + res[i].b = ptr->b + ptr->b; + res[i].f = ptr->f + ptr->f; + res[i].e = ptr->e + ptr->e; + res[i].h = ptr->h + ptr->h; + res[i].g = ptr->g + ptr->g; + ptr++; + } + + /* check results: */ + for (i = 0; i < N; i++) + { + if (res[i].c != arr[i].c + arr[i].c + || res[i].a != arr[i].a + arr[i].a + || res[i].d != arr[i].d + arr[i].d + || res[i].b != arr[i].b + arr[i].b + || res[i].f != arr[i].f + arr[i].f + || res[i].e != arr[i].e + arr[i].e + || res[i].h != arr[i].h + arr[i].h + || res[i].g != arr[i].g + arr[i].g) + abort(); + } + + ptr = arr; + for (i = 0; i < N; i++) + { + res[i].c = ptr->c + ptr->c; + res[i].a = ptr->a + ptr->a; + res[i].d = ptr->d + ptr->d; + res[i].b = ptr->b + ptr->b; + res[i].f = ptr->f + ptr->f; + res[i].e = ptr->e + ptr->e; + res[i].h = ptr->e + ptr->e; + res[i].g = ptr->g + ptr->g; + ptr++; + } + + /* check results: */ + for (i = 0; i < N; i++) + { + if (res[i].c != arr[i].c + arr[i].c + || res[i].a != arr[i].a + arr[i].a + || res[i].d != arr[i].d + arr[i].d + || res[i].b != arr[i].b + arr[i].b + || res[i].f != arr[i].f + arr[i].f + || res[i].e != arr[i].e + arr[i].e + || res[i].h != arr[i].e + arr[i].e + || res[i].g != arr[i].g + arr[i].g) + abort(); + } + +} + +int main (void) +{ + int i; + s arr[N]; + + check_vect (); + + for (i = 0; i < N; i++) + { + arr[i].a = i; + arr[i].b = i * 2; + arr[i].c = 17; + arr[i].d = i+34; + arr[i].e = i * 3 + 5; + arr[i].f = i * 5; + arr[i].g = i - 3; + arr[i].h = 56; + if (arr[i].a == 178) + abort(); + } + + main1 (arr); + + return 0; +} + +/* { dg-final { scan-tree-dump-times "vectorized 2 loops" 1 "vect" { target { vect_strided } && {! { vect_no_align} } } } } */ +/* { dg-final { scan-tree-dump-times "vectorized 1 loops" 1 "vect" { target { ! { vect_strided || vect_no_align} } } } } */ +/* { dg-final { scan-tree-dump-times "vectorizing stmts using SLP" 1 "vect" { xfail vect_no_align } } } */ +/* { dg-final { cleanup-tree-dump "vect" } } */ + diff --git a/gcc/testsuite/gcc.dg/vect/slp-24.c b/gcc/testsuite/gcc.dg/vect/slp-24.c new file mode 100644 index 00000000000..b3bf0735b02 --- /dev/null +++ b/gcc/testsuite/gcc.dg/vect/slp-24.c @@ -0,0 +1,82 @@ +/* { dg-require-effective-target vect_int } */ + +#include <stdarg.h> +#include <stdio.h> +#include "tree-vect.h" + +#define N 16 +#define DIFF 242 + +typedef struct { + unsigned char a; + unsigned char b; + unsigned char c; + unsigned char d; +} s; + +void +main1 (unsigned char x, unsigned char max_result, unsigned char min_result, s *arr) +{ + int i; + unsigned char ub[N*2] = {1,3,6,9,12,15,18,21,24,27,30,33,36,39,42,45,1,3,6,9,12,15,18,21,24,27,30,33,36,39,42,45}; + unsigned char uc[N] = {1,1,2,3,4,5,6,7,8,9,10,11,12,13,14,15}; + unsigned char udiff = 2; + unsigned char umax = x; + unsigned char umin = x; + unsigned char ua1[N*2]; + s *pIn = arr; + s out[N]; + + for (i = 0; i < N; i++) { + udiff += (unsigned char)(ub[i] - uc[i]); + + ua1[2*i+1] = ub[2*i+1]; + ua1[2*i] = ub[2*i]; + + out[i].d = pIn->d - 1; + out[i].b = pIn->b - 4; + out[i].c = pIn->c - 8; + out[i].a = pIn->a - 3; + + pIn++; + } + + for (i = 0; i < N; i++) { + if (ua1[2*i] != ub[2*i] + || ua1[2*i+1] != ub[2*i+1] + || out[i].a != arr[i].a - 3 + || out[i].b != arr[i].b - 4 + || out[i].c != arr[i].c - 8 + || out[i].d != arr[i].d - 1) + abort(); + } + + /* check results: */ + if (udiff != DIFF) + abort (); +} + +int main (void) +{ + int i; + s arr[N]; + + for (i = 0; i < N; i++) + { + arr[i].a = i + 9; + arr[i].b = i * 2 + 10; + arr[i].c = 17; + arr[i].d = i+34; + if (arr[i].a == 178) + abort(); + } + check_vect (); + + main1 (100, 100, 1, arr); + main1 (0, 15, 0, arr); + return 0; +} + +/* { dg-final { scan-tree-dump-times "vectorized 1 loops" 1 "vect" { xfail vect_no_align } } } */ +/* { dg-final { scan-tree-dump-times "vectorizing stmts using SLP" 2 "vect" { xfail vect_no_align } } } */ +/* { dg-final { cleanup-tree-dump "vect" } } */ diff --git a/gcc/testsuite/gcc.dg/vect/slp-25.c b/gcc/testsuite/gcc.dg/vect/slp-25.c new file mode 100644 index 00000000000..21f1900dc20 --- /dev/null +++ b/gcc/testsuite/gcc.dg/vect/slp-25.c @@ -0,0 +1,59 @@ +/* { dg-require-effective-target vect_int } */ + +#include <stdarg.h> +#include "tree-vect.h" + +#define N 128 + +/* Unaligned stores. */ + +int main1 (int n) +{ + int i; + int ia[N+1]; + short sa[N+1]; + + for (i = 1; i <= N/2; i++) + { + ia[2*i] = 25; + ia[2*i + 1] = 5; + } + + /* check results: */ + for (i = 1; i <= N/2; i++) + { + if (ia[2*i] != 25 + || ia[2*i + 1] != 5) + abort (); + } + + for (i = 1; i <= n/2; i++) + { + sa[2*i] = 25; + sa[2*i + 1] = 5; + } + + /* check results: */ + for (i = 1; i <= n/2; i++) + { + if (sa[2*i] != 25 + || sa[2*i + 1] != 5) + abort (); + } + + + return 0; +} + +int main (void) +{ + + check_vect (); + + return main1 (N); +} + +/* { dg-final { scan-tree-dump-times "vectorized 2 loops" 1 "vect" } } */ +/* { dg-final { scan-tree-dump-times "Vectorizing an unaligned access" 0 "vect" } } */ +/* { dg-final { scan-tree-dump-times "Alignment of access forced using peeling" 2 "vect" } } */ +/* { dg-final { cleanup-tree-dump "vect" } } */ diff --git a/gcc/testsuite/gcc.dg/vect/slp-26.c b/gcc/testsuite/gcc.dg/vect/slp-26.c new file mode 100644 index 00000000000..14be68bd289 --- /dev/null +++ b/gcc/testsuite/gcc.dg/vect/slp-26.c @@ -0,0 +1,53 @@ +/* { dg-require-effective-target vect_int } */ + +#include <stdarg.h> +#include <stdio.h> +#include "tree-vect.h" + +#define N 8 + +int +main1 () +{ + int i; + unsigned short in[N*8] = {0,1,2,3,4,5,6,7,8,9,10,11,12,13,14,15,16,17,18,19,20,21,22,23,24,25,26,27,28,29,30,31,32,33,34,35,36,37,38,39,40,41,42,43,44,45,46,47,48,49,50,51,52,53,54,55,56,57,58,59,60,61,62,63}; + unsigned short out[N*8], a[N], b[N] = {3,6,9,12,15,18,21,24}; + + /* Partial SLP is not supported. */ + for (i = 0; i < N; i++) + { + out[i*4] = in[i*4]; + out[i*4 + 1] = in[i*4 + 1]; + out[i*4 + 2] = in[i*4 + 2]; + out[i*4 + 3] = in[i*4 + 3]; + + a[i] = b[i] / 3; + } + + /* check results: */ + for (i = 0; i < N; i++) + { + if (out[i*4] != in[i*4] + || out[i*4 + 1] != in[i*4 + 1] + || out[i*4 + 2] != in[i*4 + 2] + || out[i*4 + 3] != in[i*4 + 3] + || a[i] != b[i] / 3) + abort (); + } + + return 0; +} + +int main (void) +{ + check_vect (); + + main1 (); + + return 0; +} + +/* { dg-final { scan-tree-dump-times "vectorized 0 loops" 1 "vect" } } */ +/* { dg-final { scan-tree-dump-times "vectorizing stmts using SLP" 0 "vect" } } */ +/* { dg-final { cleanup-tree-dump "vect" } } */ + diff --git a/gcc/testsuite/gcc.dg/vect/slp-28.c b/gcc/testsuite/gcc.dg/vect/slp-28.c new file mode 100644 index 00000000000..069116ac58e --- /dev/null +++ b/gcc/testsuite/gcc.dg/vect/slp-28.c @@ -0,0 +1,86 @@ +/* { dg-require-effective-target vect_int } */ + +#include <stdarg.h> +#include <stdio.h> +#include "tree-vect.h" + +#define N 32 + +int +main1 () +{ + int i; + unsigned short in[N] = {0,1,2,3,4,5,6,7,8,9,10,11,12,13,14,15,16,17,18,19,20,21,22,23,24,25,26,27,28,29,30,31}; + unsigned short in2[N] = {0,1,2,3,4,5,6,7,8,9,10,11,12,13,14,15,16,17,18,19,20,21,22,23,24,25,26,27,28,29,30,31}; + unsigned short in3[N] = {0,1,2,3,4,5,6,7,8,9,10,11,12,13,14,15,16,17,18,19,20,21,22,23,24,25,26,27,28,29,30,31}; + unsigned short check[N] = {0,1,2,3,5,6,7,8,10,11,12,13,15,16,17,18,20,21,22,23,25,26,27,28,30,31,32,33,35,36,37,38}; + unsigned short check3[N] = {0,1,2,3,4,5,6,7,8,9,10,11,5,6,7,8,9,10,11,12,13,14,15,16,10,11,12,13,14,15,16,17}; + + for (i = 0; i < N/4; i++) + { + in[i*4] = in[i*4] + 5; + in[i*4 + 1] = in[i*4 + 1] + 5; + in[i*4 + 2] = in[i*4 + 2] + 5; + in[i*4 + 3] = in[i*4 + 3] + 5; + + } + + /* check results: */ + for (i = 0; i < N; i++) + { + if (in[i] != i+5) + abort (); + } + + /* Not vectorizable because of data dependencies. */ + for (i = 1; i < N/4; i++) + { + in2[i*4] = in2[(i-1)*4] + 5; + in2[i*4 + 1] = in2[(i-1)*4 + 1] + 5; + in2[i*4 + 2] = in2[(i-1)*4 + 2] + 5; + in2[i*4 + 3] = in2[(i-1)*4 + 3] + 5; + + } + + /* check results: */ + for (i = 4; i < N; i++) + { + if (in2[i] != check[i]) + abort (); + } + + /* Not vectorizable because of data dependencies: distance 3 is greater than + the actual VF with SLP (2), but the analysis fail to detect that for now. */ + for (i = 3; i < N/4; i++) + { + in3[i*4] = in3[(i-3)*4] + 5; + in3[i*4 + 1] = in3[(i-3)*4 + 1] + 5; + in3[i*4 + 2] = in3[(i-3)*4 + 2] + 5; + in3[i*4 + 3] = in3[(i-3)*4 + 3] + 5; + + } + + /* check results: */ + for (i = 12; i < N; i++) + { + if (in3[i] != check3[i]) + abort (); + } + + + return 0; +} + +int main (void) +{ + check_vect (); + + main1 (); + + return 0; +} + +/* { dg-final { scan-tree-dump-times "vectorized 1 loops" 1 "vect" } } */ +/* { dg-final { scan-tree-dump-times "vectorizing stmts using SLP" 1 "vect" } } */ +/* { dg-final { cleanup-tree-dump "vect" } } */ + diff --git a/gcc/testsuite/gcc.dg/vect/slp-3.c b/gcc/testsuite/gcc.dg/vect/slp-3.c new file mode 100644 index 00000000000..474bfe8285f --- /dev/null +++ b/gcc/testsuite/gcc.dg/vect/slp-3.c @@ -0,0 +1,147 @@ +/* { dg-require-effective-target vect_int } */ + +#include <stdarg.h> +#include <stdio.h> +#include "tree-vect.h" + +#define N 8 + +int +main1 () +{ + int i; + unsigned short out[N*8]; + unsigned short in[N*8] = {0,1,2,3,4,5,6,7,8,9,10,11,12,13,14,15,16,17,18,19,20,21,22,23,24,25,26,27,28,29,30,31,32,33,34,35,36,37,38,39,40,41,42,43,44,45,46,47,48,49,50,51,52,53,54,55,56,57,58,59,60,61,62,63}; + + for (i = 0; i < N; i++) + { + out[i*8] = in[i*8]; + out[i*8 + 1] = in[i*8 + 1]; + out[i*8 + 2] = in[i*8 + 2]; + out[i*8 + 3] = in[i*8 + 3]; + out[i*8 + 4] = in[i*8 + 4]; + out[i*8 + 5] = in[i*8 + 5]; + out[i*8 + 6] = in[i*8 + 6]; + out[i*8 + 7] = in[i*8 + 7]; + } + + /* check results: */ + for (i = 0; i < N; i++) + { + if (out[i*8] != in[i*8] + || out[i*8 + 1] != in[i*8 + 1] + || out[i*8 + 2] != in[i*8 + 2] + || out[i*8 + 3] != in[i*8 + 3] + || out[i*8 + 4] != in[i*8 + 4] + || out[i*8 + 5] != in[i*8 + 5] + || out[i*8 + 6] != in[i*8 + 6] + || out[i*8 + 7] != in[i*8 + 7]) + abort (); + } + + for (i = 0; i < N*2; i++) + { + out[i*4] = in[i*4]; + out[i*4 + 1] = in[i*4 + 1]; + out[i*4 + 2] = in[i*4 + 2]; + out[i*4 + 3] = in[i*4 + 3]; + } + + /* check results: */ + for (i = 0; i < N*2; i++) + { + if (out[i*4] != in[i*4] + || out[i*4 + 1] != in[i*4 + 1] + || out[i*4 + 2] != in[i*4 + 2] + || out[i*4 + 3] != in[i*4 + 3]) + abort (); + } + + for (i = 0; i < N/2; i++) + { + out[i*16] = in[i*16]; + out[i*16 + 1] = in[i*16 + 1]; + out[i*16 + 2] = in[i*16 + 2]; + out[i*16 + 3] = in[i*16 + 3]; + out[i*16 + 4] = in[i*16 + 4]; + out[i*16 + 5] = in[i*16 + 5]; + out[i*16 + 6] = in[i*16 + 6]; + out[i*16 + 7] = in[i*16 + 7]; + out[i*16 + 8] = in[i*16 + 8]; + out[i*16 + 9] = in[i*16 + 9]; + out[i*16 + 10] = in[i*16 + 10]; + out[i*16 + 11] = in[i*16 + 11]; + out[i*16 + 12] = in[i*16 + 12]; + out[i*16 + 13] = in[i*16 + 13]; + out[i*16 + 14] = in[i*16 + 14]; + out[i*16 + 15] = in[i*16 + 15]; + } + + /* check results: */ + for (i = 0; i < N/2; i++) + { + if (out[i*16] != in[i*16] + || out[i*16 + 1] != in[i*16 + 1] + || out[i*16 + 2] != in[i*16 + 2] + || out[i*16 + 3] != in[i*16 + 3] + || out[i*16 + 4] != in[i*16 + 4] + || out[i*16 + 5] != in[i*16 + 5] + || out[i*16 + 6] != in[i*16 + 6] + || out[i*16 + 7] != in[i*16 + 7] + || out[i*16 + 8] != in[i*16 + 8] + || out[i*16 + 9] != in[i*16 + 9] + || out[i*16 + 10] != in[i*16 + 10] + || out[i*16 + 11] != in[i*16 + 11] + || out[i*16 + 12] != in[i*16 + 12] + || out[i*16 + 13] != in[i*16 + 13] + || out[i*16 + 14] != in[i*16 + 14] + || out[i*16 + 15] != in[i*16 + 15]) + abort (); + } + + /* SLP with unrolling by 8. */ + for (i = 0; i < N/2; i++) + { + out[i*9] = in[i*9]; + out[i*9 + 1] = in[i*9 + 1]; + out[i*9 + 2] = in[i*9 + 2]; + out[i*9 + 3] = in[i*9 + 3]; + out[i*9 + 4] = in[i*9 + 4]; + out[i*9 + 5] = in[i*9 + 5]; + out[i*9 + 6] = in[i*9 + 6]; + out[i*9 + 7] = in[i*9 + 7]; + out[i*9 + 8] = in[i*9 + 8]; + } + + /* check results: */ + for (i = 0; i < N/2; i++) + { + if (out[i*9] != in[i*9] + || out[i*9 + 1] != in[i*9 + 1] + || out[i*9 + 2] != in[i*9 + 2] + || out[i*9 + 3] != in[i*9 + 3] + || out[i*9 + 4] != in[i*9 + 4] + || out[i*9 + 5] != in[i*9 + 5] + || out[i*9 + 6] != in[i*9 + 6] + || out[i*9 + 7] != in[i*9 + 7] + || out[i*9 + 8] != in[i*9 + 8]) + abort (); + } + + + return 0; +} + +int main (void) +{ + check_vect (); + + main1 (); + + return 0; +} + +/* { dg-final { scan-tree-dump-times "vectorized 3 loops" 1 "vect" } } */ +/* { dg-final { scan-tree-dump-times "vectorizing stmts using SLP" 3 "vect" } } */ +/* { dg-final { cleanup-tree-dump "vect" } } */ + diff --git a/gcc/testsuite/gcc.dg/vect/slp-33.c b/gcc/testsuite/gcc.dg/vect/slp-33.c new file mode 100644 index 00000000000..86a641cfdf5 --- /dev/null +++ b/gcc/testsuite/gcc.dg/vect/slp-33.c @@ -0,0 +1,112 @@ +/* { dg-require-effective-target vect_int } */ + +#include <stdarg.h> +#include <stdio.h> +#include "tree-vect.h" + +#define N 8 + +int +main1 () +{ + int i; + unsigned int out[N*8], a0, a1, a2, a3, a4, a5, a6, a7, b1, b0, b2, b3, b4, b5, b6, b7; + unsigned int in[N*8] = {0,1,2,3,4,5,6,7,8,9,10,11,12,13,14,15,16,17,18,19,20,21,22,23,24,25,26,27,28,29,30,31,32,33,34,35,36,37,38,39,40,41,42,43,44,45,46,47,48,49,50,51,52,53,54,55,56,57,58,59,60,61,62,63}; + float out2[N*8]; + + /* SLP with unrolling by 4. */ + for (i = 0; i < N; i++) + { + a0 = in[i*7] + 5; + a1 = in[i*7 + 1] + 6; + a2 = in[i*7 + 2] + 7; + a3 = in[i*7 + 3] + 8; + a4 = in[i*7 + 4] + 9; + a5 = in[i*7 + 5] + 10; + a6 = in[i*7 + 6] + 11; + + b0 = a0 * 3; + b1 = a1 * 2; + b2 = a2 * 12; + b3 = a3 * 5; + b4 = a4 * 8; + b5 = a5 * 4; + b6 = a6 * 3; + + out[i*7] = b0 - 2; + out[i*7 + 1] = b1 - 3; + out[i*7 + 2] = b2 - 2; + out[i*7 + 3] = b3 - 1; + out[i*7 + 4] = b4 - 8; + out[i*7 + 5] = b5 - 7; + out[i*7 + 6] = b6 - 3; + } + + /* check results: */ + for (i = 0; i < N; i++) + { + if (out[i*7] != (in[i*7] + 5) * 3 - 2 + || out[i*7 + 1] != (in[i*7 + 1] + 6) * 2 - 3 + || out[i*7 + 2] != (in[i*7 + 2] + 7) * 12 - 2 + || out[i*7 + 3] != (in[i*7 + 3] + 8) * 5 - 1 + || out[i*7 + 4] != (in[i*7 + 4] + 9) * 8 - 8 + || out[i*7 + 5] != (in[i*7 + 5] + 10) * 4 - 7 + || out[i*7 + 6] != (in[i*7 + 6] + 11) * 3 - 3) + abort (); + } + + /* SLP with unrolling by 4. */ + for (i = 0; i < N*2; i++) + { + out[i*3] = (in[i*3] + 2) * 3; + out[i*3 + 1] = (in[i*3 + 1] + 2) * 7; + out[i*3 + 2] = (in[i*3 + 2] + 7) * 3; + } + + /* check results: */ + for (i = 0; i < N*2; i++) + { + if (out[i*3] != (in[i*3] + 2) * 3 + || out[i*3 + 1] != (in[i*3 + 1] + 2) * 7 + || out[i*3 + 2] != (in[i*3 + 2] + 7) * 3) + abort (); + } + + /* SLP with unrolling by 4. */ + for (i = 0; i < N*2; i++) + { + out2[i*3] = (float) (in[i*3] * 2 + 5) ; + out2[i*3 + 1] = (float) (in[i*3 + 1] * 3 + 7); + out2[i*3 + 2] = (float) (in[i*3 + 2] * 5 + 4); + } + + /* check results: */ + for (i = 0; i < N*2; i++) + { + if (out2[i*3] != (float) (in[i*3] * 2 + 5) + || out2[i*3 + 1] != (float) (in[i*3 + 1] * 3 + 7) + || out2[i*3 + 2] != (float) (in[i*3 + 2] * 5 + 4)) + abort (); + } + + + return 0; +} + +int main (void) +{ + check_vect (); + + main1 (); + + return 0; +} + +/* { dg-final { scan-tree-dump-times "vectorized 3 loops" 1 "vect" {target {vect_intfloat_cvt && vect_int_mult} } } } */ +/* { dg-final { scan-tree-dump-times "vectorized 2 loops" 1 "vect" {target {{! { vect_intfloat_cvt}} && vect_int_mult} } } } */ +/* { dg-final { scan-tree-dump-times "vectorized 0 loops" 1 "vect" {target {{! { vect_intfloat_cvt}} && {!{vect_int_mult}}} } } } */ +/* { dg-final { scan-tree-dump-times "vectorizing stmts using SLP" 3 "vect" {target {vect_intfloat_cvt && vect_int_mult} } } } */ +/* { dg-final { scan-tree-dump-times "vectorizing stmts using SLP" 2 "vect" {target {{! { vect_intfloat_cvt}} && vect_int_mult} } } } */ +/* { dg-final { scan-tree-dump-times "vectorizing stmts using SLP" 0 "vect" {target {{! { vect_intfloat_cvt}} && {!{vect_int_mult}}} } } } */ +/* { dg-final { cleanup-tree-dump "vect" } } */ + diff --git a/gcc/testsuite/gcc.dg/vect/slp-34.c b/gcc/testsuite/gcc.dg/vect/slp-34.c new file mode 100644 index 00000000000..d25eef02101 --- /dev/null +++ b/gcc/testsuite/gcc.dg/vect/slp-34.c @@ -0,0 +1,61 @@ +/* { dg-require-effective-target vect_int } */ + +#include <stdarg.h> +#include <stdio.h> +#include "tree-vect.h" + +#define N 8 + +int +main1 () +{ + int i; + unsigned short out[N*8]; + unsigned short in[N*8] = {0,1,2,3,4,5,6,7,8,9,10,11,12,13,14,15,16,17,18,19,20,21,22,23,24,25,26,27,28,29,30,31,32,33,34,35,36,37,38,39,40,41,42,43,44,45,46,47,48,49,50,51,52,53,54,55,56,57,58,59,60,61,62,63}; + unsigned short in2[N*8] = {0,1,2,3,4,5,6,7,8,9,10,11,12,13,14,15,16,17,18,19,20,21,22,23,24,25,26,27,28,29,30,31,32,33,34,35,36,37,38,39,40,41,42,43,44,45,46,47,48,49,50,51,52,53,54,55,56,57,58,59,60,61,62,63}; + unsigned short out2[N*8]; + + /* SLP with unrolling by 8. */ + for (i = 0; i < N; i++) + { + out[i*3] = in[i*3] + 5; + out[i*3 + 1] = in[i*3 + 1] + 6; + out[i*3 + 2] = in[i*3 + 2] + 16; + + out2[i*5] = in2[i*5] + 2; + out2[i*5 + 1] = in2[i*5 + 1] + 2; + out2[i*5 + 2] = in2[i*5 + 2] + 1; + out2[i*5 + 3] = in2[i*5 + 3] + 3; + out2[i*5 + 4] = in2[i*5 + 4] + 13; + } + + /* check results: */ + for (i = 0; i < N; i++) + { + if (out[i*3] != in[i*3] + 5 + || out[i*3 + 1] != in[i*3 + 1] + 6 + || out[i*3 + 2] != in[i*3 + 2] + 16 + || out2[i*5] != in2[i*5] + 2 + || out2[i*5 + 1] != in2[i*5 + 1] + 2 + || out2[i*5 + 2] != in2[i*5 + 2] + 1 + || out2[i*5 + 3] != in2[i*5 + 3] + 3 + || out2[i*5 + 4] != in2[i*5 + 4] + 13) + abort (); + } + + return 0; +} + +int main (void) +{ + check_vect (); + + main1 (); + + return 0; +} + +/* { dg-final { scan-tree-dump-times "vectorized 1 loops" 1 "vect" } } */ +/* { dg-final { scan-tree-dump-times "vectorizing stmts using SLP" 2 "vect" } } */ +/* { dg-final { cleanup-tree-dump "vect" } } */ + diff --git a/gcc/testsuite/gcc.dg/vect/slp-35.c b/gcc/testsuite/gcc.dg/vect/slp-35.c new file mode 100644 index 00000000000..39a7089ae7b --- /dev/null +++ b/gcc/testsuite/gcc.dg/vect/slp-35.c @@ -0,0 +1,73 @@ +/* { dg-require-effective-target vect_int } */ + +#include <stdarg.h> +#include "tree-vect.h" + +#define N 128 + +typedef struct { + int a; + int b; + int c; + int d; + int e; +} s; + +int +main1 (s *arr) +{ + int i; + s *ptr = arr; + s res[N]; + + /* SLP with unrolling by 4. */ + for (i = 0; i < N; i++) + { + res[i].c = ptr->c + ptr->c; + res[i].a = ptr->a + ptr->a; + res[i].d = ptr->d + ptr->d; + res[i].b = ptr->b + ptr->b; + res[i].e = ptr->e + ptr->e; + ptr++; + } + + /* check results: */ + for (i = 0; i < N; i++) + { + if (res[i].c != arr[i].c + arr[i].c + || res[i].a != arr[i].a + arr[i].a + || res[i].d != arr[i].d + arr[i].d + || res[i].b != arr[i].b + arr[i].b + || res[i].e != arr[i].e + arr[i].e) + abort(); + } + +} + +int main (void) +{ + int i; + s arr[N]; + + check_vect (); + + for (i = 0; i < N; i++) + { + arr[i].a = i; + arr[i].b = i * 2; + arr[i].c = 17; + arr[i].d = i+34; + arr[i].e = i * 3 + 5; + if (arr[i].a == 178) + abort(); + } + + main1 (arr); + + return 0; +} + +/* { dg-final { scan-tree-dump-times "vectorized 1 loops" 1 "vect" { xfail vect_no_align } } } */ +/* { dg-final { scan-tree-dump-times "vectorizing stmts using SLP" 1 "vect" { xfail vect_no_align } } } */ +/* { dg-final { cleanup-tree-dump "vect" } } */ + diff --git a/gcc/testsuite/gcc.dg/vect/slp-36.c b/gcc/testsuite/gcc.dg/vect/slp-36.c new file mode 100644 index 00000000000..98d1473419e --- /dev/null +++ b/gcc/testsuite/gcc.dg/vect/slp-36.c @@ -0,0 +1,75 @@ +/* { dg-do compile } */ +/* { dg-require-effective-target vect_shift } */ + +#define N 32 + +/* All the loops are vectorizable on platforms with vector shift argument. */ + +void +test_1 (void) +{ + static unsigned int bm[N]; + static unsigned int cm[N]; + int j; + + /* Vectorizable on platforms with scalar shift argument. */ + for (j = 0; j < N/2; j++) + { + bm[2*j] <<= 8; + bm[2*j+1] <<= 8; + } + + /* Not vectorizable on platforms with scalar shift argument. */ + for (j = 0; j < N/2; j++) + { + cm[2*j] <<= 8; + cm[2*j+1] <<= 7; + } +} + +void +test_2 (int a, int b) +{ + static unsigned int bm[N]; + static unsigned int cm[N]; + int j; + + /* Vectorizable on platforms with scalar shift argument. */ + for (j = 0; j < N/2; j++) + { + bm[2*j] <<= a; + bm[2*j+1] <<= a; + } + + /* Not vectorizable on platforms with scalar shift argument. */ + for (j = 0; j < N/2; j++) + { + cm[2*j] <<= a; + cm[2*j+1] <<= b; + } +} + +void +test_3 (void) +{ + static unsigned int bm[N]; + int am[N]; + int j; + + /* Not vectorizable on platforms with scalar shift argument. */ + for (j = 0; j < N/2; j++) + { + bm[2*j] <<= am[j]; + bm[2*j+1] <<= am[j]; + } + + /* Not vectorizable on platforms with scalar shift argument. */ + for (j = 0; j < N/2; j++) + { + bm[2*j] <<= am[2*j]; + bm[2*j+1] <<= am[2*j+1]; + } + +} + +/* { dg-final { cleanup-tree-dump "vect" } } */ diff --git a/gcc/testsuite/gcc.dg/vect/slp-37.c b/gcc/testsuite/gcc.dg/vect/slp-37.c new file mode 100644 index 00000000000..48642db96e7 --- /dev/null +++ b/gcc/testsuite/gcc.dg/vect/slp-37.c @@ -0,0 +1,67 @@ +/* { dg-require-effective-target vect_int } */ + +#include <stdarg.h> +#include <stdlib.h> +#include "tree-vect.h" + +#define N 128 + +typedef struct { + int a; + int b; + void *c; +} s1; + +int +foo1 (s1 *arr) +{ + int i; + s1 *ptr = arr; + + /* Different constant types - not SLPable. The group size is not power of 2, + interleaving is not supported either. */ + for (i = 0; i < N; i++) + { + ptr->a = 6; + ptr->b = 7; + ptr->c = NULL; + ptr++; + } + + /* check results: */ + for (i = 0; i < N; i++) + { + if (arr[i].a != 6 + || arr[i].b != 7 + || arr[i].c != NULL) + abort(); + } +} + +int main (void) +{ + int i; + s1 arr1[N]; + + check_vect (); + + for (i = 0; i < N; i++) + { + arr1[i].a = i; + arr1[i].b = i * 2; + arr1[i].c = (void *)arr1; + + if (arr1[i].a == 178) + abort(); + } + + + foo1 (arr1); + + return 0; +} + +/* { dg-final { scan-tree-dump-times "vectorized 1 loops" 0 "vect" } } */ +/* { dg-final { scan-tree-dump-times "vectorizing stmts using SLP" 0 "vect" } } */ +/* { dg-final { cleanup-tree-dump "vect" } } */ + diff --git a/gcc/testsuite/gcc.dg/vect/slp-4.c b/gcc/testsuite/gcc.dg/vect/slp-4.c new file mode 100644 index 00000000000..e1353584fd2 --- /dev/null +++ b/gcc/testsuite/gcc.dg/vect/slp-4.c @@ -0,0 +1,128 @@ +/* { dg-require-effective-target vect_int } */ + +#include <stdarg.h> +#include <stdio.h> +#include "tree-vect.h" + +#define N 16 + +int +main1 () +{ + int i; + unsigned short out[N*8]; + unsigned short in[N*8] = {0,1,2,3,4,5,6,7,8,9,10,11,12,13,14,15,16,17,18,19,20,21,22,23,24,25,26,27,28,29,30,31,32,33,34,35,36,37,38,39,40,41,42,43,44,45,46,47,48,49,50,51,52,53,54,55,56,57,58,59,60,61,62,63,0,1,2,3,4,5,6,7,8,9,10,11,12,13,14,15,16,17,18,19,20,21,22,23,24,25,26,27,28,29,30,31,32,33,34,35,36,37,38,39,40,41,42,43,44,45,46,47,48,49,50,51,52,53,54,55,56,57,58,59,60,61,62,63}; + unsigned int ia[N*2]; + + for (i = 0; i < N; i++) + { + out[i*8] = in[i*8]; + out[i*8 + 1] = in[i*8 + 1]; + out[i*8 + 2] = in[i*8 + 2]; + out[i*8 + 3] = in[i*8 + 3]; + out[i*8 + 4] = in[i*8 + 4]; + out[i*8 + 5] = in[i*8 + 5]; + out[i*8 + 6] = in[i*8 + 6]; + out[i*8 + 7] = in[i*8 + 7]; + + ia[i] = 7; + } + + /* check results: */ + for (i = 0; i < N; i++) + { + if (out[i*8] != in[i*8] + || out[i*8 + 1] != in[i*8 + 1] + || out[i*8 + 2] != in[i*8 + 2] + || out[i*8 + 3] != in[i*8 + 3] + || out[i*8 + 4] != in[i*8 + 4] + || out[i*8 + 5] != in[i*8 + 5] + || out[i*8 + 6] != in[i*8 + 6] + || out[i*8 + 7] != in[i*8 + 7] + || ia[i] != 7) + abort (); + } + + for (i = 0; i < N*2; i++) + { + out[i*4] = in[i*4]; + out[i*4 + 1] = in[i*4 + 1]; + out[i*4 + 2] = in[i*4 + 2]; + out[i*4 + 3] = in[i*4 + 3]; + + ia[i] = 12; + } + + /* check results: */ + for (i = 0; i < N*2; i++) + { + if (out[i*4] != in[i*4] + || out[i*4 + 1] != in[i*4 + 1] + || out[i*4 + 2] != in[i*4 + 2] + || out[i*4 + 3] != in[i*4 + 3] + || ia[i] != 12) + abort (); + } + + for (i = 0; i < N/2; i++) + { + out[i*16] = in[i*16]; + out[i*16 + 1] = in[i*16 + 1]; + out[i*16 + 2] = in[i*16 + 2]; + out[i*16 + 3] = in[i*16 + 3]; + out[i*16 + 4] = in[i*16 + 4]; + out[i*16 + 5] = in[i*16 + 5]; + out[i*16 + 6] = in[i*16 + 6]; + out[i*16 + 7] = in[i*16 + 7]; + out[i*16 + 8] = in[i*16 + 8]; + out[i*16 + 9] = in[i*16 + 9]; + out[i*16 + 10] = in[i*16 + 10]; + out[i*16 + 11] = in[i*16 + 11]; + out[i*16 + 12] = in[i*16 + 12]; + out[i*16 + 13] = in[i*16 + 13]; + out[i*16 + 14] = in[i*16 + 14]; + out[i*16 + 15] = in[i*16 + 15]; + + ia[i] = 21; + } + + /* check results: */ + for (i = 0; i < N/2; i++) + { + if (out[i*16] != in[i*16] + || out[i*16 + 1] != in[i*16 + 1] + || out[i*16 + 2] != in[i*16 + 2] + || out[i*16 + 3] != in[i*16 + 3] + || out[i*16 + 4] != in[i*16 + 4] + || out[i*16 + 5] != in[i*16 + 5] + || out[i*16 + 6] != in[i*16 + 6] + || out[i*16 + 7] != in[i*16 + 7] + || out[i*16 + 8] != in[i*16 + 8] + || out[i*16 + 9] != in[i*16 + 9] + || out[i*16 + 10] != in[i*16 + 10] + || out[i*16 + 11] != in[i*16 + 11] + || out[i*16 + 12] != in[i*16 + 12] + || out[i*16 + 13] != in[i*16 + 13] + || out[i*16 + 14] != in[i*16 + 14] + || out[i*16 + 15] != in[i*16 + 15] + || ia[i] != 21) + abort (); + } + + + return 0; +} + +int main (void) +{ + check_vect (); + + main1 (); + + return 0; +} + +/* { dg-final { scan-tree-dump-times "vectorized 3 loops" 1 "vect" } } */ +/* { dg-final { scan-tree-dump-times "vectorizing stmts using SLP" 3 "vect" } } */ +/* { dg-final { cleanup-tree-dump "vect" } } */ + diff --git a/gcc/testsuite/gcc.dg/vect/slp-5.c b/gcc/testsuite/gcc.dg/vect/slp-5.c new file mode 100644 index 00000000000..0f9c2eefb21 --- /dev/null +++ b/gcc/testsuite/gcc.dg/vect/slp-5.c @@ -0,0 +1,128 @@ +/* { dg-require-effective-target vect_int } */ + +#include <stdarg.h> +#include <stdio.h> +#include "tree-vect.h" + +#define N 16 + +int +main1 () +{ + int i; + unsigned int out[N*8]; + unsigned int in[N*8] = {0,1,2,3,4,5,6,7,8,9,10,11,12,13,14,15,16,17,18,19,20,21,22,23,24,25,26,27,28,29,30,31,32,33,34,35,36,37,38,39,40,41,42,43,44,45,46,47,48,49,50,51,52,53,54,55,56,57,58,59,60,61,62,63,0,1,2,3,4,5,6,7,8,9,10,11,12,13,14,15,16,17,18,19,20,21,22,23,24,25,26,27,28,29,30,31,32,33,34,35,36,37,38,39,40,41,42,43,44,45,46,47,48,49,50,51,52,53,54,55,56,57,58,59,60,61,62,63}; + unsigned short ia[N]; + unsigned int ib[N*2]; + + /* Not SLPable for now: multiple types with SLP of the smaller type. */ + for (i = 0; i < N; i++) + { + out[i*8] = in[i*8]; + out[i*8 + 1] = in[i*8 + 1]; + out[i*8 + 2] = in[i*8 + 2]; + out[i*8 + 3] = in[i*8 + 3]; + out[i*8 + 4] = in[i*8 + 4]; + out[i*8 + 5] = in[i*8 + 5]; + out[i*8 + 6] = in[i*8 + 6]; + out[i*8 + 7] = in[i*8 + 7]; + + ia[i] = 7; + } + + /* check results: */ + for (i = 0; i < N; i++) + { + if (out[i*8] != in[i*8] + || out[i*8 + 1] != in[i*8 + 1] + || out[i*8 + 2] != in[i*8 + 2] + || out[i*8 + 3] != in[i*8 + 3] + || out[i*8 + 4] != in[i*8 + 4] + || out[i*8 + 5] != in[i*8 + 5] + || out[i*8 + 6] != in[i*8 + 6] + || out[i*8 + 7] != in[i*8 + 7] + || ia[i] != 7) + abort (); + } + + for (i = 0; i < N*2; i++) + { + out[i*4] = in[i*4]; + out[i*4 + 1] = in[i*4 + 1]; + out[i*4 + 2] = in[i*4 + 2]; + out[i*4 + 3] = in[i*4 + 3]; + + ib[i] = 12; + } + + /* check results: */ + for (i = 0; i < N*2; i++) + { + if (out[i*4] != in[i*4] + || out[i*4 + 1] != in[i*4 + 1] + || out[i*4 + 2] != in[i*4 + 2] + || out[i*4 + 3] != in[i*4 + 3] + || ib[i] != 12) + abort (); + } + + for (i = 0; i < N/2; i++) + { + out[i*16] = in[i*16]; + out[i*16 + 1] = in[i*16 + 1]; + out[i*16 + 2] = in[i*16 + 2]; + out[i*16 + 3] = in[i*16 + 3]; + out[i*16 + 4] = in[i*16 + 4]; + out[i*16 + 5] = in[i*16 + 5]; + out[i*16 + 6] = in[i*16 + 6]; + out[i*16 + 7] = in[i*16 + 7]; + out[i*16 + 8] = in[i*16 + 8]; + out[i*16 + 9] = in[i*16 + 9]; + out[i*16 + 10] = in[i*16 + 10]; + out[i*16 + 11] = in[i*16 + 11]; + out[i*16 + 12] = in[i*16 + 12]; + out[i*16 + 13] = in[i*16 + 13]; + out[i*16 + 14] = in[i*16 + 14]; + out[i*16 + 15] = in[i*16 + 15]; + } + + /* check results: */ + for (i = 0; i < N/2; i++) + { + if (out[i*16] != in[i*16] + || out[i*16 + 1] != in[i*16 + 1] + || out[i*16 + 2] != in[i*16 + 2] + || out[i*16 + 3] != in[i*16 + 3] + || out[i*16 + 4] != in[i*16 + 4] + || out[i*16 + 5] != in[i*16 + 5] + || out[i*16 + 6] != in[i*16 + 6] + || out[i*16 + 7] != in[i*16 + 7] + || out[i*16 + 8] != in[i*16 + 8] + || out[i*16 + 9] != in[i*16 + 9] + || out[i*16 + 10] != in[i*16 + 10] + || out[i*16 + 11] != in[i*16 + 11] + || out[i*16 + 12] != in[i*16 + 12] + || out[i*16 + 13] != in[i*16 + 13] + || out[i*16 + 14] != in[i*16 + 14] + || out[i*16 + 15] != in[i*16 + 15]) + abort (); + } + + + return 0; +} + +int main (void) +{ + check_vect (); + + main1 (); + + return 0; +} + +/* { dg-final { scan-tree-dump-times "vectorized 3 loops" 1 "vect" { target { vect_strided } } } } */ +/* { dg-final { scan-tree-dump-times "vectorized 2 loops" 1 "vect" { target { ! { vect_strided } } } } } */ +/* { dg-final { scan-tree-dump-times "vectorizing stmts using SLP" 2 "vect" } } */ +/* { dg-final { cleanup-tree-dump "vect" } } */ + diff --git a/gcc/testsuite/gcc.dg/vect/slp-6.c b/gcc/testsuite/gcc.dg/vect/slp-6.c new file mode 100644 index 00000000000..5e86410588a --- /dev/null +++ b/gcc/testsuite/gcc.dg/vect/slp-6.c @@ -0,0 +1,122 @@ +/* { dg-require-effective-target vect_int } */ + +#include <stdarg.h> +#include <stdio.h> +#include "tree-vect.h" + +#define N 8 + +int +main1 () +{ + int i; + unsigned short out[N*8]; + unsigned short in[N*8] = {0,1,2,3,4,5,6,7,8,9,10,11,12,13,14,15,16,17,18,19,20,21,22,23,24,25,26,27,28,29,30,31,32,33,34,35,36,37,38,39,40,41,42,43,44,45,46,47,48,49,50,51,52,53,54,55,56,57,58,59,60,61,62,63}; + unsigned int in2[N*8] = {0,1,2,3,4,5,6,7,8,9,10,11,12,13,14,15,16,17,18,19,20,21,22,23,24,25,26,27,28,29,30,31,32,33,34,35,36,37,38,39,40,41,42,43,44,45,46,47,48,49,50,51,52,53,54,55,56,57,58,59,60,61,62,63}; + unsigned int out2[N*8]; + + for (i = 0; i < N; i++) + { + out[i*8] = in[i*8] + 5; + out[i*8 + 1] = in[i*8 + 1] + 6; + out[i*8 + 2] = in[i*8 + 2] + 7; + out[i*8 + 3] = in[i*8 + 3] + 8; + out[i*8 + 4] = in[i*8 + 4] + 9; + out[i*8 + 5] = in[i*8 + 5] + 10; + out[i*8 + 6] = in[i*8 + 6] + 11; + out[i*8 + 7] = in[i*8 + 7] + 12; + } + + /* check results: */ + for (i = 0; i < N; i++) + { + if (out[i*8] != in[i*8] + 5 + || out[i*8 + 1] != in[i*8 + 1] + 6 + || out[i*8 + 2] != in[i*8 + 2] + 7 + || out[i*8 + 3] != in[i*8 + 3] + 8 + || out[i*8 + 4] != in[i*8 + 4] + 9 + || out[i*8 + 5] != in[i*8 + 5] + 10 + || out[i*8 + 6] != in[i*8 + 6] + 11 + || out[i*8 + 7] != in[i*8 + 7] + 12) + abort (); + } + + for (i = 0; i < N*2; i++) + { + out[i*4] = in[i*4] + 2; + out[i*4 + 1] = in[i*4 + 1] + 2; + out[i*4 + 2] = in[i*4 + 2] + 1; + out[i*4 + 3] = in[i*4 + 3] + 3; + } + + /* check results: */ + for (i = 0; i < N*2; i++) + { + if (out[i*4] != in[i*4] + 2 + || out[i*4 + 1] != in[i*4 + 1] + 2 + || out[i*4 + 2] != in[i*4 + 2] + 1 + || out[i*4 + 3] != in[i*4 + 3] + 3) + abort (); + } + + for (i = 0; i < N/2; i++) + { + out2[i*16] = in2[i*16] * 2; + out2[i*16 + 1] = in2[i*16 + 1] * 3; + out2[i*16 + 2] = in2[i*16 + 2] * 4; + out2[i*16 + 3] = in2[i*16 + 3] * 3; + out2[i*16 + 4] = in2[i*16 + 4] * 2; + out2[i*16 + 5] = in2[i*16 + 5] * 3; + out2[i*16 + 6] = in2[i*16 + 6] * 2; + out2[i*16 + 7] = in2[i*16 + 7] * 4; + out2[i*16 + 8] = in2[i*16 + 8] * 2; + out2[i*16 + 9] = in2[i*16 + 9] * 5; + out2[i*16 + 10] = in2[i*16 + 10] * 2; + out2[i*16 + 11] = in2[i*16 + 11] * 3; + out2[i*16 + 12] = in2[i*16 + 12] * 4; + out2[i*16 + 13] = in2[i*16 + 13] * 4; + out2[i*16 + 14] = in2[i*16 + 14] * 3; + out2[i*16 + 15] = in2[i*16 + 15] * 2; + } + + /* check results: */ + for (i = 0; i < N/2; i++) + { + if (out2[i*16] != in2[i*16] * 2 + || out2[i*16 + 1] != in2[i*16 + 1] * 3 + || out2[i*16 + 2] != in2[i*16 + 2] * 4 + || out2[i*16 + 3] != in2[i*16 + 3] * 3 + || out2[i*16 + 4] != in2[i*16 + 4] * 2 + || out2[i*16 + 5] != in2[i*16 + 5] * 3 + || out2[i*16 + 6] != in2[i*16 + 6] * 2 + || out2[i*16 + 7] != in2[i*16 + 7] * 4 + || out2[i*16 + 8] != in2[i*16 + 8] * 2 + || out2[i*16 + 9] != in2[i*16 + 9] * 5 + || out2[i*16 + 10] != in2[i*16 + 10] * 2 + || out2[i*16 + 11] != in2[i*16 + 11] * 3 + || out2[i*16 + 12] != in2[i*16 + 12] * 4 + || out2[i*16 + 13] != in2[i*16 + 13] * 4 + || out2[i*16 + 14] != in2[i*16 + 14] * 3 + || out2[i*16 + 15] != in2[i*16 + 15] * 2) + abort (); + } + + + return 0; +} + +int main (void) +{ + check_vect (); + + main1 (); + + return 0; +} + +/* { dg-final { scan-tree-dump-times "vectorized 3 loops" 1 "vect" {target vect_int_mult} } } */ +/* { dg-final { scan-tree-dump-times "vectorized 2 loops" 1 "vect" {target { ! { vect_int_mult } } } } } */ +/* { dg-final { scan-tree-dump-times "vectorizing stmts using SLP" 3 "vect" {target vect_int_mult } } } */ +/* { dg-final { scan-tree-dump-times "vectorizing stmts using SLP" 2 "vect" {target { ! { vect_int_mult } } } } } */ +/* { dg-final { cleanup-tree-dump "vect" } } */ + diff --git a/gcc/testsuite/gcc.dg/vect/slp-7.c b/gcc/testsuite/gcc.dg/vect/slp-7.c new file mode 100644 index 00000000000..4ee7029af0e --- /dev/null +++ b/gcc/testsuite/gcc.dg/vect/slp-7.c @@ -0,0 +1,127 @@ +/* { dg-require-effective-target vect_int } */ + +#include <stdarg.h> +#include <stdio.h> +#include "tree-vect.h" + +#define N 8 + +int +main1 () +{ + int i; + unsigned int out[N*8], ia[N*2]; + unsigned int in[N*8] = {0,1,2,3,4,5,6,7,8,9,10,11,12,13,14,15,16,17,18,19,20,21,22,23,24,25,26,27,28,29,30,31,32,33,34,35,36,37,38,39,40,41,42,43,44,45,46,47,48,49,50,51,52,53,54,55,56,57,58,59,60,61,62,63}; + unsigned short in2[N*16] = {0,1,2,3,4,5,6,7,8,9,10,11,12,13,14,15,16,17,18,19,20,21,22,23,24,25,26,27,28,29,30,31,32,33,34,35,36,37,38,39,40,41,42,43,44,45,46,47,48,49,50,51,52,53,54,55,56,57,58,59,60,61,62,63,0,1,2,3,4,5,6,7,8,9,10,11,12,13,14,15,16,17,18,19,20,21,22,23,24,25,26,27,28,29,30,31,32,33,34,35,36,37,38,39,40,41,42,43,44,45,46,47,48,49,50,51,52,53,54,55,56,57,58,59,60,61,62,63}; + unsigned short sa[N], out2[N*16]; + + for (i = 0; i < N; i++) + { + out[i*8] = in[i*8] + 5; + out[i*8 + 1] = in[i*8 + 1] + 6; + out[i*8 + 2] = in[i*8 + 2] + 7; + out[i*8 + 3] = in[i*8 + 3] + 8; + out[i*8 + 4] = in[i*8 + 4] + 9; + out[i*8 + 5] = in[i*8 + 5] + 10; + out[i*8 + 6] = in[i*8 + 6] + 11; + out[i*8 + 7] = in[i*8 + 7] + 12; + + ia[i] = in[i]; + } + + /* check results: */ + for (i = 0; i < N; i++) + { + if (out[i*8] != in[i*8] + 5 + || out[i*8 + 1] != in[i*8 + 1] + 6 + || out[i*8 + 2] != in[i*8 + 2] + 7 + || out[i*8 + 3] != in[i*8 + 3] + 8 + || out[i*8 + 4] != in[i*8 + 4] + 9 + || out[i*8 + 5] != in[i*8 + 5] + 10 + || out[i*8 + 6] != in[i*8 + 6] + 11 + || out[i*8 + 7] != in[i*8 + 7] + 12 + || ia[i] != in[i]) + abort (); + } + + for (i = 0; i < N*2; i++) + { + out[i*4] = in[i*4] + 1; + out[i*4 + 1] = in[i*4 + 1] + 2; + out[i*4 + 2] = in[i*4 + 2] + 3; + out[i*4 + 3] = in[i*4 + 3] + 4; + + ia[i] = in[i]; + } + + /* check results: */ + for (i = 0; i < N*2; i++) + { + if (out[i*4] != in[i*4] + 1 + || out[i*4 + 1] != in[i*4 + 1] + 2 + || out[i*4 + 2] != in[i*4 + 2] + 3 + || out[i*4 + 3] != in[i*4 + 3] + 4 + || ia[i] != in[i]) + abort (); + } + + for (i = 0; i < N; i++) + { + out2[i*16] = in2[i*16] * 2; + out2[i*16 + 1] = in2[i*16 + 1] * 3; + out2[i*16 + 2] = in2[i*16 + 2] * 4; + out2[i*16 + 3] = in2[i*16 + 3] * 3; + out2[i*16 + 4] = in2[i*16 + 4] * 2; + out2[i*16 + 5] = in2[i*16 + 5] * 3; + out2[i*16 + 6] = in2[i*16 + 6] * 2; + out2[i*16 + 7] = in2[i*16 + 7] * 4; + out2[i*16 + 8] = in2[i*16 + 8] * 2; + out2[i*16 + 9] = in2[i*16 + 9] * 5; + out2[i*16 + 10] = in2[i*16 + 10] * 2; + out2[i*16 + 11] = in2[i*16 + 11] * 3; + out2[i*16 + 12] = in2[i*16 + 12] * 4; + out2[i*16 + 13] = in2[i*16 + 13] * 4; + out2[i*16 + 14] = in2[i*16 + 14] * 3; + out2[i*16 + 15] = in2[i*16 + 15] * 2; + } + + /* check results: */ + for (i = 0; i < N; i++) + { + if (out2[i*16] != in2[i*16] * 2 + || out2[i*16 + 1] != in2[i*16 + 1] * 3 + || out2[i*16 + 2] != in2[i*16 + 2] * 4 + || out2[i*16 + 3] != in2[i*16 + 3] * 3 + || out2[i*16 + 4] != in2[i*16 + 4] * 2 + || out2[i*16 + 5] != in2[i*16 + 5] * 3 + || out2[i*16 + 6] != in2[i*16 + 6] * 2 + || out2[i*16 + 7] != in2[i*16 + 7] * 4 + || out2[i*16 + 8] != in2[i*16 + 8] * 2 + || out2[i*16 + 9] != in2[i*16 + 9] * 5 + || out2[i*16 + 10] != in2[i*16 + 10] * 2 + || out2[i*16 + 11] != in2[i*16 + 11] * 3 + || out2[i*16 + 12] != in2[i*16 + 12] * 4 + || out2[i*16 + 13] != in2[i*16 + 13] * 4 + || out2[i*16 + 14] != in2[i*16 + 14] * 3 + || out2[i*16 + 15] != in2[i*16 + 15] * 2) + abort (); + } + + + return 0; +} + +int main (void) +{ + check_vect (); + + main1 (); + + return 0; +} + +/* { dg-final { scan-tree-dump-times "vectorized 3 loops" 1 "vect" {target { vect_strided && vect_int_mult } } } }*/ +/* { dg-final { scan-tree-dump-times "vectorized 2 loops" 1 "vect" {target { ! { vect_strided && vect_int_mult } } } } } */ +/* { dg-final { scan-tree-dump-times "vectorizing stmts using SLP" 2 "vect" } } */ +/* { dg-final { cleanup-tree-dump "vect" } } */ + diff --git a/gcc/testsuite/gcc.dg/vect/slp-8.c b/gcc/testsuite/gcc.dg/vect/slp-8.c new file mode 100644 index 00000000000..1260ddce504 --- /dev/null +++ b/gcc/testsuite/gcc.dg/vect/slp-8.c @@ -0,0 +1,45 @@ +/* { dg-require-effective-target vect_int } */ + +#include <stdarg.h> +#include "tree-vect.h" + +#define N 32 + +int main1 () +{ + int i; + int ib[N] = {0,3,6,9,12,15,18,21,24,27,30,33,36,39,42,45,0,3,6,9,12,15,18,21,24,27,30,33,36,39,42,45}; + float fa[N]; + + /* int -> float */ + for (i = 0; i < N/4; i++) + { + fa[4*i] = (float) ib[4*i]; + fa[4*i + 1] = (float) ib[4*i + 1]; + fa[4*i + 2] = (float) ib[4*i + 2]; + fa[4*i + 3] = (float) ib[4*i + 3]; + } + + /* check results: */ + for (i = 0; i < N/4; i++) + { + if (fa[4*i] != (float) ib[4*i] + || fa[4*i + 1] != (float) ib[4*i + 1] + || fa[4*i + 2] != (float) ib[4*i + 2] + || fa[4*i + 3] != (float) ib[4*i + 3]) + abort (); + } + + return 0; +} + +int main (void) +{ + check_vect (); + + return main1 (); +} + +/* { dg-final { scan-tree-dump-times "vectorized 1 loops" 1 "vect" { target powerpc*-*-* i?86-*-* x86_64-*-* } } } */ +/* { dg-final { scan-tree-dump-times "vectorizing stmts using SLP" 1 "vect" { target powerpc*-*-* i?86-*-* x86_64-*-* } } } */ +/* { dg-final { cleanup-tree-dump "vect" } } */ diff --git a/gcc/testsuite/gcc.dg/vect/slp-9.c b/gcc/testsuite/gcc.dg/vect/slp-9.c new file mode 100644 index 00000000000..cfb30bd718b --- /dev/null +++ b/gcc/testsuite/gcc.dg/vect/slp-9.c @@ -0,0 +1,47 @@ +/* { dg-require-effective-target vect_int } */ + +#include <stdarg.h> +#include "tree-vect.h" + +#define N 64 + +short X[N] __attribute__ ((__aligned__(16))); +short Y[N] __attribute__ ((__aligned__(16))); +int result[N]; + +/* short->int widening-mult */ +int +foo1(int len) { + int i; + + for (i=0; i<len/2; i++) { + result[2*i] = X[2*i] * Y[2*i]; + result[2*i+1] = X[2*i+1] * Y[2*i+1]; + } +} + +int main (void) +{ + int i; + + check_vect (); + + for (i=0; i<N; i++) { + X[i] = i; + Y[i] = 64-i; + } + + foo1 (N); + + for (i=0; i<N; i++) { + if (result[i] != X[i] * Y[i]) + abort (); + } + + return 0; +} + +/* { dg-final { scan-tree-dump-times "vectorized 1 loops" 1 "vect" { target { vect_strided && vect_widen_mult_hi_to_si } } } }*/ +/* { dg-final { scan-tree-dump-times "vectorizing stmts using SLP" 0 "vect" } } */ +/* { dg-final { cleanup-tree-dump "vect" } } */ + diff --git a/gcc/testsuite/gcc.dg/vect/vect-vfa-03.c b/gcc/testsuite/gcc.dg/vect/vect-vfa-03.c index 53d781677ae..7d684aa1ff0 100644 --- a/gcc/testsuite/gcc.dg/vect/vect-vfa-03.c +++ b/gcc/testsuite/gcc.dg/vect/vect-vfa-03.c @@ -10,9 +10,9 @@ struct S unsigned short b; }; -struct S result[N] = {12, 13, 13, 14, 14, 15, 15, 16, 16, 17, 17, 18, - 18, 19, 19, 20, 20, 21, 21, 22, 22, 23, 23, 24, - 24, 25, 25, 26, 26, 27, 27, 28}; +struct S result[N] = {20, 13, 22, 14, 24, 15, 26, 16, 28, 17, 30, 18, + 32, 19, 34, 20, 36, 21, 38, 22, 40, 23, 42, 24, + 44, 25, 46, 26, 48, 27, 50, 28}; struct S X[N] = {10, 10, 11, 11, 12, 12, 13, 13, 14, 14, 15, 15, 16, 16, 17, 17, 18, 18, 19, 19, 20, 20, 21, 21, 22, 22, 23, 23, 24, 24, 25, 25}; @@ -25,7 +25,7 @@ foo (struct S * in, struct S * out) for (i = 0; i < N; i++) { - out[i].a = in[i].a + 2; + out[i].a = in[i].a * 2; out[i].b = in[i].b + 3; } } @@ -42,10 +42,10 @@ main (void) /* check results: */ for (i = 0; i < N; i++) { - if (Y[i].a != result[i].a) + if (Y[i].a != result[i].a) abort (); - if (Y[i].b != result[i].b) + if (Y[i].b != result[i].b) abort (); } diff --git a/gcc/testsuite/gcc.dg/vect/vect-vfa-slp.c b/gcc/testsuite/gcc.dg/vect/vect-vfa-slp.c new file mode 100644 index 00000000000..27560c72d9d --- /dev/null +++ b/gcc/testsuite/gcc.dg/vect/vect-vfa-slp.c @@ -0,0 +1,56 @@ +/* { dg-require-effective-target vect_int } */ + +#include <stdarg.h> +#include "tree-vect.h" + +#define N 16 +struct S +{ + unsigned short a; + unsigned short b; +}; + +struct S result[N] = {12, 13, 13, 14, 14, 15, 15, 16, 16, 17, 17, 18, + 18, 19, 19, 20, 20, 21, 21, 22, 22, 23, 23, 24, + 24, 25, 25, 26, 26, 27, 27, 28}; +struct S X[N] = {10, 10, 11, 11, 12, 12, 13, 13, 14, 14, 15, 15, 16, + 16, 17, 17, 18, 18, 19, 19, 20, 20, 21, 21, 22, 22, + 23, 23, 24, 24, 25, 25}; +struct S Y[N] = {}; + +__attribute__ ((noinline)) void +foo (struct S * in, struct S * out) +{ + int i; + + for (i = 0; i < N; i++) + { + out[i].a = in[i].a + 2; + out[i].b = in[i].b + 3; + } +} + +int +main (void) +{ + int i; + + check_vect (); + + foo (X, Y); + + /* check results: */ + for (i = 0; i < N; i++) + { + if (Y[i].a != result[i].a) + abort (); + + if (Y[i].b != result[i].b) + abort (); + + } + return 0; +} + +/* { dg-final { scan-tree-dump-times "vectorized 1 loops" 1 "vect" } } */ +/* { dg-final { cleanup-tree-dump "vect" } } */ diff --git a/gcc/testsuite/gcc.dg/vect/vect.exp b/gcc/testsuite/gcc.dg/vect/vect.exp index fba5aa8f591..e9be793621e 100644 --- a/gcc/testsuite/gcc.dg/vect/vect.exp +++ b/gcc/testsuite/gcc.dg/vect/vect.exp @@ -108,6 +108,8 @@ dg-runtest [lsort [glob -nocomplain $srcdir/$subdir/pr*.\[cS\]]] \ "" $DEFAULT_VECTCFLAGS dg-runtest [lsort [glob -nocomplain $srcdir/$subdir/vect-*.\[cS\]]] \ "" $DEFAULT_VECTCFLAGS +dg-runtest [lsort [glob -nocomplain $srcdir/$subdir/slp-*.\[cS\]]] \ + "" $DEFAULT_VECTCFLAGS #### Tests with special options global SAVED_DEFAULT_VECTCFLAGS @@ -122,25 +124,25 @@ dg-runtest [lsort [glob -nocomplain $srcdir/$subdir/no-vfa-*.\[cS\]]] \ # -ffast-math tests set DEFAULT_VECTCFLAGS $SAVED_DEFAULT_VECTCFLAGS lappend DEFAULT_VECTCFLAGS "-ffast-math" -dg-runtest [lsort [glob -nocomplain $srcdir/$subdir/fast-math-vect*.\[cS\]]] \ +dg-runtest [lsort [glob -nocomplain $srcdir/$subdir/fast-math-*.\[cS\]]] \ "" $DEFAULT_VECTCFLAGS # -fno-math-errno tests set DEFAULT_VECTCFLAGS $SAVED_DEFAULT_VECTCFLAGS lappend DEFAULT_VECTCFLAGS "-fno-math-errno" -dg-runtest [lsort [glob -nocomplain $srcdir/$subdir/no-math-errno-vect*.\[cS\]]] \ +dg-runtest [lsort [glob -nocomplain $srcdir/$subdir/no-math-errno-*.\[cS\]]] \ "" $DEFAULT_VECTCFLAGS # -fwrapv tests set DEFAULT_VECTCFLAGS $SAVED_DEFAULT_VECTCFLAGS lappend DEFAULT_VECTCFLAGS "-fwrapv" -dg-runtest [lsort [glob -nocomplain $srcdir/$subdir/wrapv-vect*.\[cS\]]] \ +dg-runtest [lsort [glob -nocomplain $srcdir/$subdir/wrapv-*.\[cS\]]] \ "" $DEFAULT_VECTCFLAGS # -ftrapv tests set DEFAULT_VECTCFLAGS $SAVED_DEFAULT_VECTCFLAGS lappend DEFAULT_VECTCFLAGS "-ftrapv" -dg-runtest [lsort [glob -nocomplain $srcdir/$subdir/trapv-vect*.\[cS\]]] \ +dg-runtest [lsort [glob -nocomplain $srcdir/$subdir/trapv-*.\[cS\]]] \ "" $DEFAULT_VECTCFLAGS # -fdump-tree-dceloop-details tests @@ -197,12 +199,24 @@ lappend DEFAULT_VECTCFLAGS "-fno-tree-scev-cprop" "-fno-tree-reassoc" dg-runtest [lsort [glob -nocomplain $srcdir/$subdir/no-scevccp-noreassoc-*.\[cS\]]] \ "" $DEFAULT_VECTCFLAGS +# -fno-tree-scev-cprop +set DEFAULT_VECTCFLAGS $SAVED_DEFAULT_VECTCFLAGS +lappend DEFAULT_VECTCFLAGS "-fno-tree-scev-cprop" +dg-runtest [lsort [glob -nocomplain $srcdir/$subdir/no-scevccp-slp-*.\[cS\]]] \ + "" $DEFAULT_VECTCFLAGS + # -fno-tree-dominator-opts set DEFAULT_VECTCFLAGS $SAVED_DEFAULT_VECTCFLAGS lappend DEFAULT_VECTCFLAGS "-fno-tree-dominator-opts" dg-runtest [lsort [glob -nocomplain $srcdir/$subdir/no-tree-dom-*.\[cS\]]] \ "" $DEFAULT_VECTCFLAGS +# -fno-tree-pre +set DEFAULT_VECTCFLAGS $SAVED_DEFAULT_VECTCFLAGS +lappend DEFAULT_VECTCFLAGS "-fno-tree-pre" +dg-runtest [lsort [glob -nocomplain $srcdir/$subdir/no-tree-pre-*.\[cS\]]] \ + "" $DEFAULT_VECTCFLAGS + # With -Os set DEFAULT_VECTCFLAGS $SAVED_DEFAULT_VECTCFLAGS lappend DEFAULT_VECTCFLAGS "-Os" diff --git a/gcc/testsuite/lib/target-supports.exp b/gcc/testsuite/lib/target-supports.exp index fdf25444f9d..55e2a8a4c30 100644 --- a/gcc/testsuite/lib/target-supports.exp +++ b/gcc/testsuite/lib/target-supports.exp @@ -2043,7 +2043,7 @@ proc check_effective_target_vect_no_align { } { set et_vect_no_align_saved 0 if { [istarget mipsisa64*-*-*] || [istarget sparc*-*-*] - || [istarget ia64-*-*] } { + || [istarget ia64-*-*] } { set et_vect_no_align_saved 1 } } @@ -2255,6 +2255,24 @@ proc check_effective_target_vect_interleave { } { return $et_vect_interleave_saved } +# Return 1 if the target supports vector interleaving and extract even/odd, 0 otherwise. +proc check_effective_target_vect_strided { } { + global et_vect_strided_saved + + if [info exists et_vect_strided_saved] { + verbose "check_effective_target_vect_strided: using cached result" 2 + } else { + set et_vect_strided_saved 0 + if { [check_effective_target_vect_interleave] + && [check_effective_target_vect_extract_even_odd] } { + set et_vect_strided_saved 1 + } + } + + verbose "check_effective_target_vect_strided: returning $et_vect_strided_saved" 2 + return $et_vect_strided_saved +} + # Return 1 if the target supports section-anchors proc check_effective_target_section_anchors { } { diff --git a/gcc/tree-vect-analyze.c b/gcc/tree-vect-analyze.c index a37fcf4395b..684d12dfcb4 100644 --- a/gcc/tree-vect-analyze.c +++ b/gcc/tree-vect-analyze.c @@ -39,6 +39,7 @@ along with GCC; see the file COPYING3. If not see #include "tree-scalar-evolution.h" #include "tree-vectorizer.h" #include "toplev.h" +#include "recog.h" /* Main analysis functions. */ static loop_vec_info vect_analyze_loop_form (struct loop *); @@ -300,6 +301,30 @@ vect_determine_vectorization_factor (loop_vec_info loop_vinfo) } +/* SLP costs are calculated according to SLP instance unrolling factor (i.e., + the number of created vector stmts depends on the unrolling factor). However, + the actual number of vector stmts for every SLP node depends on VF which is + set later in vect_analyze_operations(). Hence, SLP costs should be updated. + In this function we assume that the inside costs calculated in + vect_model_xxx_cost are linear in ncopies. */ + +static void +vect_update_slp_costs_according_to_vf (loop_vec_info loop_vinfo) +{ + unsigned int i, vf = LOOP_VINFO_VECT_FACTOR (loop_vinfo); + VEC (slp_instance, heap) *slp_instances = LOOP_VINFO_SLP_INSTANCES (loop_vinfo); + slp_instance instance; + + if (vect_print_dump_info (REPORT_SLP)) + fprintf (vect_dump, "=== vect_update_slp_costs_according_to_vf ==="); + + for (i = 0; VEC_iterate (slp_instance, slp_instances, i, instance); i++) + /* We assume that costs are linear in ncopies. */ + SLP_INSTANCE_INSIDE_OF_LOOP_COST (instance) *= vf + / SLP_INSTANCE_UNROLLING_FACTOR (instance); +} + + /* Function vect_analyze_operations. Scan the loop stmts and make sure they are all vectorizable. */ @@ -320,6 +345,7 @@ vect_analyze_operations (loop_vec_info loop_vinfo) int min_profitable_iters; int min_scalar_loop_bound; unsigned int th; + bool only_slp_in_loop = true; if (vect_print_dump_info (REPORT_DETAILS)) fprintf (vect_dump, "=== vect_analyze_operations ==="); @@ -456,12 +482,12 @@ vect_analyze_operations (loop_vec_info loop_vinfo) ok = (vectorizable_type_promotion (stmt, NULL, NULL) || vectorizable_type_demotion (stmt, NULL, NULL) - || vectorizable_conversion (stmt, NULL, NULL) - || vectorizable_operation (stmt, NULL, NULL) - || vectorizable_assignment (stmt, NULL, NULL) - || vectorizable_load (stmt, NULL, NULL) + || vectorizable_conversion (stmt, NULL, NULL, NULL) + || vectorizable_operation (stmt, NULL, NULL, NULL) + || vectorizable_assignment (stmt, NULL, NULL, NULL) + || vectorizable_load (stmt, NULL, NULL, NULL) || vectorizable_call (stmt, NULL, NULL) - || vectorizable_store (stmt, NULL, NULL) + || vectorizable_store (stmt, NULL, NULL, NULL) || vectorizable_condition (stmt, NULL, NULL) || vectorizable_reduction (stmt, NULL, NULL)); @@ -480,6 +506,30 @@ vect_analyze_operations (loop_vec_info loop_vinfo) } return false; } + + if (!PURE_SLP_STMT (stmt_info)) + { + /* STMT needs loop-based vectorization. */ + only_slp_in_loop = false; + + /* Groups of strided accesses whose size is not a power of 2 are + not vectorizable yet using loop-vectorization. Therefore, if + this stmt feeds non-SLP-able stmts (i.e., this stmt has to be + both SLPed and loop-based vectorzed), the loop cannot be + vectorized. */ + if (STMT_VINFO_STRIDED_ACCESS (stmt_info) + && exact_log2 (DR_GROUP_SIZE (vinfo_for_stmt ( + DR_GROUP_FIRST_DR (stmt_info)))) == -1) + { + if (vect_print_dump_info (REPORT_DETAILS)) + { + fprintf (vect_dump, "not vectorized: the size of group " + "of strided accesses is not a power of 2"); + print_generic_expr (vect_dump, stmt, TDF_SLIM); + } + return false; + } + } } /* stmts in bb */ } /* bbs */ @@ -499,6 +549,18 @@ vect_analyze_operations (loop_vec_info loop_vinfo) return false; } + /* If all the stmts in the loop can be SLPed, we perform only SLP, and + vectorization factor of the loop is the unrolling factor required by the + SLP instances. If that unrolling factor is 1, we say, that we perform + pure SLP on loop - cross iteration parallelism is not exploited. */ + if (only_slp_in_loop) + vectorization_factor = LOOP_VINFO_SLP_UNROLLING_FACTOR (loop_vinfo); + else + vectorization_factor = least_common_multiple (vectorization_factor, + LOOP_VINFO_SLP_UNROLLING_FACTOR (loop_vinfo)); + + LOOP_VINFO_VECT_FACTOR (loop_vinfo) = vectorization_factor; + if (LOOP_VINFO_NITERS_KNOWN_P (loop_vinfo) && vect_print_dump_info (REPORT_DETAILS)) fprintf (vect_dump, @@ -518,6 +580,10 @@ vect_analyze_operations (loop_vec_info loop_vinfo) /* Analyze cost. Decide if worth while to vectorize. */ + /* Once VF is set, SLP costs should be updated since the number of created + vector stmts depends on VF. */ + vect_update_slp_costs_according_to_vf (loop_vinfo); + min_profitable_iters = vect_estimate_min_profitable_iters (loop_vinfo); LOOP_VINFO_COST_MODEL_MIN_ITERS (loop_vinfo) = min_profitable_iters; if (min_profitable_iters < 0) @@ -1462,9 +1528,9 @@ vect_update_misalignment_for_peel (struct data_reference *dr, /* For interleaved data accesses the step in the loop must be multiplied by the size of the interleaving group. */ - if (DR_GROUP_FIRST_DR (stmt_info)) + if (STMT_VINFO_STRIDED_ACCESS (stmt_info)) dr_size *= DR_GROUP_SIZE (vinfo_for_stmt (DR_GROUP_FIRST_DR (stmt_info))); - if (DR_GROUP_FIRST_DR (peel_stmt_info)) + if (STMT_VINFO_STRIDED_ACCESS (peel_stmt_info)) dr_peel_size *= DR_GROUP_SIZE (peel_stmt_info); /* It can be assumed that the data refs with the same alignment as dr_peel @@ -1516,7 +1582,7 @@ vect_verify_datarefs_alignment (loop_vec_info loop_vinfo) stmt_vec_info stmt_info = vinfo_for_stmt (stmt); /* For interleaving, only the alignment of the first access matters. */ - if (DR_GROUP_FIRST_DR (stmt_info) + if (STMT_VINFO_STRIDED_ACCESS (stmt_info) && DR_GROUP_FIRST_DR (stmt_info) != stmt) continue; @@ -1554,7 +1620,7 @@ vector_alignment_reachable_p (struct data_reference *dr) stmt_vec_info stmt_info = vinfo_for_stmt (stmt); tree vectype = STMT_VINFO_VECTYPE (stmt_info); - if (DR_GROUP_FIRST_DR (stmt_info)) + if (STMT_VINFO_STRIDED_ACCESS (stmt_info)) { /* For interleaved access we peel only if number of iterations in the prolog loop ({VF - misalignment}), is a multiple of the @@ -1768,7 +1834,7 @@ vect_enhance_data_refs_alignment (loop_vec_info loop_vinfo) /* For interleaving, only the alignment of the first access matters. */ - if (DR_GROUP_FIRST_DR (stmt_info) + if (STMT_VINFO_STRIDED_ACCESS (stmt_info) && DR_GROUP_FIRST_DR (stmt_info) != stmt) continue; @@ -1818,7 +1884,7 @@ vect_enhance_data_refs_alignment (loop_vec_info loop_vinfo) members of the group, therefore we divide the number of iterations by the group size. */ stmt_info = vinfo_for_stmt (DR_STMT (dr0)); - if (DR_GROUP_FIRST_DR (stmt_info)) + if (STMT_VINFO_STRIDED_ACCESS (stmt_info)) npeel /= DR_GROUP_SIZE (stmt_info); if (vect_print_dump_info (REPORT_DETAILS)) @@ -1837,7 +1903,7 @@ vect_enhance_data_refs_alignment (loop_vec_info loop_vinfo) stmt_info = vinfo_for_stmt (stmt); /* For interleaving, only the alignment of the first access matters. */ - if (DR_GROUP_FIRST_DR (stmt_info) + if (STMT_VINFO_STRIDED_ACCESS (stmt_info) && DR_GROUP_FIRST_DR (stmt_info) != stmt) continue; @@ -1907,7 +1973,7 @@ vect_enhance_data_refs_alignment (loop_vec_info loop_vinfo) /* For interleaving, only the alignment of the first access matters. */ if (aligned_access_p (dr) - || (DR_GROUP_FIRST_DR (stmt_info) + || (STMT_VINFO_STRIDED_ACCESS (stmt_info) && DR_GROUP_FIRST_DR (stmt_info) != stmt)) continue; @@ -2019,13 +2085,13 @@ vect_analyze_data_refs_alignment (loop_vec_info loop_vinfo) } -/* Function vect_analyze_data_ref_access. - - Analyze the access pattern of the data-reference DR. For now, a data access - has to be consecutive to be considered vectorizable. */ +/* Analyze groups of strided accesses: check that DR belongs to a group of + strided accesses of legal size, step, etc. Detect gaps, single element + interleaving, and other special cases. Set strided access info. + Collect groups of strided stores for further use in SLP analysis. */ static bool -vect_analyze_data_ref_access (struct data_reference *dr) +vect_analyze_group_access (struct data_reference *dr) { tree step = DR_STEP (dr); tree scalar_type = TREE_TYPE (DR_REF (dr)); @@ -2033,50 +2099,14 @@ vect_analyze_data_ref_access (struct data_reference *dr) tree stmt = DR_STMT (dr); stmt_vec_info stmt_info = vinfo_for_stmt (stmt); loop_vec_info loop_vinfo = STMT_VINFO_LOOP_VINFO (stmt_info); - struct loop *loop = LOOP_VINFO_LOOP (loop_vinfo); HOST_WIDE_INT dr_step = TREE_INT_CST_LOW (step); HOST_WIDE_INT stride; + bool slp_impossible = false; - /* Don't allow invariant accesses. */ - if (dr_step == 0) - return false; - - if (nested_in_vect_loop_p (loop, stmt)) - { - /* For the rest of the analysis we use the outer-loop step. */ - step = STMT_VINFO_DR_STEP (stmt_info); - dr_step = TREE_INT_CST_LOW (step); - - if (dr_step == 0) - { - if (vect_print_dump_info (REPORT_ALIGNMENT)) - fprintf (vect_dump, "zero step in outer loop."); - if (DR_IS_READ (dr)) - return true; - else - return false; - } - } - /* For interleaving, STRIDE is STEP counted in elements, i.e., the size of the interleaving group (including gaps). */ stride = dr_step / type_size; - /* Consecutive? */ - if (!tree_int_cst_compare (step, TYPE_SIZE_UNIT (scalar_type))) - { - /* Mark that it is not interleaving. */ - DR_GROUP_FIRST_DR (vinfo_for_stmt (stmt)) = NULL_TREE; - return true; - } - - if (nested_in_vect_loop_p (loop, stmt)) - { - if (vect_print_dump_info (REPORT_ALIGNMENT)) - fprintf (vect_dump, "strided access in outer loop."); - return false; - } - /* Not consecutive access is possible only if it is a part of interleaving. */ if (!DR_GROUP_FIRST_DR (vinfo_for_stmt (stmt))) { @@ -2119,99 +2149,105 @@ vect_analyze_data_ref_access (struct data_reference *dr) HOST_WIDE_INT diff, count_in_bytes; while (next) - { - /* Skip same data-refs. In case that two or more stmts share data-ref - (supported only for loads), we vectorize only the first stmt, and - the rest get their vectorized loads from the first one. */ - if (!tree_int_cst_compare (DR_INIT (data_ref), - DR_INIT (STMT_VINFO_DATA_REF ( - vinfo_for_stmt (next))))) - { + { + /* Skip same data-refs. In case that two or more stmts share data-ref + (supported only for loads), we vectorize only the first stmt, and + the rest get their vectorized loads from the first one. */ + if (!tree_int_cst_compare (DR_INIT (data_ref), + DR_INIT (STMT_VINFO_DATA_REF ( + vinfo_for_stmt (next))))) + { if (!DR_IS_READ (data_ref)) - { + { if (vect_print_dump_info (REPORT_DETAILS)) fprintf (vect_dump, "Two store stmts share the same dr."); - return false; + return false; } - /* Check that there is no load-store dependencies for this loads + /* Check that there is no load-store dependencies for this loads to prevent a case of load-store-load to the same location. */ if (DR_GROUP_READ_WRITE_DEPENDENCE (vinfo_for_stmt (next)) || DR_GROUP_READ_WRITE_DEPENDENCE (vinfo_for_stmt (prev))) { if (vect_print_dump_info (REPORT_DETAILS)) - fprintf (vect_dump, + fprintf (vect_dump, "READ_WRITE dependence in interleaving."); return false; } - /* For load use the same data-ref load. */ - DR_GROUP_SAME_DR_STMT (vinfo_for_stmt (next)) = prev; + /* For load use the same data-ref load. */ + DR_GROUP_SAME_DR_STMT (vinfo_for_stmt (next)) = prev; - prev = next; - next = DR_GROUP_NEXT_DR (vinfo_for_stmt (next)); - continue; - } - prev = next; + prev = next; + next = DR_GROUP_NEXT_DR (vinfo_for_stmt (next)); + continue; + } + prev = next; - /* Check that all the accesses have the same STEP. */ - next_step = DR_STEP (STMT_VINFO_DATA_REF (vinfo_for_stmt (next))); - if (tree_int_cst_compare (step, next_step)) - { - if (vect_print_dump_info (REPORT_DETAILS)) - fprintf (vect_dump, "not consecutive access in interleaving"); - return false; - } + /* Check that all the accesses have the same STEP. */ + next_step = DR_STEP (STMT_VINFO_DATA_REF (vinfo_for_stmt (next))); + if (tree_int_cst_compare (step, next_step)) + { + if (vect_print_dump_info (REPORT_DETAILS)) + fprintf (vect_dump, "not consecutive access in interleaving"); + return false; + } - data_ref = STMT_VINFO_DATA_REF (vinfo_for_stmt (next)); - /* Check that the distance between two accesses is equal to the type - size. Otherwise, we have gaps. */ - diff = (TREE_INT_CST_LOW (DR_INIT (data_ref)) - - TREE_INT_CST_LOW (prev_init)) / type_size; - if (!DR_IS_READ (data_ref) && diff != 1) + data_ref = STMT_VINFO_DATA_REF (vinfo_for_stmt (next)); + /* Check that the distance between two accesses is equal to the type + size. Otherwise, we have gaps. */ + diff = (TREE_INT_CST_LOW (DR_INIT (data_ref)) + - TREE_INT_CST_LOW (prev_init)) / type_size; + if (diff != 1) { - if (vect_print_dump_info (REPORT_DETAILS)) - fprintf (vect_dump, "interleaved store with gaps"); - return false; + /* FORNOW: SLP of accesses with gaps is not supported. */ + slp_impossible = true; + if (!DR_IS_READ (data_ref)) + { + if (vect_print_dump_info (REPORT_DETAILS)) + fprintf (vect_dump, "interleaved store with gaps"); + return false; + } } - /* Store the gap from the previous member of the group. If there is no + + /* Store the gap from the previous member of the group. If there is no gap in the access, DR_GROUP_GAP is always 1. */ - DR_GROUP_GAP (vinfo_for_stmt (next)) = diff; + DR_GROUP_GAP (vinfo_for_stmt (next)) = diff; - prev_init = DR_INIT (data_ref); - next = DR_GROUP_NEXT_DR (vinfo_for_stmt (next)); - /* Count the number of data-refs in the chain. */ - count++; - } + prev_init = DR_INIT (data_ref); + next = DR_GROUP_NEXT_DR (vinfo_for_stmt (next)); + /* Count the number of data-refs in the chain. */ + count++; + } - /* COUNT is the number of accesses found, we multiply it by the size of - the type to get COUNT_IN_BYTES. */ + /* COUNT is the number of accesses found, we multiply it by the size of + the type to get COUNT_IN_BYTES. */ count_in_bytes = type_size * count; /* Check that the size of the interleaving is not greater than STEP. */ - if (dr_step < count_in_bytes) - { - if (vect_print_dump_info (REPORT_DETAILS)) - { - fprintf (vect_dump, "interleaving size is greater than step for "); - print_generic_expr (vect_dump, DR_REF (dr), TDF_SLIM); - } - return false; - } + if (dr_step < count_in_bytes) + { + if (vect_print_dump_info (REPORT_DETAILS)) + { + fprintf (vect_dump, "interleaving size is greater than step for "); + print_generic_expr (vect_dump, DR_REF (dr), TDF_SLIM); + } + return false; + } - /* Check that the size of the interleaving is equal to STEP for stores, - i.e., that there are no gaps. */ - if (!DR_IS_READ (dr) && dr_step != count_in_bytes) - { - if (vect_print_dump_info (REPORT_DETAILS)) - fprintf (vect_dump, "interleaved store with gaps"); - return false; - } + /* Check that the size of the interleaving is equal to STEP for stores, + i.e., that there are no gaps. */ + if (!DR_IS_READ (dr) && dr_step != count_in_bytes) + { + if (vect_print_dump_info (REPORT_DETAILS)) + fprintf (vect_dump, "interleaved store with gaps"); + return false; + } /* Check that STEP is a multiple of type size. */ if ((dr_step % type_size) != 0) - { - if (vect_print_dump_info (REPORT_DETAILS)) + { + if (vect_print_dump_info (REPORT_DETAILS)) { fprintf (vect_dump, "step is not a multiple of type size: step "); print_generic_expr (vect_dump, step, TDF_SLIM); @@ -2219,22 +2255,98 @@ vect_analyze_data_ref_access (struct data_reference *dr) print_generic_expr (vect_dump, TYPE_SIZE_UNIT (scalar_type), TDF_SLIM); } - return false; - } + return false; + } - /* FORNOW: we handle only interleaving that is a power of 2. */ + /* FORNOW: we handle only interleaving that is a power of 2. + We don't fail here if it may be still possible to vectorize the + group using SLP. If not, the size of the group will be checked in + vect_analyze_operations, and the vectorization will fail. */ if (exact_log2 (stride) == -1) { if (vect_print_dump_info (REPORT_DETAILS)) fprintf (vect_dump, "interleaving is not a power of 2"); - return false; + + if (slp_impossible) + return false; } DR_GROUP_SIZE (vinfo_for_stmt (stmt)) = stride; + if (vect_print_dump_info (REPORT_DETAILS)) + fprintf (vect_dump, "Detected interleaving of size %d", (int)stride); + + /* SLP: create an SLP data structure for every interleaving group of + stores for further analysis in vect_analyse_slp. */ + if (!DR_IS_READ (dr) && !slp_impossible) + VEC_safe_push (tree, heap, LOOP_VINFO_STRIDED_STORES (loop_vinfo), stmt); } + return true; } +/* Analyze the access pattern of the data-reference DR. + In case of non-consecutive accesse call vect_analyze_group_access() to + analyze groups of strided accesses. */ + +static bool +vect_analyze_data_ref_access (struct data_reference *dr) +{ + tree step = DR_STEP (dr); + tree scalar_type = TREE_TYPE (DR_REF (dr)); + tree stmt = DR_STMT (dr); + stmt_vec_info stmt_info = vinfo_for_stmt (stmt); + loop_vec_info loop_vinfo = STMT_VINFO_LOOP_VINFO (stmt_info); + struct loop *loop = LOOP_VINFO_LOOP (loop_vinfo); + HOST_WIDE_INT dr_step = TREE_INT_CST_LOW (step); + + if (!step) + { + if (vect_print_dump_info (REPORT_DETAILS)) + fprintf (vect_dump, "bad data-ref access"); + return false; + } + + /* Don't allow invariant accesses. */ + if (dr_step == 0) + return false; + + if (nested_in_vect_loop_p (loop, stmt)) + { + /* For the rest of the analysis we use the outer-loop step. */ + step = STMT_VINFO_DR_STEP (stmt_info); + dr_step = TREE_INT_CST_LOW (step); + + if (dr_step == 0) + { + if (vect_print_dump_info (REPORT_ALIGNMENT)) + fprintf (vect_dump, "zero step in outer loop."); + if (DR_IS_READ (dr)) + return true; + else + return false; + } + } + + /* Consecutive? */ + if (!tree_int_cst_compare (step, TYPE_SIZE_UNIT (scalar_type))) + { + /* Mark that it is not interleaving. */ + DR_GROUP_FIRST_DR (vinfo_for_stmt (stmt)) = NULL_TREE; + return true; + } + + if (nested_in_vect_loop_p (loop, stmt)) + { + if (vect_print_dump_info (REPORT_ALIGNMENT)) + fprintf (vect_dump, "strided access in outer loop."); + return false; + } + + /* Not consecutive access - check if it's a part of interleaving group. */ + return vect_analyze_group_access (dr); +} + + /* Function vect_analyze_data_ref_accesses. Analyze the access pattern of all the data references in the loop. @@ -2266,6 +2378,697 @@ vect_analyze_data_ref_accesses (loop_vec_info loop_vinfo) } +/* Recursively free the memory allocated for the SLP tree rooted at NODE. */ + +void +vect_free_slp_tree (slp_tree node) +{ + if (!node) + return; + + if (SLP_TREE_LEFT (node)) + vect_free_slp_tree (SLP_TREE_LEFT (node)); + + if (SLP_TREE_RIGHT (node)) + vect_free_slp_tree (SLP_TREE_RIGHT (node)); + + VEC_free (tree, heap, SLP_TREE_SCALAR_STMTS (node)); + + if (SLP_TREE_VEC_STMTS (node)) + VEC_free (tree, heap, SLP_TREE_VEC_STMTS (node)); + + free (node); +} + + +/* Get the defs for the RHS (collect them in DEF_STMTS0/1), check that they are + of a legal type and that they match the defs of the first stmt of the SLP + group (stored in FIRST_STMT_...). */ + +static bool +vect_get_and_check_slp_defs (loop_vec_info loop_vinfo, slp_tree slp_node, + tree rhs, VEC (tree, heap) **def_stmts0, + VEC (tree, heap) **def_stmts1, + enum vect_def_type *first_stmt_dt0, + enum vect_def_type *first_stmt_dt1, + tree *first_stmt_def0_type, + tree *first_stmt_def1_type, + tree *first_stmt_const_oprnd, + int ncopies_for_cost) +{ + tree oprnd; + enum operation_type op_type = TREE_OPERAND_LENGTH (rhs); + unsigned int i, number_of_oprnds = op_type; + tree def, def_stmt; + enum vect_def_type dt[2] = {vect_unknown_def_type, vect_unknown_def_type}; + stmt_vec_info stmt_info = + vinfo_for_stmt (VEC_index (tree, SLP_TREE_SCALAR_STMTS (slp_node), 0)); + + /* Store. */ + if (!op_type) + number_of_oprnds = 1; + else + gcc_assert (op_type == unary_op || op_type == binary_op); + + for (i = 0; i < number_of_oprnds; i++) + { + if (op_type) + oprnd = TREE_OPERAND (rhs, i); + else + oprnd = rhs; + + if (!vect_is_simple_use (oprnd, loop_vinfo, &def_stmt, &def, &dt[i]) + || (!def_stmt && dt[i] != vect_constant_def)) + { + if (vect_print_dump_info (REPORT_SLP)) + { + fprintf (vect_dump, "Build SLP failed: can't find def for "); + print_generic_expr (vect_dump, oprnd, TDF_SLIM); + } + + return false; + } + + if (!*first_stmt_dt0) + { + /* op0 of the first stmt of the group - store its info. */ + *first_stmt_dt0 = dt[i]; + if (def) + *first_stmt_def0_type = TREE_TYPE (def); + else + *first_stmt_const_oprnd = oprnd; + + /* Analyze costs (for the first stmt of the group only). */ + if (op_type) + /* Not memory operation (we don't call this functions for loads). */ + vect_model_simple_cost (stmt_info, ncopies_for_cost, dt, slp_node); + else + /* Store. */ + vect_model_store_cost (stmt_info, ncopies_for_cost, dt[0], slp_node); + } + + else + { + if (!*first_stmt_dt1 && i == 1) + { + /* op1 of the first stmt of the group - store its info. */ + *first_stmt_dt1 = dt[i]; + if (def) + *first_stmt_def1_type = TREE_TYPE (def); + else + { + /* We assume that the stmt contains only one constant + operand. We fail otherwise, to be on the safe side. */ + if (*first_stmt_const_oprnd) + { + if (vect_print_dump_info (REPORT_SLP)) + fprintf (vect_dump, "Build SLP failed: two constant " + "oprnds in stmt"); + return false; + } + *first_stmt_const_oprnd = oprnd; + } + } + else + { + /* Not first stmt of the group, check that the def-stmt/s match + the def-stmt/s of the first stmt. */ + if ((i == 0 + && (*first_stmt_dt0 != dt[i] + || (*first_stmt_def0_type && def + && *first_stmt_def0_type != TREE_TYPE (def)))) + || (i == 1 + && (*first_stmt_dt1 != dt[i] + || (*first_stmt_def1_type && def + && *first_stmt_def1_type != TREE_TYPE (def)))) + || (!def + && TREE_TYPE (*first_stmt_const_oprnd) + != TREE_TYPE (oprnd))) + { + if (vect_print_dump_info (REPORT_SLP)) + fprintf (vect_dump, "Build SLP failed: different types "); + + return false; + } + } + } + + /* Check the types of the definitions. */ + switch (dt[i]) + { + case vect_constant_def: + case vect_invariant_def: + break; + + case vect_loop_def: + if (i == 0) + VEC_safe_push (tree, heap, *def_stmts0, def_stmt); + else + VEC_safe_push (tree, heap, *def_stmts1, def_stmt); + break; + + default: + /* FORNOW: Not supported. */ + if (vect_print_dump_info (REPORT_SLP)) + { + fprintf (vect_dump, "Build SLP failed: illegal type of def "); + print_generic_expr (vect_dump, def, TDF_SLIM); + } + + return false; + } + } + + return true; +} + + +/* Recursively build an SLP tree starting from NODE. + Fail (and return FALSE) if def-stmts are not isomorphic, require data + permutation or are of unsupported types of operation. Otherwise, return + TRUE. + SLP_IMPOSSIBLE is TRUE if it is impossible to SLP in the loop, for example + in the case of multiple types for now. */ + +static bool +vect_build_slp_tree (loop_vec_info loop_vinfo, slp_tree *node, + unsigned int group_size, bool *slp_impossible, + int *inside_cost, int *outside_cost, + int ncopies_for_cost) +{ + VEC (tree, heap) *def_stmts0 = VEC_alloc (tree, heap, group_size); + VEC (tree, heap) *def_stmts1 = VEC_alloc (tree, heap, group_size); + unsigned int i; + VEC (tree, heap) *stmts = SLP_TREE_SCALAR_STMTS (*node); + tree stmt = VEC_index (tree, stmts, 0); + enum vect_def_type first_stmt_dt0 = 0, first_stmt_dt1 = 0; + enum tree_code first_stmt_code = 0; + tree first_stmt_def1_type = NULL_TREE, first_stmt_def0_type = NULL_TREE; + tree lhs, rhs, prev_stmt = NULL_TREE; + bool stop_recursion = false, need_same_oprnds = false; + tree vectype, scalar_type, first_op1 = NULL_TREE; + unsigned int vectorization_factor = 0, ncopies; + optab optab; + int icode; + enum machine_mode optab_op2_mode; + enum machine_mode vec_mode; + tree first_stmt_const_oprnd = NULL_TREE; + struct data_reference *first_dr; + + /* For every stmt in NODE find its def stmt/s. */ + for (i = 0; VEC_iterate (tree, stmts, i, stmt); i++) + { + if (vect_print_dump_info (REPORT_SLP)) + { + fprintf (vect_dump, "Build SLP for "); + print_generic_expr (vect_dump, stmt, TDF_SLIM); + } + + if (TREE_CODE (stmt) != GIMPLE_MODIFY_STMT) + { + if (vect_print_dump_info (REPORT_SLP)) + { + fprintf (vect_dump, "Build SLP failed: not MODIFY_STMT "); + print_generic_expr (vect_dump, stmt, TDF_SLIM); + } + + return false; + } + + scalar_type = TREE_TYPE (GIMPLE_STMT_OPERAND (stmt, 0)); + vectype = get_vectype_for_scalar_type (scalar_type); + gcc_assert (LOOP_VINFO_VECT_FACTOR (loop_vinfo)); + vectorization_factor = LOOP_VINFO_VECT_FACTOR (loop_vinfo); + ncopies = vectorization_factor / TYPE_VECTOR_SUBPARTS (vectype); + if (ncopies > 1) + { + /* FORNOW. */ + if (vect_print_dump_info (REPORT_SLP)) + fprintf (vect_dump, "SLP failed - multiple types "); + + *slp_impossible = true; + return false; + } + + lhs = GIMPLE_STMT_OPERAND (stmt, 0); + rhs = GIMPLE_STMT_OPERAND (stmt, 1); + + /* Check the operation. */ + if (i == 0) + { + first_stmt_code = TREE_CODE (rhs); + + /* Shift arguments should be equal in all the packed stmts for a + vector shift with scalar shift operand. */ + if (TREE_CODE (rhs) == LSHIFT_EXPR || TREE_CODE (rhs) == RSHIFT_EXPR) + { + vec_mode = TYPE_MODE (vectype); + optab = optab_for_tree_code (TREE_CODE (rhs), vectype); + if (!optab) + { + if (vect_print_dump_info (REPORT_SLP)) + fprintf (vect_dump, "Build SLP failed: no optab."); + return false; + } + icode = (int) optab->handlers[(int) vec_mode].insn_code; + optab_op2_mode = insn_data[icode].operand[2].mode; + if (!VECTOR_MODE_P (optab_op2_mode)) + { + need_same_oprnds = true; + first_op1 = TREE_OPERAND (rhs, 1); + } + } + } + else + { + if (first_stmt_code != TREE_CODE (rhs)) + { + if (vect_print_dump_info (REPORT_SLP)) + { + fprintf (vect_dump, + "Build SLP failed: different operation in stmt "); + print_generic_expr (vect_dump, stmt, TDF_SLIM); + } + + return false; + } + + if (need_same_oprnds + && !operand_equal_p (first_op1, TREE_OPERAND (rhs, 1), 0)) + { + if (vect_print_dump_info (REPORT_SLP)) + { + fprintf (vect_dump, + "Build SLP failed: different shift arguments in "); + print_generic_expr (vect_dump, stmt, TDF_SLIM); + } + + return false; + } + } + + /* Strided store or load. */ + if (STMT_VINFO_STRIDED_ACCESS (vinfo_for_stmt (stmt))) + { + if (REFERENCE_CLASS_P (lhs)) + { + /* Store. */ + if (!vect_get_and_check_slp_defs (loop_vinfo, *node, rhs, + &def_stmts0, &def_stmts1, + &first_stmt_dt0, + &first_stmt_dt1, + &first_stmt_def0_type, + &first_stmt_def1_type, + &first_stmt_const_oprnd, + ncopies_for_cost)) + return false; + } + else + { + /* Load. */ + if (i == 0) + { + /* First stmt of the SLP group should be the first load of + the interleaving loop if data permutation is not + allowed. */ + if (DR_GROUP_FIRST_DR (vinfo_for_stmt (stmt)) != stmt) + { + /* FORNOW: data permutations are not supported. */ + if (vect_print_dump_info (REPORT_SLP)) + { + fprintf (vect_dump, "Build SLP failed: strided " + " loads need permutation "); + print_generic_expr (vect_dump, stmt, TDF_SLIM); + } + + return false; + } + + first_dr = STMT_VINFO_DATA_REF (vinfo_for_stmt (stmt)); + if (vect_supportable_dr_alignment (first_dr) + == dr_unaligned_unsupported) + { + if (vect_print_dump_info (REPORT_SLP)) + { + fprintf (vect_dump, "Build SLP failed: unsupported " + " unaligned load "); + print_generic_expr (vect_dump, stmt, TDF_SLIM); + } + + return false; + } + + /* Analyze costs (for the first stmt in the group). */ + vect_model_load_cost (vinfo_for_stmt (stmt), + ncopies_for_cost, *node); + } + else + { + if (DR_GROUP_NEXT_DR (vinfo_for_stmt (prev_stmt)) != stmt) + { + /* FORNOW: data permutations are not supported. */ + if (vect_print_dump_info (REPORT_SLP)) + { + fprintf (vect_dump, "Build SLP failed: strided " + " loads need permutation "); + print_generic_expr (vect_dump, stmt, TDF_SLIM); + } + return false; + } + } + + prev_stmt = stmt; + + /* We stop the tree when we reach a group of loads. */ + stop_recursion = true; + continue; + } + } /* Strided access. */ + else + { + if (REFERENCE_CLASS_P (rhs)) + { + /* Not strided load. */ + if (vect_print_dump_info (REPORT_SLP)) + { + fprintf (vect_dump, "Build SLP failed: not strided load "); + print_generic_expr (vect_dump, stmt, TDF_SLIM); + } + + /* FORNOW: Not strided loads are not supported. */ + return false; + } + + /* Not memory operation. */ + if (!BINARY_CLASS_P (rhs) && !UNARY_CLASS_P (rhs)) + { + if (vect_print_dump_info (REPORT_SLP)) + { + fprintf (vect_dump, "Build SLP failed: operation"); + fprintf (vect_dump, " unsupported "); + print_generic_expr (vect_dump, stmt, TDF_SLIM); + } + + return false; + } + + /* Find the def-stmts. */ + if (!vect_get_and_check_slp_defs (loop_vinfo, *node, rhs, &def_stmts0, + &def_stmts1, &first_stmt_dt0, + &first_stmt_dt1, + &first_stmt_def0_type, + &first_stmt_def1_type, + &first_stmt_const_oprnd, + ncopies_for_cost)) + return false; + } + } + + /* Add the costs of the node to the overall instance costs. */ + *inside_cost += SLP_TREE_INSIDE_OF_LOOP_COST (*node); + *outside_cost += SLP_TREE_OUTSIDE_OF_LOOP_COST (*node); + + /* Strided loads were reached - stop the recursion. */ + if (stop_recursion) + return true; + + /* Create SLP_TREE nodes for the definition node/s. */ + if (first_stmt_dt0 == vect_loop_def) + { + slp_tree left_node = XNEW (struct _slp_tree); + SLP_TREE_SCALAR_STMTS (left_node) = def_stmts0; + SLP_TREE_VEC_STMTS (left_node) = NULL; + SLP_TREE_LEFT (left_node) = NULL; + SLP_TREE_RIGHT (left_node) = NULL; + SLP_TREE_OUTSIDE_OF_LOOP_COST (left_node) = 0; + SLP_TREE_INSIDE_OF_LOOP_COST (left_node) = 0; + if (!vect_build_slp_tree (loop_vinfo, &left_node, group_size, + slp_impossible, inside_cost, outside_cost, + ncopies_for_cost)) + return false; + + SLP_TREE_LEFT (*node) = left_node; + } + + if (first_stmt_dt1 == vect_loop_def) + { + slp_tree right_node = XNEW (struct _slp_tree); + SLP_TREE_SCALAR_STMTS (right_node) = def_stmts1; + SLP_TREE_VEC_STMTS (right_node) = NULL; + SLP_TREE_LEFT (right_node) = NULL; + SLP_TREE_RIGHT (right_node) = NULL; + SLP_TREE_OUTSIDE_OF_LOOP_COST (right_node) = 0; + SLP_TREE_INSIDE_OF_LOOP_COST (right_node) = 0; + if (!vect_build_slp_tree (loop_vinfo, &right_node, group_size, + slp_impossible, inside_cost, outside_cost, + ncopies_for_cost)) + return false; + + SLP_TREE_RIGHT (*node) = right_node; + } + + return true; +} + + +static void +vect_print_slp_tree (slp_tree node) +{ + int i; + tree stmt; + + if (!node) + return; + + fprintf (vect_dump, "node "); + for (i = 0; VEC_iterate (tree, SLP_TREE_SCALAR_STMTS (node), i, stmt); i++) + { + fprintf (vect_dump, "\n\tstmt %d ", i); + print_generic_expr (vect_dump, stmt, TDF_SLIM); + } + fprintf (vect_dump, "\n"); + + vect_print_slp_tree (SLP_TREE_LEFT (node)); + vect_print_slp_tree (SLP_TREE_RIGHT (node)); +} + + +/* Mark the tree rooted at NODE with MARK (PURE_SLP or HYBRID). + If MARK is HYBRID, it refers to a specific stmt in NODE (the stmt at index + J). Otherwise, MARK is PURE_SLP and J is -1, which indicates that all the + stmts in NODE are to be marked. */ + +static void +vect_mark_slp_stmts (slp_tree node, enum slp_vect_type mark, int j) +{ + int i; + tree stmt; + + if (!node) + return; + + for (i = 0; VEC_iterate (tree, SLP_TREE_SCALAR_STMTS (node), i, stmt); i++) + if (j < 0 || i == j) + STMT_SLP_TYPE (vinfo_for_stmt (stmt)) = mark; + + vect_mark_slp_stmts (SLP_TREE_LEFT (node), mark, j); + vect_mark_slp_stmts (SLP_TREE_RIGHT (node), mark, j); +} + + +/* Analyze an SLP instance starting from a group of strided stores. Call + vect_build_slp_tree to build a tree of packed stmts if possible. + Return FALSE if it's impossible to SLP any stmt in the loop. */ + +static bool +vect_analyze_slp_instance (loop_vec_info loop_vinfo, tree stmt) +{ + slp_instance new_instance; + slp_tree node = XNEW (struct _slp_tree); + unsigned int group_size = DR_GROUP_SIZE (vinfo_for_stmt (stmt)); + unsigned int unrolling_factor = 1, nunits; + tree vectype, scalar_type, next; + unsigned int vectorization_factor = 0, ncopies; + bool slp_impossible = false; + int inside_cost = 0, outside_cost = 0, ncopies_for_cost; + + /* FORNOW: multiple types are not supported. */ + scalar_type = TREE_TYPE (DR_REF (STMT_VINFO_DATA_REF (vinfo_for_stmt (stmt)))); + vectype = get_vectype_for_scalar_type (scalar_type); + nunits = TYPE_VECTOR_SUBPARTS (vectype); + vectorization_factor = LOOP_VINFO_VECT_FACTOR (loop_vinfo); + ncopies = vectorization_factor / nunits; + if (ncopies > 1) + { + if (vect_print_dump_info (REPORT_SLP)) + fprintf (vect_dump, "SLP failed - multiple types "); + + return false; + } + + /* Create a node (a root of the SLP tree) for the packed strided stores. */ + SLP_TREE_SCALAR_STMTS (node) = VEC_alloc (tree, heap, group_size); + next = stmt; + /* Collect the stores and store them in SLP_TREE_SCALAR_STMTS. */ + while (next) + { + VEC_safe_push (tree, heap, SLP_TREE_SCALAR_STMTS (node), next); + next = DR_GROUP_NEXT_DR (vinfo_for_stmt (next)); + } + + SLP_TREE_VEC_STMTS (node) = NULL; + SLP_TREE_NUMBER_OF_VEC_STMTS (node) = 0; + SLP_TREE_LEFT (node) = NULL; + SLP_TREE_RIGHT (node) = NULL; + SLP_TREE_OUTSIDE_OF_LOOP_COST (node) = 0; + SLP_TREE_INSIDE_OF_LOOP_COST (node) = 0; + + /* Calculate the unrolling factor. */ + unrolling_factor = least_common_multiple (nunits, group_size) / group_size; + + /* Calculate the number of vector stmts to create based on the unrolling + factor (number of vectors is 1 if NUNITS >= GROUP_SIZE, and is + GROUP_SIZE / NUNITS otherwise. */ + ncopies_for_cost = unrolling_factor * group_size / nunits; + + /* Build the tree for the SLP instance. */ + if (vect_build_slp_tree (loop_vinfo, &node, group_size, &slp_impossible, + &inside_cost, &outside_cost, ncopies_for_cost)) + { + /* Create a new SLP instance. */ + new_instance = XNEW (struct _slp_instance); + SLP_INSTANCE_TREE (new_instance) = node; + SLP_INSTANCE_GROUP_SIZE (new_instance) = group_size; + SLP_INSTANCE_UNROLLING_FACTOR (new_instance) = unrolling_factor; + SLP_INSTANCE_OUTSIDE_OF_LOOP_COST (new_instance) = outside_cost; + SLP_INSTANCE_INSIDE_OF_LOOP_COST (new_instance) = inside_cost; + VEC_safe_push (slp_instance, heap, LOOP_VINFO_SLP_INSTANCES (loop_vinfo), + new_instance); + if (vect_print_dump_info (REPORT_SLP)) + vect_print_slp_tree (node); + + return true; + } + + /* Failed to SLP. */ + /* Free the allocated memory. */ + vect_free_slp_tree (node); + + if (slp_impossible) + return false; + + /* SLP failed for this instance, but it is still possible to SLP other stmts + in the loop. */ + return true; +} + + +/* Check if there are stmts in the loop can be vectorized using SLP. Build SLP + trees of packed scalar stmts if SLP is possible. */ + +static bool +vect_analyze_slp (loop_vec_info loop_vinfo) +{ + unsigned int i; + VEC (tree, heap) *strided_stores = LOOP_VINFO_STRIDED_STORES (loop_vinfo); + tree store; + + if (vect_print_dump_info (REPORT_SLP)) + fprintf (vect_dump, "=== vect_analyze_slp ==="); + + for (i = 0; VEC_iterate (tree, strided_stores, i, store); i++) + if (!vect_analyze_slp_instance (loop_vinfo, store)) + { + /* SLP failed. No instance can be SLPed in the loop. */ + if (vect_print_dump_info (REPORT_UNVECTORIZED_LOOPS)) + fprintf (vect_dump, "SLP failed."); + + return false; + } + + return true; +} + + +/* For each possible SLP instance decide whether to SLP it and calculate overall + unrolling factor needed to SLP the loop. */ + +static void +vect_make_slp_decision (loop_vec_info loop_vinfo) +{ + unsigned int i, unrolling_factor = 1; + VEC (slp_instance, heap) *slp_instances = LOOP_VINFO_SLP_INSTANCES (loop_vinfo); + slp_instance instance; + int decided_to_slp = 0; + + if (vect_print_dump_info (REPORT_SLP)) + fprintf (vect_dump, "=== vect_make_slp_decision ==="); + + for (i = 0; VEC_iterate (slp_instance, slp_instances, i, instance); i++) + { + /* FORNOW: SLP if you can. */ + if (unrolling_factor < SLP_INSTANCE_UNROLLING_FACTOR (instance)) + unrolling_factor = SLP_INSTANCE_UNROLLING_FACTOR (instance); + + /* Mark all the stmts that belong to INSTANCE as PURE_SLP stmts. Later we + call vect_detect_hybrid_slp () to find stmts that need hybrid SLP and + loop-based vectorization. Such stmts will be marked as HYBRID. */ + vect_mark_slp_stmts (SLP_INSTANCE_TREE (instance), pure_slp, -1); + decided_to_slp++; + } + + LOOP_VINFO_SLP_UNROLLING_FACTOR (loop_vinfo) = unrolling_factor; + + if (decided_to_slp && vect_print_dump_info (REPORT_SLP)) + fprintf (vect_dump, "Decided to SLP %d instances. Unrolling factor %d", + decided_to_slp, unrolling_factor); +} + + +/* Find stmts that must be both vectorized and SLPed (since they feed stmts that + can't be SLPed) in the tree rooted at NODE. Mark such stmts as HYBRID. */ + +static void +vect_detect_hybrid_slp_stmts (slp_tree node) +{ + int i; + tree stmt; + imm_use_iterator imm_iter; + tree use_stmt; + + if (!node) + return; + + for (i = 0; VEC_iterate (tree, SLP_TREE_SCALAR_STMTS (node), i, stmt); i++) + if (PURE_SLP_STMT (vinfo_for_stmt (stmt)) + && TREE_CODE (GIMPLE_STMT_OPERAND (stmt, 0)) == SSA_NAME) + FOR_EACH_IMM_USE_STMT (use_stmt, imm_iter, GIMPLE_STMT_OPERAND (stmt, 0)) + if (vinfo_for_stmt (use_stmt) + && !STMT_SLP_TYPE (vinfo_for_stmt (use_stmt))) + vect_mark_slp_stmts (node, hybrid, i); + + vect_detect_hybrid_slp_stmts (SLP_TREE_LEFT (node)); + vect_detect_hybrid_slp_stmts (SLP_TREE_RIGHT (node)); +} + + +/* Find stmts that must be both vectorized and SLPed. */ + +static void +vect_detect_hybrid_slp (loop_vec_info loop_vinfo) +{ + unsigned int i; + VEC (slp_instance, heap) *slp_instances = LOOP_VINFO_SLP_INSTANCES (loop_vinfo); + slp_instance instance; + + if (vect_print_dump_info (REPORT_SLP)) + fprintf (vect_dump, "=== vect_detect_hybrid_slp ==="); + + for (i = 0; VEC_iterate (slp_instance, slp_instances, i, instance); i++) + vect_detect_hybrid_slp_stmts (SLP_INSTANCE_TREE (instance)); +} + + /* Function vect_analyze_data_refs. Find all the data references in the loop. @@ -3424,6 +4227,17 @@ vect_analyze_loop (struct loop *loop) return NULL; } + /* Check the SLP opportunities in the loop, analyze and build SLP trees. */ + ok = vect_analyze_slp (loop_vinfo); + if (ok) + { + /* Decide which possible SLP instances to SLP. */ + vect_make_slp_decision (loop_vinfo); + + /* Find stmts that need to be both vectorized and SLPed. */ + vect_detect_hybrid_slp (loop_vinfo); + } + /* This pass will decide on using loop versioning and/or loop peeling in order to enhance the alignment of data references in the loop. */ diff --git a/gcc/tree-vect-transform.c b/gcc/tree-vect-transform.c index cddebde26ad..9698b417885 100644 --- a/gcc/tree-vect-transform.c +++ b/gcc/tree-vect-transform.c @@ -46,7 +46,7 @@ along with GCC; see the file COPYING3. If not see #include "real.h" /* Utility functions for the code transformation. */ -static bool vect_transform_stmt (tree, block_stmt_iterator *, bool *); +static bool vect_transform_stmt (tree, block_stmt_iterator *, bool *, slp_tree); static tree vect_create_destination_var (tree, tree); static tree vect_create_data_ref_ptr (tree, struct loop*, tree, tree *, tree *, bool, tree, bool *); @@ -125,6 +125,8 @@ vect_estimate_min_profitable_iters (loop_vec_info loop_vinfo) int nbbs = loop->num_nodes; int byte_misalign; int innerloop_iters = 0, factor; + VEC (slp_instance, heap) *slp_instances; + slp_instance instance; /* Cost model disabled. */ if (!flag_vect_cost_model) @@ -287,6 +289,14 @@ vect_estimate_min_profitable_iters (loop_vec_info loop_vinfo) targetm.vectorize.builtin_vectorization_cost (runtime_test)); } + /* Add SLP costs. */ + slp_instances = LOOP_VINFO_SLP_INSTANCES (loop_vinfo); + for (i = 0; VEC_iterate (slp_instance, slp_instances, i, instance); i++) + { + vec_outside_cost += SLP_INSTANCE_OUTSIDE_OF_LOOP_COST (instance); + vec_inside_cost += SLP_INSTANCE_INSIDE_OF_LOOP_COST (instance); + } + /* Calculate number of iterations required to make the vector version profitable, relative to the loop bodies only. The following condition must hold true: ((SIC*VF)-VIC)*niters > VOC*VF, where @@ -452,30 +462,55 @@ vect_model_induction_cost (stmt_vec_info stmt_info, int ncopies) } +/* Return addresses of the cost fields of SLP_NODE if it's not NULL, and of + the stmt otherwise. */ + +static inline void +vect_get_cost_fields (stmt_vec_info stmt_info, slp_tree slp_node, + int **inside_cost_field, int **outside_cost_field) +{ + if (slp_node) + { + *inside_cost_field = &(SLP_TREE_INSIDE_OF_LOOP_COST (slp_node)); + *outside_cost_field = &(SLP_TREE_OUTSIDE_OF_LOOP_COST (slp_node)); + } + else + { + *inside_cost_field = &(STMT_VINFO_INSIDE_OF_LOOP_COST (stmt_info)); + *outside_cost_field = &(STMT_VINFO_OUTSIDE_OF_LOOP_COST (stmt_info)); + } +} + + /* Function vect_model_simple_cost. Models cost for simple operations, i.e. those that only emit ncopies of a single op. Right now, this does not account for multiple insns that could be generated for the single vector op. We will handle that shortly. */ -static void -vect_model_simple_cost (stmt_vec_info stmt_info, int ncopies, enum vect_def_type *dt) +void +vect_model_simple_cost (stmt_vec_info stmt_info, int ncopies, + enum vect_def_type *dt, slp_tree slp_node) { int i; + int *inside_cost_field, *outside_cost_field; - STMT_VINFO_INSIDE_OF_LOOP_COST (stmt_info) = ncopies * TARG_VEC_STMT_COST; + /* Take addresses of relevant fields to update in the function. */ + vect_get_cost_fields (stmt_info, slp_node, &inside_cost_field, + &outside_cost_field); + + *inside_cost_field = ncopies * TARG_VEC_STMT_COST; /* FORNOW: Assuming maximum 2 args per stmts. */ - for (i=0; i<2; i++) + for (i = 0; i < 2; i++) { if (dt[i] == vect_constant_def || dt[i] == vect_invariant_def) - STMT_VINFO_OUTSIDE_OF_LOOP_COST (stmt_info) += TARG_SCALAR_TO_VEC_COST; + *outside_cost_field += TARG_SCALAR_TO_VEC_COST; } if (vect_print_dump_info (REPORT_DETAILS)) fprintf (vect_dump, "vect_model_simple_cost: inside_cost = %d, " - "outside_cost = %d .", STMT_VINFO_INSIDE_OF_LOOP_COST (stmt_info), - STMT_VINFO_OUTSIDE_OF_LOOP_COST (stmt_info)); + "outside_cost = %d .", *inside_cost_field, *outside_cost_field); } @@ -502,14 +537,20 @@ vect_cost_strided_group_size (stmt_vec_info stmt_info) Models cost for stores. In the case of strided accesses, one access has the overhead of the strided access attributed to it. */ -static void -vect_model_store_cost (stmt_vec_info stmt_info, int ncopies, enum vect_def_type dt) +void +vect_model_store_cost (stmt_vec_info stmt_info, int ncopies, + enum vect_def_type dt, slp_tree slp_node) { int cost = 0; int group_size; + int *inside_cost_field, *outside_cost_field; + + /* Take addresses of relevant fields to update in the function. */ + vect_get_cost_fields (stmt_info, slp_node, &inside_cost_field, + &outside_cost_field); if (dt == vect_constant_def || dt == vect_invariant_def) - STMT_VINFO_OUTSIDE_OF_LOOP_COST (stmt_info) = TARG_SCALAR_TO_VEC_COST; + *outside_cost_field = TARG_SCALAR_TO_VEC_COST; /* Strided access? */ if (DR_GROUP_FIRST_DR (stmt_info)) @@ -535,12 +576,11 @@ vect_model_store_cost (stmt_vec_info stmt_info, int ncopies, enum vect_def_type /* Costs of the stores. */ cost += ncopies * TARG_VEC_STORE_COST; - STMT_VINFO_INSIDE_OF_LOOP_COST (stmt_info) = cost; + *inside_cost_field = cost; if (vect_print_dump_info (REPORT_DETAILS)) fprintf (vect_dump, "vect_model_store_cost: inside_cost = %d, " - "outside_cost = %d .", STMT_VINFO_INSIDE_OF_LOOP_COST (stmt_info), - STMT_VINFO_OUTSIDE_OF_LOOP_COST (stmt_info)); + "outside_cost = %d .", *inside_cost_field, *outside_cost_field); } @@ -551,8 +591,8 @@ vect_model_store_cost (stmt_vec_info stmt_info, int ncopies, enum vect_def_type accesses are supported for loads, we also account for the costs of the access scheme chosen. */ -static void -vect_model_load_cost (stmt_vec_info stmt_info, int ncopies) +void +vect_model_load_cost (stmt_vec_info stmt_info, int ncopies, slp_tree slp_node) { int inner_cost = 0; @@ -560,10 +600,15 @@ vect_model_load_cost (stmt_vec_info stmt_info, int ncopies) int alignment_support_cheme; tree first_stmt; struct data_reference *dr = STMT_VINFO_DATA_REF (stmt_info), *first_dr; + int *inside_cost_field, *outside_cost_field; + + /* Take addresses of relevant fields to update in the function. */ + vect_get_cost_fields (stmt_info, slp_node, &inside_cost_field, + &outside_cost_field); /* Strided accesses? */ first_stmt = DR_GROUP_FIRST_DR (stmt_info); - if (first_stmt) + if (first_stmt && !slp_node) { group_size = vect_cost_strided_group_size (stmt_info); first_dr = STMT_VINFO_DATA_REF (vinfo_for_stmt (first_stmt)); @@ -641,14 +686,14 @@ vect_model_load_cost (stmt_vec_info stmt_info, int ncopies) access in the group. Inside the loop, there is a load op and a realignment op. */ - if ((!DR_GROUP_FIRST_DR (stmt_info)) || group_size > 1) + if ((!DR_GROUP_FIRST_DR (stmt_info)) || group_size > 1 || slp_node) { outer_cost = 2*TARG_VEC_STMT_COST; if (targetm.vectorize.builtin_mask_for_load) outer_cost += TARG_VEC_STMT_COST; } - STMT_VINFO_OUTSIDE_OF_LOOP_COST (stmt_info) = outer_cost; + *outside_cost_field = outer_cost; inner_cost += ncopies * (TARG_VEC_LOAD_COST + TARG_VEC_STMT_COST); @@ -659,12 +704,11 @@ vect_model_load_cost (stmt_vec_info stmt_info, int ncopies) gcc_unreachable (); } - STMT_VINFO_INSIDE_OF_LOOP_COST (stmt_info) = inner_cost; + *inside_cost_field = inner_cost; if (vect_print_dump_info (REPORT_DETAILS)) fprintf (vect_dump, "vect_model_load_cost: inside_cost = %d, " - "outside_cost = %d .", STMT_VINFO_INSIDE_OF_LOOP_COST (stmt_info), - STMT_VINFO_OUTSIDE_OF_LOOP_COST (stmt_info)); + "outside_cost = %d .", *inside_cost_field, *outside_cost_field); } @@ -1256,6 +1300,177 @@ vect_init_vector (tree stmt, tree vector_var, tree vector_type, } +/* For constant and loop invariant defs of SLP_NODE this function returns + (vector) defs (VEC_OPRNDS) that will be used in the vectorized stmts. + OP_NUM determines if we gather defs for operand 0 or operand 1 of the scalar + stmts. */ + +static void +vect_get_constant_vectors (slp_tree slp_node, VEC(tree,heap) **vec_oprnds, + unsigned int op_num) +{ + VEC (tree, heap) *stmts = SLP_TREE_SCALAR_STMTS (slp_node); + tree stmt = VEC_index (tree, stmts, 0); + stmt_vec_info stmt_vinfo = vinfo_for_stmt (stmt); + tree vectype = STMT_VINFO_VECTYPE (stmt_vinfo); + int nunits = TYPE_VECTOR_SUBPARTS (vectype); + tree vec_cst; + tree t = NULL_TREE; + int j, number_of_places_left_in_vector; + tree vector_type; + tree op, vop, operation; + int group_size = VEC_length (tree, stmts); + unsigned int vec_num, i; + int number_of_copies = 1; + bool is_store = false; + unsigned int number_of_vectors = SLP_TREE_NUMBER_OF_VEC_STMTS (slp_node); + VEC (tree, heap) *voprnds = VEC_alloc (tree, heap, number_of_vectors); + + if (STMT_VINFO_DATA_REF (stmt_vinfo)) + is_store = true; + + /* NUMBER_OF_COPIES is the number of times we need to use the same values in + created vectors. It is greater than 1 if unrolling is performed. + + For example, we have two scalar operands, s1 and s2 (e.g., group of + strided accesses of size two), while NUINTS is four (i.e., four scalars + of this type can be packed in a vector). The output vector will contain + two copies of each scalar operand: {s1, s2, s1, s2}. (NUMBER_OF_COPIES + will be 2). + + If GROUP_SIZE > NUNITS, the scalars will be split into several vectors + containing the operands. + + For example, NUINTS is four as before, and the group size is 8 + (s1, s2, ..., s8). We will create two vectors {s1, s2, s3, s4} and + {s5, s6, s7, s8}. */ + + number_of_copies = least_common_multiple (nunits, group_size) / group_size; + + number_of_places_left_in_vector = nunits; + for (j = 0; j < number_of_copies; j++) + { + for (i = group_size - 1; VEC_iterate (tree, stmts, i, stmt); i--) + { + operation = GIMPLE_STMT_OPERAND (stmt, 1); + if (is_store) + op = operation; + else + op = TREE_OPERAND (operation, op_num); + + /* Create 'vect_ = {op0,op1,...,opn}'. */ + t = tree_cons (NULL_TREE, op, t); + + number_of_places_left_in_vector--; + + if (number_of_places_left_in_vector == 0) + { + number_of_places_left_in_vector = nunits; + + vector_type = get_vectype_for_scalar_type (TREE_TYPE (op)); + vec_cst = build_constructor_from_list (vector_type, t); + VEC_quick_push (tree, voprnds, + vect_init_vector (stmt, vec_cst, vector_type, + NULL)); + t = NULL_TREE; + } + } + } + + /* Since the vectors are created in the reverse order, we should invert + them. */ + vec_num = VEC_length (tree, voprnds); + for (j = vec_num - 1; j >= 0; j--) + { + vop = VEC_index (tree, voprnds, j); + VEC_quick_push (tree, *vec_oprnds, vop); + } + + VEC_free (tree, heap, voprnds); + + /* In case that VF is greater than the unrolling factor needed for the SLP + group of stmts, NUMBER_OF_VECTORS to be created is greater than + NUMBER_OF_SCALARS/NUNITS or NUNITS/NUMBER_OF_SCALARS, and hence we have + to replicate the vectors. */ + while (number_of_vectors > VEC_length (tree, *vec_oprnds)) + { + for (i = 0; VEC_iterate (tree, *vec_oprnds, i, vop) && i < vec_num; i++) + VEC_quick_push (tree, *vec_oprnds, vop); + } +} + + +/* Get vectorized defintions from SLP_NODE that contains corresponding + vectorized def-stmts. */ + +static void +vect_get_slp_vect_defs (slp_tree slp_node, VEC (tree,heap) **vec_oprnds) +{ + tree vec_oprnd; + tree vec_def_stmt; + unsigned int i; + + gcc_assert (SLP_TREE_VEC_STMTS (slp_node)); + + for (i = 0; + VEC_iterate (tree, SLP_TREE_VEC_STMTS (slp_node), i, vec_def_stmt); + i++) + { + gcc_assert (vec_def_stmt); + vec_oprnd = GIMPLE_STMT_OPERAND (vec_def_stmt, 0); + VEC_quick_push (tree, *vec_oprnds, vec_oprnd); + } +} + + +/* Get vectorized definitions for SLP_NODE. + If the scalar definitions are loop invariants or constants, collect them and + call vect_get_constant_vectors() to create vector stmts. + Otherwise, the def-stmts must be already vectorized and the vectorized stmts + must be stored in the LEFT/RIGHT node of SLP_NODE, and we call + vect_get_slp_vect_defs() to retrieve them. */ + +static void +vect_get_slp_defs (slp_tree slp_node, VEC (tree,heap) **vec_oprnds0, + VEC (tree,heap) **vec_oprnds1) +{ + tree operation, first_stmt; + + /* Allocate memory for vectorized defs. */ + *vec_oprnds0 = VEC_alloc (tree, heap, + SLP_TREE_NUMBER_OF_VEC_STMTS (slp_node)); + + /* SLP_NODE corresponds either to a group of stores or to a group of + unary/binary operations. We don't call this function for loads. */ + if (SLP_TREE_LEFT (slp_node)) + /* The defs are already vectorized. */ + vect_get_slp_vect_defs (SLP_TREE_LEFT (slp_node), vec_oprnds0); + else + /* Build vectors from scalar defs. */ + vect_get_constant_vectors (slp_node, vec_oprnds0, 0); + + first_stmt = VEC_index (tree, SLP_TREE_SCALAR_STMTS (slp_node), 0); + if (STMT_VINFO_DATA_REF (vinfo_for_stmt (first_stmt))) + /* Since we don't call this function with loads, this is a group of + stores. */ + return; + + operation = GIMPLE_STMT_OPERAND (first_stmt, 1); + if (TREE_OPERAND_LENGTH (operation) == unary_op) + return; + + *vec_oprnds1 = VEC_alloc (tree, heap, + SLP_TREE_NUMBER_OF_VEC_STMTS (slp_node)); + + if (SLP_TREE_RIGHT (slp_node)) + /* The defs are already vectorized. */ + vect_get_slp_vect_defs (SLP_TREE_RIGHT (slp_node), vec_oprnds1); + else + /* Build vectors from scalar defs. */ + vect_get_constant_vectors (slp_node, vec_oprnds1, 1); +} + + /* Function get_initial_def_for_induction Input: @@ -1744,6 +1959,54 @@ vect_get_vec_def_for_stmt_copy (enum vect_def_type dt, tree vec_oprnd) } +/* Get vectorized definitions for the operands to create a copy of an original + stmt. See vect_get_vec_def_for_stmt_copy() for details. */ + +static void +vect_get_vec_defs_for_stmt_copy (enum vect_def_type *dt, + VEC(tree,heap) **vec_oprnds0, + VEC(tree,heap) **vec_oprnds1) +{ + tree vec_oprnd = VEC_pop (tree, *vec_oprnds0); + + vec_oprnd = vect_get_vec_def_for_stmt_copy (dt[0], vec_oprnd); + VEC_quick_push (tree, *vec_oprnds0, vec_oprnd); + + if (vec_oprnds1) + { + vec_oprnd = VEC_pop (tree, *vec_oprnds1); + vec_oprnd = vect_get_vec_def_for_stmt_copy (dt[1], vec_oprnd); + VEC_quick_push (tree, *vec_oprnds1, vec_oprnd); + } +} + + +/* Get vectorized definitions for OP0 and OP1, or SLP_NODE if it is not NULL. */ + +static void +vect_get_vec_defs (tree op0, tree op1, tree stmt, VEC(tree,heap) **vec_oprnds0, + VEC(tree,heap) **vec_oprnds1, slp_tree slp_node) +{ + if (slp_node) + vect_get_slp_defs (slp_node, vec_oprnds0, vec_oprnds1); + else + { + tree vec_oprnd; + + *vec_oprnds0 = VEC_alloc (tree, heap, 1); + vec_oprnd = vect_get_vec_def_for_operand (op0, stmt, NULL); + VEC_quick_push (tree, *vec_oprnds0, vec_oprnd); + + if (op1) + { + *vec_oprnds1 = VEC_alloc (tree, heap, 1); + vec_oprnd = vect_get_vec_def_for_operand (op1, stmt, NULL); + VEC_quick_push (tree, *vec_oprnds1, vec_oprnd); + } + } +} + + /* Function vect_finish_stmt_generation. Insert a new stmt. */ @@ -2399,6 +2662,10 @@ vectorizable_reduction (tree stmt, block_stmt_iterator *bsi, tree *vec_stmt) gcc_assert (ncopies >= 1); + /* FORNOW: SLP not supported. */ + if (STMT_SLP_TYPE (stmt_info)) + return false; + /* 1. Is vectorizable reduction? */ /* Not supportable if the reduction variable is used in the loop. */ @@ -2707,6 +2974,10 @@ vectorizable_call (tree stmt, block_stmt_iterator *bsi, tree *vec_stmt) if (STMT_VINFO_DEF_TYPE (stmt_info) != vect_loop_def) return false; + /* FORNOW: SLP not supported. */ + if (STMT_SLP_TYPE (stmt_info)) + return false; + /* FORNOW: not yet supported. */ if (STMT_VINFO_LIVE_P (stmt_info)) { @@ -2815,7 +3086,7 @@ vectorizable_call (tree stmt, block_stmt_iterator *bsi, tree *vec_stmt) STMT_VINFO_TYPE (stmt_info) = call_vec_info_type; if (vect_print_dump_info (REPORT_DETAILS)) fprintf (vect_dump, "=== vectorizable_call ==="); - vect_model_simple_cost (stmt_info, ncopies, dt); + vect_model_simple_cost (stmt_info, ncopies, dt, NULL); return true; } @@ -3005,16 +3276,14 @@ vect_gen_widened_results_half (enum tree_code code, tree vectype, tree decl, } -/* Function vectorizable_conversion. - -Check if STMT performs a conversion operation, that can be vectorized. -If VEC_STMT is also passed, vectorize the STMT: create a vectorized -stmt to replace it, put it in VEC_STMT, and insert it at BSI. -Return FALSE if not a vectorizable STMT, TRUE otherwise. */ +/* Check if STMT performs a conversion operation, that can be vectorized. + If VEC_STMT is also passed, vectorize the STMT: create a vectorized + stmt to replace it, put it in VEC_STMT, and insert it at BSI. + Return FALSE if not a vectorizable STMT, TRUE otherwise. */ bool -vectorizable_conversion (tree stmt, block_stmt_iterator * bsi, - tree * vec_stmt) +vectorizable_conversion (tree stmt, block_stmt_iterator *bsi, + tree *vec_stmt, slp_tree slp_node) { tree vec_dest; tree scalar_dest; @@ -3028,8 +3297,8 @@ vectorizable_conversion (tree stmt, block_stmt_iterator * bsi, tree decl1 = NULL_TREE, decl2 = NULL_TREE; tree new_temp; tree def, def_stmt; - enum vect_def_type dt0; - tree new_stmt; + enum vect_def_type dt[2] = {vect_unknown_def_type, vect_unknown_def_type}; + tree new_stmt = NULL_TREE; stmt_vec_info prev_stmt_info; int nunits_in; int nunits_out; @@ -3039,6 +3308,9 @@ vectorizable_conversion (tree stmt, block_stmt_iterator * bsi, tree rhs_type, lhs_type; tree builtin_decl; enum { NARROW, NONE, WIDEN } modifier; + int i; + VEC(tree,heap) *vec_oprnds0 = NULL; + tree vop0; /* Is STMT a vectorizable conversion? */ @@ -3067,7 +3339,7 @@ vectorizable_conversion (tree stmt, block_stmt_iterator * bsi, if (code != FIX_TRUNC_EXPR && code != FLOAT_EXPR) return false; - /* Check types of lhs and rhs */ + /* Check types of lhs and rhs. */ op0 = TREE_OPERAND (operation, 0); rhs_type = TREE_TYPE (op0); vectype_in = get_vectype_for_scalar_type (rhs_type); @@ -3091,7 +3363,7 @@ vectorizable_conversion (tree stmt, block_stmt_iterator * bsi, if (modifier == NONE) gcc_assert (STMT_VINFO_VECTYPE (stmt_info) == vectype_out); - /* Bail out if the types are both integral or non-integral */ + /* Bail out if the types are both integral or non-integral. */ if ((INTEGRAL_TYPE_P (rhs_type) && INTEGRAL_TYPE_P (lhs_type)) || (!INTEGRAL_TYPE_P (rhs_type) && !INTEGRAL_TYPE_P (lhs_type))) return false; @@ -3101,6 +3373,11 @@ vectorizable_conversion (tree stmt, block_stmt_iterator * bsi, else ncopies = LOOP_VINFO_VECT_FACTOR (loop_vinfo) / nunits_in; + /* FORNOW: SLP with multiple types is not supported. The SLP analysis verifies + this, so we can safely override NCOPIES with 1 here. */ + if (slp_node) + ncopies = 1; + /* Sanity check: make sure that at least one copy of the vectorized stmt needs to be generated. */ gcc_assert (ncopies >= 1); @@ -3114,7 +3391,7 @@ vectorizable_conversion (tree stmt, block_stmt_iterator * bsi, } /* Check the operands of the operation. */ - if (!vect_is_simple_use (op0, loop_vinfo, &def_stmt, &def, &dt0)) + if (!vect_is_simple_use (op0, loop_vinfo, &def_stmt, &def, &dt[0])) { if (vect_print_dump_info (REPORT_DETAILS)) fprintf (vect_dump, "use not simple."); @@ -3138,7 +3415,12 @@ vectorizable_conversion (tree stmt, block_stmt_iterator * bsi, } if (modifier != NONE) - STMT_VINFO_VECTYPE (stmt_info) = vectype_in; + { + STMT_VINFO_VECTYPE (stmt_info) = vectype_in; + /* FORNOW: SLP not supported. */ + if (STMT_SLP_TYPE (stmt_info)) + return false; + } if (!vec_stmt) /* transformation not required. */ { @@ -3153,6 +3435,9 @@ vectorizable_conversion (tree stmt, block_stmt_iterator * bsi, /* Handle def. */ vec_dest = vect_create_destination_var (scalar_dest, vectype_out); + if (modifier == NONE && !slp_node) + vec_oprnds0 = VEC_alloc (tree, heap, 1); + prev_stmt_info = NULL; switch (modifier) { @@ -3163,24 +3448,30 @@ vectorizable_conversion (tree stmt, block_stmt_iterator * bsi, ssa_op_iter iter; if (j == 0) - vec_oprnd0 = vect_get_vec_def_for_operand (op0, stmt, NULL); + vect_get_vec_defs (op0, NULL, stmt, &vec_oprnds0, NULL, slp_node); else - vec_oprnd0 = vect_get_vec_def_for_stmt_copy (dt0, vec_oprnd0); + vect_get_vec_defs_for_stmt_copy (dt, &vec_oprnds0, NULL); builtin_decl = targetm.vectorize.builtin_conversion (code, vectype_in); - new_stmt = build_call_expr (builtin_decl, 1, vec_oprnd0); + for (i = 0; VEC_iterate (tree, vec_oprnds0, i, vop0); i++) + { + new_stmt = build_call_expr (builtin_decl, 1, vop0); - /* Arguments are ready. create the new vector stmt. */ - new_stmt = build_gimple_modify_stmt (vec_dest, new_stmt); - new_temp = make_ssa_name (vec_dest, new_stmt); - GIMPLE_STMT_OPERAND (new_stmt, 0) = new_temp; - vect_finish_stmt_generation (stmt, new_stmt, bsi); - FOR_EACH_SSA_TREE_OPERAND (sym, new_stmt, iter, SSA_OP_ALL_VIRTUALS) - { - if (TREE_CODE (sym) == SSA_NAME) - sym = SSA_NAME_VAR (sym); - mark_sym_for_renaming (sym); + /* Arguments are ready. create the new vector stmt. */ + new_stmt = build_gimple_modify_stmt (vec_dest, new_stmt); + new_temp = make_ssa_name (vec_dest, new_stmt); + GIMPLE_STMT_OPERAND (new_stmt, 0) = new_temp; + vect_finish_stmt_generation (stmt, new_stmt, bsi); + FOR_EACH_SSA_TREE_OPERAND (sym, new_stmt, iter, + SSA_OP_ALL_VIRTUALS) + { + if (TREE_CODE (sym) == SSA_NAME) + sym = SSA_NAME_VAR (sym); + mark_sym_for_renaming (sym); + } + if (slp_node) + VEC_quick_push (tree, SLP_TREE_VEC_STMTS (slp_node), new_stmt); } if (j == 0) @@ -3201,7 +3492,7 @@ vectorizable_conversion (tree stmt, block_stmt_iterator * bsi, if (j == 0) vec_oprnd0 = vect_get_vec_def_for_operand (op0, stmt, NULL); else - vec_oprnd0 = vect_get_vec_def_for_stmt_copy (dt0, vec_oprnd0); + vec_oprnd0 = vect_get_vec_def_for_stmt_copy (dt[0], vec_oprnd0); STMT_VINFO_VECTYPE (stmt_info) = vectype_in; @@ -3237,12 +3528,12 @@ vectorizable_conversion (tree stmt, block_stmt_iterator * bsi, if (j == 0) { vec_oprnd0 = vect_get_vec_def_for_operand (op0, stmt, NULL); - vec_oprnd1 = vect_get_vec_def_for_stmt_copy (dt0, vec_oprnd0); + vec_oprnd1 = vect_get_vec_def_for_stmt_copy (dt[0], vec_oprnd0); } else { - vec_oprnd0 = vect_get_vec_def_for_stmt_copy (dt0, vec_oprnd1); - vec_oprnd1 = vect_get_vec_def_for_stmt_copy (dt0, vec_oprnd0); + vec_oprnd0 = vect_get_vec_def_for_stmt_copy (dt[0], vec_oprnd1); + vec_oprnd1 = vect_get_vec_def_for_stmt_copy (dt[0], vec_oprnd0); } /* Arguments are ready. Create the new vector stmt. */ @@ -3262,6 +3553,7 @@ vectorizable_conversion (tree stmt, block_stmt_iterator * bsi, *vec_stmt = STMT_VINFO_VEC_STMT (stmt_info); } + return true; } @@ -3274,12 +3566,12 @@ vectorizable_conversion (tree stmt, block_stmt_iterator * bsi, Return FALSE if not a vectorizable STMT, TRUE otherwise. */ bool -vectorizable_assignment (tree stmt, block_stmt_iterator *bsi, tree *vec_stmt) +vectorizable_assignment (tree stmt, block_stmt_iterator *bsi, tree *vec_stmt, + slp_tree slp_node) { tree vec_dest; tree scalar_dest; tree op; - tree vec_oprnd; stmt_vec_info stmt_info = vinfo_for_stmt (stmt); tree vectype = STMT_VINFO_VECTYPE (stmt_info); loop_vec_info loop_vinfo = STMT_VINFO_LOOP_VINFO (stmt_info); @@ -3288,6 +3580,9 @@ vectorizable_assignment (tree stmt, block_stmt_iterator *bsi, tree *vec_stmt) enum vect_def_type dt[2] = {vect_unknown_def_type, vect_unknown_def_type}; int nunits = TYPE_VECTOR_SUBPARTS (vectype); int ncopies = LOOP_VINFO_VECT_FACTOR (loop_vinfo) / nunits; + int i; + VEC(tree,heap) *vec_oprnds = NULL; + tree vop; gcc_assert (ncopies >= 1); if (ncopies > 1) @@ -3328,7 +3623,7 @@ vectorizable_assignment (tree stmt, block_stmt_iterator *bsi, tree *vec_stmt) STMT_VINFO_TYPE (stmt_info) = assignment_vec_info_type; if (vect_print_dump_info (REPORT_DETAILS)) fprintf (vect_dump, "=== vectorizable_assignment ==="); - vect_model_simple_cost (stmt_info, ncopies, dt); + vect_model_simple_cost (stmt_info, ncopies, dt, NULL); return true; } @@ -3340,15 +3635,22 @@ vectorizable_assignment (tree stmt, block_stmt_iterator *bsi, tree *vec_stmt) vec_dest = vect_create_destination_var (scalar_dest, vectype); /* Handle use. */ - op = GIMPLE_STMT_OPERAND (stmt, 1); - vec_oprnd = vect_get_vec_def_for_operand (op, stmt, NULL); + vect_get_vec_defs (op, NULL, stmt, &vec_oprnds, NULL, slp_node); /* Arguments are ready. create the new vector stmt. */ - *vec_stmt = build_gimple_modify_stmt (vec_dest, vec_oprnd); - new_temp = make_ssa_name (vec_dest, *vec_stmt); - GIMPLE_STMT_OPERAND (*vec_stmt, 0) = new_temp; - vect_finish_stmt_generation (stmt, *vec_stmt, bsi); + for (i = 0; VEC_iterate (tree, vec_oprnds, i, vop); i++) + { + *vec_stmt = build_gimple_modify_stmt (vec_dest, vop); + new_temp = make_ssa_name (vec_dest, *vec_stmt); + GIMPLE_STMT_OPERAND (*vec_stmt, 0) = new_temp; + vect_finish_stmt_generation (stmt, *vec_stmt, bsi); + STMT_VINFO_VEC_STMT (stmt_info) = *vec_stmt; + + if (slp_node) + VEC_quick_push (tree, SLP_TREE_VEC_STMTS (slp_node), *vec_stmt); + } + VEC_free (tree, heap, vec_oprnds); return true; } @@ -3403,6 +3705,10 @@ vectorizable_induction (tree phi, block_stmt_iterator *bsi ATTRIBUTE_UNUSED, if (!STMT_VINFO_RELEVANT_P (stmt_info)) return false; + /* FORNOW: SLP not supported. */ + if (STMT_SLP_TYPE (stmt_info)) + return false; + gcc_assert (STMT_VINFO_DEF_TYPE (stmt_info) == vect_induction_def); if (STMT_VINFO_LIVE_P (stmt_info)) @@ -3444,13 +3750,14 @@ vectorizable_induction (tree phi, block_stmt_iterator *bsi ATTRIBUTE_UNUSED, Return FALSE if not a vectorizable STMT, TRUE otherwise. */ bool -vectorizable_operation (tree stmt, block_stmt_iterator *bsi, tree *vec_stmt) +vectorizable_operation (tree stmt, block_stmt_iterator *bsi, tree *vec_stmt, + slp_tree slp_node) { tree vec_dest; tree scalar_dest; tree operation; tree op0, op1 = NULL; - tree vec_oprnd0 = NULL_TREE, vec_oprnd1 = NULL_TREE; + tree vec_oprnd1 = NULL_TREE; stmt_vec_info stmt_info = vinfo_for_stmt (stmt); tree vectype = STMT_VINFO_VECTYPE (stmt_info); loop_vec_info loop_vinfo = STMT_VINFO_LOOP_VINFO (stmt_info); @@ -3464,14 +3771,20 @@ vectorizable_operation (tree stmt, block_stmt_iterator *bsi, tree *vec_stmt) enum machine_mode optab_op2_mode; tree def, def_stmt; enum vect_def_type dt[2] = {vect_unknown_def_type, vect_unknown_def_type}; - tree new_stmt; + tree new_stmt = NULL_TREE; stmt_vec_info prev_stmt_info; int nunits_in = TYPE_VECTOR_SUBPARTS (vectype); int nunits_out; tree vectype_out; int ncopies = LOOP_VINFO_VECT_FACTOR (loop_vinfo) / nunits_in; - int j; - + int j, i; + VEC(tree,heap) *vec_oprnds0 = NULL, *vec_oprnds1 = NULL; + tree vop0, vop1; + + /* FORNOW: SLP with multiple types is not supported. The SLP analysis verifies + this, so we can safely override NCOPIES with 1 here. */ + if (slp_node) + ncopies = 1; gcc_assert (ncopies >= 1); /* FORNOW. This restriction should be relaxed. */ if (nested_in_vect_loop_p (loop, stmt) && ncopies > 1) @@ -3601,7 +3914,7 @@ vectorizable_operation (tree stmt, block_stmt_iterator *bsi, tree *vec_stmt) STMT_VINFO_TYPE (stmt_info) = op_vec_info_type; if (vect_print_dump_info (REPORT_DETAILS)) fprintf (vect_dump, "=== vectorizable_operation ==="); - vect_model_simple_cost (stmt_info, ncopies, dt); + vect_model_simple_cost (stmt_info, ncopies, dt, NULL); return true; } @@ -3613,6 +3926,13 @@ vectorizable_operation (tree stmt, block_stmt_iterator *bsi, tree *vec_stmt) /* Handle def. */ vec_dest = vect_create_destination_var (scalar_dest, vectype); + if (!slp_node) + { + vec_oprnds0 = VEC_alloc (tree, heap, 1); + if (op_type == binary_op) + vec_oprnds1 = VEC_alloc (tree, heap, 1); + } + /* In case the vectorization factor (VF) is bigger than the number of elements that we can fit in a vectype (nunits), we have to generate more than one vector stmt - i.e - we need to "unroll" the @@ -3672,45 +3992,53 @@ vectorizable_operation (tree stmt, block_stmt_iterator *bsi, tree *vec_stmt) /* Handle uses. */ if (j == 0) { - vec_oprnd0 = vect_get_vec_def_for_operand (op0, stmt, NULL); - if (op_type == binary_op) + if (op_type == binary_op + && (code == LSHIFT_EXPR || code == RSHIFT_EXPR) + && !slp_node) { - if (code == LSHIFT_EXPR || code == RSHIFT_EXPR) - { - /* Vector shl and shr insn patterns can be defined with - scalar operand 2 (shift operand). In this case, use - constant or loop invariant op1 directly, without - extending it to vector mode first. */ - optab_op2_mode = insn_data[icode].operand[2].mode; - if (!VECTOR_MODE_P (optab_op2_mode)) - { - if (vect_print_dump_info (REPORT_DETAILS)) - fprintf (vect_dump, "operand 1 using scalar mode."); - vec_oprnd1 = op1; - } - } - if (!vec_oprnd1) - vec_oprnd1 = vect_get_vec_def_for_operand (op1, stmt, NULL); + /* Vector shl and shr insn patterns can be defined with scalar + operand 2 (shift operand). In this case, use constant or loop + invariant op1 directly, without extending it to vector mode + first. */ + optab_op2_mode = insn_data[icode].operand[2].mode; + if (!VECTOR_MODE_P (optab_op2_mode)) + { + if (vect_print_dump_info (REPORT_DETAILS)) + fprintf (vect_dump, "operand 1 using scalar mode."); + vec_oprnd1 = op1; + VEC_quick_push (tree, vec_oprnds1, vec_oprnd1); + } } + + if (op_type == binary_op && !vec_oprnd1) + vect_get_vec_defs (op0, op1, stmt, &vec_oprnds0, &vec_oprnds1, + slp_node); + else + vect_get_vec_defs (op0, NULL_TREE, stmt, &vec_oprnds0, &vec_oprnds1, + slp_node); } else - { - vec_oprnd0 = vect_get_vec_def_for_stmt_copy (dt[0], vec_oprnd0); - if (op_type == binary_op) - vec_oprnd1 = vect_get_vec_def_for_stmt_copy (dt[1], vec_oprnd1); - } + vect_get_vec_defs_for_stmt_copy (dt, &vec_oprnds0, &vec_oprnds1); - /* Arguments are ready. create the new vector stmt. */ + /* Arguments are ready. Create the new vector stmt. */ + for (i = 0; VEC_iterate (tree, vec_oprnds0, i, vop0); i++) + { + if (op_type == binary_op) + { + vop1 = VEC_index (tree, vec_oprnds1, i); + new_stmt = build_gimple_modify_stmt (vec_dest, + build2 (code, vectype, vop0, vop1)); + } + else + new_stmt = build_gimple_modify_stmt (vec_dest, + build1 (code, vectype, vop0)); - if (op_type == binary_op) - new_stmt = build_gimple_modify_stmt (vec_dest, - build2 (code, vectype, vec_oprnd0, vec_oprnd1)); - else - new_stmt = build_gimple_modify_stmt (vec_dest, - build1 (code, vectype, vec_oprnd0)); - new_temp = make_ssa_name (vec_dest, new_stmt); - GIMPLE_STMT_OPERAND (new_stmt, 0) = new_temp; - vect_finish_stmt_generation (stmt, new_stmt, bsi); + new_temp = make_ssa_name (vec_dest, new_stmt); + GIMPLE_STMT_OPERAND (new_stmt, 0) = new_temp; + vect_finish_stmt_generation (stmt, new_stmt, bsi); + if (slp_node) + VEC_quick_push (tree, SLP_TREE_VEC_STMTS (slp_node), new_stmt); + } if (j == 0) STMT_VINFO_VEC_STMT (stmt_info) = *vec_stmt = new_stmt; @@ -3719,6 +4047,10 @@ vectorizable_operation (tree stmt, block_stmt_iterator *bsi, tree *vec_stmt) prev_stmt_info = vinfo_for_stmt (new_stmt); } + VEC_free (tree, heap, vec_oprnds0); + if (vec_oprnds1) + VEC_free (tree, heap, vec_oprnds1); + return true; } @@ -3829,7 +4161,7 @@ vectorizable_type_demotion (tree stmt, block_stmt_iterator *bsi, STMT_VINFO_TYPE (stmt_info) = type_demotion_vec_info_type; if (vect_print_dump_info (REPORT_DETAILS)) fprintf (vect_dump, "=== vectorizable_demotion ==="); - vect_model_simple_cost (stmt_info, ncopies, dt); + vect_model_simple_cost (stmt_info, ncopies, dt, NULL); return true; } @@ -4000,7 +4332,7 @@ vectorizable_type_promotion (tree stmt, block_stmt_iterator *bsi, STMT_VINFO_TYPE (stmt_info) = type_promotion_vec_info_type; if (vect_print_dump_info (REPORT_DETAILS)) fprintf (vect_dump, "=== vectorizable_promotion ==="); - vect_model_simple_cost (stmt_info, 2*ncopies, dt); + vect_model_simple_cost (stmt_info, 2*ncopies, dt, NULL); return true; } @@ -4095,6 +4427,7 @@ vect_strided_store_supported (tree vectype) fprintf (vect_dump, "interleave op not supported by target."); return false; } + return true; } @@ -4242,7 +4575,8 @@ vect_permute_store_chain (VEC(tree,heap) *dr_chain, Return FALSE if not a vectorizable STMT, TRUE otherwise. */ bool -vectorizable_store (tree stmt, block_stmt_iterator *bsi, tree *vec_stmt) +vectorizable_store (tree stmt, block_stmt_iterator *bsi, tree *vec_stmt, + slp_tree slp_node) { tree scalar_dest; tree data_ref; @@ -4268,6 +4602,15 @@ vectorizable_store (tree stmt, block_stmt_iterator *bsi, tree *vec_stmt) unsigned int group_size, i; VEC(tree,heap) *dr_chain = NULL, *oprnds = NULL, *result_chain = NULL; bool inv_p; + VEC(tree,heap) *vec_oprnds = NULL; + bool slp = (slp_node != NULL); + stmt_vec_info first_stmt_vinfo; + unsigned int vec_num; + + /* FORNOW: SLP with multiple types is not supported. The SLP analysis verifies + this, so we can safely override NCOPIES with 1 here. */ + if (slp) + ncopies = 1; gcc_assert (ncopies >= 1); @@ -4300,7 +4643,7 @@ vectorizable_store (tree stmt, block_stmt_iterator *bsi, tree *vec_stmt) scalar_dest = GIMPLE_STMT_OPERAND (stmt, 0); if (TREE_CODE (scalar_dest) != ARRAY_REF && TREE_CODE (scalar_dest) != INDIRECT_REF - && !DR_GROUP_FIRST_DR (stmt_info)) + && !STMT_VINFO_STRIDED_ACCESS (stmt_info)) return false; op = GIMPLE_STMT_OPERAND (stmt, 1); @@ -4320,17 +4663,19 @@ vectorizable_store (tree stmt, block_stmt_iterator *bsi, tree *vec_stmt) if (!STMT_VINFO_DATA_REF (stmt_info)) return false; - if (DR_GROUP_FIRST_DR (stmt_info)) + if (STMT_VINFO_STRIDED_ACCESS (stmt_info)) { strided_store = true; - if (!vect_strided_store_supported (vectype)) + if (!vect_strided_store_supported (vectype) + && !PURE_SLP_STMT (stmt_info) && !slp) return false; } if (!vec_stmt) /* transformation not required. */ { STMT_VINFO_TYPE (stmt_info) = store_vec_info_type; - vect_model_store_cost (stmt_info, ncopies, dt); + if (!PURE_SLP_STMT (stmt_info)) + vect_model_store_cost (stmt_info, ncopies, dt, NULL); return true; } @@ -4350,17 +4695,28 @@ vectorizable_store (tree stmt, block_stmt_iterator *bsi, tree *vec_stmt) /* We vectorize all the stmts of the interleaving group when we reach the last stmt in the group. */ if (DR_GROUP_STORE_COUNT (vinfo_for_stmt (first_stmt)) - < DR_GROUP_SIZE (vinfo_for_stmt (first_stmt))) + < DR_GROUP_SIZE (vinfo_for_stmt (first_stmt)) + && !slp) { *vec_stmt = NULL_TREE; return true; } + + if (slp) + strided_store = false; + + /* VEC_NUM is the number of vect stmts to be created for this group. */ + if (slp && SLP_TREE_NUMBER_OF_VEC_STMTS (slp_node) < group_size) + vec_num = SLP_TREE_NUMBER_OF_VEC_STMTS (slp_node); + else + vec_num = group_size; } else { first_stmt = stmt; first_dr = dr; - group_size = 1; + group_size = vec_num = 1; + first_stmt_vinfo = stmt_info; } if (vect_print_dump_info (REPORT_DETAILS)) @@ -4420,26 +4776,39 @@ vectorizable_store (tree stmt, block_stmt_iterator *bsi, tree *vec_stmt) if (j == 0) { - /* For interleaved stores we collect vectorized defs for all the - stores in the group in DR_CHAIN and OPRNDS. DR_CHAIN is then used - as an input to vect_permute_store_chain(), and OPRNDS as an input - to vect_get_vec_def_for_stmt_copy() for the next copy. - If the store is not strided, GROUP_SIZE is 1, and DR_CHAIN and - OPRNDS are of size 1. */ - next_stmt = first_stmt; - for (i = 0; i < group_size; i++) - { - /* Since gaps are not supported for interleaved stores, GROUP_SIZE - is the exact number of stmts in the chain. Therefore, NEXT_STMT - can't be NULL_TREE. In case that there is no interleaving, - GROUP_SIZE is 1, and only one iteration of the loop will be - executed. */ - gcc_assert (next_stmt); - op = GIMPLE_STMT_OPERAND (next_stmt, 1); - vec_oprnd = vect_get_vec_def_for_operand (op, next_stmt, NULL); - VEC_quick_push(tree, dr_chain, vec_oprnd); - VEC_quick_push(tree, oprnds, vec_oprnd); - next_stmt = DR_GROUP_NEXT_DR (vinfo_for_stmt (next_stmt)); + if (slp) + { + /* Get vectorized arguments for SLP_NODE. */ + vect_get_slp_defs (slp_node, &vec_oprnds, NULL); + + vec_oprnd = VEC_index (tree, vec_oprnds, 0); + } + else + { + /* For interleaved stores we collect vectorized defs for all the + stores in the group in DR_CHAIN and OPRNDS. DR_CHAIN is then + used as an input to vect_permute_store_chain(), and OPRNDS as + an input to vect_get_vec_def_for_stmt_copy() for the next copy. + + If the store is not strided, GROUP_SIZE is 1, and DR_CHAIN and + OPRNDS are of size 1. */ + next_stmt = first_stmt; + for (i = 0; i < group_size; i++) + { + /* Since gaps are not supported for interleaved stores, + GROUP_SIZE is the exact number of stmts in the chain. + Therefore, NEXT_STMT can't be NULL_TREE. In case that + there is no interleaving, GROUP_SIZE is 1, and only one + iteration of the loop will be executed. */ + gcc_assert (next_stmt); + op = GIMPLE_STMT_OPERAND (next_stmt, 1); + + vec_oprnd = vect_get_vec_def_for_operand (op, next_stmt, + NULL); + VEC_quick_push(tree, dr_chain, vec_oprnd); + VEC_quick_push(tree, oprnds, vec_oprnd); + next_stmt = DR_GROUP_NEXT_DR (vinfo_for_stmt (next_stmt)); + } } dataref_ptr = vect_create_data_ref_ptr (first_stmt, NULL, NULL_TREE, &dummy, &ptr_incr, false, @@ -4448,6 +4817,9 @@ vectorizable_store (tree stmt, block_stmt_iterator *bsi, tree *vec_stmt) } else { + /* FORNOW SLP doesn't work for multiple types. */ + gcc_assert (!slp); + /* For interleaved stores we created vectorized defs for all the defs stored in OPRNDS in the previous iteration (previous copy). DR_CHAIN is then used as an input to vect_permute_store_chain(), @@ -4476,12 +4848,19 @@ vectorizable_store (tree stmt, block_stmt_iterator *bsi, tree *vec_stmt) } next_stmt = first_stmt; - for (i = 0; i < group_size; i++) + for (i = 0; i < vec_num; i++) { - /* For strided stores vectorized defs are interleaved in - vect_permute_store_chain(). */ - if (strided_store) - vec_oprnd = VEC_index(tree, result_chain, i); + if (i > 0) + /* Bump the vector pointer. */ + dataref_ptr = bump_vector_ptr (dataref_ptr, ptr_incr, bsi, stmt, + NULL_TREE); + + if (slp) + vec_oprnd = VEC_index (tree, vec_oprnds, i); + else if (strided_store) + /* For strided stores vectorized defs are interleaved in + vect_permute_store_chain(). */ + vec_oprnd = VEC_index (tree, result_chain, i); data_ref = build_fold_indirect_ref (dataref_ptr); /* Arguments are ready. Create the new vector stmt. */ @@ -4498,9 +4877,6 @@ vectorizable_store (tree stmt, block_stmt_iterator *bsi, tree *vec_stmt) next_stmt = DR_GROUP_NEXT_DR (vinfo_for_stmt (next_stmt)); if (!next_stmt) break; - /* Bump the vector pointer. */ - dataref_ptr = - bump_vector_ptr (dataref_ptr, ptr_incr, bsi, stmt, NULL_TREE); } } @@ -5021,7 +5397,8 @@ vect_transform_strided_load (tree stmt, VEC(tree,heap) *dr_chain, int size, Return FALSE if not a vectorizable STMT, TRUE otherwise. */ bool -vectorizable_load (tree stmt, block_stmt_iterator *bsi, tree *vec_stmt) +vectorizable_load (tree stmt, block_stmt_iterator *bsi, tree *vec_stmt, + slp_tree slp_node) { tree scalar_dest; tree vec_dest = NULL; @@ -5056,6 +5433,13 @@ vectorizable_load (tree stmt, block_stmt_iterator *bsi, tree *vec_stmt) bool inv_p; bool compute_in_loop = false; struct loop *at_loop; + int vec_num; + bool slp = (slp_node != NULL); + + /* FORNOW: SLP with multiple types is not supported. The SLP analysis verifies + this, so we can safely override NCOPIES with 1 here. */ + if (slp) + ncopies = 1; gcc_assert (ncopies >= 1); @@ -5092,7 +5476,7 @@ vectorizable_load (tree stmt, block_stmt_iterator *bsi, tree *vec_stmt) op = GIMPLE_STMT_OPERAND (stmt, 1); if (TREE_CODE (op) != ARRAY_REF && TREE_CODE (op) != INDIRECT_REF - && !DR_GROUP_FIRST_DR (stmt_info)) + && !STMT_VINFO_STRIDED_ACCESS (stmt_info)) return false; if (!STMT_VINFO_DATA_REF (stmt_info)) @@ -5111,21 +5495,22 @@ vectorizable_load (tree stmt, block_stmt_iterator *bsi, tree *vec_stmt) } /* Check if the load is a part of an interleaving chain. */ - if (DR_GROUP_FIRST_DR (stmt_info)) + if (STMT_VINFO_STRIDED_ACCESS (stmt_info)) { strided_load = true; /* FORNOW */ gcc_assert (! nested_in_vect_loop); /* Check if interleaving is supported. */ - if (!vect_strided_load_supported (vectype)) + if (!vect_strided_load_supported (vectype) + && !PURE_SLP_STMT (stmt_info) && !slp) return false; } if (!vec_stmt) /* transformation not required. */ { STMT_VINFO_TYPE (stmt_info) = load_vec_info_type; - vect_model_load_cost (stmt_info, ncopies); + vect_model_load_cost (stmt_info, ncopies, NULL); return true; } @@ -5146,12 +5531,21 @@ vectorizable_load (tree stmt, block_stmt_iterator *bsi, tree *vec_stmt) first_dr = STMT_VINFO_DATA_REF (vinfo_for_stmt (first_stmt)); group_size = DR_GROUP_SIZE (vinfo_for_stmt (first_stmt)); dr_chain = VEC_alloc (tree, heap, group_size); + + /* VEC_NUM is the number of vect stmts to be created for this group. */ + if (slp) + { + strided_load = false; + vec_num = SLP_TREE_NUMBER_OF_VEC_STMTS (slp_node); + } + else + vec_num = group_size; } else { first_stmt = stmt; first_dr = dr; - group_size = 1; + group_size = vec_num = 1; } alignment_support_scheme = vect_supportable_dr_alignment (first_dr); @@ -5296,8 +5690,12 @@ vectorizable_load (tree stmt, block_stmt_iterator *bsi, tree *vec_stmt) dataref_ptr = bump_vector_ptr (dataref_ptr, ptr_incr, bsi, stmt, NULL_TREE); - for (i = 0; i < group_size; i++) + for (i = 0; i < vec_num; i++) { + if (i > 0) + dataref_ptr = bump_vector_ptr (dataref_ptr, ptr_incr, bsi, stmt, + NULL_TREE); + /* 2. Create the vector-load in the loop. */ switch (alignment_support_scheme) { @@ -5373,7 +5771,7 @@ vectorizable_load (tree stmt, block_stmt_iterator *bsi, tree *vec_stmt) if (alignment_support_scheme == dr_explicit_realign_optimized) { - if (i == group_size - 1 && j == ncopies - 1) + if (i == vec_num - 1 && j == ncopies - 1) add_phi_arg (phi, lsq, loop_latch_edge (containing_loop)); msq = lsq; } @@ -5414,13 +5812,20 @@ vectorizable_load (tree stmt, block_stmt_iterator *bsi, tree *vec_stmt) gcc_unreachable (); /* FORNOW. */ } - if (strided_load) - VEC_quick_push (tree, dr_chain, new_temp); - if (i < group_size - 1) - dataref_ptr = - bump_vector_ptr (dataref_ptr, ptr_incr, bsi, stmt, NULL_TREE); + /* Collect vector loads and later create their permutation in + vect_transform_strided_load (). */ + if (strided_load) + VEC_quick_push (tree, dr_chain, new_temp); + + /* Store vector loads in the corresponding SLP_NODE. */ + if (slp) + VEC_quick_push (tree, SLP_TREE_VEC_STMTS (slp_node), new_stmt); } + /* FORNOW: SLP with multiple types is unsupported. */ + if (slp) + return true; + if (strided_load) { if (!vect_transform_strided_load (stmt, dr_chain, group_size, bsi)) @@ -5586,6 +5991,10 @@ vectorizable_condition (tree stmt, block_stmt_iterator *bsi, tree *vec_stmt) if (STMT_VINFO_DEF_TYPE (stmt_info) != vect_loop_def) return false; + /* FORNOW: SLP not supported. */ + if (STMT_SLP_TYPE (stmt_info)) + return false; + /* FORNOW: not yet supported. */ if (STMT_VINFO_LIVE_P (stmt_info)) { @@ -5676,12 +6085,14 @@ vectorizable_condition (tree stmt, block_stmt_iterator *bsi, tree *vec_stmt) return true; } + /* Function vect_transform_stmt. Create a vectorized stmt to replace STMT, and insert it at BSI. */ -bool -vect_transform_stmt (tree stmt, block_stmt_iterator *bsi, bool *strided_store) +static bool +vect_transform_stmt (tree stmt, block_stmt_iterator *bsi, bool *strided_store, + slp_tree slp_node) { bool is_store = false; tree vec_stmt = NULL_TREE; @@ -5692,44 +6103,47 @@ vect_transform_stmt (tree stmt, block_stmt_iterator *bsi, bool *strided_store) switch (STMT_VINFO_TYPE (stmt_info)) { case type_demotion_vec_info_type: + gcc_assert (!slp_node); done = vectorizable_type_demotion (stmt, bsi, &vec_stmt); gcc_assert (done); break; case type_promotion_vec_info_type: + gcc_assert (!slp_node); done = vectorizable_type_promotion (stmt, bsi, &vec_stmt); gcc_assert (done); break; case type_conversion_vec_info_type: - done = vectorizable_conversion (stmt, bsi, &vec_stmt); + done = vectorizable_conversion (stmt, bsi, &vec_stmt, slp_node); gcc_assert (done); break; case induc_vec_info_type: + gcc_assert (!slp_node); done = vectorizable_induction (stmt, bsi, &vec_stmt); gcc_assert (done); break; case op_vec_info_type: - done = vectorizable_operation (stmt, bsi, &vec_stmt); + done = vectorizable_operation (stmt, bsi, &vec_stmt, slp_node); gcc_assert (done); break; case assignment_vec_info_type: - done = vectorizable_assignment (stmt, bsi, &vec_stmt); + done = vectorizable_assignment (stmt, bsi, &vec_stmt, slp_node); gcc_assert (done); break; case load_vec_info_type: - done = vectorizable_load (stmt, bsi, &vec_stmt); + done = vectorizable_load (stmt, bsi, &vec_stmt, slp_node); gcc_assert (done); break; case store_vec_info_type: - done = vectorizable_store (stmt, bsi, &vec_stmt); + done = vectorizable_store (stmt, bsi, &vec_stmt, slp_node); gcc_assert (done); - if (DR_GROUP_FIRST_DR (stmt_info)) + if (STMT_VINFO_STRIDED_ACCESS (stmt_info)) { /* In case of interleaving, the whole chain is vectorized when the last store in the chain is reached. Store stmts before the last @@ -5744,15 +6158,18 @@ vect_transform_stmt (tree stmt, block_stmt_iterator *bsi, bool *strided_store) break; case condition_vec_info_type: + gcc_assert (!slp_node); done = vectorizable_condition (stmt, bsi, &vec_stmt); gcc_assert (done); break; case call_vec_info_type: + gcc_assert (!slp_node); done = vectorizable_call (stmt, bsi, &vec_stmt); break; case reduc_vec_info_type: + gcc_assert (!slp_node); done = vectorizable_reduction (stmt, bsi, &vec_stmt); gcc_assert (done); break; @@ -6165,7 +6582,7 @@ vect_gen_niters_for_prolog_loop (loop_vec_info loop_vinfo, tree loop_niters) int element_size = GET_MODE_SIZE (TYPE_MODE (TREE_TYPE (DR_REF (dr)))); int nelements = TYPE_VECTOR_SUBPARTS (vectype); - if (DR_GROUP_FIRST_DR (stmt_info)) + if (STMT_VINFO_STRIDED_ACCESS (stmt_info)) { /* For interleaved access element size must be multiplied by the size of the interleaved group. */ @@ -6593,6 +7010,115 @@ vect_create_cond_for_alias_checks (loop_vec_info loop_vinfo, } +/* Remove a group of stores (for SLP or interleaving), free their + stmt_vec_info. */ + +static void +vect_remove_stores (tree first_stmt) +{ + stmt_ann_t ann; + tree next = first_stmt; + tree tmp; + stmt_vec_info next_stmt_info; + block_stmt_iterator next_si; + + while (next) + { + /* Free the attached stmt_vec_info and remove the stmt. */ + next_si = bsi_for_stmt (next); + bsi_remove (&next_si, true); + next_stmt_info = vinfo_for_stmt (next); + ann = stmt_ann (next); + tmp = DR_GROUP_NEXT_DR (next_stmt_info); + free (next_stmt_info); + set_stmt_info (ann, NULL); + next = tmp; + } +} + + +/* Vectorize SLP instance tree in postorder. */ + +static bool +vect_schedule_slp_instance (slp_tree node, unsigned int vec_stmts_size) +{ + tree stmt; + bool strided_store, is_store; + block_stmt_iterator si; + stmt_vec_info stmt_info; + + if (!node) + return false; + + vect_schedule_slp_instance (SLP_TREE_LEFT (node), vec_stmts_size); + vect_schedule_slp_instance (SLP_TREE_RIGHT (node), vec_stmts_size); + + stmt = VEC_index(tree, SLP_TREE_SCALAR_STMTS (node), 0); + stmt_info = vinfo_for_stmt (stmt); + SLP_TREE_VEC_STMTS (node) = VEC_alloc (tree, heap, vec_stmts_size); + SLP_TREE_NUMBER_OF_VEC_STMTS (node) = vec_stmts_size; + + if (vect_print_dump_info (REPORT_DETAILS)) + { + fprintf (vect_dump, "------>vectorizing SLP node starting from: "); + print_generic_expr (vect_dump, stmt, TDF_SLIM); + } + + si = bsi_for_stmt (stmt); + is_store = vect_transform_stmt (stmt, &si, &strided_store, node); + if (is_store) + { + if (DR_GROUP_FIRST_DR (stmt_info)) + /* If IS_STORE is TRUE, the vectorization of the + interleaving chain was completed - free all the stores in + the chain. */ + vect_remove_stores (DR_GROUP_FIRST_DR (stmt_info)); + else + /* FORNOW: SLP originates only from strided stores. */ + gcc_unreachable (); + + return true; + } + + /* FORNOW: SLP originates only from strided stores. */ + return false; +} + + +static bool +vect_schedule_slp (loop_vec_info loop_vinfo, unsigned int nunits) +{ + VEC (slp_instance, heap) *slp_instances = + LOOP_VINFO_SLP_INSTANCES (loop_vinfo); + slp_instance instance; + unsigned int vec_stmts_size; + unsigned int group_size, i; + unsigned int vectorization_factor = LOOP_VINFO_VECT_FACTOR (loop_vinfo); + bool is_store = false; + + for (i = 0; VEC_iterate (slp_instance, slp_instances, i, instance); i++) + { + group_size = SLP_INSTANCE_GROUP_SIZE (instance); + /* For each SLP instance calculate number of vector stmts to be created + for the scalar stmts in each node of the SLP tree. Number of vector + elements in one vector iteration is the number of scalar elements in + one scalar iteration (GROUP_SIZE) multiplied by VF divided by vector + size. */ + vec_stmts_size = vectorization_factor * group_size / nunits; + + /* Schedule the tree of INSTANCE. */ + is_store = vect_schedule_slp_instance (SLP_INSTANCE_TREE (instance), + vec_stmts_size); + + if (vect_print_dump_info (REPORT_VECTORIZED_LOOPS) + || vect_print_dump_info (REPORT_UNVECTORIZED_LOOPS)) + fprintf (vect_dump, "vectorizing stmts using SLP."); + } + + return is_store; +} + + /* Function vect_transform_loop. The analysis phase has determined that the loop is vectorizable. @@ -6610,6 +7136,8 @@ vect_transform_loop (loop_vec_info loop_vinfo) tree ratio = NULL; int vectorization_factor = LOOP_VINFO_VECT_FACTOR (loop_vinfo); bool strided_store; + bool slp_scheduled = false; + unsigned int nunits; if (vect_print_dump_info (REPORT_DETAILS)) fprintf (vect_dump, "=== vec_transform_loop ==="); @@ -6744,6 +7272,7 @@ vect_transform_loop (loop_vec_info loop_vinfo) stmt_info = vinfo_for_stmt (phi); if (!stmt_info) continue; + if (!STMT_VINFO_RELEVANT_P (stmt_info) && !STMT_VINFO_LIVE_P (stmt_info)) continue; @@ -6757,7 +7286,7 @@ vect_transform_loop (loop_vec_info loop_vinfo) { if (vect_print_dump_info (REPORT_DETAILS)) fprintf (vect_dump, "transform phi."); - vect_transform_stmt (phi, NULL, NULL); + vect_transform_stmt (phi, NULL, NULL, NULL); } } @@ -6791,21 +7320,56 @@ vect_transform_loop (loop_vec_info loop_vinfo) } gcc_assert (STMT_VINFO_VECTYPE (stmt_info)); - if ((TYPE_VECTOR_SUBPARTS (STMT_VINFO_VECTYPE (stmt_info)) - != (unsigned HOST_WIDE_INT) vectorization_factor) - && vect_print_dump_info (REPORT_DETAILS)) - fprintf (vect_dump, "multiple-types."); + nunits = + (unsigned int) TYPE_VECTOR_SUBPARTS (STMT_VINFO_VECTYPE (stmt_info)); + if (!STMT_SLP_TYPE (stmt_info) + && nunits != (unsigned int) vectorization_factor + && vect_print_dump_info (REPORT_DETAILS)) + /* For SLP VF is set according to unrolling factor, and not to + vector size, hence for SLP this print is not valid. */ + fprintf (vect_dump, "multiple-types."); + + /* SLP. Schedule all the SLP instances when the first SLP stmt is + reached. */ + if (STMT_SLP_TYPE (stmt_info)) + { + if (!slp_scheduled) + { + slp_scheduled = true; + + if (vect_print_dump_info (REPORT_DETAILS)) + fprintf (vect_dump, "=== scheduling SLP instances ==="); + is_store = vect_schedule_slp (loop_vinfo, nunits); + + /* IS_STORE is true if STMT is a store. Stores cannot be of + hybrid SLP type. They are removed in + vect_schedule_slp_instance and their vinfo is destroyed. */ + if (is_store) + { + bsi_next (&si); + continue; + } + } + + /* Hybrid SLP stmts must be vectorized in addition to SLP. */ + if (PURE_SLP_STMT (stmt_info)) + { + bsi_next (&si); + continue; + } + } + /* -------- vectorize statement ------------ */ if (vect_print_dump_info (REPORT_DETAILS)) fprintf (vect_dump, "transform statement."); strided_store = false; - is_store = vect_transform_stmt (stmt, &si, &strided_store); + is_store = vect_transform_stmt (stmt, &si, &strided_store, NULL); if (is_store) { stmt_ann_t ann; - if (DR_GROUP_FIRST_DR (stmt_info)) + if (STMT_VINFO_STRIDED_ACCESS (stmt_info)) { /* Interleaving. If IS_STORE is TRUE, the vectorization of the interleaving chain was completed - free all the stores in diff --git a/gcc/tree-vectorizer.c b/gcc/tree-vectorizer.c index c239d296316..29acfac5289 100644 --- a/gcc/tree-vectorizer.c +++ b/gcc/tree-vectorizer.c @@ -1359,6 +1359,7 @@ new_stmt_vec_info (tree stmt, loop_vec_info loop_vinfo) STMT_VINFO_SAME_ALIGN_REFS (res) = VEC_alloc (dr_p, heap, 5); STMT_VINFO_INSIDE_OF_LOOP_COST (res) = 0; STMT_VINFO_OUTSIDE_OF_LOOP_COST (res) = 0; + STMT_SLP_TYPE (res) = 0; DR_GROUP_FIRST_DR (res) = NULL_TREE; DR_GROUP_NEXT_DR (res) = NULL_TREE; DR_GROUP_SIZE (res) = 0; @@ -1478,7 +1479,9 @@ new_loop_vec_info (struct loop *loop) VEC_alloc (tree, heap, PARAM_VALUE (PARAM_VECT_MAX_VERSION_FOR_ALIGNMENT_CHECKS)); LOOP_VINFO_MAY_ALIAS_DDRS (res) = VEC_alloc (ddr_p, heap, PARAM_VALUE (PARAM_VECT_MAX_VERSION_FOR_ALIAS_CHECKS)); - + LOOP_VINFO_STRIDED_STORES (res) = VEC_alloc (tree, heap, 10); + LOOP_VINFO_SLP_INSTANCES (res) = VEC_alloc (slp_instance, heap, 10); + LOOP_VINFO_SLP_UNROLLING_FACTOR (res) = 1; return res; } @@ -1497,6 +1500,8 @@ destroy_loop_vec_info (loop_vec_info loop_vinfo, bool clean_stmts) int nbbs; block_stmt_iterator si; int j; + VEC (slp_instance, heap) *slp_instances; + slp_instance instance; if (!loop_vinfo) return; @@ -1571,6 +1576,10 @@ destroy_loop_vec_info (loop_vec_info loop_vinfo, bool clean_stmts) free_dependence_relations (LOOP_VINFO_DDRS (loop_vinfo)); VEC_free (tree, heap, LOOP_VINFO_MAY_MISALIGN_STMTS (loop_vinfo)); VEC_free (ddr_p, heap, LOOP_VINFO_MAY_ALIAS_DDRS (loop_vinfo)); + slp_instances = LOOP_VINFO_SLP_INSTANCES (loop_vinfo); + for (j = 0; VEC_iterate (slp_instance, slp_instances, j, instance); j++) + vect_free_slp_tree (SLP_INSTANCE_TREE (instance)); + VEC_free (slp_instance, heap, LOOP_VINFO_SLP_INSTANCES (loop_vinfo)); free (loop_vinfo); loop->aux = NULL; diff --git a/gcc/tree-vectorizer.h b/gcc/tree-vectorizer.h index 1dd472c3e2d..0168c2397bb 100644 --- a/gcc/tree-vectorizer.h +++ b/gcc/tree-vectorizer.h @@ -60,7 +60,7 @@ enum dr_alignment_support { /* Define type of def-use cross-iteration cycle. */ enum vect_def_type { - vect_constant_def, + vect_constant_def = 1, vect_invariant_def, vect_loop_def, vect_induction_def, @@ -77,11 +77,80 @@ enum verbosity_levels { REPORT_DR_DETAILS, REPORT_BAD_FORM_LOOPS, REPORT_OUTER_LOOPS, + REPORT_SLP, REPORT_DETAILS, /* New verbosity levels should be added before this one. */ MAX_VERBOSITY_LEVEL }; +/************************************************************************ + SLP + ************************************************************************/ + +/* A computation tree of an SLP instance. Each node corresponds to a group of + stmts to be packed in a SIMD stmt. */ +typedef struct _slp_tree { + /* Only binary and unary operations are supported. LEFT child corresponds to + the first operand and RIGHT child to the second if the operation is + binary. */ + struct _slp_tree *left; + struct _slp_tree *right; + /* A group of scalar stmts to be vectorized together. */ + VEC (tree, heap) *stmts; + /* Vectorized stmt/s. */ + VEC (tree, heap) *vec_stmts; + /* Number of vector stmts that are created to replace the group of scalar + stmts. It is calculated during the transformation phase as the number of + scalar elements in one scalar iteration (GROUP_SIZE) multiplied by VF + divided by vector size. */ + unsigned int vec_stmts_size; + /* Vectorization costs associated with SLP node. */ + struct + { + int outside_of_loop; /* Statements generated outside loop. */ + int inside_of_loop; /* Statements generated inside loop. */ + } cost; +} *slp_tree; + + +/* SLP instance is a sequence of stmts in a loop that can be packed into + SIMD stmts. */ +typedef struct _slp_instance { + /* The root of SLP tree. */ + slp_tree root; + + /* Size of groups of scalar stmts that will be replaced by SIMD stmt/s. */ + unsigned int group_size; + + /* The unrolling factor required to vectorized this SLP instance. */ + unsigned int unrolling_factor; + + /* Vectorization costs associated with SLP instance. */ + struct + { + int outside_of_loop; /* Statements generated outside loop. */ + int inside_of_loop; /* Statements generated inside loop. */ + } cost; +} *slp_instance; + +DEF_VEC_P(slp_instance); +DEF_VEC_ALLOC_P(slp_instance, heap); + +/* Access Functions. */ +#define SLP_INSTANCE_TREE(S) (S)->root +#define SLP_INSTANCE_GROUP_SIZE(S) (S)->group_size +#define SLP_INSTANCE_UNROLLING_FACTOR(S) (S)->unrolling_factor +#define SLP_INSTANCE_OUTSIDE_OF_LOOP_COST(S) (S)->cost.outside_of_loop +#define SLP_INSTANCE_INSIDE_OF_LOOP_COST(S) (S)->cost.inside_of_loop + +#define SLP_TREE_LEFT(S) (S)->left +#define SLP_TREE_RIGHT(S) (S)->right +#define SLP_TREE_SCALAR_STMTS(S) (S)->stmts +#define SLP_TREE_VEC_STMTS(S) (S)->vec_stmts +#define SLP_TREE_NUMBER_OF_VEC_STMTS(S) (S)->vec_stmts_size +#define SLP_TREE_OUTSIDE_OF_LOOP_COST(S) (S)->cost.outside_of_loop +#define SLP_TREE_INSIDE_OF_LOOP_COST(S) (S)->cost.inside_of_loop + /*-----------------------------------------------------------------*/ /* Info on vectorized loops. */ /*-----------------------------------------------------------------*/ @@ -141,6 +210,18 @@ typedef struct _loop_vec_info { /* The loop location in the source. */ LOC loop_line_number; + + /* All interleaving chains of stores in the loop, represented by the first + stmt in the chain. */ + VEC(tree, heap) *strided_stores; + + /* All SLP instances in the loop. This is a subset of the set of STRIDED_STORES + of the loop. */ + VEC(slp_instance, heap) *slp_instances; + + /* The unrolling factor needed to SLP the loop. In case of that pure SLP is + applied to the loop, i.e., no unrolling is needed, this is 1. */ + unsigned slp_unrolling_factor; } *loop_vec_info; /* Access Functions. */ @@ -159,6 +240,9 @@ typedef struct _loop_vec_info { #define LOOP_VINFO_MAY_MISALIGN_STMTS(L) (L)->may_misalign_stmts #define LOOP_VINFO_LOC(L) (L)->loop_line_number #define LOOP_VINFO_MAY_ALIAS_DDRS(L) (L)->may_alias_ddrs +#define LOOP_VINFO_STRIDED_STORES(L) (L)->strided_stores +#define LOOP_VINFO_SLP_INSTANCES(L) (L)->slp_instances +#define LOOP_VINFO_SLP_UNROLLING_FACTOR(L) (L)->slp_unrolling_factor #define NITERS_KNOWN_P(n) \ (host_integerp ((n),0) \ @@ -216,6 +300,29 @@ enum vect_relevant { vect_used_in_loop }; +/* The type of vectorization that can be applied to the stmt: regular loop-based + vectorization; pure SLP - the stmt is a part of SLP instances and does not + have uses outside SLP instances; or hybrid SLP and loop-based - the stmt is + a part of SLP instance and also must be loop-based vectorized, since it has + uses outside SLP sequences. + + In the loop context the meanings of pure and hybrid SLP are slightly + different. By saying that pure SLP is applied to the loop, we mean that we + exploit only intra-iteration parallelism in the loop; i.e., the loop can be + vectorized without doing any conceptual unrolling, cause we don't pack + together stmts from different iterations, only within a single iteration. + Loop hybrid SLP means that we exploit both intra-iteration and + inter-iteration parallelism (e.g., number of elements in the vector is 4 + and the slp-group-size is 2, in which case we don't have enough parallelism + within an iteration, so we obtain the rest of the parallelism from subsequent + iterations by unrolling the loop by 2). */ +enum slp_vect_type { + loop_vect = 0, + pure_slp, + hybrid +}; + + typedef struct data_reference *dr_p; DEF_VEC_P(dr_p); DEF_VEC_ALLOC_P(dr_p,heap); @@ -309,6 +416,9 @@ typedef struct _stmt_vec_info { int outside_of_loop; /* Statements generated outside loop. */ int inside_of_loop; /* Statements generated inside loop. */ } cost; + + /* Whether the stmt is SLPed, loop-based vectorized, or both. */ + enum slp_vect_type slp_type; } *stmt_vec_info; /* Access Functions. */ @@ -338,6 +448,7 @@ typedef struct _stmt_vec_info { #define STMT_VINFO_DR_GROUP_GAP(S) (S)->gap #define STMT_VINFO_DR_GROUP_SAME_DR_STMT(S)(S)->same_dr_stmt #define STMT_VINFO_DR_GROUP_READ_WRITE_DEPENDENCE(S) (S)->read_write_dep +#define STMT_VINFO_STRIDED_ACCESS(S) ((S)->first_dr != NULL) #define DR_GROUP_FIRST_DR(S) (S)->first_dr #define DR_GROUP_NEXT_DR(S) (S)->next_dr @@ -351,6 +462,10 @@ typedef struct _stmt_vec_info { #define STMT_VINFO_OUTSIDE_OF_LOOP_COST(S) (S)->cost.outside_of_loop #define STMT_VINFO_INSIDE_OF_LOOP_COST(S) (S)->cost.inside_of_loop +#define HYBRID_SLP_STMT(S) ((S)->slp_type == hybrid) +#define PURE_SLP_STMT(S) ((S)->slp_type == pure_slp) +#define STMT_SLP_TYPE(S) (S)->slp_type + /* These are some defines for the initial implementation of the vectorizer's cost model. These will later be target specific hooks. */ @@ -524,6 +639,7 @@ extern stmt_vec_info new_stmt_vec_info (tree stmt, loop_vec_info); /** In tree-vect-analyze.c **/ /* Driver for analysis stage. */ extern loop_vec_info vect_analyze_loop (struct loop *); +extern void vect_free_slp_tree (slp_tree); /** In tree-vect-patterns.c **/ @@ -536,14 +652,16 @@ void vect_pattern_recog (loop_vec_info); /** In tree-vect-transform.c **/ -extern bool vectorizable_load (tree, block_stmt_iterator *, tree *); -extern bool vectorizable_store (tree, block_stmt_iterator *, tree *); -extern bool vectorizable_operation (tree, block_stmt_iterator *, tree *); +extern bool vectorizable_load (tree, block_stmt_iterator *, tree *, slp_tree); +extern bool vectorizable_store (tree, block_stmt_iterator *, tree *, slp_tree); +extern bool vectorizable_operation (tree, block_stmt_iterator *, tree *, + slp_tree); extern bool vectorizable_type_promotion (tree, block_stmt_iterator *, tree *); extern bool vectorizable_type_demotion (tree, block_stmt_iterator *, tree *); extern bool vectorizable_conversion (tree, block_stmt_iterator *, - tree *); -extern bool vectorizable_assignment (tree, block_stmt_iterator *, tree *); + tree *, slp_tree); +extern bool vectorizable_assignment (tree, block_stmt_iterator *, tree *, + slp_tree); extern tree vectorizable_function (tree, tree, tree); extern bool vectorizable_call (tree, block_stmt_iterator *, tree *); extern bool vectorizable_condition (tree, block_stmt_iterator *, tree *); @@ -551,6 +669,11 @@ extern bool vectorizable_live_operation (tree, block_stmt_iterator *, tree *); extern bool vectorizable_reduction (tree, block_stmt_iterator *, tree *); extern bool vectorizable_induction (tree, block_stmt_iterator *, tree *); extern int vect_estimate_min_profitable_iters (loop_vec_info); +extern void vect_model_simple_cost (stmt_vec_info, int, enum vect_def_type *, + slp_tree); +extern void vect_model_store_cost (stmt_vec_info, int, enum vect_def_type, + slp_tree); +extern void vect_model_load_cost (stmt_vec_info, int, slp_tree); /* Driver for transformation stage. */ extern void vect_transform_loop (loop_vec_info); |