summaryrefslogtreecommitdiff
diff options
context:
space:
mode:
authorRichard Sandiford <richard.sandiford@linaro.org>2017-11-08 20:53:26 +0000
committerRichard Sandiford <richard.sandiford@linaro.org>2017-11-20 11:41:45 +0000
commit64ace920aefb090468fd5536f1079a019cbba6a6 (patch)
tree317845beae5c79190b3d3e51f76b9e03f966fc7d
parentcc5909432c8ed962d063b9098f7a8597a26b0ec7 (diff)
downloadgcc-64ace920aefb090468fd5536f1079a019cbba6a6.tar.gz
SLP reductions with variable-length vectors
Two things stopped us using SLP reductions with variable-length vectors: (1) We didn't have a way of constructing the initial vector. This patch does it by creating a vector full of the neutral identity value and then using a shift-and-insert function to insert any non-identity inputs into the low-numbered elements. (The non-identity values are needed for double reductions.) Alternatively, for unchained MIN/MAX reductions that have no neutral value, we instead use the same duplicate-and-interleave approach as for SLP constant and external definitions (added by a previous patch). (2) The epilogue for constant-length vectors would extract the vector elements associated with each SLP statement and do scalar arithmetic on these individual elements. For variable-length vectors, the patch instead creates a reduction vector for each SLP statement, replacing the elements for other SLP statements with the identity value. It then uses a hardware reduction instruction on each vector. 2017-11-09 Richard Sandiford <richard.sandiford@linaro.org> Alan Hayward <alan.hayward@arm.com> David Sherwood <david.sherwood@arm.com> gcc/ * doc/md.texi (vec_shl_insert_@var{m}): New optab. * internal-fn.def (VEC_SHL_INSERT): New internal function. * optabs.def (vec_shl_insert_optab): New optab. * tree-vectorizer.h (can_duplicate_and_interleave_p): Declare. (duplicate_and_interleave): Likewise. * tree-vect-loop.c: Include internal-fn.h. (neutral_op_for_slp_reduction): New function, split out from get_initial_defs_for_reduction. (get_initial_def_for_reduction): Handle option 2 for variable-length vectors by loading the neutral value into a vector and then shifting the initial value into element 0. (get_initial_defs_for_reduction): Replace the code argument with the neutral value calculated by neutral_op_for_slp_reduction. Use gimple_build_vector for constant-length vectors. Use IFN_VEC_SHL_INSERT for variable-length vectors if all but the first group_size elements have a neutral value. Use duplicate_and_interleave otherwise. (vect_create_epilog_for_reduction): Take a neutral_op parameter. Update call to get_initial_defs_for_reduction. Handle SLP reductions for variable-length vectors by creating one vector result for each scalar result, with the elements associated with other scalar results stubbed out with the neutral value. (vectorizable_reduction): Call neutral_op_for_slp_reduction. Require IFN_VEC_SHL_INSERT for double reductions on variable-length vectors, or SLP reductions that have a neutral value. Require can_duplicate_and_interleave_p support for variable-length unchained SLP reductions if there is no neutral value, such as for MIN/MAX reductions. Also require the number of vector elements to be a multiple of the number of SLP statements when doing variable-length unchained SLP reductions. Update call to vect_create_epilog_for_reduction. * tree-vect-slp.c (can_duplicate_and_interleave_p): Make public and remove initial values. (duplicate_and_interleave): Use IFN_VEC_SHL_INSERT for variable-length vectors if all but the first group_size elements have a neutral value. * config/aarch64/aarch64.md (UNSPEC_INSR): New unspec. * config/aarch64/aarch64-sve.md (vec_shl_insert_<mode>): New insn. gcc/testsuite/ * gcc.dg/vect/pr37027.c: Remove XFAIL for variable-length vectors. * gcc.dg/vect/pr67790.c: Likewise. * gcc.dg/vect/slp-reduc-1.c: Likewise. * gcc.dg/vect/slp-reduc-2.c: Likewise. * gcc.dg/vect/slp-reduc-3.c: Likewise. * gcc.dg/vect/slp-reduc-5.c: Likewise. * gcc.target/aarch64/sve_slp_5.c: New test. * gcc.target/aarch64/sve_slp_5_run.c: Likewise. * gcc.target/aarch64/sve_slp_6.c: Likewise. * gcc.target/aarch64/sve_slp_6_run.c: Likewise. * gcc.target/aarch64/sve_slp_7.c: Likewise. * gcc.target/aarch64/sve_slp_7_run.c: Likewise.
-rw-r--r--gcc/config/aarch64/aarch64-sve.md13
-rw-r--r--gcc/config/aarch64/aarch64.md1
-rw-r--r--gcc/doc/md.texi8
-rw-r--r--gcc/internal-fn.def2
-rw-r--r--gcc/optabs.def1
-rw-r--r--gcc/testsuite/gcc.dg/vect/pr37027.c2
-rw-r--r--gcc/testsuite/gcc.dg/vect/pr67790.c2
-rw-r--r--gcc/testsuite/gcc.dg/vect/slp-reduc-1.c2
-rw-r--r--gcc/testsuite/gcc.dg/vect/slp-reduc-2.c2
-rw-r--r--gcc/testsuite/gcc.dg/vect/slp-reduc-3.c5
-rw-r--r--gcc/testsuite/gcc.dg/vect/slp-reduc-5.c2
-rw-r--r--gcc/testsuite/gcc.target/aarch64/sve_slp_5.c58
-rw-r--r--gcc/testsuite/gcc.target/aarch64/sve_slp_5_run.c35
-rw-r--r--gcc/testsuite/gcc.target/aarch64/sve_slp_6.c47
-rw-r--r--gcc/testsuite/gcc.target/aarch64/sve_slp_6_run.c37
-rw-r--r--gcc/testsuite/gcc.target/aarch64/sve_slp_7.c66
-rw-r--r--gcc/testsuite/gcc.target/aarch64/sve_slp_7_run.c39
-rw-r--r--gcc/tree-vect-loop.c322
-rw-r--r--gcc/tree-vect-slp.c28
-rw-r--r--gcc/tree-vectorizer.h5
20 files changed, 592 insertions, 85 deletions
diff --git a/gcc/config/aarch64/aarch64-sve.md b/gcc/config/aarch64/aarch64-sve.md
index 535f0c3b174..adabf8e7bd6 100644
--- a/gcc/config/aarch64/aarch64-sve.md
+++ b/gcc/config/aarch64/aarch64-sve.md
@@ -2081,3 +2081,16 @@
operands[5] = gen_reg_rtx (VNx4SImode);
}
)
+
+;; Shift an SVE vector left and insert a scalar into element 0.
+(define_insn "vec_shl_insert_<mode>"
+ [(set (match_operand:SVE_ALL 0 "register_operand" "=w, w")
+ (unspec:SVE_ALL
+ [(match_operand:SVE_ALL 1 "register_operand" "0, 0")
+ (match_operand:<VEL> 2 "register_operand" "rZ, w")]
+ UNSPEC_INSR))]
+ "TARGET_SVE"
+ "@
+ insr\t%0.<Vetype>, %<vwcore>2
+ insr\t%0.<Vetype>, %<Vetype>2"
+)
diff --git a/gcc/config/aarch64/aarch64.md b/gcc/config/aarch64/aarch64.md
index 09ce12aba75..6a3ddcdceb6 100644
--- a/gcc/config/aarch64/aarch64.md
+++ b/gcc/config/aarch64/aarch64.md
@@ -162,6 +162,7 @@
UNSPEC_WHILE_LO
UNSPEC_LDN
UNSPEC_STN
+ UNSPEC_INSR
])
(define_c_enum "unspecv" [
diff --git a/gcc/doc/md.texi b/gcc/doc/md.texi
index 5b700d36c15..35b5e48320f 100644
--- a/gcc/doc/md.texi
+++ b/gcc/doc/md.texi
@@ -5273,6 +5273,14 @@ operand 1. Add operand 1 to operand 2 and place the widened result in
operand 0. (This is used express accumulation of elements into an accumulator
of a wider mode.)
+@cindex @code{vec_shl_insert_@var{m}} instruction pattern
+@item @samp{vec_shl_insert_@var{m}}
+Shift the elements in vector input operand 1 left one element (i.e.
+away from element 0) and fill the vacated element 0 with the scalar
+in operand 2. Store the result in vector output operand 0. Operands
+0 and 1 have mode @var{m} and operand 2 has the mode appropriate for
+one element of @var{m}.
+
@cindex @code{vec_shr_@var{m}} instruction pattern
@item @samp{vec_shr_@var{m}}
Whole vector right shift in bits, i.e. towards element 0.
diff --git a/gcc/internal-fn.def b/gcc/internal-fn.def
index c1329c76f3d..c2edb5e82dd 100644
--- a/gcc/internal-fn.def
+++ b/gcc/internal-fn.def
@@ -112,6 +112,8 @@ DEF_INTERNAL_OPTAB_FN (VEC_EXTRACT_ODD, ECF_CONST | ECF_NOTHROW,
vec_extract_odd, binary)
DEF_INTERNAL_OPTAB_FN (VEC_REVERSE, ECF_CONST | ECF_NOTHROW,
vec_reverse, unary)
+DEF_INTERNAL_OPTAB_FN (VEC_SHL_INSERT, ECF_CONST | ECF_NOTHROW,
+ vec_shl_insert, binary)
DEF_INTERNAL_OPTAB_FN (RSQRT, ECF_CONST, rsqrt, unary)
diff --git a/gcc/optabs.def b/gcc/optabs.def
index 1f3fee43471..2389351f3f9 100644
--- a/gcc/optabs.def
+++ b/gcc/optabs.def
@@ -374,3 +374,4 @@ OPTAB_D (set_thread_pointer_optab, "set_thread_pointer$I$a")
OPTAB_DC (vec_duplicate_optab, "vec_duplicate$a", VEC_DUPLICATE)
OPTAB_DC (vec_series_optab, "vec_series$a", VEC_SERIES)
+OPTAB_D (vec_shl_insert_optab, "vec_shl_insert_$a")
diff --git a/gcc/testsuite/gcc.dg/vect/pr37027.c b/gcc/testsuite/gcc.dg/vect/pr37027.c
index 7e14c8841a6..ef6760ec924 100644
--- a/gcc/testsuite/gcc.dg/vect/pr37027.c
+++ b/gcc/testsuite/gcc.dg/vect/pr37027.c
@@ -32,5 +32,5 @@ foo (void)
}
/* { dg-final { scan-tree-dump-times "vectorized 1 loops" 1 "vect" { xfail vect_no_int_add } } } */
-/* { dg-final { scan-tree-dump-times "vectorizing stmts using SLP" 1 "vect" { xfail { vect_no_int_add || { vect_variable_length && vect_load_lanes } } } } } */
+/* { dg-final { scan-tree-dump-times "vectorizing stmts using SLP" 1 "vect" { xfail vect_no_int_add } } } */
diff --git a/gcc/testsuite/gcc.dg/vect/pr67790.c b/gcc/testsuite/gcc.dg/vect/pr67790.c
index 85d846ba259..5e2d506a730 100644
--- a/gcc/testsuite/gcc.dg/vect/pr67790.c
+++ b/gcc/testsuite/gcc.dg/vect/pr67790.c
@@ -37,4 +37,4 @@ int main()
return 0;
}
-/* { dg-final { scan-tree-dump "vectorizing stmts using SLP" "vect" { xfail { vect_variable_length && vect_load_lanes } } } } */
+/* { dg-final { scan-tree-dump "vectorizing stmts using SLP" "vect" } } */
diff --git a/gcc/testsuite/gcc.dg/vect/slp-reduc-1.c b/gcc/testsuite/gcc.dg/vect/slp-reduc-1.c
index 5fc6ece6b90..b353dd7ccf8 100644
--- a/gcc/testsuite/gcc.dg/vect/slp-reduc-1.c
+++ b/gcc/testsuite/gcc.dg/vect/slp-reduc-1.c
@@ -43,5 +43,5 @@ int main (void)
}
/* { dg-final { scan-tree-dump-times "vectorized 1 loops" 1 "vect" { xfail vect_no_int_add } } } */
-/* { dg-final { scan-tree-dump-times "vectorizing stmts using SLP" 1 "vect" { xfail { vect_no_int_add || { vect_variable_length && vect_load_lanes } } } } } */
+/* { dg-final { scan-tree-dump-times "vectorizing stmts using SLP" 1 "vect" { xfail vect_no_int_add } } } */
diff --git a/gcc/testsuite/gcc.dg/vect/slp-reduc-2.c b/gcc/testsuite/gcc.dg/vect/slp-reduc-2.c
index 50d7aa80bfc..07c96c00eb0 100644
--- a/gcc/testsuite/gcc.dg/vect/slp-reduc-2.c
+++ b/gcc/testsuite/gcc.dg/vect/slp-reduc-2.c
@@ -38,5 +38,5 @@ int main (void)
}
/* { dg-final { scan-tree-dump-times "vectorized 1 loops" 1 "vect" { xfail vect_no_int_add } } } */
-/* { dg-final { scan-tree-dump-times "vectorizing stmts using SLP" 1 "vect" { xfail { vect_no_int_add || { vect_variable_length && vect_load_lanes } } } } } */
+/* { dg-final { scan-tree-dump-times "vectorizing stmts using SLP" 1 "vect" { xfail vect_no_int_add } } } */
diff --git a/gcc/testsuite/gcc.dg/vect/slp-reduc-3.c b/gcc/testsuite/gcc.dg/vect/slp-reduc-3.c
index 34c8da7eac2..9c8124c9b5f 100644
--- a/gcc/testsuite/gcc.dg/vect/slp-reduc-3.c
+++ b/gcc/testsuite/gcc.dg/vect/slp-reduc-3.c
@@ -58,7 +58,4 @@ int main (void)
/* The initialization loop in main also gets vectorized. */
/* { dg-final { scan-tree-dump-times "vect_recog_dot_prod_pattern: detected" 1 "vect" { xfail *-*-* } } } */
/* { dg-final { scan-tree-dump-times "vectorized 1 loops" 2 "vect" { target { vect_short_mult && { vect_widen_sum_hi_to_si && vect_unpack } } } } } */
-/* We can't yet create the necessary SLP constant vector for variable-length
- SVE and so fall back to Advanced SIMD. This means that we repeat each
- analysis note. */
-/* { dg-final { scan-tree-dump-times "vectorizing stmts using SLP" 1 "vect" { xfail { vect_widen_sum_hi_to_si_pattern || { { ! vect_unpack } || { aarch64_sve && vect_variable_length } } } } } } */
+/* { dg-final { scan-tree-dump-times "vectorizing stmts using SLP" 1 "vect" { xfail { vect_widen_sum_hi_to_si_pattern || { ! vect_unpack } } } } } */
diff --git a/gcc/testsuite/gcc.dg/vect/slp-reduc-5.c b/gcc/testsuite/gcc.dg/vect/slp-reduc-5.c
index 0a3951dc26d..fc689e46ba1 100644
--- a/gcc/testsuite/gcc.dg/vect/slp-reduc-5.c
+++ b/gcc/testsuite/gcc.dg/vect/slp-reduc-5.c
@@ -43,5 +43,5 @@ int main (void)
}
/* { dg-final { scan-tree-dump-times "vectorized 1 loops" 2 "vect" { xfail vect_no_int_min_max } } } */
-/* { dg-final { scan-tree-dump-times "vectorizing stmts using SLP" 1 "vect" { xfail { vect_no_int_min_max || { vect_variable_length && vect_load_lanes } } } } } */
+/* { dg-final { scan-tree-dump-times "vectorizing stmts using SLP" 1 "vect" { xfail vect_no_int_min_max } } } */
diff --git a/gcc/testsuite/gcc.target/aarch64/sve_slp_5.c b/gcc/testsuite/gcc.target/aarch64/sve_slp_5.c
new file mode 100644
index 00000000000..88fe213f66a
--- /dev/null
+++ b/gcc/testsuite/gcc.target/aarch64/sve_slp_5.c
@@ -0,0 +1,58 @@
+/* { dg-do compile } */
+/* { dg-options "-O2 -ftree-vectorize -march=armv8-a+sve -msve-vector-bits=scalable -ffast-math" } */
+
+#include <stdint.h>
+
+#define VEC_PERM(TYPE) \
+void __attribute__ ((noinline, noclone)) \
+vec_slp_##TYPE (TYPE *restrict a, TYPE *restrict b, int n) \
+{ \
+ TYPE x0 = b[0]; \
+ TYPE x1 = b[1]; \
+ for (int i = 0; i < n; ++i) \
+ { \
+ x0 += a[i * 2]; \
+ x1 += a[i * 2 + 1]; \
+ } \
+ b[0] = x0; \
+ b[1] = x1; \
+}
+
+#define TEST_ALL(T) \
+ T (int8_t) \
+ T (uint8_t) \
+ T (int16_t) \
+ T (uint16_t) \
+ T (int32_t) \
+ T (uint32_t) \
+ T (int64_t) \
+ T (uint64_t) \
+ T (_Float16) \
+ T (float) \
+ T (double)
+
+TEST_ALL (VEC_PERM)
+
+/* ??? We don't think it's worth using SLP for the 64-bit loops and fall
+ back to the less efficient non-SLP implementation instead. */
+/* ??? At present we don't treat the int8_t and int16_t loops as
+ reductions. */
+/* { dg-final { scan-assembler-times {\tld1b\t} 2 { xfail *-*-* } } } */
+/* { dg-final { scan-assembler-times {\tld1h\t} 3 { xfail *-*-* } } } */
+/* { dg-final { scan-assembler-times {\tld1b\t} 1 } } */
+/* { dg-final { scan-assembler-times {\tld1h\t} 2 } } */
+/* { dg-final { scan-assembler-times {\tld1w\t} 3 } } */
+/* { dg-final { scan-assembler-times {\tld1d\t} 3 { xfail *-*-* } } } */
+/* { dg-final { scan-assembler-not {\tld2b\t} } } */
+/* { dg-final { scan-assembler-not {\tld2h\t} } } */
+/* { dg-final { scan-assembler-not {\tld2w\t} } } */
+/* { dg-final { scan-assembler-not {\tld2d\t} { xfail *-*-* } } } */
+/* { dg-final { scan-assembler-times {\tuaddv\td[0-9]+, p[0-7], z[0-9]+\.b} 4 { xfail *-*-* } } } */
+/* { dg-final { scan-assembler-times {\tuaddv\td[0-9]+, p[0-7], z[0-9]+\.h} 4 { xfail *-*-* } } } */
+/* { dg-final { scan-assembler-times {\tuaddv\td[0-9]+, p[0-7], z[0-9]+\.b} 2 } } */
+/* { dg-final { scan-assembler-times {\tuaddv\td[0-9]+, p[0-7], z[0-9]+\.h} 2 } } */
+/* { dg-final { scan-assembler-times {\tuaddv\td[0-9]+, p[0-7], z[0-9]+\.s} 4 } } */
+/* { dg-final { scan-assembler-times {\tuaddv\td[0-9]+, p[0-7], z[0-9]+\.d} 4 } } */
+/* { dg-final { scan-assembler-times {\tfaddv\th[0-9]+, p[0-7], z[0-9]+\.h} 2 } } */
+/* { dg-final { scan-assembler-times {\tfaddv\ts[0-9]+, p[0-7], z[0-9]+\.s} 2 } } */
+/* { dg-final { scan-assembler-times {\tfaddv\td[0-9]+, p[0-7], z[0-9]+\.d} 2 } } */
diff --git a/gcc/testsuite/gcc.target/aarch64/sve_slp_5_run.c b/gcc/testsuite/gcc.target/aarch64/sve_slp_5_run.c
new file mode 100644
index 00000000000..bb5421700da
--- /dev/null
+++ b/gcc/testsuite/gcc.target/aarch64/sve_slp_5_run.c
@@ -0,0 +1,35 @@
+/* { dg-do run { target aarch64_sve_hw } } */
+/* { dg-options "-O2 -ftree-vectorize -march=armv8-a+sve -ffast-math" } */
+
+#include "sve_slp_5.c"
+
+#define N (141 * 2)
+
+#define HARNESS(TYPE) \
+ { \
+ TYPE a[N], b[2] = { 40, 22 }; \
+ for (unsigned int i = 0; i < N; ++i) \
+ { \
+ a[i] = i * 2 + i % 5; \
+ asm volatile ("" ::: "memory"); \
+ } \
+ vec_slp_##TYPE (a, b, N / 2); \
+ TYPE x0 = 40; \
+ TYPE x1 = 22; \
+ for (unsigned int i = 0; i < N; i += 2) \
+ { \
+ x0 += a[i]; \
+ x1 += a[i + 1]; \
+ asm volatile ("" ::: "memory"); \
+ } \
+ /* _Float16 isn't precise enough for this. */ \
+ if ((TYPE) 0x1000 + 1 != (TYPE) 0x1000 \
+ && (x0 != b[0] || x1 != b[1])) \
+ __builtin_abort (); \
+ }
+
+int __attribute__ ((optimize (1)))
+main (void)
+{
+ TEST_ALL (HARNESS)
+}
diff --git a/gcc/testsuite/gcc.target/aarch64/sve_slp_6.c b/gcc/testsuite/gcc.target/aarch64/sve_slp_6.c
new file mode 100644
index 00000000000..fcc75ad0814
--- /dev/null
+++ b/gcc/testsuite/gcc.target/aarch64/sve_slp_6.c
@@ -0,0 +1,47 @@
+/* { dg-do compile } */
+/* { dg-options "-O2 -ftree-vectorize -march=armv8-a+sve -msve-vector-bits=scalable -ffast-math" } */
+
+#include <stdint.h>
+
+#define VEC_PERM(TYPE) \
+void __attribute__ ((noinline, noclone)) \
+vec_slp_##TYPE (TYPE *restrict a, TYPE *restrict b, int n) \
+{ \
+ TYPE x0 = b[0]; \
+ TYPE x1 = b[1]; \
+ TYPE x2 = b[2]; \
+ for (int i = 0; i < n; ++i) \
+ { \
+ x0 += a[i * 3]; \
+ x1 += a[i * 3 + 1]; \
+ x2 += a[i * 3 + 2]; \
+ } \
+ b[0] = x0; \
+ b[1] = x1; \
+ b[2] = x2; \
+}
+
+#define TEST_ALL(T) \
+ T (int8_t) \
+ T (uint8_t) \
+ T (int16_t) \
+ T (uint16_t) \
+ T (int32_t) \
+ T (uint32_t) \
+ T (int64_t) \
+ T (uint64_t) \
+ T (_Float16) \
+ T (float) \
+ T (double)
+
+TEST_ALL (VEC_PERM)
+
+/* These loops can't use SLP. */
+/* { dg-final { scan-assembler-not {\tld1b\t} } } */
+/* { dg-final { scan-assembler-not {\tld1h\t} } } */
+/* { dg-final { scan-assembler-not {\tld1w\t} } } */
+/* { dg-final { scan-assembler-not {\tld1d\t} } } */
+/* { dg-final { scan-assembler {\tld3b\t} } } */
+/* { dg-final { scan-assembler {\tld3h\t} } } */
+/* { dg-final { scan-assembler {\tld3w\t} } } */
+/* { dg-final { scan-assembler {\tld3d\t} } } */
diff --git a/gcc/testsuite/gcc.target/aarch64/sve_slp_6_run.c b/gcc/testsuite/gcc.target/aarch64/sve_slp_6_run.c
new file mode 100644
index 00000000000..e2ad116f91d
--- /dev/null
+++ b/gcc/testsuite/gcc.target/aarch64/sve_slp_6_run.c
@@ -0,0 +1,37 @@
+/* { dg-do run { target aarch64_sve_hw } } */
+/* { dg-options "-O2 -ftree-vectorize -march=armv8-a+sve -ffast-math" } */
+
+#include "sve_slp_6.c"
+
+#define N (77 * 3)
+
+#define HARNESS(TYPE) \
+ { \
+ TYPE a[N], b[3] = { 40, 22, 75 }; \
+ for (unsigned int i = 0; i < N; ++i) \
+ { \
+ a[i] = i * 2 + i % 5; \
+ asm volatile ("" ::: "memory"); \
+ } \
+ vec_slp_##TYPE (a, b, N / 3); \
+ TYPE x0 = 40; \
+ TYPE x1 = 22; \
+ TYPE x2 = 75; \
+ for (unsigned int i = 0; i < N; i += 3) \
+ { \
+ x0 += a[i]; \
+ x1 += a[i + 1]; \
+ x2 += a[i + 2]; \
+ asm volatile ("" ::: "memory"); \
+ } \
+ /* _Float16 isn't precise enough for this. */ \
+ if ((TYPE) 0x1000 + 1 != (TYPE) 0x1000 \
+ && (x0 != b[0] || x1 != b[1] || x2 != b[2])) \
+ __builtin_abort (); \
+ }
+
+int __attribute__ ((optimize (1)))
+main (void)
+{
+ TEST_ALL (HARNESS)
+}
diff --git a/gcc/testsuite/gcc.target/aarch64/sve_slp_7.c b/gcc/testsuite/gcc.target/aarch64/sve_slp_7.c
new file mode 100644
index 00000000000..6613ea6dd2d
--- /dev/null
+++ b/gcc/testsuite/gcc.target/aarch64/sve_slp_7.c
@@ -0,0 +1,66 @@
+/* { dg-do compile } */
+/* { dg-options "-O2 -ftree-vectorize -march=armv8-a+sve -msve-vector-bits=scalable -ffast-math" } */
+
+#include <stdint.h>
+
+#define VEC_PERM(TYPE) \
+void __attribute__ ((noinline, noclone)) \
+vec_slp_##TYPE (TYPE *restrict a, TYPE *restrict b, int n) \
+{ \
+ TYPE x0 = b[0]; \
+ TYPE x1 = b[1]; \
+ TYPE x2 = b[2]; \
+ TYPE x3 = b[3]; \
+ for (int i = 0; i < n; ++i) \
+ { \
+ x0 += a[i * 4]; \
+ x1 += a[i * 4 + 1]; \
+ x2 += a[i * 4 + 2]; \
+ x3 += a[i * 4 + 3]; \
+ } \
+ b[0] = x0; \
+ b[1] = x1; \
+ b[2] = x2; \
+ b[3] = x3; \
+}
+
+#define TEST_ALL(T) \
+ T (int8_t) \
+ T (uint8_t) \
+ T (int16_t) \
+ T (uint16_t) \
+ T (int32_t) \
+ T (uint32_t) \
+ T (int64_t) \
+ T (uint64_t) \
+ T (_Float16) \
+ T (float) \
+ T (double)
+
+TEST_ALL (VEC_PERM)
+
+/* We can't use SLP for the 64-bit loops, since the number of reduction
+ results might be greater than the number of elements in the vector.
+ Otherwise we have two loads per loop, one for the initial vector
+ and one for the loop body. */
+/* ??? At present we don't treat the int8_t and int16_t loops as
+ reductions. */
+/* { dg-final { scan-assembler-times {\tld1b\t} 2 { xfail *-*-* } } } */
+/* { dg-final { scan-assembler-times {\tld1h\t} 3 { xfail *-*-* } } } */
+/* { dg-final { scan-assembler-times {\tld1b\t} 1 } } */
+/* { dg-final { scan-assembler-times {\tld1h\t} 2 } } */
+/* { dg-final { scan-assembler-times {\tld1w\t} 3 } } */
+/* { dg-final { scan-assembler-times {\tld4d\t} 3 } } */
+/* { dg-final { scan-assembler-not {\tld4b\t} } } */
+/* { dg-final { scan-assembler-not {\tld4h\t} } } */
+/* { dg-final { scan-assembler-not {\tld4w\t} } } */
+/* { dg-final { scan-assembler-not {\tld1d\t} } } */
+/* { dg-final { scan-assembler-times {\tuaddv\td[0-9]+, p[0-7], z[0-9]+\.b} 8 { xfail *-*-* } } } */
+/* { dg-final { scan-assembler-times {\tuaddv\td[0-9]+, p[0-7], z[0-9]+\.h} 8 { xfail *-*-* } } } */
+/* { dg-final { scan-assembler-times {\tuaddv\td[0-9]+, p[0-7], z[0-9]+\.b} 4 } } */
+/* { dg-final { scan-assembler-times {\tuaddv\td[0-9]+, p[0-7], z[0-9]+\.h} 4 } } */
+/* { dg-final { scan-assembler-times {\tuaddv\td[0-9]+, p[0-7], z[0-9]+\.s} 8 } } */
+/* { dg-final { scan-assembler-times {\tuaddv\td[0-9]+, p[0-7], z[0-9]+\.d} 8 } } */
+/* { dg-final { scan-assembler-times {\tfaddv\th[0-9]+, p[0-7], z[0-9]+\.h} 4 } } */
+/* { dg-final { scan-assembler-times {\tfaddv\ts[0-9]+, p[0-7], z[0-9]+\.s} 4 } } */
+/* { dg-final { scan-assembler-times {\tfaddv\td[0-9]+, p[0-7], z[0-9]+\.d} 4 } } */
diff --git a/gcc/testsuite/gcc.target/aarch64/sve_slp_7_run.c b/gcc/testsuite/gcc.target/aarch64/sve_slp_7_run.c
new file mode 100644
index 00000000000..5a8bf99bc5b
--- /dev/null
+++ b/gcc/testsuite/gcc.target/aarch64/sve_slp_7_run.c
@@ -0,0 +1,39 @@
+/* { dg-do run { target aarch64_sve_hw } } */
+/* { dg-options "-O2 -ftree-vectorize -march=armv8-a+sve -ffast-math" } */
+
+#include "sve_slp_7.c"
+
+#define N (54 * 4)
+
+#define HARNESS(TYPE) \
+ { \
+ TYPE a[N], b[4] = { 40, 22, 75, 19 }; \
+ for (unsigned int i = 0; i < N; ++i) \
+ { \
+ a[i] = i * 2 + i % 5; \
+ asm volatile ("" ::: "memory"); \
+ } \
+ vec_slp_##TYPE (a, b, N / 4); \
+ TYPE x0 = 40; \
+ TYPE x1 = 22; \
+ TYPE x2 = 75; \
+ TYPE x3 = 19; \
+ for (unsigned int i = 0; i < N; i += 4) \
+ { \
+ x0 += a[i]; \
+ x1 += a[i + 1]; \
+ x2 += a[i + 2]; \
+ x3 += a[i + 3]; \
+ asm volatile ("" ::: "memory"); \
+ } \
+ /* _Float16 isn't precise enough for this. */ \
+ if ((TYPE) 0x1000 + 1 != (TYPE) 0x1000 \
+ && (x0 != b[0] || x1 != b[1] || x2 != b[2] || x3 != b[3])) \
+ __builtin_abort (); \
+ }
+
+int __attribute__ ((optimize (1)))
+main (void)
+{
+ TEST_ALL (HARNESS)
+}
diff --git a/gcc/tree-vect-loop.c b/gcc/tree-vect-loop.c
index 793c22ac61d..2fbc0826fd3 100644
--- a/gcc/tree-vect-loop.c
+++ b/gcc/tree-vect-loop.c
@@ -50,6 +50,7 @@ along with GCC; see the file COPYING3. If not see
#include "cgraph.h"
#include "tree-cfg.h"
#include "tree-if-conv.h"
+#include "internal-fn.h"
/* Loop Vectorization Pass.
@@ -2449,6 +2450,54 @@ reduction_code_for_scalar_code (enum tree_code code,
}
}
+/* If there is a neutral value X such that SLP reduction NODE would not
+ be affected by the introduction of additional X elements, return that X,
+ otherwise return null. CODE is the code of the reduction. REDUC_CHAIN
+ is true if the SLP statements perform a single reduction, false if each
+ statement performs an independent reduction. */
+
+static tree
+neutral_op_for_slp_reduction (slp_tree slp_node, tree_code code,
+ bool reduc_chain)
+{
+ vec<gimple *> stmts = SLP_TREE_SCALAR_STMTS (slp_node);
+ gimple *stmt = stmts[0];
+ stmt_vec_info stmt_vinfo = vinfo_for_stmt (stmt);
+ tree vector_type = STMT_VINFO_VECTYPE (stmt_vinfo);
+ tree scalar_type = TREE_TYPE (vector_type);
+ struct loop *loop = gimple_bb (stmt)->loop_father;
+ gcc_assert (loop);
+
+ switch (code)
+ {
+ case WIDEN_SUM_EXPR:
+ case DOT_PROD_EXPR:
+ case SAD_EXPR:
+ case PLUS_EXPR:
+ case MINUS_EXPR:
+ case BIT_IOR_EXPR:
+ case BIT_XOR_EXPR:
+ return build_zero_cst (scalar_type);
+
+ case MULT_EXPR:
+ return build_one_cst (scalar_type);
+
+ case BIT_AND_EXPR:
+ return build_all_ones_cst (scalar_type);
+
+ case MAX_EXPR:
+ case MIN_EXPR:
+ /* For MIN/MAX the initial values are neutral. A reduction chain
+ has only a single initial value, so that value is neutral for
+ all statements. */
+ if (reduc_chain)
+ return PHI_ARG_DEF_FROM_EDGE (stmt, loop_preheader_edge (loop));
+ return NULL_TREE;
+
+ default:
+ return NULL_TREE;
+ }
+}
/* Error reporting helper for vect_is_simple_reduction below. GIMPLE statement
STMT is printed with a message MSG. */
@@ -4014,6 +4063,7 @@ get_initial_def_for_reduction (gimple *stmt, tree init_val,
int int_init_val = 0;
gimple *def_stmt = NULL;
gimple_seq stmts = NULL;
+ unsigned HOST_WIDE_INT count;
gcc_assert (vectype);
nunits = TYPE_VECTOR_SUBPARTS (vectype);
@@ -4086,13 +4136,19 @@ get_initial_def_for_reduction (gimple *stmt, tree init_val,
/* Option1: the first element is '0' or '1' as well. */
init_def = gimple_build_vector_from_val (&stmts, vectype,
def_for_init);
+ else if (!nunits.is_constant (&count))
+ {
+ /* Option2 (variable length): the first element is INIT_VAL. */
+ init_def = build_vector_from_val (vectype, def_for_init);
+ gcall *call = gimple_build_call_internal (IFN_VEC_SHL_INSERT,
+ 2, init_def, init_val);
+ init_def = make_ssa_name (vectype);
+ gimple_call_set_lhs (call, init_def);
+ gimple_seq_add_stmt (&stmts, call);
+ }
else
{
- /* Option2: the first element is INIT_VAL. */
-
- /* Enforced by vectorizable_reduction (which disallows double
- reductions with variable-length vectors). */
- unsigned int count = nunits.to_constant ();
+ /* Option2 (constant length): the first element is INIT_VAL. */
auto_vec<tree, 32> elts (count);
elts.quick_push (init_val);
for (unsigned int i = 1; i < count; ++i)
@@ -4130,34 +4186,32 @@ get_initial_def_for_reduction (gimple *stmt, tree init_val,
}
/* Get at the initial defs for the reduction PHIs in SLP_NODE.
- NUMBER_OF_VECTORS is the number of vector defs to create. */
+ NUMBER_OF_VECTORS is the number of vector defs to create.
+ If NEUTRAL_OP is nonnull, introducing extra elements of that
+ value will not change the result. */
static void
get_initial_defs_for_reduction (slp_tree slp_node,
vec<tree> *vec_oprnds,
unsigned int number_of_vectors,
- enum tree_code code, bool reduc_chain)
+ bool reduc_chain, tree neutral_op)
{
vec<gimple *> stmts = SLP_TREE_SCALAR_STMTS (slp_node);
gimple *stmt = stmts[0];
stmt_vec_info stmt_vinfo = vinfo_for_stmt (stmt);
- unsigned nunits;
+ unsigned HOST_WIDE_INT nunits;
unsigned j, number_of_places_left_in_vector;
- tree vector_type, scalar_type;
+ tree vector_type;
tree vop;
int group_size = stmts.length ();
unsigned int vec_num, i;
unsigned number_of_copies = 1;
vec<tree> voprnds;
voprnds.create (number_of_vectors);
- tree neutral_op = NULL;
struct loop *loop;
+ auto_vec<tree, 16> permute_results;
vector_type = STMT_VINFO_VECTYPE (stmt_vinfo);
- scalar_type = TREE_TYPE (vector_type);
- /* vectorizable_reduction has already rejected SLP reductions on
- variable-length vectors. */
- nunits = TYPE_VECTOR_SUBPARTS (vector_type).to_constant ();
gcc_assert (STMT_VINFO_DEF_TYPE (stmt_vinfo) == vect_reduction_def);
@@ -4165,45 +4219,7 @@ get_initial_defs_for_reduction (slp_tree slp_node,
gcc_assert (loop);
edge pe = loop_preheader_edge (loop);
- /* op is the reduction operand of the first stmt already. */
- /* For additional copies (see the explanation of NUMBER_OF_COPIES below)
- we need either neutral operands or the original operands. See
- get_initial_def_for_reduction() for details. */
- switch (code)
- {
- case WIDEN_SUM_EXPR:
- case DOT_PROD_EXPR:
- case SAD_EXPR:
- case PLUS_EXPR:
- case MINUS_EXPR:
- case BIT_IOR_EXPR:
- case BIT_XOR_EXPR:
- neutral_op = build_zero_cst (scalar_type);
- break;
-
- case MULT_EXPR:
- neutral_op = build_one_cst (scalar_type);
- break;
-
- case BIT_AND_EXPR:
- neutral_op = build_all_ones_cst (scalar_type);
- break;
-
- /* For MIN/MAX we don't have an easy neutral operand but
- the initial values can be used fine here. Only for
- a reduction chain we have to force a neutral element. */
- case MAX_EXPR:
- case MIN_EXPR:
- if (! reduc_chain)
- neutral_op = NULL;
- else
- neutral_op = PHI_ARG_DEF_FROM_EDGE (stmt, pe);
- break;
-
- default:
- gcc_assert (! reduc_chain);
- neutral_op = NULL;
- }
+ gcc_assert (!reduc_chain || neutral_op);
/* NUMBER_OF_COPIES is the number of times we need to use the same values in
created vectors. It is greater than 1 if unrolling is performed.
@@ -4221,6 +4237,9 @@ get_initial_defs_for_reduction (slp_tree slp_node,
(s1, s2, ..., s8). We will create two vectors {s1, s2, s3, s4} and
{s5, s6, s7, s8}. */
+ if (!TYPE_VECTOR_SUBPARTS (vector_type).is_constant (&nunits))
+ nunits = group_size;
+
number_of_copies = nunits * number_of_vectors / group_size;
number_of_places_left_in_vector = nunits;
@@ -4247,7 +4266,40 @@ get_initial_defs_for_reduction (slp_tree slp_node,
if (number_of_places_left_in_vector == 0)
{
gimple_seq ctor_seq = NULL;
- tree init = gimple_build_vector (&ctor_seq, vector_type, elts);
+ tree init;
+ if (must_eq (TYPE_VECTOR_SUBPARTS (vector_type), nunits))
+ /* Build the vector directly from ELTS. */
+ init = gimple_build_vector (&ctor_seq, vector_type, elts);
+ else if (neutral_op)
+ {
+ /* Build a vector of the neutral value and shift the
+ other elements into place. */
+ init = gimple_build_vector_from_val (&ctor_seq, vector_type,
+ neutral_op);
+ int k = nunits;
+ while (k > 0 && elts[k - 1] == neutral_op)
+ k -= 1;
+ while (k > 0)
+ {
+ k -= 1;
+ gcall *call = gimple_build_call_internal
+ (IFN_VEC_SHL_INSERT, 2, init, elts[k]);
+ init = make_ssa_name (vector_type);
+ gimple_call_set_lhs (call, init);
+ gimple_seq_add_stmt (&ctor_seq, call);
+ }
+ }
+ else
+ {
+ /* First time round, duplicate ELTS to fill the
+ required number of vectors, then cherry pick the
+ appropriate result for each iteration. */
+ if (vec_oprnds->is_empty ())
+ duplicate_and_interleave (&ctor_seq, vector_type, elts,
+ number_of_vectors,
+ permute_results);
+ init = permute_results[number_of_vectors - j - 1];
+ }
if (ctor_seq != NULL)
gsi_insert_seq_on_edge_immediate (pe, ctor_seq);
voprnds.quick_push (init);
@@ -4317,6 +4369,8 @@ get_initial_defs_for_reduction (slp_tree slp_node,
DOUBLE_REDUC is TRUE if double reduction phi nodes should be handled.
SLP_NODE is an SLP node containing a group of reduction statements. The
first one in this group is STMT.
+ NEUTRAL_OP is the value given by neutral_op_for_slp_reduction; it is
+ null if this is not an SLP reduction
This function:
1. Creates the reduction def-use cycles: sets the arguments for
@@ -4364,7 +4418,8 @@ vect_create_epilog_for_reduction (vec<tree> vect_defs, gimple *stmt,
vec<gimple *> reduction_phis,
bool double_reduc,
slp_tree slp_node,
- slp_instance slp_node_instance)
+ slp_instance slp_node_instance,
+ tree neutral_op)
{
stmt_vec_info stmt_info = vinfo_for_stmt (stmt);
stmt_vec_info prev_phi_info;
@@ -4400,6 +4455,7 @@ vect_create_epilog_for_reduction (vec<tree> vect_defs, gimple *stmt,
auto_vec<tree> vec_initial_defs;
auto_vec<gimple *> phis;
bool slp_reduc = false;
+ bool direct_slp_reduc;
tree new_phi_result;
gimple *inner_phi = NULL;
tree induction_index = NULL_TREE;
@@ -4443,8 +4499,9 @@ vect_create_epilog_for_reduction (vec<tree> vect_defs, gimple *stmt,
unsigned vec_num = SLP_TREE_NUMBER_OF_VEC_STMTS (slp_node);
vec_initial_defs.reserve (vec_num);
get_initial_defs_for_reduction (slp_node_instance->reduc_phis,
- &vec_initial_defs, vec_num, code,
- GROUP_FIRST_ELEMENT (stmt_info));
+ &vec_initial_defs, vec_num,
+ GROUP_FIRST_ELEMENT (stmt_info),
+ neutral_op);
}
else
{
@@ -4738,6 +4795,12 @@ vect_create_epilog_for_reduction (vec<tree> vect_defs, gimple *stmt,
b2 = operation (b1) */
slp_reduc = (slp_node && !GROUP_FIRST_ELEMENT (vinfo_for_stmt (stmt)));
+ /* True if we should implement SLP_REDUC using native reduction operations
+ instead of scalar operations. */
+ direct_slp_reduc = (reduc_code != ERROR_MARK
+ && slp_reduc
+ && !TYPE_VECTOR_SUBPARTS (vectype).is_constant ());
+
/* In case of reduction chain, e.g.,
# a1 = phi <a3, a0>
a2 = operation (a1)
@@ -4745,7 +4808,7 @@ vect_create_epilog_for_reduction (vec<tree> vect_defs, gimple *stmt,
we may end up with more than one vector result. Here we reduce them to
one vector. */
- if (GROUP_FIRST_ELEMENT (vinfo_for_stmt (stmt)))
+ if (GROUP_FIRST_ELEMENT (vinfo_for_stmt (stmt)) || direct_slp_reduc)
{
tree first_vect = PHI_RESULT (new_phis[0]);
gassign *new_vec_stmt = NULL;
@@ -5032,6 +5095,81 @@ vect_create_epilog_for_reduction (vec<tree> vect_defs, gimple *stmt,
scalar_results.safe_push (new_temp);
}
+ else if (direct_slp_reduc)
+ {
+ /* Here we create one vector for each of the GROUP_SIZE results,
+ with the elements for other SLP statements replaced with the
+ neutral value. We can then do a normal reduction on each vector. */
+
+ /* Enforced by vectorizable_reduction. */
+ gcc_assert (new_phis.length () == 1);
+ gcc_assert (pow2p_hwi (group_size));
+
+ slp_tree orig_phis_slp_node = slp_node_instance->reduc_phis;
+ vec<gimple *> orig_phis = SLP_TREE_SCALAR_STMTS (orig_phis_slp_node);
+ gimple_seq seq = NULL;
+
+ /* Build a vector {0, 1, 2, ...}, with the same number of elements
+ and the same element size as VECTYPE. */
+ tree index = build_index_vector (vectype, 0, 1);
+ tree index_type = TREE_TYPE (index);
+ tree index_elt_type = TREE_TYPE (index_type);
+ tree mask_type = build_same_sized_truth_vector_type (index_type);
+
+ /* Create a vector that, for each element, identifies which of
+ the GROUP_SIZE results should use it. */
+ tree index_mask = build_int_cst (index_elt_type, group_size - 1);
+ index = gimple_build (&seq, BIT_AND_EXPR, index_type, index,
+ build_vector_from_val (index_type, index_mask));
+
+ /* Get a neutral vector value. This is simply a splat of the neutral
+ scalar value if we have one, otherwise the initial scalar value
+ is itself a neutral value. */
+ tree vector_identity = NULL_TREE;
+ if (neutral_op)
+ vector_identity = gimple_build_vector_from_val (&seq, vectype,
+ neutral_op);
+ for (unsigned int i = 0; i < group_size; ++i)
+ {
+ /* If there's no univeral neutral value, we can use the
+ initial scalar value from the original PHI. This is used
+ for MIN and MAX reduction, for example. */
+ if (!neutral_op)
+ {
+ tree scalar_value
+ = PHI_ARG_DEF_FROM_EDGE (orig_phis[i],
+ loop_preheader_edge (loop));
+ vector_identity = gimple_build_vector_from_val (&seq, vectype,
+ scalar_value);
+ }
+
+ /* Calculate the equivalent of:
+
+ sel[j] = (index[j] == i);
+
+ which selects the elements of NEW_PHI_RESULT that should
+ be included in the result. */
+ tree compare_val = build_int_cst (index_elt_type, i);
+ compare_val = build_vector_from_val (index_type, compare_val);
+ tree sel = gimple_build (&seq, EQ_EXPR, mask_type,
+ index, compare_val);
+
+ /* Calculate the equivalent of:
+
+ vec = seq ? new_phi_result : vector_identity;
+
+ VEC is now suitable for a full vector reduction. */
+ tree vec = gimple_build (&seq, VEC_COND_EXPR, vectype,
+ sel, new_phi_result, vector_identity);
+
+ /* Do the reduction and convert it to the appropriate type. */
+ tree scalar = gimple_build (&seq, reduc_code,
+ TREE_TYPE (vectype), vec);
+ scalar = gimple_convert (&seq, scalar_type, scalar);
+ scalar_results.safe_push (scalar);
+ }
+ gsi_insert_seq_before (&exit_gsi, seq, GSI_SAME_STMT);
+ }
else
{
bool reduce_with_shift = have_whole_vector_shift (mode);
@@ -6255,25 +6393,64 @@ vectorizable_reduction (gimple *stmt, gimple_stmt_iterator *gsi,
return false;
}
- if (double_reduc && !nunits_out.is_constant ())
+ /* For SLP reductions, see if there is a neutral value we can use. */
+ tree neutral_op = NULL_TREE;
+ if (slp_node)
+ neutral_op
+ = neutral_op_for_slp_reduction (slp_node_instance->reduc_phis, code,
+ GROUP_FIRST_ELEMENT (stmt_info) != NULL);
+
+ /* For double reductions, and for SLP reductions with a neutral value,
+ we construct a variable-length initial vector by loading a vector
+ full of the neutral value and then shift-and-inserting the start
+ values into the low-numbered elements. */
+ if ((double_reduc || neutral_op)
+ && !nunits_out.is_constant ()
+ && !direct_internal_fn_supported_p (IFN_VEC_SHL_INSERT,
+ vectype_out, OPTIMIZE_FOR_SPEED))
{
- /* The current double-reduction code creates the initial value
- element-by-element. */
if (dump_enabled_p ())
dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
- "double reduction not supported for variable-length"
- " vectors.\n");
+ "reduction on variable-length vectors requires"
+ " target support for a vector-shift-and-insert"
+ " operation.\n");
return false;
}
- if (slp_node && !nunits_out.is_constant ())
- {
- /* The current SLP code creates the initial value element-by-element. */
- if (dump_enabled_p ())
- dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
- "SLP reduction not supported for variable-length"
- " vectors.\n");
- return false;
+ /* Check extra constraints for variable-length unchained SLP reductions. */
+ if (STMT_SLP_TYPE (stmt_info)
+ && !GROUP_FIRST_ELEMENT (vinfo_for_stmt (stmt))
+ && !nunits_out.is_constant ())
+ {
+ /* We checked above that we could build the initial vector when
+ there's a neutral element value. Check here for the case in
+ which each SLP statement has its own initial value and in which
+ that value needs to be repeated for every instance of the
+ statement within the initial vector. */
+ unsigned int group_size = SLP_TREE_SCALAR_STMTS (slp_node).length ();
+ scalar_mode elt_mode = SCALAR_TYPE_MODE (TREE_TYPE (vectype_out));
+ if (!neutral_op
+ && !can_duplicate_and_interleave_p (group_size, elt_mode))
+ {
+ if (dump_enabled_p ())
+ dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
+ "unsupported form of SLP reduction for"
+ " variable-length vectors: cannot build"
+ " initial vector.\n");
+ return false;
+ }
+ /* The epilogue code relies on the number of elements being a multiple
+ of the group size. The duplicate-and-interleave approach to setting
+ up the the initial vector does too. */
+ if (!multiple_p (nunits_out, group_size))
+ {
+ if (dump_enabled_p ())
+ dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
+ "unsupported form of SLP reduction for"
+ " variable-length vectors: the vector size"
+ " is not a multiple of the number of results.\n");
+ return false;
+ }
}
/* In case of widenning multiplication by a constant, we update the type
@@ -6541,7 +6718,8 @@ vectorizable_reduction (gimple *stmt, gimple_stmt_iterator *gsi,
vect_create_epilog_for_reduction (vect_defs, stmt, reduc_def_stmt,
epilog_copies,
epilog_reduc_code, phis,
- double_reduc, slp_node, slp_node_instance);
+ double_reduc, slp_node, slp_node_instance,
+ neutral_op);
return true;
}
diff --git a/gcc/tree-vect-slp.c b/gcc/tree-vect-slp.c
index abd2d9bb615..2c633e7e92b 100644
--- a/gcc/tree-vect-slp.c
+++ b/gcc/tree-vect-slp.c
@@ -214,10 +214,10 @@ vect_get_place_in_interleaving_chain (gimple *stmt, gimple *first_stmt)
(if nonnull) and the type of each intermediate vector in *VECTOR_TYPE_OUT
(if nonnull). */
-static bool
+bool
can_duplicate_and_interleave_p (unsigned int count, machine_mode elt_mode,
- unsigned int *nvectors_out = NULL,
- tree *vector_type_out = NULL)
+ unsigned int *nvectors_out,
+ tree *vector_type_out)
{
poly_int64 elt_bytes = count * GET_MODE_SIZE (elt_mode);
poly_int64 nelts;
@@ -3277,7 +3277,7 @@ vect_mask_constant_operand_p (gimple *stmt, int opnum)
We try to find the largest IM for which this sequence works, in order
to cut down on the number of interleaves. */
-static void
+void
duplicate_and_interleave (gimple_seq *seq, tree vector_type, vec<tree> elts,
unsigned int nresults, vec<tree> &results)
{
@@ -3559,6 +3559,26 @@ vect_get_constant_vectors (tree op, slp_tree slp_node,
if (must_eq (TYPE_VECTOR_SUBPARTS (vector_type), nunits))
/* Build the vector directly from ELTS. */
vec_cst = gimple_build_vector (&ctor_seq, vector_type, elts);
+ else if (neutral_op)
+ {
+ /* Build a vector of the neutral value and shift the
+ other elements into place. */
+ vec_cst = gimple_build_vector_from_val (&ctor_seq,
+ vector_type,
+ neutral_op);
+ int k = nunits;
+ while (k > 0 && elts[k - 1] == neutral_op)
+ k -= 1;
+ while (k > 0)
+ {
+ k -= 1;
+ gcall *call = gimple_build_call_internal
+ (IFN_VEC_SHL_INSERT, 2, vec_cst, elts[k]);
+ vec_cst = make_ssa_name (vector_type);
+ gimple_call_set_lhs (call, vec_cst);
+ gimple_seq_add_stmt (&ctor_seq, call);
+ }
+ }
else
{
if (vec_oprnds->is_empty ())
diff --git a/gcc/tree-vectorizer.h b/gcc/tree-vectorizer.h
index f4e29c0c662..224f4a67459 100644
--- a/gcc/tree-vectorizer.h
+++ b/gcc/tree-vectorizer.h
@@ -1348,6 +1348,11 @@ extern void vect_get_slp_defs (vec<tree> , slp_tree, vec<vec<tree> > *);
extern bool vect_slp_bb (basic_block);
extern gimple *vect_find_last_scalar_stmt_in_slp (slp_tree);
extern bool is_simple_and_all_uses_invariant (gimple *, loop_vec_info);
+extern bool can_duplicate_and_interleave_p (unsigned int, machine_mode,
+ unsigned int * = NULL,
+ tree * = NULL);
+extern void duplicate_and_interleave (gimple_seq *, tree, vec<tree>,
+ unsigned int, vec<tree> &);
/* In tree-vect-patterns.c. */
/* Pattern recognition functions.