summaryrefslogtreecommitdiff
diff options
context:
space:
mode:
authorRichard Sandiford <richard.sandiford@linaro.org>2017-10-08 12:29:08 +0100
committerRichard Sandiford <richard.sandiford@linaro.org>2017-11-20 16:01:23 +0000
commitf8d18515fd0e87fe9b68a23e6d73b80064baec97 (patch)
treebf7c7a9d43942db218b3786c21fea82242787adf
parentf2ba9afa2bfb21956959a5ab48be1ef1ce2e6f2c (diff)
downloadgcc-f8d18515fd0e87fe9b68a23e6d73b80064baec97.tar.gz
Allow capped vectorisation factors
[Branch only patch -- not intended for trunk in its current state] This patch allows the controlling mask for a loop to be clamped to a compile-time maximum. We can then vectorise code that has a known dependence distance, even if it is (or might be) smaller than a vector. This is almost ready for trunk, but I'd prefer to do some more checking first.
-rw-r--r--gcc/config/aarch64/aarch64-sve.md20
-rw-r--r--gcc/config/aarch64/aarch64.md1
-rw-r--r--gcc/internal-fn.def3
-rw-r--r--gcc/optabs.def2
-rw-r--r--gcc/testsuite/gcc.dg/vect/slp-28.c4
-rw-r--r--gcc/testsuite/gcc.target/aarch64/sve_cap_1.c44
-rw-r--r--gcc/testsuite/gcc.target/aarch64/sve_cap_1_run.c37
-rw-r--r--gcc/testsuite/gcc.target/aarch64/sve_cap_2.c53
-rw-r--r--gcc/testsuite/gcc.target/aarch64/sve_cap_2_run.c44
-rw-r--r--gcc/testsuite/gcc.target/aarch64/sve_cap_3.c53
-rw-r--r--gcc/testsuite/gcc.target/aarch64/sve_cap_3_run.c44
-rw-r--r--gcc/testsuite/gcc.target/aarch64/sve_cap_4.c42
-rw-r--r--gcc/testsuite/gcc.target/aarch64/sve_cap_4_run.c37
-rw-r--r--gcc/testsuite/gcc.target/aarch64/sve_cap_5.c24
-rw-r--r--gcc/testsuite/gcc.target/aarch64/sve_cap_5_run.c40
-rw-r--r--gcc/tree-ssa-loop-ivopts.c19
-rw-r--r--gcc/tree-vect-data-refs.c7
-rw-r--r--gcc/tree-vect-loop-manip.c102
-rw-r--r--gcc/tree-vect-loop.c134
-rw-r--r--gcc/tree-vect-stmts.c80
-rw-r--r--gcc/tree-vectorizer.h35
21 files changed, 751 insertions, 74 deletions
diff --git a/gcc/config/aarch64/aarch64-sve.md b/gcc/config/aarch64/aarch64-sve.md
index 0ab9c7b0ae8..5d84b7fc595 100644
--- a/gcc/config/aarch64/aarch64-sve.md
+++ b/gcc/config/aarch64/aarch64-sve.md
@@ -2379,6 +2379,26 @@
"<sve_fp_op>\t%0.<Vetype>, %1/m, %0.<Vetype>, %2.<Vetype>"
)
+(define_expand "mask_popcount<mode>"
+ [(set (match_operand:DI 0 "register_operand")
+ (unspec:DI [(match_dup 2)
+ (match_operand:PRED_ALL 1 "register_operand")]
+ UNSPEC_CNTP))]
+ "TARGET_SVE"
+ {
+ operands[2] = force_reg (<MODE>mode, CONSTM1_RTX (<MODE>mode));
+ }
+)
+
+(define_insn "*mask_popcount<mode>"
+ [(set (match_operand:DI 0 "register_operand" "=r")
+ (unspec:DI [(match_operand:PRED_ALL 1 "register_operand" "Upa")
+ (match_operand:PRED_ALL 2 "register_operand" "Upa")]
+ UNSPEC_CNTP))]
+ "TARGET_SVE"
+ "cntp\t%0, %1, %2.<Vetype>"
+)
+
;; Shift an SVE vector left and insert a scalar into element 0.
(define_insn "vec_shl_insert_<mode>"
[(set (match_operand:SVE_ALL 0 "register_operand" "=w, w")
diff --git a/gcc/config/aarch64/aarch64.md b/gcc/config/aarch64/aarch64.md
index 0a4c90c1a39..581e6a753d2 100644
--- a/gcc/config/aarch64/aarch64.md
+++ b/gcc/config/aarch64/aarch64.md
@@ -167,6 +167,7 @@
UNSPEC_INSR
UNSPEC_CLASTB
UNSPEC_FADDA
+ UNSPEC_CNTP
])
(define_c_enum "unspecv" [
diff --git a/gcc/internal-fn.def b/gcc/internal-fn.def
index 3cc62dd68a6..f28519837f2 100644
--- a/gcc/internal-fn.def
+++ b/gcc/internal-fn.def
@@ -125,6 +125,9 @@ DEF_INTERNAL_OPTAB_FN (STORE_LANES, ECF_CONST, vec_store_lanes, store_lanes)
DEF_INTERNAL_OPTAB_FN (MASK_STORE_LANES, 0,
vec_mask_store_lanes, mask_store_lanes)
+DEF_INTERNAL_OPTAB_FN (MASK_POPCOUNT, ECF_CONST | ECF_NOTHROW,
+ mask_popcount, unary)
+
DEF_INTERNAL_OPTAB_FN (WHILE_ULT, ECF_CONST | ECF_NOTHROW, while_ult, while)
DEF_INTERNAL_OPTAB_FN (VEC_INTERLEAVE_LO, ECF_CONST | ECF_NOTHROW,
diff --git a/gcc/optabs.def b/gcc/optabs.def
index 523906aa198..bf67dfca132 100644
--- a/gcc/optabs.def
+++ b/gcc/optabs.def
@@ -397,4 +397,6 @@ OPTAB_D (mask_scatter_store_optab, "mask_scatter_store$a")
OPTAB_DC (vec_duplicate_optab, "vec_duplicate$a", VEC_DUPLICATE)
OPTAB_DC (vec_series_optab, "vec_series$a", VEC_SERIES)
+
+OPTAB_D (mask_popcount_optab, "mask_popcount$a")
OPTAB_D (vec_shl_insert_optab, "vec_shl_insert_$a")
diff --git a/gcc/testsuite/gcc.dg/vect/slp-28.c b/gcc/testsuite/gcc.dg/vect/slp-28.c
index 7778bad4465..4211b94ad7f 100644
--- a/gcc/testsuite/gcc.dg/vect/slp-28.c
+++ b/gcc/testsuite/gcc.dg/vect/slp-28.c
@@ -88,6 +88,6 @@ int main (void)
return 0;
}
-/* { dg-final { scan-tree-dump-times "vectorized 1 loops" 1 "vect" } } */
-/* { dg-final { scan-tree-dump-times "vectorizing stmts using SLP" 1 "vect" } } */
+/* { dg-final { scan-tree-dump-times "vectorized 2 loops" 1 "vect" } } */
+/* { dg-final { scan-tree-dump-times "vectorizing stmts using SLP" 2 "vect" } } */
diff --git a/gcc/testsuite/gcc.target/aarch64/sve_cap_1.c b/gcc/testsuite/gcc.target/aarch64/sve_cap_1.c
new file mode 100644
index 00000000000..1051fd1f7f0
--- /dev/null
+++ b/gcc/testsuite/gcc.target/aarch64/sve_cap_1.c
@@ -0,0 +1,44 @@
+/* { dg-do compile } */
+/* { dg-options "-O2 -ftree-vectorize -fno-inline -march=armv8-a+sve -fno-vect-cost-model" } */
+
+#define LOOP(TYPE) \
+ void \
+ f_##TYPE##_1 (TYPE *a, int n) \
+ { \
+ for (int i = 0; i < n; ++i) \
+ a[i] += a[i - 1]; \
+ } \
+ \
+ void \
+ f_##TYPE##_2 (TYPE *a, int n) \
+ { \
+ for (int i = 0; i < n; ++i) \
+ a[i] += a[i - 2]; \
+ } \
+ \
+ void \
+ f_##TYPE##_5 (TYPE *a, int n) \
+ { \
+ for (int i = 0; i < n; ++i) \
+ a[i] += a[i - 5]; \
+ }
+
+LOOP (char)
+LOOP (short)
+LOOP (float)
+LOOP (double)
+
+/* { dg-final { scan-assembler-times {\tstrb\t} 1 } } */
+/* { dg-final { scan-assembler-times {\tstrh\t} 1 } } */
+/* { dg-final { scan-assembler-times {\tstr\ts[0-9]+} 1 } } */
+/* { dg-final { scan-assembler-times {\tstr\td[0-9]+} 1 } } */
+
+/* { dg-final { scan-assembler-times {\tld1b\tz[0-9]+\.b, p[0-7]/z, \[x[0-9]+, x[0-9]+\]} 4 } } */
+/* { dg-final { scan-assembler-times {\tld1h\tz[0-9]+\.h, p[0-7]/z, \[x[0-9]+, x[0-9]+, lsl 1\]} 4 } } */
+/* { dg-final { scan-assembler-times {\tld1w\tz[0-9]+\.s, p[0-7]/z, \[x[0-9]+, x[0-9]+, lsl 2\]} 4 } } */
+/* { dg-final { scan-assembler-times {\tld1d\tz[0-9]+\.d, p[0-7]/z, \[x[0-9]+, x[0-9]+, lsl 3\]} 4 } } */
+
+/* { dg-final { scan-assembler-times {\tst1b\tz[0-9]+\.b, p[0-7], \[x[0-9]+, x[0-9]+\]} 2 } } */
+/* { dg-final { scan-assembler-times {\tst1h\tz[0-9]+\.h, p[0-7], \[x[0-9]+, x[0-9]+, lsl 1\]} 2 } } */
+/* { dg-final { scan-assembler-times {\tst1w\tz[0-9]+\.s, p[0-7], \[x[0-9]+, x[0-9]+, lsl 2\]} 2 } } */
+/* { dg-final { scan-assembler-times {\tst1d\tz[0-9]+\.d, p[0-7], \[x[0-9]+, x[0-9]+, lsl 3\]} 2 } } */
diff --git a/gcc/testsuite/gcc.target/aarch64/sve_cap_1_run.c b/gcc/testsuite/gcc.target/aarch64/sve_cap_1_run.c
new file mode 100644
index 00000000000..0f280b04f0b
--- /dev/null
+++ b/gcc/testsuite/gcc.target/aarch64/sve_cap_1_run.c
@@ -0,0 +1,37 @@
+/* { dg-do run { target { aarch64_sve_hw } } } */
+/* { dg-options "-O2 -ftree-vectorize -fno-inline -march=armv8-a+sve -fno-vect-cost-model" } */
+
+#include "sve_cap_1.c"
+
+#define N 129
+
+#define F(X) (((X) % 5) * (X))
+
+#define TEST_LOOP(TYPE, M) \
+ { \
+ TYPE a[N + M]; \
+ for (int i = 0; i < N + M; ++i) \
+ a[i] = F (i); \
+ f_##TYPE##_##M (a + M, N); \
+ for (int i = 0; i < N; ++i) \
+ { \
+ TYPE x = a[i]; \
+ TYPE y = F (i + M); \
+ if (a[i + M] != (TYPE) (x + y)) \
+ __builtin_abort (); \
+ } \
+ }
+
+#define TEST_LOOPS(TYPE) \
+ TEST_LOOP (TYPE, 1) \
+ TEST_LOOP (TYPE, 2) \
+ TEST_LOOP (TYPE, 5)
+
+int
+main (void)
+{
+ TEST_LOOPS (char);
+ TEST_LOOPS (short);
+ TEST_LOOPS (float);
+ TEST_LOOPS (double);
+}
diff --git a/gcc/testsuite/gcc.target/aarch64/sve_cap_2.c b/gcc/testsuite/gcc.target/aarch64/sve_cap_2.c
new file mode 100644
index 00000000000..d46a08c2ee1
--- /dev/null
+++ b/gcc/testsuite/gcc.target/aarch64/sve_cap_2.c
@@ -0,0 +1,53 @@
+/* { dg-do compile } */
+/* { dg-options "-O2 -ftree-vectorize -fno-inline -march=armv8-a+sve -fno-vect-cost-model" } */
+
+#define LOOP(TYPE) \
+ void __attribute__ ((weak)) \
+ f_##TYPE##_1 (TYPE *a, int n) \
+ { \
+ for (int i = 0; i < n; ++i) \
+ { \
+ a[i * 2] += a[i * 2 - 2]; \
+ a[i * 2 + 1] += a[i * 2 - 1]; \
+ } \
+ } \
+ \
+ void __attribute__ ((weak)) \
+ f_##TYPE##_2 (TYPE *a, int n) \
+ { \
+ for (int i = 0; i < n; ++i) \
+ { \
+ a[i * 2] += a[i * 2 - 4]; \
+ a[i * 2 + 1] += a[i * 2 - 3]; \
+ } \
+ } \
+ \
+ void __attribute__ ((weak)) \
+ f_##TYPE##_5 (TYPE *a, int n) \
+ { \
+ for (int i = 0; i < n; ++i) \
+ { \
+ a[i * 2] += a[i * 2 - 10]; \
+ a[i * 2 + 1] += a[i * 2 - 9]; \
+ } \
+ }
+
+LOOP (char)
+LOOP (short)
+LOOP (float)
+LOOP (double)
+
+/* { dg-final { scan-assembler-times {\tstrb\t} 2 } } */
+/* { dg-final { scan-assembler-times {\tstrh\t} 2 } } */
+/* { dg-final { scan-assembler-times {\tstp\ts[0-9]+} 1 } } */
+/* { dg-final { scan-assembler-times {\tstp\td[0-9]+} 1 } } */
+
+/* { dg-final { scan-assembler-times {\tld1b\t} 4 } } */
+/* { dg-final { scan-assembler-times {\tld1h\t} 4 } } */
+/* { dg-final { scan-assembler-times {\tld1w\t} 4 } } */
+/* { dg-final { scan-assembler-times {\tld1d\t} 4 } } */
+
+/* { dg-final { scan-assembler-times {\tst1b\t} 2 } } */
+/* { dg-final { scan-assembler-times {\tst1h\t} 2 } } */
+/* { dg-final { scan-assembler-times {\tst1w\t} 2 } } */
+/* { dg-final { scan-assembler-times {\tst1d\t} 2 } } */
diff --git a/gcc/testsuite/gcc.target/aarch64/sve_cap_2_run.c b/gcc/testsuite/gcc.target/aarch64/sve_cap_2_run.c
new file mode 100644
index 00000000000..0f8ed957c79
--- /dev/null
+++ b/gcc/testsuite/gcc.target/aarch64/sve_cap_2_run.c
@@ -0,0 +1,44 @@
+/* { dg-do run { target { aarch64_sve_hw } } } */
+/* { dg-options "-O2 -ftree-vectorize -fno-inline -march=armv8-a+sve -fno-vect-cost-model" } */
+
+#include "sve_cap_2.c"
+
+#define N 129
+
+#define F(X) (((X) % 5) * (X))
+#define G(X) ((X) ^ 39)
+
+#define TEST_LOOP(TYPE, M) \
+ { \
+ TYPE a[(N + M) * 2]; \
+ for (int i = 0; i < N + M; ++i) \
+ { \
+ a[i * 2] = F (i); \
+ a[i * 2 + 1] = G (i); \
+ } \
+ f_##TYPE##_##M (a + M * 2, N); \
+ for (int i = 0; i < N; ++i) \
+ { \
+ TYPE x0 = a[i * 2]; \
+ TYPE y0 = F (i + M); \
+ TYPE x1 = a[i * 2 + 1]; \
+ TYPE y1 = G (i + M); \
+ if (a[(i + M) * 2] != (TYPE) (x0 + y0) \
+ || a[(i + M) * 2 + 1] != (TYPE) (x1 + y1)) \
+ __builtin_abort (); \
+ } \
+ }
+
+#define TEST_LOOPS(TYPE) \
+ TEST_LOOP (TYPE, 1) \
+ TEST_LOOP (TYPE, 2) \
+ TEST_LOOP (TYPE, 5)
+
+int
+main (void)
+{
+ TEST_LOOPS (char);
+ TEST_LOOPS (short);
+ TEST_LOOPS (float);
+ TEST_LOOPS (double);
+}
diff --git a/gcc/testsuite/gcc.target/aarch64/sve_cap_3.c b/gcc/testsuite/gcc.target/aarch64/sve_cap_3.c
new file mode 100644
index 00000000000..6515465b7f6
--- /dev/null
+++ b/gcc/testsuite/gcc.target/aarch64/sve_cap_3.c
@@ -0,0 +1,53 @@
+/* { dg-do compile } */
+/* { dg-options "-O2 -ftree-vectorize -fno-inline -march=armv8-a+sve -fno-vect-cost-model -msve-vector-bits=scalable" } */
+
+#define LOOP(TYPE) \
+ void __attribute__ ((weak)) \
+ f_##TYPE##_1 (TYPE *a, int n) \
+ { \
+ for (int i = 0; i < n; ++i) \
+ { \
+ a[i * 2] += a[i * 2 - 2]; \
+ a[i * 2 + 1] -= a[i * 2 - 1]; \
+ } \
+ } \
+ \
+ void __attribute__ ((weak)) \
+ f_##TYPE##_2 (TYPE *a, int n) \
+ { \
+ for (int i = 0; i < n; ++i) \
+ { \
+ a[i * 2] += a[i * 2 - 4]; \
+ a[i * 2 + 1] -= a[i * 2 - 3]; \
+ } \
+ } \
+ \
+ void __attribute__ ((weak)) \
+ f_##TYPE##_5 (TYPE *a, int n) \
+ { \
+ for (int i = 0; i < n; ++i) \
+ { \
+ a[i * 2] += a[i * 2 - 10]; \
+ a[i * 2 + 1] -= a[i * 2 - 9]; \
+ } \
+ }
+
+LOOP (char)
+LOOP (short)
+LOOP (float)
+LOOP (double)
+
+/* { dg-final { scan-assembler-times {\tstrb\t} 2 } } */
+/* { dg-final { scan-assembler-times {\tstrh\t} 2 } } */
+/* { dg-final { scan-assembler-times {\tstp\ts[0-9]+} 1 } } */
+/* { dg-final { scan-assembler-times {\tstp\td[0-9]+} 1 } } */
+
+/* { dg-final { scan-assembler-times {\tld2b\t} 4 } } */
+/* { dg-final { scan-assembler-times {\tld2h\t} 4 } } */
+/* { dg-final { scan-assembler-times {\tld2w\t} 4 } } */
+/* { dg-final { scan-assembler-times {\tld2d\t} 4 } } */
+
+/* { dg-final { scan-assembler-times {\tst2b\t} 2 } } */
+/* { dg-final { scan-assembler-times {\tst2h\t} 2 } } */
+/* { dg-final { scan-assembler-times {\tst2w\t} 2 } } */
+/* { dg-final { scan-assembler-times {\tst2d\t} 2 } } */
diff --git a/gcc/testsuite/gcc.target/aarch64/sve_cap_3_run.c b/gcc/testsuite/gcc.target/aarch64/sve_cap_3_run.c
new file mode 100644
index 00000000000..fe26162a812
--- /dev/null
+++ b/gcc/testsuite/gcc.target/aarch64/sve_cap_3_run.c
@@ -0,0 +1,44 @@
+/* { dg-do run { target { aarch64_sve_hw } } } */
+/* { dg-options "-O2 -ftree-vectorize -fno-inline -march=armv8-a+sve -fno-vect-cost-model" } */
+
+#include "sve_cap_3.c"
+
+#define N 129
+
+#define F(X) (((X) % 5) * (X))
+#define G(X) ((X) ^ 39)
+
+#define TEST_LOOP(TYPE, M) \
+ { \
+ TYPE a[(N + M) * 2]; \
+ for (int i = 0; i < N + M; ++i) \
+ { \
+ a[i * 2] = F (i); \
+ a[i * 2 + 1] = G (i); \
+ } \
+ f_##TYPE##_##M (a + M * 2, N); \
+ for (int i = 0; i < N; ++i) \
+ { \
+ TYPE x0 = a[i * 2]; \
+ TYPE y0 = F (i + M); \
+ TYPE x1 = a[i * 2 + 1]; \
+ TYPE y1 = G (i + M); \
+ if (a[(i + M) * 2] != (TYPE) (y0 + x0) \
+ || a[(i + M) * 2 + 1] != (TYPE) (y1 - x1)) \
+ __builtin_abort (); \
+ } \
+ }
+
+#define TEST_LOOPS(TYPE) \
+ TEST_LOOP (TYPE, 1) \
+ TEST_LOOP (TYPE, 2) \
+ TEST_LOOP (TYPE, 5)
+
+int
+main (void)
+{
+ TEST_LOOPS (char);
+ TEST_LOOPS (short);
+ TEST_LOOPS (float);
+ TEST_LOOPS (double);
+}
diff --git a/gcc/testsuite/gcc.target/aarch64/sve_cap_4.c b/gcc/testsuite/gcc.target/aarch64/sve_cap_4.c
new file mode 100644
index 00000000000..c3bf2f326d3
--- /dev/null
+++ b/gcc/testsuite/gcc.target/aarch64/sve_cap_4.c
@@ -0,0 +1,42 @@
+/* { dg-do compile } */
+/* { dg-options "-O2 -ftree-vectorize -fno-inline -march=armv8-a+sve -fno-vect-cost-model" } */
+
+#define LOOP(TYPE) \
+ void \
+ f_##TYPE##_1 (TYPE *a, int n) \
+ { \
+ for (int i = 0; i < n; ++i) \
+ a[i * 5] += a[i * 5 - 5]; \
+ } \
+ \
+ void \
+ f_##TYPE##_2 (TYPE *a, int n) \
+ { \
+ for (int i = 0; i < n; ++i) \
+ a[i * 5] += a[i * 5 - 10]; \
+ } \
+ \
+ void \
+ f_##TYPE##_5 (TYPE *a, int n) \
+ { \
+ for (int i = 0; i < n; ++i) \
+ a[i * 5] += a[i * 5 - 25]; \
+ }
+
+LOOP (char)
+LOOP (short)
+LOOP (float)
+LOOP (double)
+
+/* At the moment we can't use extending loads and truncating stores.
+ Please add ld and st scan-assemblers below if that changes. */
+/* { dg-final { scan-assembler-times {\tstrb\t} 1 { xfail *-*-* } } } */
+/* { dg-final { scan-assembler-times {\tstrh\t} 1 { xfail *-*-* } } } */
+/* { dg-final { scan-assembler-times {\tstr\ts[0-9]+} 1 } } */
+/* { dg-final { scan-assembler-times {\tstr\td[0-9]+} 1 } } */
+
+/* { dg-final { scan-assembler-times {\tld1w\tz[0-9]+\.s, p[0-7]/z, \[x[0-9]+, z[0-9]+\.s, sxtw 2\]} 4 } } */
+/* { dg-final { scan-assembler-times {\tld1d\tz[0-9]+\.d, p[0-7]/z, \[x[0-9]+, z[0-9]+\.d, lsl 3\]} 4 } } */
+
+/* { dg-final { scan-assembler-times {\tst1w\tz[0-9]+\.s, p[0-7], \[x[0-9]+, z[0-9]+\.s, sxtw 2\]} 2 } } */
+/* { dg-final { scan-assembler-times {\tst1d\tz[0-9]+\.d, p[0-7], \[x[0-9]+, z[0-9]+\.d, lsl 3\]} 2 } } */
diff --git a/gcc/testsuite/gcc.target/aarch64/sve_cap_4_run.c b/gcc/testsuite/gcc.target/aarch64/sve_cap_4_run.c
new file mode 100644
index 00000000000..f39bc7fc3cb
--- /dev/null
+++ b/gcc/testsuite/gcc.target/aarch64/sve_cap_4_run.c
@@ -0,0 +1,37 @@
+/* { dg-do run { target { aarch64_sve_hw } } } */
+/* { dg-options "-O2 -ftree-vectorize -fno-inline -march=armv8-a+sve -fno-vect-cost-model" } */
+
+#include "sve_cap_4.c"
+
+#define N 129
+
+#define F(X) (((X) % 5) * (X))
+
+#define TEST_LOOP(TYPE, M) \
+ { \
+ TYPE a[(N + M) * 5]; \
+ for (int i = 0; i < N + M; ++i) \
+ a[i * 5] = F (i); \
+ f_##TYPE##_##M (a + M * 5, N); \
+ for (int i = 0; i < N; ++i) \
+ { \
+ TYPE x = a[i * 5]; \
+ TYPE y = F (i + M); \
+ if (a[(i + M) * 5] != (TYPE) (x + y)) \
+ __builtin_abort (); \
+ } \
+ }
+
+#define TEST_LOOPS(TYPE) \
+ TEST_LOOP (TYPE, 1) \
+ TEST_LOOP (TYPE, 2) \
+ TEST_LOOP (TYPE, 5)
+
+int
+main (void)
+{
+ TEST_LOOPS (char);
+ TEST_LOOPS (short);
+ TEST_LOOPS (float);
+ TEST_LOOPS (double);
+}
diff --git a/gcc/testsuite/gcc.target/aarch64/sve_cap_5.c b/gcc/testsuite/gcc.target/aarch64/sve_cap_5.c
new file mode 100644
index 00000000000..4d4987773b6
--- /dev/null
+++ b/gcc/testsuite/gcc.target/aarch64/sve_cap_5.c
@@ -0,0 +1,24 @@
+/* { dg-do compile } */
+/* { dg-options "-O2 -ftree-vectorize -fno-inline -march=armv8-a+sve -fno-vect-cost-model" } */
+
+void __attribute__ ((noinline, noclone))
+f (double *x, float *y, int n)
+{
+ for (int i = 0; i < n; ++i)
+ {
+ x[i * 3 + 18] = x[i * 3 + 0] + y[i];
+ x[i * 3 + 19] = x[i * 3 + 1] - y[i];
+ x[i * 3 + 20] = x[i * 3 + 2];
+ }
+}
+
+/* { dg-final { scan-assembler-times {\tld1w\t} 1 } } */
+/* { dg-final { scan-assembler-times {\tld3d\t} 2 } } */
+/* { dg-final { scan-assembler-times {\tst3d\t} 2 } } */
+/* { dg-final { scan-assembler-times {\tpunpklo\t} 1 } } */
+/* { dg-final { scan-assembler-times {\tpunpkhi\t} 1 } } */
+/* { dg-final { scan-assembler-times {\tuunpklo\t} 1 } } */
+/* { dg-final { scan-assembler-times {\tuunpkhi\t} 1 } } */
+/* { dg-final { scan-assembler-times {\tfcvt\t} 2 } } */
+/* { dg-final { scan-assembler-times {\tfadd\t} 2 } } */
+/* { dg-final { scan-assembler-times {\tfsub\t} 2 } } */
diff --git a/gcc/testsuite/gcc.target/aarch64/sve_cap_5_run.c b/gcc/testsuite/gcc.target/aarch64/sve_cap_5_run.c
new file mode 100644
index 00000000000..57fd625d20a
--- /dev/null
+++ b/gcc/testsuite/gcc.target/aarch64/sve_cap_5_run.c
@@ -0,0 +1,40 @@
+/* { dg-do run { target { aarch64_sve_hw } } } */
+/* { dg-options "-O2 -ftree-vectorize -fno-inline -march=armv8-a+sve -fno-vect-cost-model" } */
+
+#include "sve_cap_5.c"
+
+#define COUNT 5
+#define N ((COUNT + 2) * 6)
+
+int
+main (void)
+{
+ double x[N * 3];
+ float y[N];
+ for (int i = 0; i < N; ++i)
+ {
+ x[i * 3 + 0] = i * 2;
+ x[i * 3 + 1] = i * 3;
+ x[i * 3 + 2] = i * 5;
+ y[i] = i * 4;
+ }
+ f (x, y, COUNT * 6);
+ for (int i = 0; i < N; ++i)
+ {
+ if (i >= 6 && i < (COUNT + 1) * 6)
+ {
+ if (x[i * 3 + 0] != x[i * 3 - 18] + (i - 6) * 4
+ || x[i * 3 + 1] != x[i * 3 - 17] - (i - 6) * 4
+ || x[i * 3 + 2] != x[i * 3 - 16])
+ __builtin_abort ();
+ }
+ else
+ {
+ if (x[i * 3 + 0] != i * 2
+ || x[i * 3 + 1] != i * 3
+ || x[i * 3 + 2] != i * 5)
+ __builtin_abort ();
+ }
+ }
+ return 0;
+}
diff --git a/gcc/tree-ssa-loop-ivopts.c b/gcc/tree-ssa-loop-ivopts.c
index e0ba500fce2..224732e7059 100644
--- a/gcc/tree-ssa-loop-ivopts.c
+++ b/gcc/tree-ssa-loop-ivopts.c
@@ -2143,6 +2143,7 @@ constant_multiple_of (tree top, tree bot, widest_int *mul)
enum tree_code code;
unsigned precision = TYPE_PRECISION (TREE_TYPE (top));
widest_int res, p0, p1;
+ gassign *assign;
STRIP_NOPS (top);
STRIP_NOPS (bot);
@@ -2189,6 +2190,24 @@ constant_multiple_of (tree top, tree bot, widest_int *mul)
*mul = wi::sext (wi::divmod_trunc (p0, p1, SIGNED, &res), precision);
return res == 0;
+ case SSA_NAME:
+ /* Handle one important special case: TOP is an SSA_NAME defined
+ to be BOT * CST. This triggers in vector loops with variable
+ vectorization factors. */
+ assign = dyn_cast <gassign *> (SSA_NAME_DEF_STMT (top));
+ if (assign && gimple_assign_rhs_code (assign) == MULT_EXPR)
+ {
+ tree new_top = gimple_assign_rhs1 (assign);
+ mby = gimple_assign_rhs2 (assign);
+ if (TREE_CODE (mby) == INTEGER_CST
+ && constant_multiple_of (new_top, bot, &res))
+ {
+ *mul = wi::sext (res * wi::to_widest (mby), precision);
+ return true;
+ }
+ }
+ return false;
+
default:
if (POLY_INT_CST_P (top)
&& POLY_INT_CST_P (bot)
diff --git a/gcc/tree-vect-data-refs.c b/gcc/tree-vect-data-refs.c
index 373d4be6581..8b2cf1fbafe 100644
--- a/gcc/tree-vect-data-refs.c
+++ b/gcc/tree-vect-data-refs.c
@@ -53,6 +53,7 @@ along with GCC; see the file COPYING3. If not see
#include "tree-cfg.h"
#include "tree-hash-traits.h"
#include "internal-fn.h"
+#include "gimple-fold.h"
/* Return true if load- or store-lanes optab OPTAB is implemented for
COUNT vectors of type VECTYPE. NAME is the name of OPTAB. */
@@ -4782,7 +4783,6 @@ vect_create_addr_base_for_vector_ref (gimple *stmt,
return entry->final_addr;
}
-
/* Function vect_create_data_ref_ptr.
Create a new pointer-to-AGGR_TYPE variable (ap), that points to the first
@@ -5013,6 +5013,9 @@ vect_create_data_ref_ptr (gimple *stmt, tree aggr_type, struct loop *at_loop,
{
if (iv_step == NULL_TREE)
{
+ /* The caller must provide an IV_STEP for capped VF. */
+ gcc_assert (!use_capped_vf (loop_vinfo));
+
/* The step of the aggregate pointer is the type size. */
iv_step = TYPE_SIZE_UNIT (aggr_type);
/* One exception to the above is when the scalar step of the load in
@@ -5143,7 +5146,7 @@ bump_vector_ptr (tree dataref_ptr, gimple *ptr_incr, gimple_stmt_iterator *gsi,
mark_ptr_info_alignment_unknown (SSA_NAME_PTR_INFO (new_dataref_ptr));
}
- if (!ptr_incr)
+ if (!ptr_incr || use_capped_vf (STMT_VINFO_LOOP_VINFO (stmt_info)))
return new_dataref_ptr;
/* Update the vector-pointer's cross-iteration increment. */
diff --git a/gcc/tree-vect-loop-manip.c b/gcc/tree-vect-loop-manip.c
index 9d9a93e00d5..57aab1b764f 100644
--- a/gcc/tree-vect-loop-manip.c
+++ b/gcc/tree-vect-loop-manip.c
@@ -254,15 +254,31 @@ adjust_phi_and_debug_stmts (gimple *update_phi, edge e, tree new_def)
/* Define one loop mask MASK from loop LOOP. INIT_MASK is the value that
the mask should have during the first iteration and NEXT_MASK is the
- value that it should have on subsequent iterations. */
+ value that it should have on subsequent iterations. CAP_MASK, if
+ nonnull, is a cap that should be applied to each value of the mask
+ before the mask is used; add the statement that does to HEADER_SEQ. */
static void
-vect_set_loop_mask (struct loop *loop, tree mask, tree init_mask,
- tree next_mask)
+vect_set_loop_mask (struct loop *loop, gimple_seq *header_seq, tree mask,
+ tree init_mask, tree next_mask, tree cap_mask)
{
- gphi *phi = create_phi_node (mask, loop->header);
+ tree mask_type = TREE_TYPE (mask);
+ tree uncapped_mask;
+ if (cap_mask)
+ uncapped_mask = make_temp_ssa_name (mask_type, NULL, "uncapped_mask");
+ else
+ uncapped_mask = mask;
+ gphi *phi = create_phi_node (uncapped_mask, loop->header);
add_phi_arg (phi, init_mask, loop_preheader_edge (loop), UNKNOWN_LOCATION);
add_phi_arg (phi, next_mask, loop_latch_edge (loop), UNKNOWN_LOCATION);
+
+ /* Apply the cap mask, if any. */
+ if (cap_mask)
+ {
+ gimple *stmt = gimple_build_assign (mask, BIT_AND_EXPR,
+ uncapped_mask, cap_mask);
+ gimple_seq_add_stmt (header_seq, stmt);
+ }
}
/* Add SEQ to the end of LOOP's preheader block. */
@@ -355,12 +371,14 @@ vect_maybe_permute_loop_masks (gimple_seq *seq, rgroup_masks *dest_rgm,
/* Helper for vect_set_loop_condition_masked. Generate definitions for
all the masks in RGM and return a mask that is nonzero when the loop
- needs to iterate. Add any new preheader statements to PREHEADER_SEQ.
- Use LOOP_COND_GSI to insert code before the exit gcond.
+ needs to iterate. Add any new preheader statements to PREHEADER_SEQ
+ and any new header statements to HEADER_SEQ. Use LOOP_COND_GSI to
+ insert code before the exit gcond.
RGM belongs to loop LOOP. The loop originally iterated NITERS
times and has been vectorized according to LOOP_VINFO. Each iteration
- of the vectorized loop handles VF iterations of the scalar loop.
+ of the vectorized loop handles CAPPED_VF iterations of the scalar loop,
+ where CAPPED_VF is bounded by the compile-time vectorization factor.
If NITERS_SKIP is nonnull, the first iteration of the vectorized loop
starts with NITERS_SKIP dummy iterations of the scalar loop before
@@ -374,7 +392,7 @@ vect_maybe_permute_loop_masks (gimple_seq *seq, rgroup_masks *dest_rgm,
does not overflow. However, MIGHT_WRAP_P says whether an induction
variable that starts at 0 and has step:
- VF * RGM->max_nscalars_per_iter
+ CAPPED_VF * RGM->max_nscalars_per_iter
might overflow before hitting a value above:
@@ -386,8 +404,9 @@ vect_maybe_permute_loop_masks (gimple_seq *seq, rgroup_masks *dest_rgm,
static tree
vect_set_loop_masks_directly (struct loop *loop, loop_vec_info loop_vinfo,
gimple_seq *preheader_seq,
+ gimple_seq *header_seq,
gimple_stmt_iterator loop_cond_gsi,
- rgroup_masks *rgm, tree vf,
+ rgroup_masks *rgm, tree capped_vf,
tree niters, tree niters_skip,
bool might_wrap_p)
{
@@ -401,7 +420,7 @@ vect_set_loop_masks_directly (struct loop *loop, loop_vec_info loop_vinfo,
of the vector loop, and the number that it should skip during the
first iteration of the vector loop. */
tree nscalars_total = niters;
- tree nscalars_step = vf;
+ tree nscalars_step = capped_vf;
tree nscalars_skip = niters_skip;
if (nscalars_per_iter != 1)
{
@@ -553,8 +572,8 @@ vect_set_loop_masks_directly (struct loop *loop, loop_vec_info loop_vinfo,
else
{
/* FIRST_LIMIT is the maximum number of scalars handled by the
- first iteration of the vector loop. Test the portion
- associated with this mask. */
+ first iteration of the vector loop (before any cap mask
+ is applied). Test the portion associated with this mask. */
start = bias_tree;
end = first_limit;
}
@@ -589,7 +608,18 @@ vect_set_loop_masks_directly (struct loop *loop, loop_vec_info loop_vinfo,
gcall *call = vect_gen_while (next_mask, test_index, this_test_limit);
gsi_insert_before (test_gsi, call, GSI_SAME_STMT);
- vect_set_loop_mask (loop, mask, init_mask, next_mask);
+ /* Get the cap that needs to be ANDed with every mask. */
+ tree cap_mask = LOOP_VINFO_CAP (loop_vinfo).mask;
+ if (use_capped_vf (loop_vinfo)
+ && (!cap_mask || nscalars_per_iter != 1))
+ {
+ cap_mask = make_temp_ssa_name (mask_type, NULL, "cap_mask");
+ call = vect_gen_while (cap_mask, bias_tree, nscalars_step);
+ gimple_seq_add_stmt (preheader_seq, call);
+ }
+
+ vect_set_loop_mask (loop, header_seq, mask, init_mask,
+ next_mask, cap_mask);
}
return next_mask;
}
@@ -658,16 +688,26 @@ vect_set_loop_condition_masked (struct loop *loop, loop_vec_info loop_vinfo,
else
iv_limit += max_vf - 1;
}
- /* IV_LIMIT is the maximum number of latch iterations, which is also
- the maximum in-range IV value. Round this value down to the previous
- vector alignment boundary and then add an extra full iteration. */
- poly_uint64 vf = LOOP_VINFO_VECT_FACTOR (loop_vinfo);
- iv_limit = (iv_limit & -(int) known_alignment (vf)) + max_vf;
+ if (use_capped_vf (loop_vinfo))
+ /* In the worst case the final vector iteration will handle a single
+ scalar iteration, so we'll have up to MAX_VF - 1 inactive
+ iterations. Add 1 to this to get the number of loop iterations
+ instead of the number of latch iterations. */
+ iv_limit += max_vf;
+ else
+ {
+ /* IV_LIMIT is the maximum number of latch iterations, which
+ is also the maximum in-range IV value. Round this value
+ down to the previous vector alignment boundary and then add
+ an extra full iteration. */
+ poly_uint64 vf = LOOP_VINFO_VECT_FACTOR (loop_vinfo);
+ iv_limit = (iv_limit & -(int) known_alignment (vf)) + max_vf;
+ }
}
- /* Get the vectorization factor in tree form. */
- tree vf = build_int_cst (compare_type,
- LOOP_VINFO_VECT_FACTOR (loop_vinfo));
+ /* Convert the runtime vectorization factor to the appropriate type. */
+ tree capped_vf = gimple_convert (&preheader_seq, compare_type,
+ LOOP_VINFO_CAP (loop_vinfo).niters);
/* Iterate over all the rgroups and fill in their masks. We could use
the first mask from any rgroup for the loop condition; here we
@@ -701,9 +741,10 @@ vect_set_loop_condition_masked (struct loop *loop, loop_vec_info loop_vinfo,
/* Set up all masks for this group. */
test_mask = vect_set_loop_masks_directly (loop, loop_vinfo,
- &preheader_seq,
- loop_cond_gsi, rgm, vf,
- niters, niters_skip,
+ &preheader_seq, &header_seq,
+ loop_cond_gsi, rgm,
+ capped_vf, niters,
+ niters_skip,
might_wrap_p);
}
@@ -2384,15 +2425,16 @@ vect_do_peeling (loop_vec_info loop_vinfo, tree niters, tree nitersm1,
basic_block guard_bb, guard_to;
profile_probability prob_prolog, prob_vector, prob_epilog;
int estimated_vf;
+ tree vf = LOOP_VINFO_CAP (loop_vinfo).niters;
+ poly_uint64 max_vf = LOOP_VINFO_VECT_FACTOR (loop_vinfo);
int prolog_peeling = 0;
if (!vect_use_loop_mask_for_alignment_p (loop_vinfo))
prolog_peeling = LOOP_VINFO_PEELING_FOR_ALIGNMENT (loop_vinfo);
- poly_uint64 vf = LOOP_VINFO_VECT_FACTOR (loop_vinfo);
poly_uint64 bound_epilog = 0;
if (!LOOP_VINFO_FULLY_MASKED_P (loop_vinfo)
&& LOOP_VINFO_PEELING_FOR_NITER (loop_vinfo))
- bound_epilog += vf - 1;
+ bound_epilog += max_vf - 1;
if (LOOP_VINFO_PEELING_FOR_GAPS (loop_vinfo))
bound_epilog += 1;
bool epilog_peeling = may_ne (bound_epilog, 0U);
@@ -2449,7 +2491,7 @@ vect_do_peeling (loop_vec_info loop_vinfo, tree niters, tree nitersm1,
the end of vector loop and skip to the end of epilog loop. */
bool skip_epilog = (prolog_peeling < 0
|| !LOOP_VINFO_NITERS_KNOWN_P (loop_vinfo)
- || !vf.is_constant ());
+ || TREE_CODE (vf) != INTEGER_CST);
/* PEELING_FOR_GAPS is special because epilog loop must be executed. */
if (LOOP_VINFO_PEELING_FOR_GAPS (loop_vinfo))
skip_epilog = false;
@@ -2530,9 +2572,11 @@ vect_do_peeling (loop_vec_info loop_vinfo, tree niters, tree nitersm1,
niters = vect_build_loop_niters (loop_vinfo, &new_var_p);
/* It's guaranteed that vector loop bound before vectorization is at
least VF, so set range information for newly generated var. */
- if (new_var_p)
+ poly_uint64 const_vf;
+ if (new_var_p && poly_int_tree_p (vf, &const_vf))
set_range_info (niters, VR_RANGE,
- wi::to_wide (build_int_cst (type, vf)),
+ wi::to_wide (build_int_cstu
+ (type, constant_lower_bound (const_vf))),
wi::to_wide (TYPE_MAX_VALUE (type)));
/* Prolog iterates at most bound_prolog times, latch iterates at
diff --git a/gcc/tree-vect-loop.c b/gcc/tree-vect-loop.c
index a39682108f1..e33a83bfa6b 100644
--- a/gcc/tree-vect-loop.c
+++ b/gcc/tree-vect-loop.c
@@ -1344,6 +1344,85 @@ vect_verify_full_masking (loop_vec_info loop_vinfo)
return true;
}
+/* LOOP_VINFO uses a fully-masked loop and needs to use a capped
+ vectorization factor. Decide whether the best way of doing that is:
+
+ cap_mask = IFN_WHILE_ULT (0, max_vf)
+ actual_vf = IFN_MASK_POPCOUNT (cap_mask)
+
+ CAP_MASK can then be used for an rgroup for which nS == 1 and nV == 1
+ (see the comment above rgroup_masks for details).
+
+ Return true if this does seem to be the best implementation and
+ update LOOP_VINFO_CAP accordingly. */
+
+static bool
+vect_maybe_build_capped_vf_via_while (loop_vec_info loop_vinfo,
+ gimple_seq *seq)
+{
+ poly_uint64 nunits = LOOP_VINFO_VECT_FACTOR (loop_vinfo);
+ if (nunits.is_constant ())
+ /* In this case the capped number of iterations is known at compile
+ time, so a POPCOUNT would be pointless. */
+ return false;
+
+ if (LOOP_VINFO_MASKS (loop_vinfo).is_empty ())
+ return false;
+
+ rgroup_masks *rgm = &LOOP_VINFO_MASKS (loop_vinfo)[0];
+ if (rgm->max_nscalars_per_iter != 1)
+ /* There's no nS == 1 && nV == 1 mask that would benefit from
+ having a precomputed cap mask. */
+ return false;
+
+ if (!direct_internal_fn_supported_p (IFN_MASK_POPCOUNT, rgm->mask_type,
+ OPTIMIZE_FOR_SPEED))
+ return false;
+
+ tree compare_type = LOOP_VINFO_MASK_COMPARE_TYPE (loop_vinfo);
+ tree zero_index = build_int_cst (compare_type, 0);
+ tree limit = build_int_cst (compare_type,
+ LOOP_VINFO_MAX_VECT_FACTOR (loop_vinfo));
+
+ tree cap_mask = make_temp_ssa_name (rgm->mask_type, NULL, "cap_mask");
+ gcall *stmt = vect_gen_while (cap_mask, zero_index, limit);
+ gimple_seq_add_stmt (seq, stmt);
+ LOOP_VINFO_CAP (loop_vinfo).mask = cap_mask;
+
+ tree vf = make_temp_ssa_name (sizetype, NULL, "vf");
+ stmt = gimple_build_call_internal (IFN_MASK_POPCOUNT, 1, cap_mask);
+ gimple_call_set_lhs (stmt, vf);
+ gimple_seq_add_stmt (seq, stmt);
+ LOOP_VINFO_CAP (loop_vinfo).niters = vf;
+
+ return true;
+}
+
+/* Initialize LOOP_VINFO_CAP (LOOP_VINFO). */
+
+static void
+vect_build_cap (loop_vec_info loop_vinfo)
+{
+ tree vf = size_int (LOOP_VINFO_VECT_FACTOR (loop_vinfo));
+ if (!use_capped_vf (loop_vinfo))
+ LOOP_VINFO_CAP (loop_vinfo).niters = vf;
+ else
+ {
+ gimple_seq seq = NULL;
+ if (!vect_maybe_build_capped_vf_via_while (loop_vinfo, &seq))
+ {
+ tree max_vf = size_int (LOOP_VINFO_MAX_VECT_FACTOR (loop_vinfo));
+ LOOP_VINFO_CAP (loop_vinfo).niters
+ = gimple_build (&seq, MIN_EXPR, sizetype, vf, max_vf);
+ }
+ if (seq)
+ {
+ edge pe = loop_preheader_edge (LOOP_VINFO_LOOP (loop_vinfo));
+ gsi_insert_seq_on_edge_immediate (pe, seq);
+ }
+ }
+}
+
/* Calculate the cost of one scalar iteration of the loop. */
static void
vect_compute_single_scalar_iteration_cost (loop_vec_info loop_vinfo)
@@ -2128,12 +2207,15 @@ vect_analyze_loop_2 (loop_vec_info loop_vinfo, bool &fatal)
/* Analyze data dependences between the data-refs in the loop
and adjust the maximum vectorization factor according to
the dependences.
- FORNOW: fail at the first data dependence that we encounter. */
+
+ We might be able to cope with max_vf that are smaller than the full
+ vector width by using a fully-masked loop. Postpone that decision
+ until we know whether full masking is possible. Of course, it might
+ not be a win to use vectors in this situation even if it is supported,
+ but that's a decision for the cost model. */
ok = vect_analyze_data_ref_dependences (loop_vinfo, &max_vf);
- if (!ok
- || (max_vf != MAX_VECTORIZATION_FACTOR
- && may_lt (max_vf, min_vf)))
+ if (!ok || max_vf <= 1)
{
if (dump_enabled_p ())
dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
@@ -2150,14 +2232,6 @@ vect_analyze_loop_2 (loop_vec_info loop_vinfo, bool &fatal)
"can't determine vectorization factor.\n");
return false;
}
- if (max_vf != MAX_VECTORIZATION_FACTOR
- && may_lt (max_vf, LOOP_VINFO_VECT_FACTOR (loop_vinfo)))
- {
- if (dump_enabled_p ())
- dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
- "bad data dependence.\n");
- return false;
- }
/* Compute the scalar iteration cost. */
vect_compute_single_scalar_iteration_cost (loop_vinfo);
@@ -2278,6 +2352,19 @@ start_over:
"not using a fully-masked loop.\n");
}
+ if (!LOOP_VINFO_FULLY_MASKED_P (loop_vinfo)
+ && use_capped_vf (loop_vinfo))
+ {
+ if (dump_enabled_p ())
+ dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
+ "Need to cap the runtime vectorization factor to "
+ HOST_WIDE_INT_PRINT_DEC " but cannot fully mask"
+ " the loop.\n",
+ LOOP_VINFO_MAX_VECT_FACTOR (loop_vinfo));
+ /* Undoing SLP might allow us to use a mask. */
+ goto again;
+ }
+
/* If epilog loop is required because of data accesses with gaps,
one additional iteration needs to be peeled. Check if there is
enough iterations for vectorization. */
@@ -7347,7 +7434,7 @@ vectorizable_induction (gimple *phi,
gphi *induction_phi;
tree induc_def, vec_dest;
tree init_expr, step_expr;
- poly_uint64 vf = LOOP_VINFO_VECT_FACTOR (loop_vinfo);
+ tree vf = LOOP_VINFO_CAP (loop_vinfo).niters;
unsigned i;
tree expr;
gimple_seq stmts;
@@ -7526,12 +7613,9 @@ vectorizable_induction (gimple *phi,
/* Generate [VF*S, VF*S, ... ]. */
if (SCALAR_FLOAT_TYPE_P (TREE_TYPE (step_expr)))
- {
- expr = build_int_cst (integer_type_node, vf);
- expr = fold_convert (TREE_TYPE (step_expr), expr);
- }
+ expr = fold_convert (TREE_TYPE (step_expr), vf);
else
- expr = build_int_cst (TREE_TYPE (step_expr), vf);
+ expr = vf;
new_name = fold_build2 (MULT_EXPR, TREE_TYPE (step_expr),
expr, step_expr);
if (! CONSTANT_CLASS_P (new_name))
@@ -7726,12 +7810,9 @@ vectorizable_induction (gimple *phi,
vec_step = [VF*S, VF*S, VF*S, VF*S] */
gimple_seq seq = NULL;
if (SCALAR_FLOAT_TYPE_P (TREE_TYPE (step_expr)))
- {
- expr = build_int_cst (integer_type_node, vf);
- expr = gimple_build (&seq, FLOAT_EXPR, TREE_TYPE (step_expr), expr);
- }
+ expr = gimple_build (&seq, FLOAT_EXPR, TREE_TYPE (step_expr), vf);
else
- expr = build_int_cst (TREE_TYPE (step_expr), vf);
+ expr = gimple_convert (&seq, TREE_TYPE (step_expr), vf);
new_name = gimple_build (&seq, MULT_EXPR, TREE_TYPE (step_expr),
expr, step_expr);
if (seq)
@@ -8315,7 +8396,7 @@ vect_transform_loop (loop_vec_info loop_vinfo)
tree step_vector = NULL_TREE;
tree niters_vector_mult_vf = NULL_TREE;
poly_uint64 vf = LOOP_VINFO_VECT_FACTOR (loop_vinfo);
- unsigned int lowest_vf = constant_lower_bound (vf);
+ unsigned int lowest_vf;
bool grouped_store;
bool slp_scheduled = false;
gimple *stmt, *pattern_stmt;
@@ -8325,6 +8406,9 @@ vect_transform_loop (loop_vec_info loop_vinfo)
bool check_profitability = false;
unsigned int th;
+ lowest_vf = constant_lower_bound (vf);
+ lowest_vf = MIN (lowest_vf, LOOP_VINFO_MAX_VECT_FACTOR (loop_vinfo));
+
if (dump_enabled_p ())
dump_printf_loc (MSG_NOTE, vect_location, "=== vec_transform_loop ===\n");
@@ -8389,6 +8473,8 @@ vect_transform_loop (loop_vec_info loop_vinfo)
}
}
+ vect_build_cap (loop_vinfo);
+
tree niters = vect_build_loop_niters (loop_vinfo);
LOOP_VINFO_NITERS_UNCHANGED (loop_vinfo) = niters;
tree nitersm1 = unshare_expr (LOOP_VINFO_NITERSM1 (loop_vinfo));
diff --git a/gcc/tree-vect-stmts.c b/gcc/tree-vect-stmts.c
index df7ae274047..c0a87dc9275 100644
--- a/gcc/tree-vect-stmts.c
+++ b/gcc/tree-vect-stmts.c
@@ -110,6 +110,27 @@ record_stmt_cost (stmt_vector_for_cost *body_cost_vec, int count,
count, kind, stmt_info, misalign, where);
}
+/* Return a tree that represents STEP multiplied by the vectorization
+ factor. */
+
+static tree
+vect_mult_by_vf (loop_vec_info loop_vinfo, tree step)
+{
+ hash_map<tree, tree> *map = &LOOP_VINFO_VF_MULT_MAP (loop_vinfo);
+ bool existed;
+ tree &entry = map->get_or_insert (step, &existed);
+ if (!existed)
+ {
+ gimple_seq seq = NULL;
+ tree vf = LOOP_VINFO_CAP (loop_vinfo).niters;
+ vf = gimple_convert (&seq, TREE_TYPE (step), vf);
+ entry = gimple_build (&seq, MULT_EXPR, TREE_TYPE (step), vf, step);
+ edge pe = loop_preheader_edge (LOOP_VINFO_LOOP (loop_vinfo));
+ gsi_insert_seq_on_edge_immediate (pe, seq);
+ }
+ return entry;
+}
+
/* Return a variable of type ELEM_TYPE[NELEMS]. */
static tree
@@ -2812,7 +2833,8 @@ vect_get_gather_scatter_ops (struct loop *loop, gimple *stmt,
static void
vect_get_strided_load_store_ops (gimple *stmt, loop_vec_info loop_vinfo,
gather_scatter_info *gs_info,
- tree *dataref_bump, tree *vec_offset)
+ tree *iv_step, tree *dataref_bump,
+ tree *vec_offset)
{
stmt_vec_info stmt_info = vinfo_for_stmt (stmt);
struct data_reference *dr = STMT_VINFO_DATA_REF (stmt_info);
@@ -2827,6 +2849,12 @@ vect_get_strided_load_store_ops (gimple *stmt, loop_vec_info loop_vinfo,
if (stmts)
gsi_insert_seq_on_edge_immediate (loop_preheader_edge (loop), stmts);
+ if (use_capped_vf (loop_vinfo))
+ *iv_step = vect_mult_by_vf (loop_vinfo,
+ fold_convert (sizetype, DR_STEP (dr)));
+ else
+ *iv_step = *dataref_bump;
+
/* The offset given in GS_INFO can have pointer type, so use the element
type of the vector instead. */
tree offset_type = TREE_TYPE (gs_info->offset);
@@ -2851,18 +2879,32 @@ vect_get_strided_load_store_ops (gimple *stmt, loop_vec_info loop_vinfo,
being vectorized and MEMORY_ACCESS_TYPE describes the type of
vectorization. */
-static tree
-vect_get_data_ptr_increment (data_reference *dr, tree aggr_type,
- vect_memory_access_type memory_access_type)
+static void
+vect_get_data_ptr_increment (loop_vec_info loop_vinfo, data_reference *dr,
+ tree aggr_type, unsigned int group_size,
+ vect_memory_access_type memory_access_type,
+ tree *iv_step, tree *bump)
{
if (memory_access_type == VMAT_INVARIANT)
- return size_zero_node;
+ {
+ *iv_step = *bump = size_zero_node;
+ return;
+ }
- tree iv_step = TYPE_SIZE_UNIT (aggr_type);
+ *bump = TYPE_SIZE_UNIT (aggr_type);
tree step = vect_dr_behavior (dr)->step;
if (tree_int_cst_sgn (step) == -1)
- iv_step = fold_build1 (NEGATE_EXPR, TREE_TYPE (iv_step), iv_step);
- return iv_step;
+ *bump = fold_build1 (NEGATE_EXPR, TREE_TYPE (*bump), *bump);
+
+ if (loop_vinfo && use_capped_vf (loop_vinfo))
+ {
+ tree elt_type = TREE_TYPE (DR_REF (dr));
+ tree bytes_per_iter = size_binop (MULT_EXPR, TYPE_SIZE_UNIT (elt_type),
+ size_int (group_size));
+ *iv_step = vect_mult_by_vf (loop_vinfo, bytes_per_iter);
+ }
+ else
+ *iv_step = *bump;
}
/* Check and perform vectorization of BUILT_IN_BSWAP{16,32,64}. */
@@ -6717,18 +6759,19 @@ vectorizable_store (gimple *stmt, gimple_stmt_iterator *gsi, gimple **vec_stmt,
|| memory_access_type == VMAT_CONTIGUOUS_REVERSE)
offset = size_int (-TYPE_VECTOR_SUBPARTS (vectype) + 1);
- tree bump;
+ tree bump, iv_step;
tree vec_offset = NULL_TREE;
if (STMT_VINFO_GATHER_SCATTER_P (stmt_info))
{
aggr_type = NULL_TREE;
+ iv_step = NULL_TREE;
bump = NULL_TREE;
}
else if (memory_access_type == VMAT_GATHER_SCATTER)
{
aggr_type = elem_type;
vect_get_strided_load_store_ops (stmt, loop_vinfo, &gs_info,
- &bump, &vec_offset);
+ &iv_step, &bump, &vec_offset);
}
else
{
@@ -6736,7 +6779,8 @@ vectorizable_store (gimple *stmt, gimple_stmt_iterator *gsi, gimple **vec_stmt,
aggr_type = build_array_type_nelts (elem_type, vec_num * nunits);
else
aggr_type = vectype;
- bump = vect_get_data_ptr_increment (dr, aggr_type, memory_access_type);
+ vect_get_data_ptr_increment (loop_vinfo, dr, aggr_type, group_size,
+ memory_access_type, &iv_step, &bump);
}
if (mask)
@@ -6854,7 +6898,7 @@ vectorizable_store (gimple *stmt, gimple_stmt_iterator *gsi, gimple **vec_stmt,
simd_lane_access_p ? loop : NULL,
offset, &dummy, gsi, &ptr_incr,
simd_lane_access_p, &inv_p,
- NULL_TREE, bump);
+ NULL_TREE, iv_step);
gcc_assert (bb_vinfo || !inv_p);
}
else
@@ -7917,18 +7961,19 @@ vectorizable_load (gimple *stmt, gimple_stmt_iterator *gsi, gimple **vec_stmt,
if (memory_access_type == VMAT_CONTIGUOUS_REVERSE)
offset = size_int (-TYPE_VECTOR_SUBPARTS (vectype) + 1);
- tree bump;
+ tree bump, iv_step;
tree vec_offset = NULL_TREE;
if (STMT_VINFO_GATHER_SCATTER_P (stmt_info))
{
aggr_type = NULL_TREE;
+ iv_step = NULL_TREE;
bump = NULL_TREE;
}
else if (memory_access_type == VMAT_GATHER_SCATTER)
{
aggr_type = elem_type;
vect_get_strided_load_store_ops (stmt, loop_vinfo, &gs_info,
- &bump, &vec_offset);
+ &iv_step, &bump, &vec_offset);
}
else
{
@@ -7936,7 +7981,8 @@ vectorizable_load (gimple *stmt, gimple_stmt_iterator *gsi, gimple **vec_stmt,
aggr_type = build_array_type_nelts (elem_type, vec_num * nunits);
else
aggr_type = vectype;
- bump = vect_get_data_ptr_increment (dr, aggr_type, memory_access_type);
+ vect_get_data_ptr_increment (loop_vinfo, dr, aggr_type, group_size,
+ memory_access_type, &iv_step, &bump);
}
tree vec_mask = NULL_TREE;
@@ -7971,7 +8017,7 @@ vectorizable_load (gimple *stmt, gimple_stmt_iterator *gsi, gimple **vec_stmt,
= vect_create_data_ref_ptr (first_stmt_for_drptr, aggr_type,
at_loop, offset, &dummy, gsi,
&ptr_incr, simd_lane_access_p,
- &inv_p, byte_offset, bump);
+ &inv_p, byte_offset, iv_step);
/* Adjust the pointer by the difference to first_stmt. */
data_reference_p ptrdr
= STMT_VINFO_DATA_REF (vinfo_for_stmt (first_stmt_for_drptr));
@@ -7993,7 +8039,7 @@ vectorizable_load (gimple *stmt, gimple_stmt_iterator *gsi, gimple **vec_stmt,
= vect_create_data_ref_ptr (first_stmt, aggr_type, at_loop,
offset, &dummy, gsi, &ptr_incr,
simd_lane_access_p, &inv_p,
- byte_offset, bump);
+ byte_offset, iv_step);
if (mask)
vec_mask = vect_get_vec_def_for_operand (mask, stmt,
mask_vectype);
diff --git a/gcc/tree-vectorizer.h b/gcc/tree-vectorizer.h
index 023594771bb..8073ba05a83 100644
--- a/gcc/tree-vectorizer.h
+++ b/gcc/tree-vectorizer.h
@@ -357,6 +357,18 @@ struct rgroup_masks {
typedef auto_vec<rgroup_masks> vec_loop_masks;
+/* Represents a scalar iteration count <= VF as both an integer count and a
+ vector mask. */
+struct vec_niters_and_mask {
+ vec_niters_and_mask () : niters (NULL_TREE), mask (NULL_TREE) {}
+
+ /* The number of scalar iterations as a sizetype integer. */
+ tree niters;
+
+ /* The mask of scalar iterations, with one element per iteration. */
+ tree mask;
+};
+
/*-----------------------------------------------------------------*/
/* Info on vectorized loops. */
/*-----------------------------------------------------------------*/
@@ -397,6 +409,10 @@ typedef struct _loop_vec_info : public vec_info {
if there is no particular limit. */
unsigned HOST_WIDE_INT max_vectorization_factor;
+ /* The actual runtime vectorization factor, which is the minimum of
+ VECTORIZATION_FACTOR and MAX_VECTORIZATION_FACTOR. */
+ vec_niters_and_mask cap;
+
/* The masks that a fully-masked loop should use to avoid operating
on inactive scalars. */
vec_loop_masks masks;
@@ -526,6 +542,10 @@ typedef struct _loop_vec_info : public vec_info {
/* A hash table used for caching vector base addresses. */
hash_table<vect_addr_base_hasher> vect_addr_base_htab;
+
+ /* A map from X to a precomputed gimple_val containing
+ CAPPED_VECTORIZATION_FACTOR * X. */
+ hash_map<tree, tree> vf_mult_map;
} *loop_vec_info;
/* Access Functions. */
@@ -545,6 +565,7 @@ typedef struct _loop_vec_info : public vec_info {
#define LOOP_VINFO_FULLY_MASKED_P(L) (L)->fully_masked_p
#define LOOP_VINFO_VECT_FACTOR(L) (L)->vectorization_factor
#define LOOP_VINFO_MAX_VECT_FACTOR(L) (L)->max_vectorization_factor
+#define LOOP_VINFO_CAP(L) (L)->cap
#define LOOP_VINFO_MASKS(L) (L)->masks
#define LOOP_VINFO_MASK_SKIP_NITERS(L) (L)->mask_skip_niters
#define LOOP_VINFO_MASK_COMPARE_TYPE(L) (L)->mask_compare_type
@@ -577,6 +598,7 @@ typedef struct _loop_vec_info : public vec_info {
#define LOOP_VINFO_SINGLE_SCALAR_ITERATION_COST(L) (L)->single_scalar_iteration_cost
#define LOOP_VINFO_ORIG_LOOP_INFO(L) (L)->orig_loop_info
#define LOOP_VINFO_ADDR_CACHE(L) (L)->vect_addr_base_htab
+#define LOOP_VINFO_VF_MULT_MAP(L) (L)->vf_mult_map
#define LOOP_REQUIRES_VERSIONING_FOR_ALIGNMENT(L) \
((L)->may_misalign_stmts.length () > 0)
@@ -1351,6 +1373,19 @@ unlimited_cost_model (loop_p loop)
return (flag_vect_cost_model == VECT_COST_MODEL_UNLIMITED);
}
+/* Return true if the loop needs to use a vectorization factor that
+ is capped at run time. */
+
+static inline bool
+use_capped_vf (loop_vec_info loop_vinfo)
+{
+ return (loop_vinfo
+ && (LOOP_VINFO_MAX_VECT_FACTOR (loop_vinfo)
+ != MAX_VECTORIZATION_FACTOR)
+ && may_lt (LOOP_VINFO_MAX_VECT_FACTOR (loop_vinfo),
+ LOOP_VINFO_VECT_FACTOR (loop_vinfo)));
+}
+
/* Return true if the loop described by LOOP_VINFO is fully-masked and
if the first iteration should use a partial mask in order to achieve
alignment. */