Allow capped vectorisation factors

[Branch only patch -- not intended for trunk in its current state] This patch allows the controlling mask for a loop to be clamped to a compile-time maximum. We can then vectorise code that has a known dependence distance, even if it is (or might be) smaller than a vector. This is almost ready for trunk, but I'd prefer to do some more checking first.
author: Richard Sandiford <richard.sandiford@linaro.org> 2017-10-08 12:29:08 +0100
committer: Richard Sandiford <richard.sandiford@linaro.org> 2017-11-20 16:01:23 +0000
commit: f8d18515fd0e87fe9b68a23e6d73b80064baec97 (patch)
tree: bf7c7a9d43942db218b3786c21fea82242787adf
parent: f2ba9afa2bfb21956959a5ab48be1ef1ce2e6f2c (diff)
download: gcc-f8d18515fd0e87fe9b68a23e6d73b80064baec97.tar.gz
21 files changed, 751 insertions, 74 deletions
diff --git a/gcc/config/aarch64/aarch64-sve.md b/gcc/config/aarch64/aarch64-sve.md
index 0ab9c7b0ae8..5d84b7fc595 100644
--- a/gcc/config/aarch64/aarch64-sve.md
+++ b/gcc/config/aarch64/aarch64-sve.md
@@ -2379,6 +2379,26 @@
   "<sve_fp_op>\t%0.<Vetype>, %1/m, %0.<Vetype>, %2.<Vetype>"
 )
 
+(define_expand "mask_popcount<mode>"
+  [(set (match_operand:DI 0 "register_operand")
+	(unspec:DI [(match_dup 2)
+		    (match_operand:PRED_ALL 1 "register_operand")]
+		   UNSPEC_CNTP))]
+  "TARGET_SVE"
+  {
+    operands[2] = force_reg (<MODE>mode, CONSTM1_RTX (<MODE>mode));
+  }
+)
+
+(define_insn "*mask_popcount<mode>"
+  [(set (match_operand:DI 0 "register_operand" "=r")
+	(unspec:DI [(match_operand:PRED_ALL 1 "register_operand" "Upa")
+		    (match_operand:PRED_ALL 2 "register_operand" "Upa")]
+		   UNSPEC_CNTP))]
+  "TARGET_SVE"
+  "cntp\t%0, %1, %2.<Vetype>"
+)
+
 ;; Shift an SVE vector left and insert a scalar into element 0.
 (define_insn "vec_shl_insert_<mode>"
   [(set (match_operand:SVE_ALL 0 "register_operand" "=w, w")
diff --git a/gcc/config/aarch64/aarch64.md b/gcc/config/aarch64/aarch64.md
index 0a4c90c1a39..581e6a753d2 100644
--- a/gcc/config/aarch64/aarch64.md
+++ b/gcc/config/aarch64/aarch64.md
@@ -167,6 +167,7 @@
     UNSPEC_INSR
     UNSPEC_CLASTB
     UNSPEC_FADDA
+    UNSPEC_CNTP
 ])
 
 (define_c_enum "unspecv" [
diff --git a/gcc/internal-fn.def b/gcc/internal-fn.def
index 3cc62dd68a6..f28519837f2 100644
--- a/gcc/internal-fn.def
+++ b/gcc/internal-fn.def
@@ -125,6 +125,9 @@ DEF_INTERNAL_OPTAB_FN (STORE_LANES, ECF_CONST, vec_store_lanes, store_lanes)
 DEF_INTERNAL_OPTAB_FN (MASK_STORE_LANES, 0,
 		       vec_mask_store_lanes, mask_store_lanes)
 
+DEF_INTERNAL_OPTAB_FN (MASK_POPCOUNT, ECF_CONST | ECF_NOTHROW,
+		       mask_popcount, unary)
+
 DEF_INTERNAL_OPTAB_FN (WHILE_ULT, ECF_CONST | ECF_NOTHROW, while_ult, while)
 
 DEF_INTERNAL_OPTAB_FN (VEC_INTERLEAVE_LO, ECF_CONST | ECF_NOTHROW,
diff --git a/gcc/optabs.def b/gcc/optabs.def
index 523906aa198..bf67dfca132 100644
--- a/gcc/optabs.def
+++ b/gcc/optabs.def
@@ -397,4 +397,6 @@ OPTAB_D (mask_scatter_store_optab, "mask_scatter_store$a")
 
 OPTAB_DC (vec_duplicate_optab, "vec_duplicate$a", VEC_DUPLICATE)
 OPTAB_DC (vec_series_optab, "vec_series$a", VEC_SERIES)
+
+OPTAB_D (mask_popcount_optab, "mask_popcount$a")
 OPTAB_D (vec_shl_insert_optab, "vec_shl_insert_$a")
diff --git a/gcc/testsuite/gcc.dg/vect/slp-28.c b/gcc/testsuite/gcc.dg/vect/slp-28.c
index 7778bad4465..4211b94ad7f 100644
--- a/gcc/testsuite/gcc.dg/vect/slp-28.c
+++ b/gcc/testsuite/gcc.dg/vect/slp-28.c
@@ -88,6 +88,6 @@ int main (void)
   return 0;
 }
 
-/* { dg-final { scan-tree-dump-times "vectorized 1 loops" 1 "vect"  } } */
-/* { dg-final { scan-tree-dump-times "vectorizing stmts using SLP" 1 "vect" } } */
+/* { dg-final { scan-tree-dump-times "vectorized 2 loops" 1 "vect" } } */
+/* { dg-final { scan-tree-dump-times "vectorizing stmts using SLP" 2 "vect" } } */
   
diff --git a/gcc/testsuite/gcc.target/aarch64/sve_cap_1.c b/gcc/testsuite/gcc.target/aarch64/sve_cap_1.c
new file mode 100644
index 00000000000..1051fd1f7f0
--- /dev/null
+++ b/gcc/testsuite/gcc.target/aarch64/sve_cap_1.c
@@ -0,0 +1,44 @@
+/* { dg-do compile } */
+/* { dg-options "-O2 -ftree-vectorize -fno-inline -march=armv8-a+sve -fno-vect-cost-model" } */
+
+#define LOOP(TYPE)				\
+  void						\
+  f_##TYPE##_1 (TYPE *a, int n)			\
+  {						\
+    for (int i = 0; i < n; ++i)			\
+      a[i] += a[i - 1];				\
+  }						\
+						\
+  void						\
+  f_##TYPE##_2 (TYPE *a, int n)			\
+  {						\
+    for (int i = 0; i < n; ++i)			\
+      a[i] += a[i - 2];				\
+  }						\
+						\
+  void						\
+  f_##TYPE##_5 (TYPE *a, int n)			\
+  {						\
+    for (int i = 0; i < n; ++i)			\
+      a[i] += a[i - 5];				\
+  }
+
+LOOP (char)
+LOOP (short)
+LOOP (float)
+LOOP (double)
+
+/* { dg-final { scan-assembler-times {\tstrb\t} 1 } } */
+/* { dg-final { scan-assembler-times {\tstrh\t} 1 } } */
+/* { dg-final { scan-assembler-times {\tstr\ts[0-9]+} 1 } } */
+/* { dg-final { scan-assembler-times {\tstr\td[0-9]+} 1 } } */
+
+/* { dg-final { scan-assembler-times {\tld1b\tz[0-9]+\.b, p[0-7]/z, \[x[0-9]+, x[0-9]+\]} 4 } } */
+/* { dg-final { scan-assembler-times {\tld1h\tz[0-9]+\.h, p[0-7]/z, \[x[0-9]+, x[0-9]+, lsl 1\]} 4 } } */
+/* { dg-final { scan-assembler-times {\tld1w\tz[0-9]+\.s, p[0-7]/z, \[x[0-9]+, x[0-9]+, lsl 2\]} 4 } } */
+/* { dg-final { scan-assembler-times {\tld1d\tz[0-9]+\.d, p[0-7]/z, \[x[0-9]+, x[0-9]+, lsl 3\]} 4 } } */
+
+/* { dg-final { scan-assembler-times {\tst1b\tz[0-9]+\.b, p[0-7], \[x[0-9]+, x[0-9]+\]} 2 } } */
+/* { dg-final { scan-assembler-times {\tst1h\tz[0-9]+\.h, p[0-7], \[x[0-9]+, x[0-9]+, lsl 1\]} 2 } } */
+/* { dg-final { scan-assembler-times {\tst1w\tz[0-9]+\.s, p[0-7], \[x[0-9]+, x[0-9]+, lsl 2\]} 2 } } */
+/* { dg-final { scan-assembler-times {\tst1d\tz[0-9]+\.d, p[0-7], \[x[0-9]+, x[0-9]+, lsl 3\]} 2 } } */
diff --git a/gcc/testsuite/gcc.target/aarch64/sve_cap_1_run.c b/gcc/testsuite/gcc.target/aarch64/sve_cap_1_run.c
new file mode 100644
index 00000000000..0f280b04f0b
--- /dev/null
+++ b/gcc/testsuite/gcc.target/aarch64/sve_cap_1_run.c
@@ -0,0 +1,37 @@
+/* { dg-do run { target { aarch64_sve_hw } } } */
+/* { dg-options "-O2 -ftree-vectorize -fno-inline -march=armv8-a+sve -fno-vect-cost-model" } */
+
+#include "sve_cap_1.c"
+
+#define N 129
+
+#define F(X) (((X) % 5) * (X))
+
+#define TEST_LOOP(TYPE, M)			\
+  {						\
+    TYPE a[N + M];				\
+    for (int i = 0; i < N + M; ++i)		\
+      a[i] = F (i);				\
+    f_##TYPE##_##M (a + M, N);			\
+    for (int i = 0; i < N; ++i)			\
+      {						\
+	TYPE x = a[i];				\
+	TYPE y = F (i + M);			\
+	if (a[i + M] != (TYPE) (x + y))		\
+	  __builtin_abort ();			\
+      }						\
+  }
+
+#define TEST_LOOPS(TYPE) \
+  TEST_LOOP (TYPE, 1) \
+  TEST_LOOP (TYPE, 2) \
+  TEST_LOOP (TYPE, 5)
+
+int
+main (void)
+{
+  TEST_LOOPS (char);
+  TEST_LOOPS (short);
+  TEST_LOOPS (float);
+  TEST_LOOPS (double);
+}
diff --git a/gcc/testsuite/gcc.target/aarch64/sve_cap_2.c b/gcc/testsuite/gcc.target/aarch64/sve_cap_2.c
new file mode 100644
index 00000000000..d46a08c2ee1
--- /dev/null
+++ b/gcc/testsuite/gcc.target/aarch64/sve_cap_2.c
@@ -0,0 +1,53 @@
+/* { dg-do compile } */
+/* { dg-options "-O2 -ftree-vectorize -fno-inline -march=armv8-a+sve -fno-vect-cost-model" } */
+
+#define LOOP(TYPE)				\
+  void __attribute__ ((weak))			\
+  f_##TYPE##_1 (TYPE *a, int n)			\
+  {						\
+    for (int i = 0; i < n; ++i)			\
+      {						\
+	a[i * 2] += a[i * 2 - 2];		\
+	a[i * 2 + 1] += a[i * 2 - 1];		\
+      }						\
+  }						\
+						\
+  void __attribute__ ((weak))			\
+  f_##TYPE##_2 (TYPE *a, int n)			\
+  {						\
+    for (int i = 0; i < n; ++i)			\
+      {						\
+	a[i * 2] += a[i * 2 - 4];		\
+	a[i * 2 + 1] += a[i * 2 - 3];		\
+      }						\
+  }						\
+						\
+  void __attribute__ ((weak))			\
+  f_##TYPE##_5 (TYPE *a, int n)			\
+  {						\
+    for (int i = 0; i < n; ++i)			\
+      {						\
+	a[i * 2] += a[i * 2 - 10];		\
+	a[i * 2 + 1] += a[i * 2 - 9];		\
+      }						\
+  }
+
+LOOP (char)
+LOOP (short)
+LOOP (float)
+LOOP (double)
+
+/* { dg-final { scan-assembler-times {\tstrb\t} 2 } } */
+/* { dg-final { scan-assembler-times {\tstrh\t} 2 } } */
+/* { dg-final { scan-assembler-times {\tstp\ts[0-9]+} 1 } } */
+/* { dg-final { scan-assembler-times {\tstp\td[0-9]+} 1 } } */
+
+/* { dg-final { scan-assembler-times {\tld1b\t} 4 } } */
+/* { dg-final { scan-assembler-times {\tld1h\t} 4 } } */
+/* { dg-final { scan-assembler-times {\tld1w\t} 4 } } */
+/* { dg-final { scan-assembler-times {\tld1d\t} 4 } } */
+
+/* { dg-final { scan-assembler-times {\tst1b\t} 2 } } */
+/* { dg-final { scan-assembler-times {\tst1h\t} 2 } } */
+/* { dg-final { scan-assembler-times {\tst1w\t} 2 } } */
+/* { dg-final { scan-assembler-times {\tst1d\t} 2 } } */
diff --git a/gcc/testsuite/gcc.target/aarch64/sve_cap_2_run.c b/gcc/testsuite/gcc.target/aarch64/sve_cap_2_run.c
new file mode 100644
index 00000000000..0f8ed957c79
--- /dev/null
+++ b/gcc/testsuite/gcc.target/aarch64/sve_cap_2_run.c
@@ -0,0 +1,44 @@
+/* { dg-do run { target { aarch64_sve_hw } } } */
+/* { dg-options "-O2 -ftree-vectorize -fno-inline -march=armv8-a+sve -fno-vect-cost-model" } */
+
+#include "sve_cap_2.c"
+
+#define N 129
+
+#define F(X) (((X) % 5) * (X))
+#define G(X) ((X) ^ 39)
+
+#define TEST_LOOP(TYPE, M)				\
+  {							\
+    TYPE a[(N + M) * 2];				\
+    for (int i = 0; i < N + M; ++i)			\
+      {							\
+        a[i * 2] = F (i);				\
+        a[i * 2 + 1] = G (i);				\
+      }							\
+    f_##TYPE##_##M (a + M * 2, N);			\
+    for (int i = 0; i < N; ++i)				\
+      {							\
+	TYPE x0 = a[i * 2];				\
+	TYPE y0 = F (i + M);				\
+	TYPE x1 = a[i * 2 + 1];				\
+	TYPE y1 = G (i + M);				\
+	if (a[(i + M) * 2] != (TYPE) (x0 + y0)		\
+	    || a[(i + M) * 2 + 1] != (TYPE) (x1 + y1))	\
+	  __builtin_abort ();				\
+      }							\
+  }
+
+#define TEST_LOOPS(TYPE) \
+  TEST_LOOP (TYPE, 1) \
+  TEST_LOOP (TYPE, 2) \
+  TEST_LOOP (TYPE, 5)
+
+int
+main (void)
+{
+  TEST_LOOPS (char);
+  TEST_LOOPS (short);
+  TEST_LOOPS (float);
+  TEST_LOOPS (double);
+}
diff --git a/gcc/testsuite/gcc.target/aarch64/sve_cap_3.c b/gcc/testsuite/gcc.target/aarch64/sve_cap_3.c
new file mode 100644
index 00000000000..6515465b7f6
--- /dev/null
+++ b/gcc/testsuite/gcc.target/aarch64/sve_cap_3.c
@@ -0,0 +1,53 @@
+/* { dg-do compile } */
+/* { dg-options "-O2 -ftree-vectorize -fno-inline -march=armv8-a+sve -fno-vect-cost-model -msve-vector-bits=scalable" } */
+
+#define LOOP(TYPE)				\
+  void __attribute__ ((weak))			\
+  f_##TYPE##_1 (TYPE *a, int n)			\
+  {						\
+    for (int i = 0; i < n; ++i)			\
+      {						\
+	a[i * 2] += a[i * 2 - 2];		\
+	a[i * 2 + 1] -= a[i * 2 - 1];		\
+      }						\
+  }						\
+						\
+  void __attribute__ ((weak))			\
+  f_##TYPE##_2 (TYPE *a, int n)			\
+  {						\
+    for (int i = 0; i < n; ++i)			\
+      {						\
+	a[i * 2] += a[i * 2 - 4];		\
+	a[i * 2 + 1] -= a[i * 2 - 3];		\
+      }						\
+  }						\
+						\
+  void __attribute__ ((weak))			\
+  f_##TYPE##_5 (TYPE *a, int n)			\
+  {						\
+    for (int i = 0; i < n; ++i)			\
+      {						\
+	a[i * 2] += a[i * 2 - 10];		\
+	a[i * 2 + 1] -= a[i * 2 - 9];		\
+      }						\
+  }
+
+LOOP (char)
+LOOP (short)
+LOOP (float)
+LOOP (double)
+
+/* { dg-final { scan-assembler-times {\tstrb\t} 2 } } */
+/* { dg-final { scan-assembler-times {\tstrh\t} 2 } } */
+/* { dg-final { scan-assembler-times {\tstp\ts[0-9]+} 1 } } */
+/* { dg-final { scan-assembler-times {\tstp\td[0-9]+} 1 } } */
+
+/* { dg-final { scan-assembler-times {\tld2b\t} 4 } } */
+/* { dg-final { scan-assembler-times {\tld2h\t} 4 } } */
+/* { dg-final { scan-assembler-times {\tld2w\t} 4 } } */
+/* { dg-final { scan-assembler-times {\tld2d\t} 4 } } */
+
+/* { dg-final { scan-assembler-times {\tst2b\t} 2 } } */
+/* { dg-final { scan-assembler-times {\tst2h\t} 2 } } */
+/* { dg-final { scan-assembler-times {\tst2w\t} 2 } } */
+/* { dg-final { scan-assembler-times {\tst2d\t} 2 } } */
diff --git a/gcc/testsuite/gcc.target/aarch64/sve_cap_3_run.c b/gcc/testsuite/gcc.target/aarch64/sve_cap_3_run.c
new file mode 100644
index 00000000000..fe26162a812
--- /dev/null
+++ b/gcc/testsuite/gcc.target/aarch64/sve_cap_3_run.c
@@ -0,0 +1,44 @@
+/* { dg-do run { target { aarch64_sve_hw } } } */
+/* { dg-options "-O2 -ftree-vectorize -fno-inline -march=armv8-a+sve -fno-vect-cost-model" } */
+
+#include "sve_cap_3.c"
+
+#define N 129
+
+#define F(X) (((X) % 5) * (X))
+#define G(X) ((X) ^ 39)
+
+#define TEST_LOOP(TYPE, M)				\
+  {							\
+    TYPE a[(N + M) * 2];				\
+    for (int i = 0; i < N + M; ++i)			\
+      {							\
+        a[i * 2] = F (i);				\
+        a[i * 2 + 1] = G (i);				\
+      }							\
+    f_##TYPE##_##M (a + M * 2, N);			\
+    for (int i = 0; i < N; ++i)				\
+      {							\
+	TYPE x0 = a[i * 2];				\
+	TYPE y0 = F (i + M);				\
+	TYPE x1 = a[i * 2 + 1];				\
+	TYPE y1 = G (i + M);				\
+	if (a[(i + M) * 2] != (TYPE) (y0 + x0)		\
+	    || a[(i + M) * 2 + 1] != (TYPE) (y1 - x1))	\
+	  __builtin_abort ();				\
+      }							\
+  }
+
+#define TEST_LOOPS(TYPE) \
+  TEST_LOOP (TYPE, 1) \
+  TEST_LOOP (TYPE, 2) \
+  TEST_LOOP (TYPE, 5)
+
+int
+main (void)
+{
+  TEST_LOOPS (char);
+  TEST_LOOPS (short);
+  TEST_LOOPS (float);
+  TEST_LOOPS (double);
+}
diff --git a/gcc/testsuite/gcc.target/aarch64/sve_cap_4.c b/gcc/testsuite/gcc.target/aarch64/sve_cap_4.c
new file mode 100644
index 00000000000..c3bf2f326d3
--- /dev/null
+++ b/gcc/testsuite/gcc.target/aarch64/sve_cap_4.c
@@ -0,0 +1,42 @@
+/* { dg-do compile } */
+/* { dg-options "-O2 -ftree-vectorize -fno-inline -march=armv8-a+sve -fno-vect-cost-model" } */
+
+#define LOOP(TYPE)				\
+  void						\
+  f_##TYPE##_1 (TYPE *a, int n)			\
+  {						\
+    for (int i = 0; i < n; ++i)			\
+      a[i * 5] += a[i * 5 - 5];			\
+  }						\
+						\
+  void						\
+  f_##TYPE##_2 (TYPE *a, int n)			\
+  {						\
+    for (int i = 0; i < n; ++i)			\
+      a[i * 5] += a[i * 5 - 10];		\
+  }						\
+						\
+  void						\
+  f_##TYPE##_5 (TYPE *a, int n)			\
+  {						\
+    for (int i = 0; i < n; ++i)			\
+      a[i * 5] += a[i * 5 - 25];		\
+  }
+
+LOOP (char)
+LOOP (short)
+LOOP (float)
+LOOP (double)
+
+/* At the moment we can't use extending loads and truncating stores.
+   Please add ld and st scan-assemblers below if that changes.  */
+/* { dg-final { scan-assembler-times {\tstrb\t} 1 { xfail *-*-* } } } */
+/* { dg-final { scan-assembler-times {\tstrh\t} 1 { xfail *-*-* } } } */
+/* { dg-final { scan-assembler-times {\tstr\ts[0-9]+} 1 } } */
+/* { dg-final { scan-assembler-times {\tstr\td[0-9]+} 1 } } */
+
+/* { dg-final { scan-assembler-times {\tld1w\tz[0-9]+\.s, p[0-7]/z, \[x[0-9]+, z[0-9]+\.s, sxtw 2\]} 4 } } */
+/* { dg-final { scan-assembler-times {\tld1d\tz[0-9]+\.d, p[0-7]/z, \[x[0-9]+, z[0-9]+\.d, lsl 3\]} 4 } } */
+
+/* { dg-final { scan-assembler-times {\tst1w\tz[0-9]+\.s, p[0-7], \[x[0-9]+, z[0-9]+\.s, sxtw 2\]} 2 } } */
+/* { dg-final { scan-assembler-times {\tst1d\tz[0-9]+\.d, p[0-7], \[x[0-9]+, z[0-9]+\.d, lsl 3\]} 2 } } */
diff --git a/gcc/testsuite/gcc.target/aarch64/sve_cap_4_run.c b/gcc/testsuite/gcc.target/aarch64/sve_cap_4_run.c
new file mode 100644
index 00000000000..f39bc7fc3cb
--- /dev/null
+++ b/gcc/testsuite/gcc.target/aarch64/sve_cap_4_run.c
@@ -0,0 +1,37 @@
+/* { dg-do run { target { aarch64_sve_hw } } } */
+/* { dg-options "-O2 -ftree-vectorize -fno-inline -march=armv8-a+sve -fno-vect-cost-model" } */
+
+#include "sve_cap_4.c"
+
+#define N 129
+
+#define F(X) (((X) % 5) * (X))
+
+#define TEST_LOOP(TYPE, M)			\
+  {						\
+    TYPE a[(N + M) * 5];			\
+    for (int i = 0; i < N + M; ++i)		\
+      a[i * 5] = F (i);				\
+    f_##TYPE##_##M (a + M * 5, N);		\
+    for (int i = 0; i < N; ++i)			\
+      {						\
+	TYPE x = a[i * 5];			\
+	TYPE y = F (i + M);			\
+	if (a[(i + M) * 5] != (TYPE) (x + y))	\
+	  __builtin_abort ();			\
+      }						\
+  }
+
+#define TEST_LOOPS(TYPE) \
+  TEST_LOOP (TYPE, 1) \
+  TEST_LOOP (TYPE, 2) \
+  TEST_LOOP (TYPE, 5)
+
+int
+main (void)
+{
+  TEST_LOOPS (char);
+  TEST_LOOPS (short);
+  TEST_LOOPS (float);
+  TEST_LOOPS (double);
+}
diff --git a/gcc/testsuite/gcc.target/aarch64/sve_cap_5.c b/gcc/testsuite/gcc.target/aarch64/sve_cap_5.c
new file mode 100644
index 00000000000..4d4987773b6
--- /dev/null
+++ b/gcc/testsuite/gcc.target/aarch64/sve_cap_5.c
@@ -0,0 +1,24 @@
+/* { dg-do compile } */
+/* { dg-options "-O2 -ftree-vectorize -fno-inline -march=armv8-a+sve -fno-vect-cost-model" } */
+
+void __attribute__ ((noinline, noclone))
+f (double *x, float *y, int n)
+{
+  for (int i = 0; i < n; ++i)
+    {
+      x[i * 3 + 18] = x[i * 3 + 0] + y[i];
+      x[i * 3 + 19] = x[i * 3 + 1] - y[i];
+      x[i * 3 + 20] = x[i * 3 + 2];
+    }
+}
+
+/* { dg-final { scan-assembler-times {\tld1w\t} 1 } } */
+/* { dg-final { scan-assembler-times {\tld3d\t} 2 } } */
+/* { dg-final { scan-assembler-times {\tst3d\t} 2 } } */
+/* { dg-final { scan-assembler-times {\tpunpklo\t} 1 } } */
+/* { dg-final { scan-assembler-times {\tpunpkhi\t} 1 } } */
+/* { dg-final { scan-assembler-times {\tuunpklo\t} 1 } } */
+/* { dg-final { scan-assembler-times {\tuunpkhi\t} 1 } } */
+/* { dg-final { scan-assembler-times {\tfcvt\t} 2 } } */
+/* { dg-final { scan-assembler-times {\tfadd\t} 2 } } */
+/* { dg-final { scan-assembler-times {\tfsub\t} 2 } } */
diff --git a/gcc/testsuite/gcc.target/aarch64/sve_cap_5_run.c b/gcc/testsuite/gcc.target/aarch64/sve_cap_5_run.c
new file mode 100644
index 00000000000..57fd625d20a
--- /dev/null
+++ b/gcc/testsuite/gcc.target/aarch64/sve_cap_5_run.c
@@ -0,0 +1,40 @@
+/* { dg-do run { target { aarch64_sve_hw } } } */
+/* { dg-options "-O2 -ftree-vectorize -fno-inline -march=armv8-a+sve -fno-vect-cost-model" } */
+
+#include "sve_cap_5.c"
+
+#define COUNT 5
+#define N ((COUNT + 2) * 6)
+
+int
+main (void)
+{
+  double x[N * 3];
+  float y[N];
+  for (int i = 0; i < N; ++i)
+    {
+      x[i * 3 + 0] = i * 2;
+      x[i * 3 + 1] = i * 3;
+      x[i * 3 + 2] = i * 5;
+      y[i] = i * 4;
+    }
+  f (x, y, COUNT * 6);
+  for (int i = 0; i < N; ++i)
+    {
+      if (i >= 6 && i < (COUNT + 1) * 6)
+	{
+	  if (x[i * 3 + 0] != x[i * 3 - 18] + (i - 6) * 4
+	      || x[i * 3 + 1] != x[i * 3 - 17] - (i - 6) * 4
+	      || x[i * 3 + 2] != x[i * 3 - 16])
+	    __builtin_abort ();
+	}
+      else
+	{
+	  if (x[i * 3 + 0] != i * 2
+	      || x[i * 3 + 1] != i * 3
+	      || x[i * 3 + 2] != i * 5)
+	    __builtin_abort ();
+	}
+    }
+  return 0;
+}
diff --git a/gcc/tree-ssa-loop-ivopts.c b/gcc/tree-ssa-loop-ivopts.c
index e0ba500fce2..224732e7059 100644
--- a/gcc/tree-ssa-loop-ivopts.c
+++ b/gcc/tree-ssa-loop-ivopts.c
@@ -2143,6 +2143,7 @@ constant_multiple_of (tree top, tree bot, widest_int *mul)
   enum tree_code code;
   unsigned precision = TYPE_PRECISION (TREE_TYPE (top));
   widest_int res, p0, p1;
+  gassign *assign;
 
   STRIP_NOPS (top);
   STRIP_NOPS (bot);
@@ -2189,6 +2190,24 @@ constant_multiple_of (tree top, tree bot, widest_int *mul)
       *mul = wi::sext (wi::divmod_trunc (p0, p1, SIGNED, &res), precision);
       return res == 0;
 
+    case SSA_NAME:
+      /* Handle one important special case: TOP is an SSA_NAME defined
+	 to be BOT * CST.  This triggers in vector loops with variable
+	 vectorization factors.  */
+      assign = dyn_cast <gassign *> (SSA_NAME_DEF_STMT (top));
+      if (assign && gimple_assign_rhs_code (assign) == MULT_EXPR)
+	{
+	  tree new_top = gimple_assign_rhs1 (assign);
+	  mby = gimple_assign_rhs2 (assign);
+	  if (TREE_CODE (mby) == INTEGER_CST
+	      && constant_multiple_of (new_top, bot, &res))
+	    {
+	      *mul = wi::sext (res * wi::to_widest (mby), precision);
+	      return true;
+	    }
+	}
+      return false;
+
     default:
       if (POLY_INT_CST_P (top)
 	  && POLY_INT_CST_P (bot)
diff --git a/gcc/tree-vect-data-refs.c b/gcc/tree-vect-data-refs.c
index 373d4be6581..8b2cf1fbafe 100644
--- a/gcc/tree-vect-data-refs.c
+++ b/gcc/tree-vect-data-refs.c
@@ -53,6 +53,7 @@ along with GCC; see the file COPYING3.  If not see
 #include "tree-cfg.h"
 #include "tree-hash-traits.h"
 #include "internal-fn.h"
+#include "gimple-fold.h"
 
 /* Return true if load- or store-lanes optab OPTAB is implemented for
    COUNT vectors of type VECTYPE.  NAME is the name of OPTAB.  */
@@ -4782,7 +4783,6 @@ vect_create_addr_base_for_vector_ref (gimple *stmt,
   return entry->final_addr;
 }
 
-
 /* Function vect_create_data_ref_ptr.
 
    Create a new pointer-to-AGGR_TYPE variable (ap), that points to the first
@@ -5013,6 +5013,9 @@ vect_create_data_ref_ptr (gimple *stmt, tree aggr_type, struct loop *at_loop,
     {
       if (iv_step == NULL_TREE)
 	{
+	  /* The caller must provide an IV_STEP for capped VF.  */
+	  gcc_assert (!use_capped_vf (loop_vinfo));
+
 	  /* The step of the aggregate pointer is the type size.  */
 	  iv_step = TYPE_SIZE_UNIT (aggr_type);
 	  /* One exception to the above is when the scalar step of the load in
@@ -5143,7 +5146,7 @@ bump_vector_ptr (tree dataref_ptr, gimple *ptr_incr, gimple_stmt_iterator *gsi,
       mark_ptr_info_alignment_unknown (SSA_NAME_PTR_INFO (new_dataref_ptr));
     }
 
-  if (!ptr_incr)
+  if (!ptr_incr || use_capped_vf (STMT_VINFO_LOOP_VINFO (stmt_info)))
     return new_dataref_ptr;
 
   /* Update the vector-pointer's cross-iteration increment.  */
diff --git a/gcc/tree-vect-loop-manip.c b/gcc/tree-vect-loop-manip.c
index 9d9a93e00d5..57aab1b764f 100644
--- a/gcc/tree-vect-loop-manip.c
+++ b/gcc/tree-vect-loop-manip.c
@@ -254,15 +254,31 @@ adjust_phi_and_debug_stmts (gimple *update_phi, edge e, tree new_def)
 
 /* Define one loop mask MASK from loop LOOP.  INIT_MASK is the value that
    the mask should have during the first iteration and NEXT_MASK is the
-   value that it should have on subsequent iterations.  */
+   value that it should have on subsequent iterations.  CAP_MASK, if
+   nonnull, is a cap that should be applied to each value of the mask
+   before the mask is used; add the statement that does to HEADER_SEQ.  */
 
 static void
-vect_set_loop_mask (struct loop *loop, tree mask, tree init_mask,
-		    tree next_mask)
+vect_set_loop_mask (struct loop *loop, gimple_seq *header_seq, tree mask,
+		    tree init_mask, tree next_mask, tree cap_mask)
 {
-  gphi *phi = create_phi_node (mask, loop->header);
+  tree mask_type = TREE_TYPE (mask);
+  tree uncapped_mask;
+  if (cap_mask)
+    uncapped_mask = make_temp_ssa_name (mask_type, NULL, "uncapped_mask");
+  else
+    uncapped_mask = mask;
+  gphi *phi = create_phi_node (uncapped_mask, loop->header);
   add_phi_arg (phi, init_mask, loop_preheader_edge (loop), UNKNOWN_LOCATION);
   add_phi_arg (phi, next_mask, loop_latch_edge (loop), UNKNOWN_LOCATION);
+
+  /* Apply the cap mask, if any.  */
+  if (cap_mask)
+    {
+      gimple *stmt = gimple_build_assign (mask, BIT_AND_EXPR,
+					  uncapped_mask, cap_mask);
+      gimple_seq_add_stmt (header_seq, stmt);
+    }
 }
 
 /* Add SEQ to the end of LOOP's preheader block.  */
@@ -355,12 +371,14 @@ vect_maybe_permute_loop_masks (gimple_seq *seq, rgroup_masks *dest_rgm,
 
 /* Helper for vect_set_loop_condition_masked.  Generate definitions for
    all the masks in RGM and return a mask that is nonzero when the loop
-   needs to iterate.  Add any new preheader statements to PREHEADER_SEQ.
-   Use LOOP_COND_GSI to insert code before the exit gcond.
+   needs to iterate.  Add any new preheader statements to PREHEADER_SEQ
+   and any new header statements to HEADER_SEQ.  Use LOOP_COND_GSI to
+   insert code before the exit gcond.
 
    RGM belongs to loop LOOP.  The loop originally iterated NITERS
    times and has been vectorized according to LOOP_VINFO.  Each iteration
-   of the vectorized loop handles VF iterations of the scalar loop.
+   of the vectorized loop handles CAPPED_VF iterations of the scalar loop,
+   where CAPPED_VF is bounded by the compile-time vectorization factor.
 
    If NITERS_SKIP is nonnull, the first iteration of the vectorized loop
    starts with NITERS_SKIP dummy iterations of the scalar loop before
@@ -374,7 +392,7 @@ vect_maybe_permute_loop_masks (gimple_seq *seq, rgroup_masks *dest_rgm,
    does not overflow.  However, MIGHT_WRAP_P says whether an induction
    variable that starts at 0 and has step:
 
-     VF * RGM->max_nscalars_per_iter
+     CAPPED_VF * RGM->max_nscalars_per_iter
 
    might overflow before hitting a value above:
 
@@ -386,8 +404,9 @@ vect_maybe_permute_loop_masks (gimple_seq *seq, rgroup_masks *dest_rgm,
 static tree
 vect_set_loop_masks_directly (struct loop *loop, loop_vec_info loop_vinfo,
 			      gimple_seq *preheader_seq,
+			      gimple_seq *header_seq,
 			      gimple_stmt_iterator loop_cond_gsi,
-			      rgroup_masks *rgm, tree vf,
+			      rgroup_masks *rgm, tree capped_vf,
 			      tree niters, tree niters_skip,
 			      bool might_wrap_p)
 {
@@ -401,7 +420,7 @@ vect_set_loop_masks_directly (struct loop *loop, loop_vec_info loop_vinfo,
      of the vector loop, and the number that it should skip during the
      first iteration of the vector loop.  */
   tree nscalars_total = niters;
-  tree nscalars_step = vf;
+  tree nscalars_step = capped_vf;
   tree nscalars_skip = niters_skip;
   if (nscalars_per_iter != 1)
     {
@@ -553,8 +572,8 @@ vect_set_loop_masks_directly (struct loop *loop, loop_vec_info loop_vinfo,
 	  else
 	    {
 	      /* FIRST_LIMIT is the maximum number of scalars handled by the
-		 first iteration of the vector loop.  Test the portion
-		 associated with this mask.  */
+		 first iteration of the vector loop (before any cap mask
+		 is applied).  Test the portion associated with this mask.  */
 	      start = bias_tree;
 	      end = first_limit;
 	    }
@@ -589,7 +608,18 @@ vect_set_loop_masks_directly (struct loop *loop, loop_vec_info loop_vinfo,
       gcall *call = vect_gen_while (next_mask, test_index, this_test_limit);
       gsi_insert_before (test_gsi, call, GSI_SAME_STMT);
 
-      vect_set_loop_mask (loop, mask, init_mask, next_mask);
+      /* Get the cap that needs to be ANDed with every mask.  */
+      tree cap_mask = LOOP_VINFO_CAP (loop_vinfo).mask;
+      if (use_capped_vf (loop_vinfo)
+	  && (!cap_mask || nscalars_per_iter != 1))
+	{
+	  cap_mask = make_temp_ssa_name (mask_type, NULL, "cap_mask");
+	  call = vect_gen_while (cap_mask, bias_tree, nscalars_step);
+	  gimple_seq_add_stmt (preheader_seq, call);
+	}
+
+      vect_set_loop_mask (loop, header_seq, mask, init_mask,
+			  next_mask, cap_mask);
     }
   return next_mask;
 }
@@ -658,16 +688,26 @@ vect_set_loop_condition_masked (struct loop *loop, loop_vec_info loop_vinfo,
 	  else
 	    iv_limit += max_vf - 1;
 	}
-      /* IV_LIMIT is the maximum number of latch iterations, which is also
-	 the maximum in-range IV value.  Round this value down to the previous
-	 vector alignment boundary and then add an extra full iteration.  */
-      poly_uint64 vf = LOOP_VINFO_VECT_FACTOR (loop_vinfo);
-      iv_limit = (iv_limit & -(int) known_alignment (vf)) + max_vf;
+      if (use_capped_vf (loop_vinfo))
+	/* In the worst case the final vector iteration will handle a single
+	   scalar iteration, so we'll have up to MAX_VF - 1 inactive
+	   iterations.  Add 1 to this to get the number of loop iterations
+	   instead of the number of latch iterations.  */
+	iv_limit += max_vf;
+      else
+	{
+	  /* IV_LIMIT is the maximum number of latch iterations, which
+	     is also the maximum in-range IV value.  Round this value
+	     down to the previous vector alignment boundary and then add
+	     an extra full iteration.  */
+	  poly_uint64 vf = LOOP_VINFO_VECT_FACTOR (loop_vinfo);
+	  iv_limit = (iv_limit & -(int) known_alignment (vf)) + max_vf;
+	}
     }
 
-  /* Get the vectorization factor in tree form.  */
-  tree vf = build_int_cst (compare_type,
-			   LOOP_VINFO_VECT_FACTOR (loop_vinfo));
+  /* Convert the runtime vectorization factor to the appropriate type.  */
+  tree capped_vf = gimple_convert (&preheader_seq, compare_type,
+				   LOOP_VINFO_CAP (loop_vinfo).niters);
 
   /* Iterate over all the rgroups and fill in their masks.  We could use
      the first mask from any rgroup for the loop condition; here we
@@ -701,9 +741,10 @@ vect_set_loop_condition_masked (struct loop *loop, loop_vec_info loop_vinfo,
 
 	/* Set up all masks for this group.  */
 	test_mask = vect_set_loop_masks_directly (loop, loop_vinfo,
-						  &preheader_seq,
-						  loop_cond_gsi, rgm, vf,
-						  niters, niters_skip,
+						  &preheader_seq, &header_seq,
+						  loop_cond_gsi, rgm,
+						  capped_vf, niters,
+						  niters_skip,
 						  might_wrap_p);
       }
 
@@ -2384,15 +2425,16 @@ vect_do_peeling (loop_vec_info loop_vinfo, tree niters, tree nitersm1,
   basic_block guard_bb, guard_to;
   profile_probability prob_prolog, prob_vector, prob_epilog;
   int estimated_vf;
+  tree vf = LOOP_VINFO_CAP (loop_vinfo).niters;
+  poly_uint64 max_vf = LOOP_VINFO_VECT_FACTOR (loop_vinfo);
   int prolog_peeling = 0;
   if (!vect_use_loop_mask_for_alignment_p (loop_vinfo))
     prolog_peeling = LOOP_VINFO_PEELING_FOR_ALIGNMENT (loop_vinfo);
 
-  poly_uint64 vf = LOOP_VINFO_VECT_FACTOR (loop_vinfo);
   poly_uint64 bound_epilog = 0;
   if (!LOOP_VINFO_FULLY_MASKED_P (loop_vinfo)
       && LOOP_VINFO_PEELING_FOR_NITER (loop_vinfo))
-    bound_epilog += vf - 1;
+    bound_epilog += max_vf - 1;
   if (LOOP_VINFO_PEELING_FOR_GAPS (loop_vinfo))
     bound_epilog += 1;
   bool epilog_peeling = may_ne (bound_epilog, 0U);
@@ -2449,7 +2491,7 @@ vect_do_peeling (loop_vec_info loop_vinfo, tree niters, tree nitersm1,
      the end of vector loop and skip to the end of epilog loop.  */
   bool skip_epilog = (prolog_peeling < 0
 		      || !LOOP_VINFO_NITERS_KNOWN_P (loop_vinfo)
-		      || !vf.is_constant ());
+		      || TREE_CODE (vf) != INTEGER_CST);
   /* PEELING_FOR_GAPS is special because epilog loop must be executed.  */
   if (LOOP_VINFO_PEELING_FOR_GAPS (loop_vinfo))
     skip_epilog = false;
@@ -2530,9 +2572,11 @@ vect_do_peeling (loop_vec_info loop_vinfo, tree niters, tree nitersm1,
       niters = vect_build_loop_niters (loop_vinfo, &new_var_p);
       /* It's guaranteed that vector loop bound before vectorization is at
 	 least VF, so set range information for newly generated var.  */
-      if (new_var_p)
+      poly_uint64 const_vf;
+      if (new_var_p && poly_int_tree_p (vf, &const_vf))
 	set_range_info (niters, VR_RANGE,
-			wi::to_wide (build_int_cst (type, vf)),
+			wi::to_wide (build_int_cstu
+				     (type, constant_lower_bound (const_vf))),
 			wi::to_wide (TYPE_MAX_VALUE (type)));
 
       /* Prolog iterates at most bound_prolog times, latch iterates at
diff --git a/gcc/tree-vect-loop.c b/gcc/tree-vect-loop.c
index a39682108f1..e33a83bfa6b 100644
--- a/gcc/tree-vect-loop.c
+++ b/gcc/tree-vect-loop.c
@@ -1344,6 +1344,85 @@ vect_verify_full_masking (loop_vec_info loop_vinfo)
   return true;
 }
 
+/* LOOP_VINFO uses a fully-masked loop and needs to use a capped
+   vectorization factor.  Decide whether the best way of doing that is:
+
+     cap_mask = IFN_WHILE_ULT (0, max_vf)
+     actual_vf = IFN_MASK_POPCOUNT (cap_mask)
+
+   CAP_MASK can then be used for an rgroup for which nS == 1 and nV == 1
+   (see the comment above rgroup_masks for details).
+
+   Return true if this does seem to be the best implementation and
+   update LOOP_VINFO_CAP accordingly.  */
+
+static bool
+vect_maybe_build_capped_vf_via_while (loop_vec_info loop_vinfo,
+				      gimple_seq *seq)
+{
+  poly_uint64 nunits = LOOP_VINFO_VECT_FACTOR (loop_vinfo);
+  if (nunits.is_constant ())
+    /* In this case the capped number of iterations is known at compile
+       time, so a POPCOUNT would be pointless.  */
+    return false;
+
+  if (LOOP_VINFO_MASKS (loop_vinfo).is_empty ())
+    return false;
+
+  rgroup_masks *rgm = &LOOP_VINFO_MASKS (loop_vinfo)[0];
+  if (rgm->max_nscalars_per_iter != 1)
+    /* There's no nS == 1 && nV == 1 mask that would benefit from
+       having a precomputed cap mask.  */
+    return false;
+
+  if (!direct_internal_fn_supported_p (IFN_MASK_POPCOUNT, rgm->mask_type,
+				       OPTIMIZE_FOR_SPEED))
+    return false;
+
+  tree compare_type = LOOP_VINFO_MASK_COMPARE_TYPE (loop_vinfo);
+  tree zero_index = build_int_cst (compare_type, 0);
+  tree limit = build_int_cst (compare_type,
+			      LOOP_VINFO_MAX_VECT_FACTOR (loop_vinfo));
+
+  tree cap_mask = make_temp_ssa_name (rgm->mask_type, NULL, "cap_mask");
+  gcall *stmt = vect_gen_while (cap_mask, zero_index, limit);
+  gimple_seq_add_stmt (seq, stmt);
+  LOOP_VINFO_CAP (loop_vinfo).mask = cap_mask;
+
+  tree vf = make_temp_ssa_name (sizetype, NULL, "vf");
+  stmt = gimple_build_call_internal (IFN_MASK_POPCOUNT, 1, cap_mask);
+  gimple_call_set_lhs (stmt, vf);
+  gimple_seq_add_stmt (seq, stmt);
+  LOOP_VINFO_CAP (loop_vinfo).niters = vf;
+
+  return true;
+}
+
+/* Initialize LOOP_VINFO_CAP (LOOP_VINFO).  */
+
+static void
+vect_build_cap (loop_vec_info loop_vinfo)
+{
+  tree vf = size_int (LOOP_VINFO_VECT_FACTOR (loop_vinfo));
+  if (!use_capped_vf (loop_vinfo))
+    LOOP_VINFO_CAP (loop_vinfo).niters = vf;
+  else
+    {
+      gimple_seq seq = NULL;
+      if (!vect_maybe_build_capped_vf_via_while (loop_vinfo, &seq))
+	{
+	  tree max_vf = size_int (LOOP_VINFO_MAX_VECT_FACTOR (loop_vinfo));
+	  LOOP_VINFO_CAP (loop_vinfo).niters
+	    = gimple_build (&seq, MIN_EXPR, sizetype, vf, max_vf);
+	}
+      if (seq)
+	{
+	  edge pe = loop_preheader_edge (LOOP_VINFO_LOOP (loop_vinfo));
+	  gsi_insert_seq_on_edge_immediate (pe, seq);
+	}
+    }
+}
+
 /* Calculate the cost of one scalar iteration of the loop.  */
 static void
 vect_compute_single_scalar_iteration_cost (loop_vec_info loop_vinfo)
@@ -2128,12 +2207,15 @@ vect_analyze_loop_2 (loop_vec_info loop_vinfo, bool &fatal)
   /* Analyze data dependences between the data-refs in the loop
      and adjust the maximum vectorization factor according to
      the dependences.
-     FORNOW: fail at the first data dependence that we encounter.  */
+
+     We might be able to cope with max_vf that are smaller than the full
+     vector width by using a fully-masked loop.  Postpone that decision
+     until we know whether full masking is possible.  Of course, it might
+     not be a win to use vectors in this situation even if it is supported,
+     but that's a decision for the cost model.  */
 
   ok = vect_analyze_data_ref_dependences (loop_vinfo, &max_vf);
-  if (!ok
-      || (max_vf != MAX_VECTORIZATION_FACTOR
-	  && may_lt (max_vf, min_vf)))
+  if (!ok || max_vf <= 1)
     {
       if (dump_enabled_p ())
 	    dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
@@ -2150,14 +2232,6 @@ vect_analyze_loop_2 (loop_vec_info loop_vinfo, bool &fatal)
 			 "can't determine vectorization factor.\n");
       return false;
     }
-  if (max_vf != MAX_VECTORIZATION_FACTOR
-      && may_lt (max_vf, LOOP_VINFO_VECT_FACTOR (loop_vinfo)))
-    {
-      if (dump_enabled_p ())
-	dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
-			 "bad data dependence.\n");
-      return false;
-    }
 
   /* Compute the scalar iteration cost.  */
   vect_compute_single_scalar_iteration_cost (loop_vinfo);
@@ -2278,6 +2352,19 @@ start_over:
 			 "not using a fully-masked loop.\n");
     }
 
+  if (!LOOP_VINFO_FULLY_MASKED_P (loop_vinfo)
+      && use_capped_vf (loop_vinfo))
+    {
+      if (dump_enabled_p ())
+	dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
+			 "Need to cap the runtime vectorization factor to "
+			 HOST_WIDE_INT_PRINT_DEC " but cannot fully mask"
+			 " the loop.\n",
+			 LOOP_VINFO_MAX_VECT_FACTOR (loop_vinfo));
+      /* Undoing SLP might allow us to use a mask.  */
+      goto again;
+    }
+
   /* If epilog loop is required because of data accesses with gaps,
      one additional iteration needs to be peeled.  Check if there is
      enough iterations for vectorization.  */
@@ -7347,7 +7434,7 @@ vectorizable_induction (gimple *phi,
   gphi *induction_phi;
   tree induc_def, vec_dest;
   tree init_expr, step_expr;
-  poly_uint64 vf = LOOP_VINFO_VECT_FACTOR (loop_vinfo);
+  tree vf = LOOP_VINFO_CAP (loop_vinfo).niters;
   unsigned i;
   tree expr;
   gimple_seq stmts;
@@ -7526,12 +7613,9 @@ vectorizable_induction (gimple *phi,
 
       /* Generate [VF*S, VF*S, ... ].  */
       if (SCALAR_FLOAT_TYPE_P (TREE_TYPE (step_expr)))
-	{
-	  expr = build_int_cst (integer_type_node, vf);
-	  expr = fold_convert (TREE_TYPE (step_expr), expr);
-	}
+	expr = fold_convert (TREE_TYPE (step_expr), vf);
       else
-	expr = build_int_cst (TREE_TYPE (step_expr), vf);
+	expr = vf;
       new_name = fold_build2 (MULT_EXPR, TREE_TYPE (step_expr),
 			      expr, step_expr);
       if (! CONSTANT_CLASS_P (new_name))
@@ -7726,12 +7810,9 @@ vectorizable_induction (gimple *phi,
 	  vec_step = [VF*S, VF*S, VF*S, VF*S]  */
       gimple_seq seq = NULL;
       if (SCALAR_FLOAT_TYPE_P (TREE_TYPE (step_expr)))
-	{
-	  expr = build_int_cst (integer_type_node, vf);
-	  expr = gimple_build (&seq, FLOAT_EXPR, TREE_TYPE (step_expr), expr);
-	}
+	expr = gimple_build (&seq, FLOAT_EXPR, TREE_TYPE (step_expr), vf);
       else
-	expr = build_int_cst (TREE_TYPE (step_expr), vf);
+	expr = gimple_convert (&seq, TREE_TYPE (step_expr), vf);
       new_name = gimple_build (&seq, MULT_EXPR, TREE_TYPE (step_expr),
 			       expr, step_expr);
       if (seq)
@@ -8315,7 +8396,7 @@ vect_transform_loop (loop_vec_info loop_vinfo)
   tree step_vector = NULL_TREE;
   tree niters_vector_mult_vf = NULL_TREE;
   poly_uint64 vf = LOOP_VINFO_VECT_FACTOR (loop_vinfo);
-  unsigned int lowest_vf = constant_lower_bound (vf);
+  unsigned int lowest_vf;
   bool grouped_store;
   bool slp_scheduled = false;
   gimple *stmt, *pattern_stmt;
@@ -8325,6 +8406,9 @@ vect_transform_loop (loop_vec_info loop_vinfo)
   bool check_profitability = false;
   unsigned int th;
 
+  lowest_vf = constant_lower_bound (vf);
+  lowest_vf = MIN (lowest_vf, LOOP_VINFO_MAX_VECT_FACTOR (loop_vinfo));
+
   if (dump_enabled_p ())
     dump_printf_loc (MSG_NOTE, vect_location, "=== vec_transform_loop ===\n");
 
@@ -8389,6 +8473,8 @@ vect_transform_loop (loop_vec_info loop_vinfo)
 	}
     }
 
+  vect_build_cap (loop_vinfo);
+
   tree niters = vect_build_loop_niters (loop_vinfo);
   LOOP_VINFO_NITERS_UNCHANGED (loop_vinfo) = niters;
   tree nitersm1 = unshare_expr (LOOP_VINFO_NITERSM1 (loop_vinfo));
diff --git a/gcc/tree-vect-stmts.c b/gcc/tree-vect-stmts.c
index df7ae274047..c0a87dc9275 100644
--- a/gcc/tree-vect-stmts.c
+++ b/gcc/tree-vect-stmts.c
@@ -110,6 +110,27 @@ record_stmt_cost (stmt_vector_for_cost *body_cost_vec, int count,
 			  count, kind, stmt_info, misalign, where);
 }
 
+/* Return a tree that represents STEP multiplied by the vectorization
+   factor.  */
+
+static tree
+vect_mult_by_vf (loop_vec_info loop_vinfo, tree step)
+{
+  hash_map<tree, tree> *map = &LOOP_VINFO_VF_MULT_MAP (loop_vinfo);
+  bool existed;
+  tree &entry = map->get_or_insert (step, &existed);
+  if (!existed)
+    {
+      gimple_seq seq = NULL;
+      tree vf = LOOP_VINFO_CAP (loop_vinfo).niters;
+      vf = gimple_convert (&seq, TREE_TYPE (step), vf);
+      entry = gimple_build (&seq, MULT_EXPR, TREE_TYPE (step), vf, step);
+      edge pe = loop_preheader_edge (LOOP_VINFO_LOOP (loop_vinfo));
+      gsi_insert_seq_on_edge_immediate (pe, seq);
+    }
+  return entry;
+}
+
 /* Return a variable of type ELEM_TYPE[NELEMS].  */
 
 static tree
@@ -2812,7 +2833,8 @@ vect_get_gather_scatter_ops (struct loop *loop, gimple *stmt,
 static void
 vect_get_strided_load_store_ops (gimple *stmt, loop_vec_info loop_vinfo,
 				 gather_scatter_info *gs_info,
-				 tree *dataref_bump, tree *vec_offset)
+				 tree *iv_step, tree *dataref_bump,
+				 tree *vec_offset)
 {
   stmt_vec_info stmt_info = vinfo_for_stmt (stmt);
   struct data_reference *dr = STMT_VINFO_DATA_REF (stmt_info);
@@ -2827,6 +2849,12 @@ vect_get_strided_load_store_ops (gimple *stmt, loop_vec_info loop_vinfo,
   if (stmts)
     gsi_insert_seq_on_edge_immediate (loop_preheader_edge (loop), stmts);
 
+  if (use_capped_vf (loop_vinfo))
+    *iv_step = vect_mult_by_vf (loop_vinfo,
+				fold_convert (sizetype, DR_STEP (dr)));
+  else
+    *iv_step = *dataref_bump;
+
   /* The offset given in GS_INFO can have pointer type, so use the element
      type of the vector instead.  */
   tree offset_type = TREE_TYPE (gs_info->offset);
@@ -2851,18 +2879,32 @@ vect_get_strided_load_store_ops (gimple *stmt, loop_vec_info loop_vinfo,
    being vectorized and MEMORY_ACCESS_TYPE describes the type of
    vectorization.  */
 
-static tree
-vect_get_data_ptr_increment (data_reference *dr, tree aggr_type,
-			     vect_memory_access_type memory_access_type)
+static void
+vect_get_data_ptr_increment (loop_vec_info loop_vinfo, data_reference *dr,
+			     tree aggr_type, unsigned int group_size,
+			     vect_memory_access_type memory_access_type,
+			     tree *iv_step, tree *bump)
 {
   if (memory_access_type == VMAT_INVARIANT)
-    return size_zero_node;
+    {
+      *iv_step = *bump = size_zero_node;
+      return;
+    }
 
-  tree iv_step = TYPE_SIZE_UNIT (aggr_type);
+  *bump = TYPE_SIZE_UNIT (aggr_type);
   tree step = vect_dr_behavior (dr)->step;
   if (tree_int_cst_sgn (step) == -1)
-    iv_step = fold_build1 (NEGATE_EXPR, TREE_TYPE (iv_step), iv_step);
-  return iv_step;
+    *bump = fold_build1 (NEGATE_EXPR, TREE_TYPE (*bump), *bump);
+
+  if (loop_vinfo && use_capped_vf (loop_vinfo))
+    {
+      tree elt_type = TREE_TYPE (DR_REF (dr));
+      tree bytes_per_iter = size_binop (MULT_EXPR, TYPE_SIZE_UNIT (elt_type),
+					size_int (group_size));
+      *iv_step = vect_mult_by_vf (loop_vinfo, bytes_per_iter);
+    }
+  else
+    *iv_step = *bump;
 }
 
 /* Check and perform vectorization of BUILT_IN_BSWAP{16,32,64}.  */
@@ -6717,18 +6759,19 @@ vectorizable_store (gimple *stmt, gimple_stmt_iterator *gsi, gimple **vec_stmt,
       || memory_access_type == VMAT_CONTIGUOUS_REVERSE)
     offset = size_int (-TYPE_VECTOR_SUBPARTS (vectype) + 1);
 
-  tree bump;
+  tree bump, iv_step;
   tree vec_offset = NULL_TREE;
   if (STMT_VINFO_GATHER_SCATTER_P (stmt_info))
     {
       aggr_type = NULL_TREE;
+      iv_step = NULL_TREE;
       bump = NULL_TREE;
     }
   else if (memory_access_type == VMAT_GATHER_SCATTER)
     {
       aggr_type = elem_type;
       vect_get_strided_load_store_ops (stmt, loop_vinfo, &gs_info,
-				       &bump, &vec_offset);
+				       &iv_step, &bump, &vec_offset);
     }
   else
     {
@@ -6736,7 +6779,8 @@ vectorizable_store (gimple *stmt, gimple_stmt_iterator *gsi, gimple **vec_stmt,
 	aggr_type = build_array_type_nelts (elem_type, vec_num * nunits);
       else
 	aggr_type = vectype;
-      bump = vect_get_data_ptr_increment (dr, aggr_type, memory_access_type);
+      vect_get_data_ptr_increment (loop_vinfo, dr, aggr_type, group_size,
+				   memory_access_type, &iv_step, &bump);
     }
 
   if (mask)
@@ -6854,7 +6898,7 @@ vectorizable_store (gimple *stmt, gimple_stmt_iterator *gsi, gimple **vec_stmt,
 					  simd_lane_access_p ? loop : NULL,
 					  offset, &dummy, gsi, &ptr_incr,
 					  simd_lane_access_p, &inv_p,
-					  NULL_TREE, bump);
+					  NULL_TREE, iv_step);
 	  gcc_assert (bb_vinfo || !inv_p);
 	}
       else
@@ -7917,18 +7961,19 @@ vectorizable_load (gimple *stmt, gimple_stmt_iterator *gsi, gimple **vec_stmt,
   if (memory_access_type == VMAT_CONTIGUOUS_REVERSE)
     offset = size_int (-TYPE_VECTOR_SUBPARTS (vectype) + 1);
 
-  tree bump;
+  tree bump, iv_step;
   tree vec_offset = NULL_TREE;
   if (STMT_VINFO_GATHER_SCATTER_P (stmt_info))
     {
       aggr_type = NULL_TREE;
+      iv_step = NULL_TREE;
       bump = NULL_TREE;
     }
   else if (memory_access_type == VMAT_GATHER_SCATTER)
     {
       aggr_type = elem_type;
       vect_get_strided_load_store_ops (stmt, loop_vinfo, &gs_info,
-				       &bump, &vec_offset);
+				       &iv_step, &bump, &vec_offset);
     }
   else
     {
@@ -7936,7 +7981,8 @@ vectorizable_load (gimple *stmt, gimple_stmt_iterator *gsi, gimple **vec_stmt,
 	aggr_type = build_array_type_nelts (elem_type, vec_num * nunits);
       else
 	aggr_type = vectype;
-      bump = vect_get_data_ptr_increment (dr, aggr_type, memory_access_type);
+      vect_get_data_ptr_increment (loop_vinfo, dr, aggr_type, group_size,
+				   memory_access_type, &iv_step, &bump);
     }
 
   tree vec_mask = NULL_TREE;
@@ -7971,7 +8017,7 @@ vectorizable_load (gimple *stmt, gimple_stmt_iterator *gsi, gimple **vec_stmt,
 		= vect_create_data_ref_ptr (first_stmt_for_drptr, aggr_type,
 					    at_loop, offset, &dummy, gsi,
 					    &ptr_incr, simd_lane_access_p,
-					    &inv_p, byte_offset, bump);
+					    &inv_p, byte_offset, iv_step);
 	      /* Adjust the pointer by the difference to first_stmt.  */
 	      data_reference_p ptrdr
 		= STMT_VINFO_DATA_REF (vinfo_for_stmt (first_stmt_for_drptr));
@@ -7993,7 +8039,7 @@ vectorizable_load (gimple *stmt, gimple_stmt_iterator *gsi, gimple **vec_stmt,
 	      = vect_create_data_ref_ptr (first_stmt, aggr_type, at_loop,
 					  offset, &dummy, gsi, &ptr_incr,
 					  simd_lane_access_p, &inv_p,
-					  byte_offset, bump);
+					  byte_offset, iv_step);
 	  if (mask)
 	    vec_mask = vect_get_vec_def_for_operand (mask, stmt,
 						     mask_vectype);
diff --git a/gcc/tree-vectorizer.h b/gcc/tree-vectorizer.h
index 023594771bb..8073ba05a83 100644
--- a/gcc/tree-vectorizer.h
+++ b/gcc/tree-vectorizer.h
@@ -357,6 +357,18 @@ struct rgroup_masks {
 
 typedef auto_vec<rgroup_masks> vec_loop_masks;
 
+/* Represents a scalar iteration count <= VF as both an integer count and a
+   vector mask.  */
+struct vec_niters_and_mask {
+  vec_niters_and_mask () : niters (NULL_TREE), mask (NULL_TREE) {}
+
+  /* The number of scalar iterations as a sizetype integer.  */
+  tree niters;
+
+  /* The mask of scalar iterations, with one element per iteration.  */
+  tree mask;
+};
+
 /*-----------------------------------------------------------------*/
 /* Info on vectorized loops.                                       */
 /*-----------------------------------------------------------------*/
@@ -397,6 +409,10 @@ typedef struct _loop_vec_info : public vec_info {
      if there is no particular limit.  */
   unsigned HOST_WIDE_INT max_vectorization_factor;
 
+  /* The actual runtime vectorization factor, which is the minimum of
+     VECTORIZATION_FACTOR and MAX_VECTORIZATION_FACTOR.  */
+  vec_niters_and_mask cap;
+
   /* The masks that a fully-masked loop should use to avoid operating
      on inactive scalars.  */
   vec_loop_masks masks;
@@ -526,6 +542,10 @@ typedef struct _loop_vec_info : public vec_info {
 
   /* A hash table used for caching vector base addresses.  */
   hash_table<vect_addr_base_hasher> vect_addr_base_htab;
+
+  /* A map from X to a precomputed gimple_val containing
+     CAPPED_VECTORIZATION_FACTOR * X.  */
+  hash_map<tree, tree> vf_mult_map;
 } *loop_vec_info;
 
 /* Access Functions.  */
@@ -545,6 +565,7 @@ typedef struct _loop_vec_info : public vec_info {
 #define LOOP_VINFO_FULLY_MASKED_P(L)       (L)->fully_masked_p
 #define LOOP_VINFO_VECT_FACTOR(L)          (L)->vectorization_factor
 #define LOOP_VINFO_MAX_VECT_FACTOR(L)      (L)->max_vectorization_factor
+#define LOOP_VINFO_CAP(L)                  (L)->cap
 #define LOOP_VINFO_MASKS(L)                (L)->masks
 #define LOOP_VINFO_MASK_SKIP_NITERS(L)     (L)->mask_skip_niters
 #define LOOP_VINFO_MASK_COMPARE_TYPE(L)    (L)->mask_compare_type
@@ -577,6 +598,7 @@ typedef struct _loop_vec_info : public vec_info {
 #define LOOP_VINFO_SINGLE_SCALAR_ITERATION_COST(L) (L)->single_scalar_iteration_cost
 #define LOOP_VINFO_ORIG_LOOP_INFO(L)       (L)->orig_loop_info
 #define LOOP_VINFO_ADDR_CACHE(L)	   (L)->vect_addr_base_htab
+#define LOOP_VINFO_VF_MULT_MAP(L)          (L)->vf_mult_map
 
 #define LOOP_REQUIRES_VERSIONING_FOR_ALIGNMENT(L)	\
   ((L)->may_misalign_stmts.length () > 0)
@@ -1351,6 +1373,19 @@ unlimited_cost_model (loop_p loop)
   return (flag_vect_cost_model == VECT_COST_MODEL_UNLIMITED);
 }
 
+/* Return true if the loop needs to use a vectorization factor that
+   is capped at run time.  */
+
+static inline bool
+use_capped_vf (loop_vec_info loop_vinfo)
+{
+  return (loop_vinfo
+	  && (LOOP_VINFO_MAX_VECT_FACTOR (loop_vinfo)
+	      != MAX_VECTORIZATION_FACTOR)
+	  && may_lt (LOOP_VINFO_MAX_VECT_FACTOR (loop_vinfo),
+		     LOOP_VINFO_VECT_FACTOR (loop_vinfo)));
+}
+
 /* Return true if the loop described by LOOP_VINFO is fully-masked and
    if the first iteration should use a partial mask in order to achieve
    alignment.  */
author	Richard Sandiford <richard.sandiford@linaro.org>	2017-10-08 12:29:08 +0100
committer	Richard Sandiford <richard.sandiford@linaro.org>	2017-11-20 16:01:23 +0000
commit	f8d18515fd0e87fe9b68a23e6d73b80064baec97 (patch)
tree	bf7c7a9d43942db218b3786c21fea82242787adf
parent	f2ba9afa2bfb21956959a5ab48be1ef1ce2e6f2c (diff)
download	gcc-f8d18515fd0e87fe9b68a23e6d73b80064baec97.tar.gz