Add support for speculative loads

[Branch only patch -- not intended for trunk in its current state] This patch adds support for speculative loads in cases where the loads are (or can be made to be) aligned to a full vector size. Such loads can never partially fault and they should be more efficient than first-faulting loads for the cases that they can handle.
author: Richard Sandiford <richard.sandiford@linaro.org> 2017-06-23 17:52:44 +0100
committer: Richard Sandiford <richard.sandiford@linaro.org> 2017-11-20 16:01:23 +0000
commit: 02cf0942b05e2278c7e251b969092b64f06b915d (patch)
tree: f9344ed179868f44cee2a9cdb6d31a3b0d38dfe6
parent: 655e3625f9c65f2c9d4e8c76eeca5edf9254afeb (diff)
download: gcc-02cf0942b05e2278c7e251b969092b64f06b915d.tar.gz
41 files changed, 2138 insertions, 96 deletions
diff --git a/gcc/config/aarch64/aarch64-sve.md b/gcc/config/aarch64/aarch64-sve.md
index 5d84b7fc595..e381cfcabe2 100644
--- a/gcc/config/aarch64/aarch64-sve.md
+++ b/gcc/config/aarch64/aarch64-sve.md
@@ -2379,6 +2379,16 @@
   "<sve_fp_op>\t%0.<Vetype>, %1/m, %0.<Vetype>, %2.<Vetype>"
 )
 
+(define_insn "break_after_<mode>"
+  [(set (match_operand:PRED_ALL 0 "register_operand" "=Upa")
+	(unspec:PRED_ALL
+	  [(match_operand:PRED_ALL 1 "register_operand" "Upa")
+	   (match_operand:PRED_ALL 2 "register_operand" "Upa")]
+	  UNSPEC_BRKA))]
+  "TARGET_SVE"
+  "brka\t%0.b, %1/z, %2.b"
+)
+
 (define_expand "mask_popcount<mode>"
   [(set (match_operand:DI 0 "register_operand")
 	(unspec:DI [(match_dup 2)
diff --git a/gcc/config/aarch64/aarch64.md b/gcc/config/aarch64/aarch64.md
index 581e6a753d2..37dcd85440e 100644
--- a/gcc/config/aarch64/aarch64.md
+++ b/gcc/config/aarch64/aarch64.md
@@ -168,6 +168,7 @@
     UNSPEC_CLASTB
     UNSPEC_FADDA
     UNSPEC_CNTP
+    UNSPEC_BRKA
 ])
 
 (define_c_enum "unspecv" [
diff --git a/gcc/gimple-iterator.h b/gcc/gimple-iterator.h
index 70f18beceff..8e9fe1f087d 100644
--- a/gcc/gimple-iterator.h
+++ b/gcc/gimple-iterator.h
@@ -152,6 +152,22 @@ gsi_last_1 (gimple_seq *seq)
 
 #define gsi_last(x) gsi_last_1 (&(x))
 
+/* Return a new iterator initially pointing at the end of SEQ.  */
+
+static inline gimple_stmt_iterator
+gsi_end_1 (gimple_seq *seq)
+{
+  gimple_stmt_iterator i;
+
+  i.ptr = NULL;
+  i.seq = seq;
+  i.bb = i.ptr ? gimple_bb (i.ptr) : NULL;
+
+  return i;
+}
+
+#define gsi_end(x) gsi_end_1 (&(x))
+
 /* Return a new iterator pointing to the last statement in basic block BB.  */
 
 static inline gimple_stmt_iterator
diff --git a/gcc/internal-fn.def b/gcc/internal-fn.def
index f28519837f2..1ff1d832eeb 100644
--- a/gcc/internal-fn.def
+++ b/gcc/internal-fn.def
@@ -155,6 +155,16 @@ DEF_INTERNAL_COND_OPTAB_FN (XOR, ECF_CONST | ECF_NOTHROW, xor, binary)
 
 DEF_INTERNAL_OPTAB_FN (RSQRT, ECF_CONST, rsqrt, unary)
 
+/* IFN_BREAK_AFTER (A, B):
+
+   - If A & B is all false, return A.
+   - Otherwise find the first true bit in A & B.  Copy bits of A up
+     to and including that bit and set the remaining bits to false.
+
+   A, B and the return value are all vector masks.  */
+DEF_INTERNAL_OPTAB_FN (BREAK_AFTER, ECF_CONST | ECF_NOTHROW,
+		       break_after, binary)
+
 /* Extract the last active element from a vector.  */
 DEF_INTERNAL_OPTAB_FN (EXTRACT_LAST, ECF_CONST | ECF_NOTHROW,
 		       extract_last, cond_unary)
diff --git a/gcc/optabs.def b/gcc/optabs.def
index bf67dfca132..d86dc803d5a 100644
--- a/gcc/optabs.def
+++ b/gcc/optabs.def
@@ -308,6 +308,7 @@ OPTAB_D (reduc_ior_scal_optab,  "reduc_ior_scal_$a")
 OPTAB_D (reduc_xor_scal_optab,  "reduc_xor_scal_$a")
 OPTAB_D (fold_left_plus_optab, "fold_left_plus_$a")
 
+OPTAB_D (break_after_optab, "break_after_$a")
 OPTAB_D (extract_last_optab, "extract_last_$a")
 OPTAB_D (fold_extract_last_optab, "fold_extract_last_$a")
 
diff --git a/gcc/testsuite/gcc.target/aarch64/sve_speculative_1.c b/gcc/testsuite/gcc.target/aarch64/sve_speculative_1.c
new file mode 100644
index 00000000000..ba2f569fd5c
--- /dev/null
+++ b/gcc/testsuite/gcc.target/aarch64/sve_speculative_1.c
@@ -0,0 +1,62 @@
+/* { dg-do compile } */
+/* { dg-options "-O3 -fno-inline -ffast-math -march=armv8-a+sve -fdump-tree-vect-details -msve-vector-bits=256" } */
+
+#include <stdint.h>
+
+/* Speculative loop with no data references.  */
+
+#define SPEC_LOOP(ARGTYPE,INDUCTYPE)\
+INDUCTYPE spec_loop_##ARGTYPE##INDUCTYPE (ARGTYPE mask, ARGTYPE limit)\
+{\
+  INDUCTYPE i = 0;\
+  while ((i & mask) != limit)\
+    i += 1;\
+  return i;\
+}\
+
+#define SPEC_FP_LOOP(ARGTYPE,INDUCTYPE,FPTYPE)\
+FPTYPE spec_fp_loop_##ARGTYPE##INDUCTYPE##FPTYPE (ARGTYPE mask, ARGTYPE limit)\
+{\
+  INDUCTYPE i = 0;\
+  FPTYPE f = 0.0;\
+  while ((i & mask) != limit)\
+    {\
+      f += 1;\
+      i += 1;\
+    }\
+  return f;\
+}\
+
+SPEC_LOOP (uint8_t, uint8_t)
+SPEC_LOOP (uint16_t, uint16_t)
+SPEC_LOOP (uint32_t, uint32_t)
+SPEC_LOOP (uint64_t, uint64_t)
+
+SPEC_LOOP (int8_t, int8_t)
+SPEC_LOOP (int16_t, int16_t)
+SPEC_LOOP (int32_t, int32_t)
+SPEC_LOOP (int64_t, int64_t)
+
+/* Conversions.  */
+SPEC_LOOP (uint16_t, uint8_t)
+
+SPEC_LOOP (uint32_t, uint8_t)
+SPEC_LOOP (uint32_t, uint16_t)
+
+SPEC_LOOP (uint64_t, uint8_t)
+SPEC_LOOP (uint64_t, uint16_t)
+SPEC_LOOP (uint64_t, uint32_t)
+
+SPEC_FP_LOOP (uint32_t, uint32_t, float)
+SPEC_FP_LOOP (uint64_t, uint64_t, double)
+
+SPEC_FP_LOOP (uint64_t, uint64_t, float)
+
+/* { dg-final { scan-tree-dump-times "LOOP VECTORIZED" 17 "vect" } } */
+/* { dg-final { scan-assembler-times {\tbrka\tp[0-9]+\.b, p[0-9]+/z, p[0-9]+\.b} 17 } } */
+/* { dg-final { scan-assembler-times {\tlastb\tw[0-9]+, p[0-7], z[0-9]+\.b} 5 } } */
+/* { dg-final { scan-assembler-times {\tlastb\tw[0-9]+, p[0-7], z[0-9]+\.h} 4 } } */
+/* { dg-final { scan-assembler-times {\tlastb\tw[0-9]+, p[0-7], z[0-9]+\.s} 3 } } */
+/* { dg-final { scan-assembler-times {\tlastb\tx[0-9]+, p[0-7], z[0-9]+\.d} 2 } } */
+/* { dg-final { scan-assembler-times {\tlastb\ts[0-9]+, p[0-7], z[0-9]+\.s} 2 } } */
+/* { dg-final { scan-assembler-times {\tlastb\td[0-9]+, p[0-7], z[0-9]+\.d} 1 } } */
diff --git a/gcc/testsuite/gcc.target/aarch64/sve_speculative_10.c b/gcc/testsuite/gcc.target/aarch64/sve_speculative_10.c
new file mode 100644
index 00000000000..c69164bb1ea
--- /dev/null
+++ b/gcc/testsuite/gcc.target/aarch64/sve_speculative_10.c
@@ -0,0 +1,23 @@
+/* { dg-do compile } */
+/* { dg-options "-O3 -fno-common -ffast-math -march=armv8-a+sve -fdump-tree-vect-details -msve-vector-bits=256" } */
+
+/* Speculative loop with two loads from global buffers which can be aligned,
+   but does require peeling.  */
+
+int a[500];
+int b[500];
+
+int
+foo (int n)
+{
+  int i = 0;
+  do
+    i += 1;
+  while (a[i] + b[i] < n);
+  return i;
+}
+
+/* { dg-final { scan-tree-dump-not "loop versioned for vectorization to enhance alignment" "vect" } } */
+/* { dg-final { scan-tree-dump "Alignment of access forced using peeling" "vect" } } */
+/* { dg-final { scan-tree-dump "misalign = 4 bytes of ref" "vect" } } */
+/* { dg-final { scan-tree-dump "LOOP VECTORIZED" "vect" } } */
diff --git a/gcc/testsuite/gcc.target/aarch64/sve_speculative_11.c b/gcc/testsuite/gcc.target/aarch64/sve_speculative_11.c
new file mode 100644
index 00000000000..92e4adc5571
--- /dev/null
+++ b/gcc/testsuite/gcc.target/aarch64/sve_speculative_11.c
@@ -0,0 +1,65 @@
+/* { dg-do compile } */
+/* { dg-options "-O3 -fno-common -ffast-math -march=armv8-a+sve -fdump-tree-vect-details -msve-vector-bits=256" } */
+
+#include <stdint.h>
+
+/* Speculative loop with two loads from global buffers which can be aligned
+   without any peeling.  */
+
+#define MAX_ARRAY_SIZE 500
+
+#ifndef STRIDE_LEVEL
+#define STRIDE_LEVEL 1
+#endif
+
+#define SPEC_LOOP(DATATYPE, ARGTYPE)\
+DATATYPE a##DATATYPE[MAX_ARRAY_SIZE];\
+DATATYPE b##DATATYPE[MAX_ARRAY_SIZE];\
+ARGTYPE spec_loop_##DATATYPE##_##ARGTYPE (DATATYPE n)\
+{\
+  ARGTYPE i = -1;\
+  do\
+    i += 1;\
+  while (a##DATATYPE[i*STRIDE_LEVEL] + b##DATATYPE[i*STRIDE_LEVEL] < n);\
+  return i;\
+}
+
+/* TODO: Cannot yet vectorize due to gather load.  */
+SPEC_LOOP (int8_t, int8_t)
+SPEC_LOOP (int16_t, int16_t)
+
+SPEC_LOOP (int32_t, int32_t)
+SPEC_LOOP (int64_t, int64_t)
+SPEC_LOOP (float, int32_t)
+SPEC_LOOP (double, int64_t)
+
+/* { dg-final { scan-tree-dump-not "loop versioned for vectorization to enhance alignment" "vect" } } */
+/* { dg-final { scan-tree-dump-not "Alignment of access forced using peeling" "vect" } } */
+
+/* { dg-final { scan-tree-dump "force alignment of aint8_t" "vect" { xfail *-*-* } } } */
+/* { dg-final { scan-tree-dump "force alignment of bint8_t" "vect" { xfail *-*-* } } } */
+/* { dg-final { scan-tree-dump "force alignment of aint16_t" "vect" { xfail *-*-* } } } */
+/* { dg-final { scan-tree-dump "force alignment of bint16_t" "vect" { xfail *-*-* } } } */
+/* { dg-final { scan-tree-dump "force alignment of aint32_t" "vect" } } */
+/* { dg-final { scan-tree-dump "force alignment of bint32_t" "vect" } } */
+/* { dg-final { scan-tree-dump "force alignment of aint64_t" "vect" } } */
+/* { dg-final { scan-tree-dump "force alignment of bint64_t" "vect" } } */
+/* { dg-final { scan-tree-dump "force alignment of afloat" "vect" } } */
+/* { dg-final { scan-tree-dump "force alignment of bfloat" "vect" } } */
+/* { dg-final { scan-tree-dump "force alignment of adouble" "vect" } } */
+/* { dg-final { scan-tree-dump "force alignment of bdouble" "vect" } } */
+
+/* { dg-final { scan-tree-dump "misalign = 0 bytes of ref aint8_t" "vect" { xfail *-*-* } } } */
+/* { dg-final { scan-tree-dump "misalign = 0 bytes of ref bint8_t" "vect" { xfail *-*-* } } } */
+/* { dg-final { scan-tree-dump "misalign = 0 bytes of ref aint16_t" "vect" { xfail *-*-* } } } */
+/* { dg-final { scan-tree-dump "misalign = 0 bytes of ref bint16_t" "vect" { xfail *-*-* } } } */
+/* { dg-final { scan-tree-dump "misalign = 0 bytes of ref aint32_t" "vect" } } */
+/* { dg-final { scan-tree-dump "misalign = 0 bytes of ref bint32_t" "vect" } } */
+/* { dg-final { scan-tree-dump "misalign = 0 bytes of ref aint64_t" "vect" } } */
+/* { dg-final { scan-tree-dump "misalign = 0 bytes of ref bint64_t" "vect" } } */
+/* { dg-final { scan-tree-dump "misalign = 0 bytes of ref afloat" "vect" } } */
+/* { dg-final { scan-tree-dump "misalign = 0 bytes of ref bfloat" "vect" } } */
+/* { dg-final { scan-tree-dump "misalign = 0 bytes of ref adouble" "vect" } } */
+/* { dg-final { scan-tree-dump "misalign = 0 bytes of ref bdouble" "vect" } } */
+
+/* { dg-final { scan-tree-dump-times "LOOP VECTORIZED" 4 "vect" } } */
diff --git a/gcc/testsuite/gcc.target/aarch64/sve_speculative_11_run.c b/gcc/testsuite/gcc.target/aarch64/sve_speculative_11_run.c
new file mode 100644
index 00000000000..ebcefdb623c
--- /dev/null
+++ b/gcc/testsuite/gcc.target/aarch64/sve_speculative_11_run.c
@@ -0,0 +1,61 @@
+/* { dg-do run { target { aarch64_sve_hw } } } */
+/* { dg-options "-O3 -fno-common -ffast-math -march=armv8-a+sve" } */
+/* { dg-options "-O3 -fno-common -ffast-math -march=armv8-a+sve -msve-vector-bits=256" { target aarch64_sve256_hw } } */
+
+#include "sve_speculative_11.c"
+
+extern void abort (void);
+
+#ifndef FILL_DATA
+#define FILL_DATA 0
+#endif
+
+#ifndef EXIT_CONDITION
+#define EXIT_CONDITION 5
+#endif
+
+#ifndef LOOP_COUNTS
+#define LOOP_COUNTS {37,45,55,17,39,43}
+#endif
+int loop_counts[] = LOOP_COUNTS;
+
+/* Fill the arrays with the exit conditions.
+   Then refill at the correct strided accesses with fill data up to the end of
+   the loop count.  */
+
+#define TEST_SPEC_LOOP_FUNC(DATATYPE, ARGTYPE)\
+void test_spec_loop_##DATATYPE##_##ARGTYPE (ARGTYPE num_elements)\
+{\
+  int i;\
+  for (i=0; i<MAX_ARRAY_SIZE; i++)\
+    {\
+      a##DATATYPE[i] = EXIT_CONDITION;\
+      b##DATATYPE[i] = EXIT_CONDITION;\
+    }\
+  for (i=0; (i<num_elements-1)*STRIDE_LEVEL; i++)\
+    {\
+      a##DATATYPE[i*STRIDE_LEVEL] = FILL_DATA;\
+      b##DATATYPE[i*STRIDE_LEVEL] = FILL_DATA;\
+    }\
+  ARGTYPE ret = spec_loop_##DATATYPE##_##ARGTYPE (EXIT_CONDITION);\
+  if (ret != num_elements - 1)\
+    abort ();\
+}
+
+TEST_SPEC_LOOP_FUNC (int8_t, int8_t)
+TEST_SPEC_LOOP_FUNC (int16_t, int16_t)
+TEST_SPEC_LOOP_FUNC (int32_t, int32_t)
+TEST_SPEC_LOOP_FUNC (int64_t, int64_t)
+TEST_SPEC_LOOP_FUNC (float, int32_t)
+TEST_SPEC_LOOP_FUNC (double, int64_t)
+
+int main (void)
+{
+  test_spec_loop_int8_t_int8_t (loop_counts[0]);
+  test_spec_loop_int16_t_int16_t (loop_counts[1]);
+  test_spec_loop_int32_t_int32_t (loop_counts[2]);
+  test_spec_loop_int64_t_int64_t (loop_counts[3]);
+  test_spec_loop_float_int32_t (loop_counts[4]);
+  test_spec_loop_double_int64_t (loop_counts[5]);
+  return 0;
+}
diff --git a/gcc/testsuite/gcc.target/aarch64/sve_speculative_12.c b/gcc/testsuite/gcc.target/aarch64/sve_speculative_12.c
new file mode 100644
index 00000000000..d6caa8e7513
--- /dev/null
+++ b/gcc/testsuite/gcc.target/aarch64/sve_speculative_12.c
@@ -0,0 +1,32 @@
+/* { dg-do compile } */
+/* { dg-options "-O3 -fno-common -ffast-math -march=armv8-a+sve -fdump-tree-vect-details -msve-vector-bits=256" } */
+
+/* Speculative loop with two loads from global buffers which can be aligned
+   without any peeling, and an access stride of 2.  */
+
+#define STRIDE_LEVEL 2
+
+#include "sve_speculative_11.c"
+
+/* { dg-final { scan-tree-dump-not "loop versioned for vectorization to enhance alignment" "vect" } } */
+/* { dg-final { scan-tree-dump-not "Alignment of access forced using peeling" "vect" } } */
+
+/* { dg-final { scan-tree-dump "force alignment of aint8_t" "vect" { xfail *-*-* } } } */
+/* { dg-final { scan-tree-dump "force alignment of bint8_t" "vect" { xfail *-*-* } } } */
+/* { dg-final { scan-tree-dump "force alignment of aint16_t" "vect" { xfail *-*-* } } } */
+/* { dg-final { scan-tree-dump "force alignment of bint16_t" "vect" { xfail *-*-* } } } */
+/* { dg-final { scan-tree-dump "force alignment of aint32_t" "vect" } } */
+/* { dg-final { scan-tree-dump "force alignment of bint32_t" "vect" } } */
+/* { dg-final { scan-tree-dump "force alignment of aint64_t" "vect" } } */
+/* { dg-final { scan-tree-dump "force alignment of bint64_t" "vect" } } */
+
+/* { dg-final { scan-tree-dump "misalign = 0 bytes of ref aint8_t" "vect" { xfail *-*-* } } } */
+/* { dg-final { scan-tree-dump "misalign = 0 bytes of ref bint8_t" "vect" { xfail *-*-* } } } */
+/* { dg-final { scan-tree-dump "misalign = 0 bytes of ref aint16_t" "vect" { xfail *-*-* } } } */
+/* { dg-final { scan-tree-dump "misalign = 0 bytes of ref bint16_t" "vect" { xfail *-*-* } } } */
+/* { dg-final { scan-tree-dump "misalign = 0 bytes of ref aint32_t" "vect" } } */
+/* { dg-final { scan-tree-dump "misalign = 0 bytes of ref bint32_t" "vect" } } */
+/* { dg-final { scan-tree-dump "misalign = 0 bytes of ref aint64_t" "vect" } } */
+/* { dg-final { scan-tree-dump "misalign = 0 bytes of ref bint64_t" "vect" } } */
+
+/* { dg-final { scan-tree-dump-times "LOOP VECTORIZED" 4 "vect" } } */
diff --git a/gcc/testsuite/gcc.target/aarch64/sve_speculative_12_run.c b/gcc/testsuite/gcc.target/aarch64/sve_speculative_12_run.c
new file mode 100644
index 00000000000..42c346073c6
--- /dev/null
+++ b/gcc/testsuite/gcc.target/aarch64/sve_speculative_12_run.c
@@ -0,0 +1,9 @@
+/* { dg-do run { target { aarch64_sve_hw } } } */
+/* { dg-options "-O3 -fno-common -ffast-math -march=armv8-a+sve" } */
+/* { dg-options "-O3 -fno-common -ffast-math -march=armv8-a+sve -msve-vector-bits=256" { target aarch64_sve256_hw } } */
+
+#define STRIDE_LEVEL 2
+#define EXIT_CONDITION 7
+#define LOOP_COUNTS {43,27,19,54,25,27}
+
+#include "sve_speculative_11_run.c"
diff --git a/gcc/testsuite/gcc.target/aarch64/sve_speculative_13.c b/gcc/testsuite/gcc.target/aarch64/sve_speculative_13.c
new file mode 100644
index 00000000000..db95e81d3f2
--- /dev/null
+++ b/gcc/testsuite/gcc.target/aarch64/sve_speculative_13.c
@@ -0,0 +1,12 @@
+/* { dg-do compile } */
+/* { dg-options "-O3 -fno-common -ffast-math -march=armv8-a+sve -fdump-tree-vect-details -msve-vector-bits=256" } */
+
+/* Speculative loop with two loads from global buffers which can be aligned
+   without any peeling, and an access stride of 3.  */
+
+#define STRIDE_LEVEL 3
+
+#include "sve_speculative_11.c"
+
+/* { dg-final { scan-tree-dump-times "not vectorized: can't calculate required alignment for data ref" 10 "vect" } } */
+/* { dg-final { scan-tree-dump-not "LOOP VECTORIZED" "vect" } } */
diff --git a/gcc/testsuite/gcc.target/aarch64/sve_speculative_13_run.c b/gcc/testsuite/gcc.target/aarch64/sve_speculative_13_run.c
new file mode 100644
index 00000000000..519ff21e168
--- /dev/null
+++ b/gcc/testsuite/gcc.target/aarch64/sve_speculative_13_run.c
@@ -0,0 +1,9 @@
+/* { dg-do run { target { aarch64_sve_hw } } } */
+/* { dg-options "-O3 -fno-common -ffast-math -march=armv8-a+sve" } */
+/* { dg-options "-O3 -fno-common -ffast-math -march=armv8-a+sve -msve-vector-bits=256" { target aarch64_sve256_hw } } */
+
+#define STRIDE_LEVEL 3
+#define EXIT_CONDITION 9
+#define LOOP_COUNTS {19,47,15,35,23,33}
+
+#include "sve_speculative_11_run.c"
diff --git a/gcc/testsuite/gcc.target/aarch64/sve_speculative_14.c b/gcc/testsuite/gcc.target/aarch64/sve_speculative_14.c
new file mode 100644
index 00000000000..218afb6c5ca
--- /dev/null
+++ b/gcc/testsuite/gcc.target/aarch64/sve_speculative_14.c
@@ -0,0 +1,32 @@
+/* { dg-do compile } */
+/* { dg-options "-O3 -fno-common -ffast-math -march=armv8-a+sve -fdump-tree-vect-details -msve-vector-bits=256" } */
+
+/* Speculative loop with two loads from global buffers which can be aligned
+   without any peeling, and an access stride of 4.  */
+
+#define STRIDE_LEVEL 4
+
+#include "sve_speculative_11.c"
+
+/* { dg-final { scan-tree-dump-not "loop versioned for vectorization to enhance alignment" "vect" } } */
+/* { dg-final { scan-tree-dump-not "Alignment of access forced using peeling" "vect" } } */
+
+/* { dg-final { scan-tree-dump "force alignment of aint8_t" "vect" { xfail *-*-* } } } */
+/* { dg-final { scan-tree-dump "force alignment of bint8_t" "vect" { xfail *-*-* } } } */
+/* { dg-final { scan-tree-dump "force alignment of aint16_t" "vect" { xfail *-*-* } } } */
+/* { dg-final { scan-tree-dump "force alignment of bint16_t" "vect" { xfail *-*-* } } } */
+/* { dg-final { scan-tree-dump "force alignment of aint32_t" "vect" } } */
+/* { dg-final { scan-tree-dump "force alignment of bint32_t" "vect" } } */
+/* { dg-final { scan-tree-dump "force alignment of aint64_t" "vect" } } */
+/* { dg-final { scan-tree-dump "force alignment of bint64_t" "vect" } } */
+
+/* { dg-final { scan-tree-dump "misalign = 0 bytes of ref aint8_t" "vect" { xfail *-*-* } } } */
+/* { dg-final { scan-tree-dump "misalign = 0 bytes of ref bint8_t" "vect" { xfail *-*-* } } } */
+/* { dg-final { scan-tree-dump "misalign = 0 bytes of ref aint16_t" "vect" { xfail *-*-* } } } */
+/* { dg-final { scan-tree-dump "misalign = 0 bytes of ref bint16_t" "vect" { xfail *-*-* } } } */
+/* { dg-final { scan-tree-dump "misalign = 0 bytes of ref aint32_t" "vect" } } */
+/* { dg-final { scan-tree-dump "misalign = 0 bytes of ref bint32_t" "vect" } } */
+/* { dg-final { scan-tree-dump "misalign = 0 bytes of ref aint64_t" "vect" } } */
+/* { dg-final { scan-tree-dump "misalign = 0 bytes of ref bint64_t" "vect" } } */
+
+/* { dg-final { scan-tree-dump-times "LOOP VECTORIZED" 4 "vect" } } */
diff --git a/gcc/testsuite/gcc.target/aarch64/sve_speculative_14_run.c b/gcc/testsuite/gcc.target/aarch64/sve_speculative_14_run.c
new file mode 100644
index 00000000000..958e94fd822
--- /dev/null
+++ b/gcc/testsuite/gcc.target/aarch64/sve_speculative_14_run.c
@@ -0,0 +1,11 @@
+/* { dg-do run { target { aarch64_sve_hw } } } */
+/* { dg-options "-O3 -fno-common -ffast-math -march=armv8-a+sve" } */
+/* { dg-options "-O3 -fno-common -ffast-math -march=armv8-a+sve -msve-vector-bits=256" { target aarch64_sve256_hw } } */
+
+#define STRIDE_LEVEL 4
+
+#define FILL_DATA 5
+#define EXIT_CONDITION 22
+#define LOOP_COUNTS {43,27,19,54,25,27}
+
+#include "sve_speculative_11_run.c"
diff --git a/gcc/testsuite/gcc.target/aarch64/sve_speculative_15.c b/gcc/testsuite/gcc.target/aarch64/sve_speculative_15.c
new file mode 100644
index 00000000000..42ec564c90b
--- /dev/null
+++ b/gcc/testsuite/gcc.target/aarch64/sve_speculative_15.c
@@ -0,0 +1,59 @@
+/* { dg-do compile } */
+/* { dg-options "-O3 -fno-common -ffast-math -march=armv8-a+sve -fdump-tree-vect-details -msve-vector-bits=256" } */
+
+#include <stdint.h>
+
+/* Speculative loop with two consecutive loads from a single global buffer
+   which can be aligned without any peeling, and an access stride of 2.  */
+
+#define MAX_ARRAY_SIZE 500
+
+/* Minimum STRIDE_LEVEL is 2.  */
+#ifndef STRIDE_LEVEL
+#define STRIDE_LEVEL 2
+#endif
+
+#define SPEC_LOOP(DATATYPE, ARGTYPE)\
+DATATYPE a##DATATYPE[MAX_ARRAY_SIZE];\
+ARGTYPE spec_loop_##DATATYPE##_##ARGTYPE (DATATYPE n)\
+{\
+  ARGTYPE i = -1;\
+  do\
+    i += 1;\
+  while (a##DATATYPE[i*STRIDE_LEVEL] + a##DATATYPE[(i*STRIDE_LEVEL) + 1] < n);\
+  return i;\
+}
+
+/* TODO: Cannot yet vectorize due to gather load.  */
+SPEC_LOOP (int8_t, int8_t)
+SPEC_LOOP (int16_t, int16_t)
+
+SPEC_LOOP (int32_t, int32_t)
+SPEC_LOOP (int64_t, int64_t)
+SPEC_LOOP (float, int32_t)
+SPEC_LOOP (double, int64_t)
+
+/* { dg-final { scan-tree-dump-not "loop versioned for vectorization to enhance alignment" "vect" } } */
+/* { dg-final { scan-tree-dump-not "Alignment of access forced using peeling" "vect" } } */
+
+/* { dg-final { scan-tree-dump "force alignment of aint8_t" "vect" { xfail *-*-* } } } */
+/* { dg-final { scan-tree-dump "force alignment of aint16_t" "vect" { xfail *-*-* } } } */
+/* { dg-final { scan-tree-dump "force alignment of aint32_t" "vect" } } */
+/* { dg-final { scan-tree-dump "force alignment of aint64_t" "vect" } } */
+/* { dg-final { scan-tree-dump "force alignment of afloat" "vect" } } */
+/* { dg-final { scan-tree-dump "force alignment of adouble" "vect" } } */
+
+/* { dg-final { scan-tree-dump "misalign = 1 bytes of ref aint8_t" "vect" { xfail *-*-* } } } */
+/* { dg-final { scan-tree-dump "misalign = 0 bytes of ref aint8_t" "vect" { xfail *-*-* } } } */
+/* { dg-final { scan-tree-dump "misalign = 0 bytes of ref aint16_t" "vect" { xfail *-*-* } } } */
+/* { dg-final { scan-tree-dump "misalign = 2 bytes of ref aint16_t" "vect" { xfail *-*-* } } } */
+/* { dg-final { scan-tree-dump "misalign = 0 bytes of ref aint32_t" "vect" } } */
+/* { dg-final { scan-tree-dump "misalign = 4 bytes of ref aint32_t" "vect" } } */
+/* { dg-final { scan-tree-dump "misalign = 0 bytes of ref aint64_t" "vect" } } */
+/* { dg-final { scan-tree-dump "misalign = 8 bytes of ref aint64_t" "vect" } } */
+/* { dg-final { scan-tree-dump "misalign = 0 bytes of ref afloat" "vect" } } */
+/* { dg-final { scan-tree-dump "misalign = 4 bytes of ref afloat" "vect" } } */
+/* { dg-final { scan-tree-dump "misalign = 0 bytes of ref adouble" "vect" } } */
+/* { dg-final { scan-tree-dump "misalign = 8 bytes of ref adouble" "vect" } } */
+
+/* { dg-final { scan-tree-dump-times "LOOP VECTORIZED" 4 "vect" } } */
diff --git a/gcc/testsuite/gcc.target/aarch64/sve_speculative_15_run.c b/gcc/testsuite/gcc.target/aarch64/sve_speculative_15_run.c
new file mode 100644
index 00000000000..533f99467fd
--- /dev/null
+++ b/gcc/testsuite/gcc.target/aarch64/sve_speculative_15_run.c
@@ -0,0 +1,56 @@
+/* { dg-do run { target { aarch64_sve_hw } } } */
+/* { dg-options "-O3 -fno-common -ffast-math -march=armv8-a+sve" } */
+/* { dg-options "-O3 -fno-common -ffast-math -march=armv8-a+sve -msve-vector-bits=256" { target aarch64_sve256_hw } } */
+
+#include "sve_speculative_15.c"
+
+extern void abort (void);
+
+#ifndef FILL_DATA
+#define FILL_DATA 0
+#endif
+
+#ifndef EXIT_CONDITION
+#define EXIT_CONDITION 5
+#endif
+
+#ifndef LOOP_COUNTS
+#define LOOP_COUNTS {37,45,55,17,39,43}
+#endif
+int loop_counts[] = LOOP_COUNTS;
+
+/* Fill the arrays with the exit conditions.
+   Then refill at the correct strided accesses with fill data up to the end of
+   the loop count.  */
+
+#define TEST_SPEC_LOOP_FUNC(DATATYPE, ARGTYPE)				\
+void									\
+test_spec_loop_##DATATYPE##_##ARGTYPE (ARGTYPE num_elements)		\
+{									\
+  for (int i = 0; i < MAX_ARRAY_SIZE; ++i)				\
+    a##DATATYPE[i] = EXIT_CONDITION;					\
+  for (int i = 0; i < (num_elements - 1) * STRIDE_LEVEL; ++i)		\
+    a##DATATYPE[i] = FILL_DATA;						\
+  ARGTYPE ret = spec_loop_##DATATYPE##_##ARGTYPE (EXIT_CONDITION);	\
+  if (ret != num_elements - 1)						\
+    abort ();								\
+}
+
+TEST_SPEC_LOOP_FUNC (int8_t, int8_t)
+TEST_SPEC_LOOP_FUNC (int16_t, int16_t)
+TEST_SPEC_LOOP_FUNC (int32_t, int32_t)
+TEST_SPEC_LOOP_FUNC (int64_t, int64_t)
+TEST_SPEC_LOOP_FUNC (float, int32_t)
+TEST_SPEC_LOOP_FUNC (double, int64_t)
+
+int main (void)
+{
+  test_spec_loop_int8_t_int8_t (loop_counts[0]);
+  test_spec_loop_int16_t_int16_t (loop_counts[1]);
+  test_spec_loop_int32_t_int32_t (loop_counts[2]);
+  test_spec_loop_int64_t_int64_t (loop_counts[3]);
+  test_spec_loop_float_int32_t (loop_counts[4]);
+  test_spec_loop_double_int64_t (loop_counts[5]);
+  return 0;
+}
+
diff --git a/gcc/testsuite/gcc.target/aarch64/sve_speculative_16.c b/gcc/testsuite/gcc.target/aarch64/sve_speculative_16.c
new file mode 100644
index 00000000000..9affb766b2a
--- /dev/null
+++ b/gcc/testsuite/gcc.target/aarch64/sve_speculative_16.c
@@ -0,0 +1,12 @@
+/* { dg-do compile } */
+/* { dg-options "-O3 -fno-common -ffast-math -march=armv8-a+sve -fdump-tree-vect-details -msve-vector-bits=256" } */
+
+/* Speculative loop with two consecutive loads from a single global buffer
+   which can be aligned without any peeling, and an access stride of 3.  */
+
+#define STRIDE_LEVEL 3
+
+#include "sve_speculative_15.c"
+
+/* { dg-final { scan-tree-dump-times "not vectorized: can't calculate required alignment for data ref" 10 "vect" } } */
+/* { dg-final { scan-tree-dump-not "LOOP VECTORIZED" "vect" } } */
diff --git a/gcc/testsuite/gcc.target/aarch64/sve_speculative_16_run.c b/gcc/testsuite/gcc.target/aarch64/sve_speculative_16_run.c
new file mode 100644
index 00000000000..7c53e7aeed6
--- /dev/null
+++ b/gcc/testsuite/gcc.target/aarch64/sve_speculative_16_run.c
@@ -0,0 +1,9 @@
+/* { dg-do run { target { aarch64_sve_hw } } } */
+/* { dg-options "-O3 -fno-common -ffast-math -march=armv8-a+sve" } */
+/* { dg-options "-O3 -fno-common -ffast-math -march=armv8-a+sve -msve-vector-bits=256" { target aarch64_sve256_hw } } */
+
+#define STRIDE_LEVEL 3
+#define EXIT_CONDITION 7
+#define LOOP_COUNTS {43,27,19,54,25,27}
+
+#include "sve_speculative_15_run.c"
diff --git a/gcc/testsuite/gcc.target/aarch64/sve_speculative_17.c b/gcc/testsuite/gcc.target/aarch64/sve_speculative_17.c
new file mode 100644
index 00000000000..b7e472e0deb
--- /dev/null
+++ b/gcc/testsuite/gcc.target/aarch64/sve_speculative_17.c
@@ -0,0 +1,34 @@
+/* { dg-do compile } */
+/* { dg-options "-O3 -fno-common -ffast-math -march=armv8-a+sve -fdump-tree-vect-details -msve-vector-bits=256" } */
+
+/* Speculative loop with two consecutive loads from a single global buffer
+   which can be aligned without any peeling, and an access stride of 4.  */
+
+#define STRIDE_LEVEL 4
+
+#include "sve_speculative_15.c"
+
+/* { dg-final { scan-tree-dump-not "loop versioned for vectorization to enhance alignment" "vect" } } */
+/* { dg-final { scan-tree-dump-not "Alignment of access forced using peeling" "vect" } } */
+
+/* { dg-final { scan-tree-dump "force alignment of aint8_t" "vect" { xfail *-*-* } } } */
+/* { dg-final { scan-tree-dump "force alignment of aint16_t" "vect" { xfail *-*-* } } } */
+/* { dg-final { scan-tree-dump "force alignment of aint32_t" "vect" } } */
+/* { dg-final { scan-tree-dump "force alignment of aint64_t" "vect" } } */
+/* { dg-final { scan-tree-dump "force alignment of afloat" "vect" } } */
+/* { dg-final { scan-tree-dump "force alignment of adouble" "vect" } } */
+
+/* { dg-final { scan-tree-dump "misalign = 1 bytes of ref aint8_t" "vect" { xfail *-*-* } } } */
+/* { dg-final { scan-tree-dump "misalign = 0 bytes of ref aint8_t" "vect" { xfail *-*-* } } } */
+/* { dg-final { scan-tree-dump "misalign = 0 bytes of ref aint16_t" "vect" { xfail *-*-* } } } */
+/* { dg-final { scan-tree-dump "misalign = 2 bytes of ref aint16_t" "vect" { xfail *-*-* } } } */
+/* { dg-final { scan-tree-dump "misalign = 0 bytes of ref aint32_t" "vect" } } */
+/* { dg-final { scan-tree-dump "misalign = 4 bytes of ref aint32_t" "vect" } } */
+/* { dg-final { scan-tree-dump "misalign = 0 bytes of ref aint64_t" "vect" } } */
+/* { dg-final { scan-tree-dump "misalign = 8 bytes of ref aint64_t" "vect" } } */
+/* { dg-final { scan-tree-dump "misalign = 0 bytes of ref afloat" "vect" } } */
+/* { dg-final { scan-tree-dump "misalign = 4 bytes of ref afloat" "vect" } } */
+/* { dg-final { scan-tree-dump "misalign = 0 bytes of ref adouble" "vect" } } */
+/* { dg-final { scan-tree-dump "misalign = 8 bytes of ref adouble" "vect" } } */
+
+/* { dg-final { scan-tree-dump-times "LOOP VECTORIZED" 4 "vect" } } */
diff --git a/gcc/testsuite/gcc.target/aarch64/sve_speculative_17_run.c b/gcc/testsuite/gcc.target/aarch64/sve_speculative_17_run.c
new file mode 100644
index 00000000000..5453116429a
--- /dev/null
+++ b/gcc/testsuite/gcc.target/aarch64/sve_speculative_17_run.c
@@ -0,0 +1,9 @@
+/* { dg-do run { target { aarch64_sve_hw } } } */
+/* { dg-options "-O3 -fno-common -ffast-math -march=armv8-a+sve" } */
+/* { dg-options "-O3 -fno-common -ffast-math -march=armv8-a+sve -msve-vector-bits=256" { target aarch64_sve256_hw } } */
+
+#define STRIDE_LEVEL 4
+#define EXIT_CONDITION 9
+#define LOOP_COUNTS {19,47,15,35,23,33}
+
+#include "sve_speculative_15_run.c"
diff --git a/gcc/testsuite/gcc.target/aarch64/sve_speculative_1_run.c b/gcc/testsuite/gcc.target/aarch64/sve_speculative_1_run.c
new file mode 100644
index 00000000000..f4bb55ed6f8
--- /dev/null
+++ b/gcc/testsuite/gcc.target/aarch64/sve_speculative_1_run.c
@@ -0,0 +1,47 @@
+/* { dg-do run { target { aarch64_sve_hw } } } */
+/* { dg-options "-O3 -fno-inline -march=armv8-a+sve -ffast-math" } */
+/* { dg-options "-O3 -fno-inline -march=armv8-a+sve -ffast-math -msve-vector-bits=256" { target aarch64_sve256_hw } } */
+
+#include "sve_speculative_1.c"
+
+extern void abort (void);
+
+#define TEST_LOOP(ARGTYPE,INDUCTYPE)\
+{\
+  INDUCTYPE res = spec_loop_##ARGTYPE##INDUCTYPE (0xFF, 0xAE);\
+  if (res != 0xAE)\
+    abort ();\
+}\
+
+#define TEST_FP_LOOP(ARGTYPE,INDUCTYPE,FPTYPE)\
+{\
+  FPTYPE res = spec_fp_loop_##ARGTYPE##INDUCTYPE##FPTYPE (0xFF, 0xAE);\
+  if (res != 0xAE)\
+    abort ();\
+}\
+
+int main ()
+{
+  TEST_LOOP (uint8_t, uint8_t);
+  TEST_LOOP (uint16_t, uint16_t);
+  TEST_LOOP (uint32_t, uint32_t);
+  TEST_LOOP (uint64_t, uint64_t);
+  TEST_LOOP (int32_t, int32_t);
+  TEST_LOOP (int64_t, int64_t);
+
+  TEST_LOOP (uint16_t, uint8_t)
+
+  TEST_LOOP (uint32_t, uint8_t)
+  TEST_LOOP (uint32_t, uint16_t)
+
+  TEST_LOOP (uint64_t, uint8_t)
+  TEST_LOOP (uint64_t, uint16_t)
+  TEST_LOOP (uint64_t, uint32_t)
+
+  TEST_FP_LOOP (uint32_t, uint32_t, float)
+  TEST_FP_LOOP (uint64_t, uint64_t, double)
+
+  TEST_FP_LOOP (uint64_t, uint64_t, float)
+
+  return 0;
+}
diff --git a/gcc/testsuite/gcc.target/aarch64/sve_speculative_2.c b/gcc/testsuite/gcc.target/aarch64/sve_speculative_2.c
new file mode 100644
index 00000000000..108c5a6fbe6
--- /dev/null
+++ b/gcc/testsuite/gcc.target/aarch64/sve_speculative_2.c
@@ -0,0 +1,72 @@
+/* { dg-do compile } */
+/* { dg-options "-O3 -fno-inline -ffast-math -march=armv8-a+sve -fdump-tree-vect-details -msve-vector-bits=256" } */
+
+#include <stdint.h>
+#include <stdbool.h>
+
+/* Speculative loop with no data references.  */
+
+/* FIXME: dup of rhs into predicate register is made of horrible code.  */
+#define SPEC_LOOP(ARGTYPE,INDUCTYPE)\
+INDUCTYPE spec_loop_##ARGTYPE##INDUCTYPE (ARGTYPE mask, ARGTYPE limit,\
+					  bool rhs)\
+{\
+  INDUCTYPE i = 0;\
+  bool lhs = (i & mask) != limit;\
+  while (lhs == rhs)\
+    {\
+      i += 1;\
+      lhs = (i & mask) != limit;\
+    }\
+  return i;\
+}\
+
+#define SPEC_FP_LOOP(ARGTYPE,INDUCTYPE,FPTYPE)\
+INDUCTYPE spec_fp_loop_##ARGTYPE##INDUCTYPE##FPTYPE (ARGTYPE mask, ARGTYPE limit,\
+						     bool rhs)\
+{\
+  INDUCTYPE i = 0;\
+  FPTYPE f = 0.0;\
+  bool lhs = (i & mask) != limit;\
+  while (lhs == rhs)\
+    {\
+      f += 1;\
+      i += 1;\
+      lhs = (i & mask) != limit;\
+    }\
+  return f;\
+}\
+
+SPEC_LOOP (uint8_t, uint8_t)
+SPEC_LOOP (uint16_t, uint16_t)
+SPEC_LOOP (uint32_t, uint32_t)
+SPEC_LOOP (uint64_t, uint64_t)
+
+SPEC_LOOP (int8_t, int8_t)
+SPEC_LOOP (int16_t, int16_t)
+SPEC_LOOP (int32_t, int32_t)
+SPEC_LOOP (int64_t, int64_t)
+
+/* Conversions.  */
+SPEC_LOOP (uint16_t, uint8_t)
+
+SPEC_LOOP (uint32_t, uint8_t)
+SPEC_LOOP (uint32_t, uint16_t)
+
+SPEC_LOOP (uint64_t, uint8_t)
+SPEC_LOOP (uint64_t, uint16_t)
+SPEC_LOOP (uint64_t, uint32_t)
+
+SPEC_FP_LOOP (uint32_t, uint32_t, float)
+SPEC_FP_LOOP (uint64_t, uint64_t, double)
+
+SPEC_FP_LOOP (uint64_t, uint64_t, float)
+
+/* { dg-final { scan-tree-dump-times "LOOP VECTORIZED" 17 "vect" } } */
+/* { dg-final { scan-assembler-times {\tbrka\tp[0-9]+\.b, p[0-9]+/z, p[0-9]+\.b} 17 } } */
+/* { dg-final { scan-assembler-times {\tlastb\tw[0-9]+, p[0-7], z[0-9]+\.b} 5 } } */
+/* { dg-final { scan-assembler-times {\tlastb\tw[0-9]+, p[0-7], z[0-9]+\.h} 4 } } */
+/* { dg-final { scan-assembler-times {\tlastb\tw[0-9]+, p[0-7], z[0-9]+\.s} 3 } } */
+/* { dg-final { scan-assembler-times {\tlastb\tx[0-9]+, p[0-7], z[0-9]+\.d} 2 } } */
+/* { dg-final { scan-assembler-times {\tlastb\ts[0-9]+, p[0-7], z[0-9]+\.s} 2 } } */
+/* { dg-final { scan-assembler-times {\tlastb\td[0-9]+, p[0-7], z[0-9]+\.d} 1 } } */
diff --git a/gcc/testsuite/gcc.target/aarch64/sve_speculative_2_run.c b/gcc/testsuite/gcc.target/aarch64/sve_speculative_2_run.c
new file mode 100644
index 00000000000..ad2c9c874b8
--- /dev/null
+++ b/gcc/testsuite/gcc.target/aarch64/sve_speculative_2_run.c
@@ -0,0 +1,45 @@
+/* { dg-do run { target { aarch64_sve_hw } } } */
+/* { dg-options "-O3 -fno-inline -march=armv8-a+sve -ffast-math" } */
+/* { dg-options "-O3 -fno-inline -march=armv8-a+sve -ffast-math -msve-vector-bits=256" { target aarch64_sve256_hw } } */
+
+#include "sve_speculative_2.c"
+
+extern void abort (void);
+
+#define TEST_LOOP(ARGTYPE,INDUCTYPE)\
+{\
+  INDUCTYPE res = spec_loop_##ARGTYPE##INDUCTYPE (0xFF, 0xAE, true);\
+  if (res != 0xAE)\
+    abort ();\
+}\
+
+#define TEST_FP_LOOP(ARGTYPE,INDUCTYPE,FPTYPE)\
+{\
+  FPTYPE res = spec_fp_loop_##ARGTYPE##INDUCTYPE##FPTYPE (0xFF, 0xAE, true);\
+  if (res != 0xAE)\
+    abort ();\
+}\
+
+int main ()
+{
+  TEST_LOOP (uint8_t, uint8_t);
+  TEST_LOOP (uint16_t, uint16_t);
+  TEST_LOOP (uint32_t, uint32_t);
+  TEST_LOOP (uint64_t, uint64_t);
+  TEST_LOOP (int32_t, int32_t);
+  TEST_LOOP (int64_t, int64_t);
+
+  TEST_LOOP (uint16_t, uint8_t)
+
+  TEST_LOOP (uint32_t, uint8_t)
+  TEST_LOOP (uint32_t, uint16_t)
+
+  TEST_LOOP (uint64_t, uint8_t)
+  TEST_LOOP (uint64_t, uint16_t)
+  TEST_LOOP (uint64_t, uint32_t)
+
+  TEST_FP_LOOP (uint32_t, uint32_t, float)
+  TEST_FP_LOOP (uint64_t, uint64_t, double)
+
+  return 0;
+}
diff --git a/gcc/testsuite/gcc.target/aarch64/sve_speculative_3.c b/gcc/testsuite/gcc.target/aarch64/sve_speculative_3.c
new file mode 100644
index 00000000000..db35711a193
--- /dev/null
+++ b/gcc/testsuite/gcc.target/aarch64/sve_speculative_3.c
@@ -0,0 +1,26 @@
+/* { dg-do compile } */
+/* { dg-options "-O3 -fno-inline -ffast-math -march=armv8-a+sve -fdump-tree-vect-details -msve-vector-bits=256" } */
+
+#include <stdint.h>
+
+/* Speculative loop with different sizes and no data references .
+   Cannot be vectorized.  */
+
+#define SPEC_FP_LOOP(ARGTYPE,INDUCTYPE,FPTYPE)\
+FPTYPE spec_fp_loop_##ARGTYPE##INDUCTYPE (ARGTYPE mask, ARGTYPE limit)\
+{\
+  INDUCTYPE i = 0;\
+  FPTYPE f = 0.0;\
+  while ((i & mask) != limit)\
+    {\
+      f += 1;\
+      i += 1;\
+    }\
+  return f;\
+}\
+
+SPEC_FP_LOOP (uint32_t, uint32_t, double)
+
+/* { dg-final { scan-tree-dump-times "not vectorized: ncopies is greater than 1" 1 "vect" } } */
+/* { dg-final { scan-assembler-not "brka\tp\[0-9\]*.b, p\[0-9\]*\/z, p\[0-9\]*.b" } } */
+
diff --git a/gcc/testsuite/gcc.target/aarch64/sve_speculative_4.c b/gcc/testsuite/gcc.target/aarch64/sve_speculative_4.c
new file mode 100644
index 00000000000..32b8c71c92a
--- /dev/null
+++ b/gcc/testsuite/gcc.target/aarch64/sve_speculative_4.c
@@ -0,0 +1,66 @@
+/* { dg-do compile } */
+/* { dg-options "-O3 -fno-inline -ffast-math -march=armv8-a+sve -fdump-tree-vect-details -msve-vector-bits=256" } */
+
+#include <stdint.h>
+
+/* Speculative loop with a load.  */
+
+#define SPEC_LOOP(ARGTYPE,INDUCTYPE)\
+INDUCTYPE spec_loop_##ARGTYPE##INDUCTYPE (ARGTYPE mask, ARGTYPE limit, ARGTYPE * array)\
+{\
+  uint64_t i = 0;\
+  INDUCTYPE r = 0;\
+  while ((i & mask) != limit)\
+  {\
+    r = array[i];\
+    i++;\
+  }\
+  return r;\
+}
+
+#define SPEC_FP_LOOP(ARGTYPE,FPTYPE)\
+FPTYPE spec_fp_loop_##ARGTYPE##FPTYPE (ARGTYPE mask, ARGTYPE limit, FPTYPE * array)\
+{\
+  uint64_t i = 0;\
+  FPTYPE f = 0.0;\
+  while ((i & mask) != limit)\
+    {\
+      f = array[i];\
+      i++;\
+    }\
+  return f;\
+}
+
+SPEC_LOOP (uint8_t, uint8_t)
+SPEC_LOOP (uint16_t, uint16_t)
+SPEC_LOOP (uint32_t, uint32_t)
+SPEC_LOOP (uint64_t, uint64_t)
+
+SPEC_LOOP (int8_t, int8_t)
+SPEC_LOOP (int16_t, int16_t)
+SPEC_LOOP (int32_t, int32_t)
+SPEC_LOOP (int64_t, int64_t)
+
+/* Conversions.  */
+SPEC_LOOP (uint16_t, uint8_t)
+
+SPEC_LOOP (uint32_t, uint8_t)
+SPEC_LOOP (uint32_t, uint16_t)
+
+SPEC_LOOP (uint64_t, uint8_t)
+SPEC_LOOP (uint64_t, uint16_t)
+SPEC_LOOP (uint64_t, uint32_t)
+
+SPEC_FP_LOOP (uint32_t, float)
+SPEC_FP_LOOP (uint64_t, double)
+
+SPEC_FP_LOOP (uint64_t, float)
+
+/* { dg-final { scan-tree-dump-times "LOOP VECTORIZED" 17 "vect" } } */
+/* { dg-final { scan-assembler-times {\tbrka\tp[0-9]+\.b, p[0-9]+/z, p[0-9]+\.b} 17 } } */
+/* { dg-final { scan-assembler-times {\tlastb\tw[0-9]+, p[0-7], z[0-9]+\.b} 2 } } */
+/* { dg-final { scan-assembler-times {\tlastb\tw[0-9]+, p[0-7], z[0-9]+\.h} 3 } } */
+/* { dg-final { scan-assembler-times {\tlastb\tw[0-9]+, p[0-7], z[0-9]+\.s} 4 } } */
+/* { dg-final { scan-assembler-times {\tlastb\tx[0-9]+, p[0-7], z[0-9]+\.d} 5 } } */
+/* { dg-final { scan-assembler-times {\tlastb\ts[0-9]+, p[0-7], z[0-9]+\.s} 2 } } */
+/* { dg-final { scan-assembler-times {\tlastb\td[0-9]+, p[0-7], z[0-9]+\.d} 1 } } */
diff --git a/gcc/testsuite/gcc.target/aarch64/sve_speculative_4_run.c b/gcc/testsuite/gcc.target/aarch64/sve_speculative_4_run.c
new file mode 100644
index 00000000000..96834ba51be
--- /dev/null
+++ b/gcc/testsuite/gcc.target/aarch64/sve_speculative_4_run.c
@@ -0,0 +1,56 @@
+/* { dg-do run { target { aarch64_sve_hw } } } */
+/* { dg-options "-O3 -fno-inline -march=armv8-a+sve -ffast-math" } */
+/* { dg-options "-O3 -fno-inline -march=armv8-a+sve -ffast-math -msve-vector-bits=256" { target aarch64_sve256_hw } } */
+
+#include "sve_speculative_4.c"
+
+extern void abort (void);
+#include <string.h>
+
+#define MAX 0xAE
+
+#define TEST_LOOP(ARGTYPE,INDUCTYPE)\
+{\
+  ARGTYPE array[MAX];\
+  memset (array, 0, sizeof (ARGTYPE) * MAX);\
+  array[MAX - 1] = 72;\
+  INDUCTYPE res = spec_loop_##ARGTYPE##INDUCTYPE (0xFF, MAX, array);\
+  if (res != 72)\
+    abort ();\
+}
+
+#define TEST_FP_LOOP(ARGTYPE,FPTYPE)\
+{\
+  FPTYPE array[MAX];\
+  memset (array, 0, sizeof (FPTYPE) * MAX);\
+  array[MAX - 1] = 54.5;\
+  FPTYPE res = spec_fp_loop_##ARGTYPE##FPTYPE (0xFF, MAX, array);\
+  if (res != 54.5)\
+    abort ();\
+}
+
+int main ()
+{
+  TEST_LOOP (uint8_t, uint8_t);
+  TEST_LOOP (uint16_t, uint16_t);
+  TEST_LOOP (uint32_t, uint32_t);
+  TEST_LOOP (uint64_t, uint64_t);
+  TEST_LOOP (int32_t, int32_t);
+  TEST_LOOP (int64_t, int64_t);
+
+  TEST_LOOP (uint16_t, uint8_t)
+
+  TEST_LOOP (uint32_t, uint8_t)
+  TEST_LOOP (uint32_t, uint16_t)
+
+  TEST_LOOP (uint64_t, uint8_t)
+  TEST_LOOP (uint64_t, uint16_t)
+  TEST_LOOP (uint64_t, uint32_t)
+
+  TEST_FP_LOOP (uint32_t, float)
+  TEST_FP_LOOP (uint64_t, double)
+
+  TEST_FP_LOOP (uint64_t, float)
+
+  return 0;
+}
diff --git a/gcc/testsuite/gcc.target/aarch64/sve_speculative_5.c b/gcc/testsuite/gcc.target/aarch64/sve_speculative_5.c
new file mode 100644
index 00000000000..d1d8f8fbaaa
--- /dev/null
+++ b/gcc/testsuite/gcc.target/aarch64/sve_speculative_5.c
@@ -0,0 +1,54 @@
+/* { dg-do compile } */
+/* { dg-options "-O3 -fno-inline -ffast-math -march=armv8-a+sve -fdump-tree-vect-details -msve-vector-bits=256" } */
+
+#include <stdint.h>
+
+/* Speculative loop with a load.  Exit condition in the array.  */
+
+#ifndef EXIT_CONDITION
+#define EXIT_CONDITION 1
+#endif
+
+#define SPEC_LOOP(ARGTYPE)\
+ARGTYPE spec_loop_##ARGTYPE (ARGTYPE * array)\
+{\
+  ARGTYPE i = 0;\
+  ARGTYPE r = EXIT_CONDITION + 1;\
+  while (r != EXIT_CONDITION)\
+  {\
+    r = array[i];\
+    i++;\
+  }\
+  return i;\
+}
+
+#define SPEC_FP_LOOP(FPTYPE, ARGTYPE)\
+ARGTYPE spec_loop_##ARGTYPE##FPTYPE (FPTYPE * array)\
+{\
+  ARGTYPE i = 0;\
+  ARGTYPE r = EXIT_CONDITION + 1;\
+  while (r != EXIT_CONDITION)\
+  {\
+    r = array[i];\
+    i++;\
+  }\
+  return i;\
+}
+
+/* TODO: Cannot yet vectorize due to gather load.  */
+SPEC_LOOP (int8_t)
+SPEC_LOOP (int16_t)
+
+SPEC_LOOP (int32_t)
+SPEC_LOOP (int64_t)
+SPEC_FP_LOOP (float, int32_t)
+SPEC_FP_LOOP (double, int64_t)
+
+/* { dg-final { scan-tree-dump-times "LOOP VECTORIZED" 4 "vect" } } */
+/* { dg-final { scan-assembler-times {\tbrka\tp[0-9]+\.b, p[0-9]+/z, p[0-9]+\.b} 4 } } */
+/* { dg-final { scan-assembler-not {\tlastb\tw[0-9]+, p[0-7], z[0-9]+\.b} } } */
+/* { dg-final { scan-assembler-not {\tlastb\tw[0-9]+, p[0-7], z[0-9]+\.h} } } */
+/* { dg-final { scan-assembler-times {\tlastb\tw[0-9]+, p[0-7], z[0-9]+\.s} 2 } } */
+/* { dg-final { scan-assembler-times {\tlastb\tx[0-9]+, p[0-7], z[0-9]+\.d} 2 } } */
+/* { dg-final { scan-assembler-not {\tlastb\ts[0-9]+, p[0-7], z[0-9]+\.s} } } */
+/* { dg-final { scan-assembler-not {\tlastb\td[0-9]+, p[0-7], z[0-9]+\.d} } } */
diff --git a/gcc/testsuite/gcc.target/aarch64/sve_speculative_5_run.c b/gcc/testsuite/gcc.target/aarch64/sve_speculative_5_run.c
new file mode 100644
index 00000000000..a8f7f9fff17
--- /dev/null
+++ b/gcc/testsuite/gcc.target/aarch64/sve_speculative_5_run.c
@@ -0,0 +1,104 @@
+/* { dg-do run { target { aarch64_sve_hw } } } */
+/* { dg-options "-O3 -fno-inline -march=armv8-a+sve" } */
+/* { dg-options "-O3 -fno-inline -march=armv8-a+sve -msve-vector-bits=256" { target aarch64_sve256_hw } } */
+
+#include "sve_speculative_5.c"
+
+#define _GNU_SOURCE
+#include <sys/mman.h>
+extern void abort (void);
+extern void *mremap (void *old_address, size_t old_size,
+		     size_t new_size, int flags, ... /* void *new_address */);
+
+#ifndef FILL_DATA
+#define FILL_DATA 0
+#endif
+
+#ifndef LOOP_COUNTS
+#define LOOP_COUNTS {22,20,13,17,29,19}
+#endif
+int loop_counts[] = LOOP_COUNTS;
+
+/* Program will fault if memory beyond the boundaries of BUF is accessed.  */
+
+#define SPACE_SIZE 4096*sizeof(int)
+
+/* Enable to confirm program segfaults when accessing outside of BUF.  */
+#ifdef CHECK_SEGFAULT
+#define ADDITIONAL 1
+#else
+#define ADDITIONAL 0
+#endif
+
+/* BUF is an array of NUM_ELEMENTS size.
+   BUF_PRE points to 4 elements before BUF.
+   Before calling SPEC_LOOP, set the last element of BUF and the
+   four elements of BUF_PRE to the exit condition.
+   Fill the rest of BUF to the fill data.  */
+
+#define TEST_SPEC_LOOP_FUNC(ARGTYPE)\
+void test_spec_loop_##ARGTYPE (void *bufend, ARGTYPE num_elements)\
+{\
+  int i;\
+  ARGTYPE* buf = ((ARGTYPE*)bufend) - num_elements;\
+  ARGTYPE* buf_pre = ((ARGTYPE*)bufend) - num_elements - 4;\
+  for (i=0; i<num_elements-1; i++)\
+    buf[i] = FILL_DATA;\
+  buf[num_elements - 1 + ADDITIONAL] = EXIT_CONDITION;\
+  for (i=0; i<4; i++)\
+    buf_pre[i] = EXIT_CONDITION;\
+  ARGTYPE ret = spec_loop_##ARGTYPE (buf);\
+  if (ret != num_elements)\
+    abort ();\
+}
+
+#define TEST_SPEC_FP_LOOP_FUNC(FPTYPE, ARGTYPE)\
+void test_spec_loop_##ARGTYPE##FPTYPE (void *bufend, ARGTYPE num_elements)\
+{\
+  int i;\
+  FPTYPE* buf = ((FPTYPE*)bufend) - num_elements;\
+  FPTYPE* buf_pre = ((FPTYPE*)bufend) - num_elements - 4;\
+  for (i=0; i<num_elements-1; i++)\
+    buf[i] = FILL_DATA;\
+  buf[num_elements - 1 + ADDITIONAL] = EXIT_CONDITION;\
+  for (i=0; i<4; i++)\
+    buf_pre[i] = EXIT_CONDITION;\
+  ARGTYPE ret = spec_loop_##ARGTYPE##FPTYPE (buf);\
+  if (ret != num_elements)\
+    abort ();\
+}
+
+TEST_SPEC_LOOP_FUNC (int8_t)
+TEST_SPEC_LOOP_FUNC (int16_t)
+TEST_SPEC_LOOP_FUNC (int32_t)
+TEST_SPEC_LOOP_FUNC (int64_t)
+TEST_SPEC_FP_LOOP_FUNC (float, int32_t)
+TEST_SPEC_FP_LOOP_FUNC (double, int64_t)
+
+int main (void)
+{
+  /* Map in two pages worth of space.  Then reduce it down to a single page.
+     This will result in the second page of data being unmapped - ie it
+     will cause a segfault if accessed.  */
+
+  void *space = mmap (0, SPACE_SIZE * 2, PROT_READ|PROT_WRITE,
+		      MAP_ANON|MAP_PRIVATE, -1, 0);
+  if (space == (void*)-1)
+    abort ();
+
+  void *space_new = mremap (space, SPACE_SIZE * 2, SPACE_SIZE, 0);
+  if (space != space_new)
+    abort ();
+
+  /* set END to the start of the second (unmapped) page.  */
+  char *end = space + SPACE_SIZE;
+
+  test_spec_loop_int8_t (end, loop_counts[0]);
+  test_spec_loop_int16_t (end, loop_counts[1]);
+  test_spec_loop_int32_t (end, loop_counts[2]);
+  test_spec_loop_int64_t (end, loop_counts[3]);
+  test_spec_loop_int32_tfloat (end, loop_counts[4]);
+  test_spec_loop_int64_tdouble (end, loop_counts[5]);
+
+  return 0;
+}
diff --git a/gcc/testsuite/gcc.target/aarch64/sve_speculative_5_run_2.c b/gcc/testsuite/gcc.target/aarch64/sve_speculative_5_run_2.c
new file mode 100644
index 00000000000..ed12336f47d
--- /dev/null
+++ b/gcc/testsuite/gcc.target/aarch64/sve_speculative_5_run_2.c
@@ -0,0 +1,8 @@
+/* { dg-do run { target { aarch64_sve_hw } } } */
+/* { dg-options "-O3 -fno-inline -march=armv8-a+sve" } */
+/* { dg-options "-O3 -fno-inline -march=armv8-a+sve -msve-vector-bits=256" { target aarch64_sve256_hw } } */
+
+/* Use exit condition of 0.  */
+#define EXIT_CONDITION 0
+#define FILL_DATA 1
+#include "sve_speculative_5_run.c"
diff --git a/gcc/testsuite/gcc.target/aarch64/sve_speculative_5_run_3.c b/gcc/testsuite/gcc.target/aarch64/sve_speculative_5_run_3.c
new file mode 100644
index 00000000000..c6a5edf86b4
--- /dev/null
+++ b/gcc/testsuite/gcc.target/aarch64/sve_speculative_5_run_3.c
@@ -0,0 +1,9 @@
+/* { dg-do run { target { aarch64_sve_hw } } } */
+/* { dg-options "-O3 -fno-inline -march=armv8-a+sve" } */
+/* { dg-options "-O3 -fno-inline -march=armv8-a+sve -msve-vector-bits=256" { target aarch64_sve256_hw } } */
+
+/* Use exit condition of 0 and less than a single iteration.  */
+#define EXIT_CONDITION 0
+#define FILL_DATA 1
+#define LOOP_COUNTS {3,5,3,1,5,1}
+#include "sve_speculative_5_run.c"
diff --git a/gcc/testsuite/gcc.target/aarch64/sve_speculative_6.c b/gcc/testsuite/gcc.target/aarch64/sve_speculative_6.c
new file mode 100644
index 00000000000..1b71687a257
--- /dev/null
+++ b/gcc/testsuite/gcc.target/aarch64/sve_speculative_6.c
@@ -0,0 +1,44 @@
+/* { dg-do compile } */
+/* { dg-options "-O3 -fno-inline -ffast-math -march=armv8-a+sve -fdump-tree-vect-details -msve-vector-bits=256" } */
+
+#include <stdint.h>
+
+/* Speculative loop with a conditional load.  */
+
+#define SPEC_LOOP(ARGTYPE,INDUCTYPE)\
+INDUCTYPE spec_loop_##ARGTYPE##INDUCTYPE (ARGTYPE mask, ARGTYPE limit,\
+					  ARGTYPE * array, ARGTYPE * cond)\
+{\
+  uint64_t i = 0;\
+  INDUCTYPE r = 0;\
+  while ((i & mask) != limit)\
+  {\
+    if (cond[i])\
+      r = array[i];\
+    i++;\
+  }\
+  return r;\
+}
+
+SPEC_LOOP (uint8_t, uint8_t)
+SPEC_LOOP (uint16_t, uint16_t)
+SPEC_LOOP (uint32_t, uint32_t)
+SPEC_LOOP (uint64_t, uint64_t)
+
+SPEC_LOOP (int8_t, int8_t)
+SPEC_LOOP (int16_t, int16_t)
+SPEC_LOOP (int32_t, int32_t)
+SPEC_LOOP (int64_t, int64_t)
+
+/* Conversions.  */
+SPEC_LOOP (uint16_t, uint8_t)
+
+SPEC_LOOP (uint32_t, uint8_t)
+SPEC_LOOP (uint32_t, uint16_t)
+
+SPEC_LOOP (uint64_t, uint8_t)
+SPEC_LOOP (uint64_t, uint16_t)
+SPEC_LOOP (uint64_t, uint32_t)
+
+/* { dg-final { scan-tree-dump-not "LOOP VECTORIZED" "vect" } } */
+/* { dg-final { scan-tree-dump "speculative mask loads not supported" "vect" } } */
diff --git a/gcc/testsuite/gcc.target/aarch64/sve_speculative_7.c b/gcc/testsuite/gcc.target/aarch64/sve_speculative_7.c
new file mode 100644
index 00000000000..0c2d62387e2
--- /dev/null
+++ b/gcc/testsuite/gcc.target/aarch64/sve_speculative_7.c
@@ -0,0 +1,21 @@
+/* { dg-do compile } */
+/* { dg-options "-O3 -fno-inline -ffast-math -march=armv8-a+sve -fdump-tree-vect-details -msve-vector-bits=256" } */
+
+#include <stdint.h>
+
+/* Speculative loop with a load and a test.  */
+
+uint32_t
+search (uint32_t *array)
+{
+  for (;;)
+    {
+      uint32_t x = *array++ >> 7;
+      if (x >= 200)
+        return x;
+    }
+}
+
+/* { dg-final { scan-tree-dump "LOOP VECTORIZED" "vect" } } */
+/* { dg-final { scan-assembler-times {\tbrka\tp[0-9]+\.b, p[0-9]+/z, p[0-9]+\.b} 1 } } */
+/* { dg-final { scan-assembler-times {\tlastb\tw[0-9]+, p[0-7], z[0-9]+\.s} 1 } } */
diff --git a/gcc/testsuite/gcc.target/aarch64/sve_speculative_8.c b/gcc/testsuite/gcc.target/aarch64/sve_speculative_8.c
new file mode 100644
index 00000000000..8c70e2f9012
--- /dev/null
+++ b/gcc/testsuite/gcc.target/aarch64/sve_speculative_8.c
@@ -0,0 +1,21 @@
+/* { dg-do compile } */
+/* { dg-options "-O3 -fno-inline -ffast-math -march=armv8-a+sve -fdump-tree-vect-details -msve-vector-bits=256" } */
+
+#include <stdint.h>
+
+/* Speculative loop with a load which requires multiple copies and a test.  */
+
+uint32_t
+search (uint64_t *array)
+{
+  for (;;)
+    {
+      uint32_t x = *array++ >> 7;
+      if (x >= 200)
+        return x;
+    }
+}
+
+/* { dg-final { scan-tree-dump "multiple copies not supported for speculative loops" "vect" } } */
+/* { dg-final { scan-tree-dump "not vectorized: relevant stmt not supported" "vect" } } */
+/* { dg-final { scan-tree-dump-not "LOOP VECTORIZED" "vect" } } */
diff --git a/gcc/testsuite/gcc.target/aarch64/sve_speculative_9.c b/gcc/testsuite/gcc.target/aarch64/sve_speculative_9.c
new file mode 100644
index 00000000000..c21b44614c7
--- /dev/null
+++ b/gcc/testsuite/gcc.target/aarch64/sve_speculative_9.c
@@ -0,0 +1,34 @@
+/* { dg-do compile } */
+/* { dg-options "-O3 -fno-common -ffast-math -march=armv8-a+sve -fdump-tree-vect-details -msve-vector-bits=256" } */
+
+#include <stdint.h>
+
+/* Speculative loop with two loads which cannot both be aligned.  */
+
+#ifndef STRIDE_LEVEL
+#define STRIDE_LEVEL 1
+#endif
+
+#define SPEC_LOOP(DATATYPE, ARGTYPE)\
+ARGTYPE spec_loop_##DATATYPE##_##ARGTYPE (DATATYPE *a, DATATYPE*b, DATATYPE n)\
+{\
+  ARGTYPE i = -1;\
+  do\
+    i += 1;\
+  while (a[i*STRIDE_LEVEL] + b[i*STRIDE_LEVEL] < n);\
+  return i;\
+}
+
+/* TODO: Cannot yet vectorize due to gather load.  */
+SPEC_LOOP (int8_t, int8_t)
+SPEC_LOOP (int16_t, int16_t)
+
+SPEC_LOOP (int32_t, int32_t)
+SPEC_LOOP (int64_t, int64_t)
+SPEC_LOOP (float, int32_t)
+SPEC_LOOP (double, int64_t)
+
+
+/* { dg-final { scan-tree-dump-times "loop versioned for vectorization to enhance alignment" 4 "vect" } } */
+/* { dg-final { scan-tree-dump-not "Alignment of access forced using peeling" "vect" } } */
+/* { dg-final { scan-tree-dump-times "LOOP VECTORIZED" 4 "vect" } } */
diff --git a/gcc/testsuite/gcc.target/aarch64/sve_speculative_9_run.c b/gcc/testsuite/gcc.target/aarch64/sve_speculative_9_run.c
new file mode 100644
index 00000000000..f9470020fd0
--- /dev/null
+++ b/gcc/testsuite/gcc.target/aarch64/sve_speculative_9_run.c
@@ -0,0 +1,67 @@
+/* { dg-do run { target { aarch64_sve_hw } } } */
+/* { dg-options "-O3 -fno-common -ffast-math -march=armv8-a+sve" } */
+/* { dg-options "-O3 -fno-common -ffast-math -march=armv8-a+sve -msve-vector-bits=256" { target aarch64_sve256_hw } } */
+
+#include "sve_speculative_9.c"
+
+extern void abort (void);
+
+#ifndef MAX_ARRAY_SIZE
+#define MAX_ARRAY_SIZE 500
+#endif
+
+#ifndef FILL_DATA
+#define FILL_DATA 0
+#endif
+
+#ifndef EXIT_CONDITION
+#define EXIT_CONDITION 5
+#endif
+
+#ifndef LOOP_COUNTS
+#define LOOP_COUNTS {37,45,55,17,39,43}
+#endif
+int loop_counts[] = LOOP_COUNTS;
+
+/* Fill the arrays with the exit conditions.
+   Then refill at the correct strided accesses with fill data up to the end of
+   the loop count.  */
+
+#define TEST_SPEC_LOOP_FUNC(DATATYPE, ARGTYPE)\
+void test_spec_loop_##DATATYPE##_##ARGTYPE (ARGTYPE num_elements)\
+{\
+  DATATYPE a[MAX_ARRAY_SIZE];\
+  DATATYPE b[MAX_ARRAY_SIZE];\
+  int i;\
+  for (i=0; i<MAX_ARRAY_SIZE; i++)\
+    {\
+      a[i] = EXIT_CONDITION;\
+      b[i] = EXIT_CONDITION;\
+    }\
+  for (i=0; (i<num_elements-1)*STRIDE_LEVEL; i++)\
+    {\
+      a[i*STRIDE_LEVEL] = FILL_DATA;\
+      b[i*STRIDE_LEVEL] = FILL_DATA;\
+    }\
+  ARGTYPE ret = spec_loop_##DATATYPE##_##ARGTYPE (a, b, EXIT_CONDITION);\
+  if (ret != num_elements - 1)\
+    abort ();\
+}
+
+TEST_SPEC_LOOP_FUNC (int8_t, int8_t)
+TEST_SPEC_LOOP_FUNC (int16_t, int16_t)
+TEST_SPEC_LOOP_FUNC (int32_t, int32_t)
+TEST_SPEC_LOOP_FUNC (int64_t, int64_t)
+TEST_SPEC_LOOP_FUNC (float, int32_t)
+TEST_SPEC_LOOP_FUNC (double, int64_t)
+
+int main (void)
+{
+  test_spec_loop_int8_t_int8_t (loop_counts[0]);
+  test_spec_loop_int16_t_int16_t (loop_counts[1]);
+  test_spec_loop_int32_t_int32_t (loop_counts[2]);
+  test_spec_loop_int64_t_int64_t (loop_counts[3]);
+  test_spec_loop_float_int32_t (loop_counts[4]);
+  test_spec_loop_double_int64_t (loop_counts[5]);
+  return 0;
+}
diff --git a/gcc/tree-vect-data-refs.c b/gcc/tree-vect-data-refs.c
index adb2af72573..3ef92c1d87d 100644
--- a/gcc/tree-vect-data-refs.c
+++ b/gcc/tree-vect-data-refs.c
@@ -834,15 +834,64 @@ vect_record_base_alignments (vec_info *vinfo)
       }
 }
 
+/* Function can_get_vect_data_ref_required_alignment
+
+   Try to calculate the alignment for the given data reference DR once
+   vectorised.  If successful store the alignment to ALIGNMENT_P.
+
+   For non speculative loops, the alignment is always calculable and is given
+   by preferred_vector_alignment.  For speculative loops we align to the
+   vector size multiplied by the step.  */
+
+bool
+vect_can_calculate_target_alignment (struct data_reference *dr,
+				     unsigned int *alignment_p)
+{
+  gimple *stmt = DR_STMT (dr);
+  stmt_vec_info stmt_info = vinfo_for_stmt (stmt);
+  loop_vec_info loop_vinfo = STMT_VINFO_LOOP_VINFO (stmt_info);
+
+  if (!loop_vinfo || !LOOP_VINFO_SPECULATIVE_EXECUTION (loop_vinfo))
+    {
+      tree vectype = STMT_VINFO_VECTYPE (stmt_info);
+      if (alignment_p)
+	*alignment_p = targetm.vectorize.preferred_vector_alignment (vectype);
+      return true;
+    }
+
+  /* We have to assume that non-constant vector sizes might not be
+     a power of two.  */
+  unsigned HOST_WIDE_INT size;
+  if (!current_vector_size.is_constant (&size))
+    return false;
+
+  /* Step must be a positive integer.  */
+  if (!tree_fits_shwi_p (DR_STEP (dr))
+      || tree_int_cst_sgn (DR_STEP (dr)) <= 0)
+    return false;
+
+  unsigned int step = tree_to_uhwi (DR_STEP (dr));
+  unsigned int unit_size =
+    tree_to_uhwi (TYPE_SIZE_UNIT (TREE_TYPE (DR_REF (dr))));
+
+  /* Step must be a power of two and divisible by the unit size.  */
+  if (!pow2p_hwi (step) || step % unit_size != 0)
+    return false;
+
+  if (alignment_p)
+    *alignment_p = size * BITS_PER_UNIT * step / unit_size;
+  return true;
+}
+
 /* Return the target alignment for the vectorized form of DR.  */
 
 static unsigned int
 vect_calculate_target_alignment (struct data_reference *dr)
 {
-  gimple *stmt = DR_STMT (dr);
-  stmt_vec_info stmt_info = vinfo_for_stmt (stmt);
-  tree vectype = STMT_VINFO_VECTYPE (stmt_info);
-  return targetm.vectorize.preferred_vector_alignment (vectype);
+  unsigned int ret;
+  if (!vect_can_calculate_target_alignment (dr, &ret))
+    gcc_unreachable ();
+  return ret;
 }
 
 /* Function vect_compute_data_ref_alignment
@@ -2288,11 +2337,11 @@ vect_find_same_alignment_drs (struct data_dependence_relation *ddr)
   if (diff != 0)
     {
       /* Get the wider of the two alignments.  */
-      unsigned int align_a = (vect_calculate_target_alignment (dra)
-			      / BITS_PER_UNIT);
-      unsigned int align_b = (vect_calculate_target_alignment (drb)
-			      / BITS_PER_UNIT);
-      unsigned int max_align = MAX (align_a, align_b);
+      unsigned int align_a, align_b;
+      if (!vect_can_calculate_target_alignment (dra, &align_a)
+	  || !vect_can_calculate_target_alignment (drb, &align_b))
+	return;
+      unsigned int max_align = MAX (align_a, align_b) / BITS_PER_UNIT;
 
       /* Require the gap to be a multiple of the larger vector alignment.  */
       if (!wi::multiple_of_p (diff, max_align, SIGNED))
@@ -2341,6 +2390,17 @@ vect_analyze_data_refs_alignment (loop_vec_info vinfo)
   FOR_EACH_VEC_ELT (datarefs, i, dr)
     {
       stmt_vec_info stmt_info = vinfo_for_stmt (DR_STMT (dr));
+
+      if (STMT_VINFO_VECTORIZABLE (stmt_info)
+	  && !vect_can_calculate_target_alignment (dr, NULL))
+	{
+	  if (dump_enabled_p ())
+	    dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
+			     "not vectorized: can't calculate required "
+			     "alignment for data ref.\n");
+	  return false;
+	}
+
       if (STMT_VINFO_VECTORIZABLE (stmt_info)
 	  && !vect_compute_data_ref_alignment (dr))
 	{
@@ -3484,7 +3544,17 @@ vect_prune_runtime_alias_test_list (loop_vec_info loop_vinfo)
       else
 	{
 	  if (!operand_equal_p (DR_STEP (dr_a), DR_STEP (dr_b), 0))
-	    length_factor = scalar_loop_iters;
+	    {
+	      if (LOOP_VINFO_SPECULATIVE_EXECUTION (loop_vinfo))
+		{
+		  if (dump_enabled_p ())
+		    dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
+				     "Cannot vectorize speculative loops with "
+				     "differing data reference step sizes.\n");
+		  return false;
+		}
+	      length_factor = scalar_loop_iters;
+	    }
 	  else
 	    length_factor = size_int (vect_factor);
 	  segment_length_a = vect_vfa_segment_size (dr_a, length_factor);
@@ -4466,6 +4536,9 @@ vect_get_new_ssa_name (tree type, enum vect_var_kind var_kind, const char *name)
   case vect_simple_var:
     prefix = "vect";
     break;
+  case vect_mask_var:
+    prefix = "mask";
+    break;
   case vect_scalar_var:
     prefix = "stmp";
     break;
@@ -6652,6 +6725,10 @@ vect_supportable_dr_alignment (struct data_reference *dr,
     {
       vect_loop = LOOP_VINFO_LOOP (loop_vinfo);
       nested_in_vect_loop = nested_in_vect_loop_p (vect_loop, stmt);
+
+      /* Speculative loops rely on aligned data refs.  */
+      if (LOOP_VINFO_SPECULATIVE_EXECUTION (loop_vinfo))
+	return dr_unaligned_unsupported;
     }
 
   /* Possibly unaligned access.  */
diff --git a/gcc/tree-vect-loop-manip.c b/gcc/tree-vect-loop-manip.c
index 57aab1b764f..901113fcf03 100644
--- a/gcc/tree-vect-loop-manip.c
+++ b/gcc/tree-vect-loop-manip.c
@@ -369,6 +369,242 @@ vect_maybe_permute_loop_masks (gimple_seq *seq, rgroup_masks *dest_rgm,
   return false;
 }
 
+/* Helper for vect_set_speculative_masks.  Set the masks in RGM directly
+   from the corresponding scalar values.  RGM belongs to LOOP, which has
+   been vectorized according to LOOP_VINFO.  NSCALARITERS_SKIP is the
+   number of scalar iterations that we should skip during the first
+   iteration of the vector loop (because the start point has been
+   brought forward by that amount to achieve alignment).
+
+   Add any new preheader statements to PREHEADER_SEQ and any new header
+   statements to HEADER_SEQ.  */
+
+static void
+vect_set_speculative_masks_directly (struct loop *loop,
+				     loop_vec_info loop_vinfo,
+				     gimple_seq *preheader_seq,
+				     gimple_seq *header_seq,
+				     rgroup_masks *rgm,
+				     tree nscalariters_skip)
+{
+  /* It doesn't make sense to align for speculation when we have a
+     capped VF.  */
+  gcc_assert (!use_capped_vf (loop_vinfo));
+
+  tree compare_type = LOOP_VINFO_MASK_COMPARE_TYPE (loop_vinfo);
+  tree mask_type = rgm->mask_type;
+  poly_uint64 nscalars_per_mask = TYPE_VECTOR_SUBPARTS (mask_type);
+  unsigned int nscalars_per_iter = rgm->max_nscalars_per_iter;
+
+  tree nscalars_skip = nscalariters_skip;
+  if (nscalars_per_iter != 1)
+    {
+      tree factor = build_int_cst (compare_type, nscalars_per_iter);
+      nscalars_skip = gimple_build (preheader_seq, MULT_EXPR, compare_type,
+				    nscalars_skip, factor);
+    }
+
+  tree full_mask = build_minus_one_cst (mask_type);
+  tree mask;
+  unsigned int i;
+  FOR_EACH_VEC_ELT (rgm->masks, i, mask)
+    {
+      /* Previous masks covered START scalars.  This mask covers the
+	 next batch.  */
+      tree start = build_int_cst (compare_type, nscalars_per_mask * i);
+      tree init_mask = vect_gen_while_not (preheader_seq, mask_type,
+					   start, nscalars_skip);
+
+      /* Always use a full mask for subsequent iterations of the loop.  */
+      vect_set_loop_mask (loop, header_seq, mask, init_mask,
+			  full_mask, NULL_TREE);
+    }
+}
+
+/* Set up the controlling masks for LOOP, which is a speculative loop that
+   has been vectorized according to LOOP_VINFO.  */
+
+static void
+vect_set_speculative_masks (struct loop *loop, loop_vec_info loop_vinfo)
+{
+  gimple_seq preheader_seq = NULL;
+  gimple_seq header_seq = NULL;
+
+  vec_loop_masks *masks = &LOOP_VINFO_MASKS (loop_vinfo);
+  tree nscalariters_skip = LOOP_VINFO_MASK_SKIP_NITERS (loop_vinfo);
+  rgroup_masks *rgm;
+  unsigned int i;
+  FOR_EACH_VEC_ELT (*masks, i, rgm)
+    if (!rgm->masks.is_empty ())
+      {
+	/* We shouldn't be using masks if there are no elements to skip
+	   on the first iteration.  */
+	gcc_assert (nscalariters_skip != NULL_TREE);
+
+	/* First try using permutes.  */
+	unsigned int nmasks = i + 1;
+	if ((nmasks & 1) == 0)
+	  {
+	    rgroup_masks *half_rgm = &(*masks)[nmasks / 2 - 1];
+	    if (!half_rgm->masks.is_empty ()
+		&& vect_maybe_permute_loop_masks (&header_seq, rgm, half_rgm))
+	      continue;
+	  }
+
+	vect_set_speculative_masks_directly (loop, loop_vinfo,
+					     &preheader_seq, &header_seq,
+					     rgm, nscalariters_skip);
+      }
+
+  /* Emit all accumulated statements.  */
+  add_preheader_seq (loop, preheader_seq);
+  add_header_seq (loop, header_seq);
+}
+
+/* RGM belongs to the nonspeculative masks of LOOP_VINFO.  Set up the masks
+   in RGM so that the active bits corresponding to the first NSCALARITERS
+   scalar iterations are true and every other bit is false.  Add any new
+   statements before GSI.  */
+
+static void
+vect_set_nonspeculative_masks_directly (loop_vec_info loop_vinfo,
+					gimple_stmt_iterator *gsi,
+					rgroup_masks *rgm, tree nscalariters)
+{
+  tree compare_type = LOOP_VINFO_MASK_COMPARE_TYPE (loop_vinfo);
+  tree mask_type = rgm->mask_type;
+  poly_uint64 nscalars_per_mask = TYPE_VECTOR_SUBPARTS (mask_type);
+  unsigned int nscalars_per_iter = rgm->max_nscalars_per_iter;
+
+  /* Calculate the number of scalars covered by the rgroup.  */
+  gimple_seq seq = NULL;
+  tree nscalars = nscalariters;
+  if (nscalars_per_iter != 1)
+    nscalars = gimple_build (&seq, MULT_EXPR, compare_type, nscalars,
+			     build_int_cst (compare_type, nscalars_per_iter));
+  if (seq)
+    gsi_insert_seq_before (gsi, seq, GSI_SAME_STMT);
+
+  tree mask;
+  unsigned int i;
+  FOR_EACH_VEC_ELT (rgm->masks, i, mask)
+    {
+      /* Previous masks covered START scalars.  This mask covers the
+	 next batch.  */
+      tree start = build_int_cst (compare_type, nscalars_per_mask * i);
+      if (LOOP_VINFO_MASK_SKIP_NITERS (loop_vinfo))
+	{
+	  /* First get a mask that ignores whether bits are active.  */
+	  tree temp = make_ssa_name (mask_type);
+	  gcall *call = vect_gen_while (temp, start, nscalars);
+	  gsi_insert_before (gsi, call, GSI_SAME_STMT);
+
+	  /* Now AND the result with the active lanes.  */
+	  tree active
+	    = vect_get_loop_mask (gsi, &LOOP_VINFO_MASKS (loop_vinfo),
+				  rgm->masks.length (), mask_type, i);
+	  gassign *assign = gimple_build_assign (mask, BIT_AND_EXPR,
+						 temp, active);
+	  gsi_insert_before (gsi, assign, GSI_SAME_STMT);
+	}
+      else
+	{
+	  /* All lanes are active.  */
+	  gcall *call = vect_gen_while (mask, start, nscalars);
+	  gsi_insert_before (gsi, call, GSI_SAME_STMT);
+	}
+    }
+}
+
+/* Set MASK to the mask of active elements up to and including the
+   first iteration for which the exit condition of LOOP_VINFO is true.
+   Insert any new statements before GSI.  ALL_ACTIVE_P is true if we
+   should treat all elements as active, false if we should get the
+   mask of active elements from the main loop mask.  */
+
+static void
+vect_add_break_after (loop_vec_info loop_vinfo, gimple_stmt_iterator *gsi,
+		      tree mask, bool all_active_p)
+{
+  tree mask_type = TREE_TYPE (mask);
+
+  tree active;
+  if (all_active_p)
+    active = build_minus_one_cst (mask_type);
+  else
+    active = vect_get_loop_mask (gsi, &LOOP_VINFO_MASKS (loop_vinfo),
+				 1, mask_type, 0);
+
+  /* Break the mask after the first true exit condition.  */
+  tree exit_mask = LOOP_VINFO_EXIT_TEST_MASK (loop_vinfo);
+  gcall *call = gimple_build_call_internal (IFN_BREAK_AFTER, 2,
+					    active, exit_mask);
+  gimple_call_set_lhs (call, mask);
+  gsi_insert_before (gsi, call, GSI_SAME_STMT);
+}
+
+/* Set up the nonspeculative masks in LOOP_VINFO.  Emit any new statements
+   before GSI.  */
+
+static void
+vect_set_nonspeculative_masks (loop_vec_info loop_vinfo,
+			       gimple_stmt_iterator *gsi)
+{
+  vec_niters_and_mask nim;
+  vec_loop_masks *masks = &LOOP_VINFO_NONSPECULATIVE_MASKS (loop_vinfo);
+  tree compare_type = LOOP_VINFO_MASK_COMPARE_TYPE (loop_vinfo);
+  tree niters = NULL_TREE;
+  rgroup_masks *rgm;
+  unsigned int i;
+  FOR_EACH_VEC_ELT (*masks, i, rgm)
+    if (!rgm->masks.is_empty ())
+      {
+	unsigned int nmasks = i + 1;
+
+	/* Try to set the mask directly with a BREAK_AFTER.  */
+	if (nmasks == 1 && rgm->max_nscalars_per_iter == 1)
+	  {
+	    /* All elements are active unless we're peeling for
+	       alignment.  */
+	    vect_add_break_after (loop_vinfo, gsi, rgm->masks[0],
+				  !LOOP_VINFO_MASK_SKIP_NITERS (loop_vinfo));
+	    continue;
+	  }
+
+	/* Try using permutes.  */
+	if ((nmasks & 1) == 0)
+	  {
+	    gimple_seq seq = NULL;
+	    rgroup_masks *half_rgm = &(*masks)[nmasks / 2 - 1];
+	    if (!half_rgm->masks.is_empty ()
+		&& vect_maybe_permute_loop_masks (&seq, rgm, half_rgm))
+	      {
+		gsi_insert_seq_before (gsi, seq, GSI_SAME_STMT);
+		continue;
+	      }
+	  }
+
+	if (niters == NULL_TREE)
+	  {
+	    /* Get the mask of elements up to and including the first
+	       iteration for which the exit condition is true.
+	       Include any inactive starting elements at this stage.  */
+	    tree mask_type = vect_mask_type_for_speculation (loop_vinfo);
+	    nim.mask = make_ssa_name (mask_type);
+	    vect_add_break_after (loop_vinfo, gsi, nim.mask, true);
+
+	    /* Convert the mask to a scalar count, then convert the
+	       sizetype result to the mask comparison type.  */
+	    gimple_seq seq = NULL;
+	    niters = vect_get_niters_from_mask (&seq, &nim);
+	    niters = gimple_convert (&seq, compare_type, niters);
+	    if (seq)
+	      gsi_insert_seq_before (gsi, seq, GSI_SAME_STMT);
+	  }
+	vect_set_nonspeculative_masks_directly (loop_vinfo, gsi, rgm, niters);
+      }
+}
+
 /* Helper for vect_set_loop_condition_masked.  Generate definitions for
    all the masks in RGM and return a mask that is nonzero when the loop
    needs to iterate.  Add any new preheader statements to PREHEADER_SEQ
@@ -939,11 +1175,29 @@ vect_set_loop_condition (struct loop *loop, loop_vec_info loop_vinfo,
 			 tree niters, tree step, tree final_iv,
 			 bool niters_maybe_zero)
 {
-  gcond *cond_stmt;
+  gcond *cond_stmt = NULL;
   gcond *orig_cond = get_loop_exit_condition (loop);
   gimple_stmt_iterator loop_cond_gsi = gsi_for_stmt (orig_cond);
+  bool masked_p = (loop_vinfo && LOOP_VINFO_FULLY_MASKED_P (loop_vinfo));
+  bool speculation_p
+    = (loop_vinfo && LOOP_VINFO_SPECULATIVE_EXECUTION (loop_vinfo));
 
-  if (loop_vinfo && LOOP_VINFO_FULLY_MASKED_P (loop_vinfo))
+  if (speculation_p)
+    {
+      /* Set the masks that control statements that cannot be speculatively
+	 executed.  */
+      vect_set_nonspeculative_masks (loop_vinfo, &loop_cond_gsi);
+
+      /* ...then add the statements themselves.  */
+      gimple_seq late_seq = LOOP_VINFO_NONSPECULATIVE_SEQ (loop_vinfo);
+      if (late_seq)
+	gsi_insert_seq_before (&loop_cond_gsi, late_seq, GSI_SAME_STMT);
+
+      /* Set up the masks that control the speculative statements.  */
+      if (masked_p)
+	vect_set_speculative_masks (loop, loop_vinfo);
+    }
+  else if (masked_p)
     cond_stmt = vect_set_loop_condition_masked (loop, loop_vinfo, niters,
 						final_iv, niters_maybe_zero,
 						loop_cond_gsi);
@@ -952,11 +1206,14 @@ vect_set_loop_condition (struct loop *loop, loop_vec_info loop_vinfo,
 						  final_iv, niters_maybe_zero,
 						  loop_cond_gsi);
 
-  /* Remove old loop exit test.  */
-  gsi_remove (&loop_cond_gsi, true);
-  free_stmt_vec_info (orig_cond);
+  if (!speculation_p)
+    {
+      /* Remove old loop exit test.  */
+      gsi_remove (&loop_cond_gsi, true);
+      free_stmt_vec_info (orig_cond);
+    }
 
-  if (dump_enabled_p ())
+  if (dump_enabled_p () && cond_stmt)
     {
       dump_printf_loc (MSG_NOTE, vect_location, "New loop exit condition: ");
       dump_gimple_stmt (MSG_NOTE, TDF_SLIM, cond_stmt, 0);
@@ -1644,13 +1901,15 @@ vect_gen_prolog_loop_niters (loop_vec_info loop_vinfo,
 {
   struct data_reference *dr = LOOP_VINFO_UNALIGNED_DR (loop_vinfo);
   tree var;
-  tree niters_type = TREE_TYPE (LOOP_VINFO_NITERS (loop_vinfo));
   gimple_seq stmts = NULL, new_stmts = NULL;
   tree iters, iters_name;
   gimple *dr_stmt = DR_STMT (dr);
   stmt_vec_info stmt_info = vinfo_for_stmt (dr_stmt);
   tree vectype = STMT_VINFO_VECTYPE (stmt_info);
   unsigned int target_align = DR_TARGET_ALIGNMENT (dr);
+  tree niters_type = (LOOP_VINFO_SPECULATIVE_EXECUTION (loop_vinfo)
+		      ? size_type_node
+		      : TREE_TYPE (LOOP_VINFO_NITERS (loop_vinfo)));
 
   if (LOOP_VINFO_PEELING_FOR_ALIGNMENT (loop_vinfo) > 0)
     {
@@ -1829,6 +2088,12 @@ vect_prepare_for_masked_peels (loop_vec_info loop_vinfo)
 tree
 vect_build_loop_niters (loop_vec_info loop_vinfo, bool *new_var_p)
 {
+  if (!LOOP_VINFO_NITERS (loop_vinfo))
+    {
+      gcc_assert (LOOP_VINFO_SPECULATIVE_EXECUTION (loop_vinfo));
+      return NULL;
+    }
+
   tree ni = unshare_expr (LOOP_VINFO_NITERS (loop_vinfo));
   if (TREE_CODE (ni) == INTEGER_CST)
     return ni;
@@ -2421,7 +2686,7 @@ vect_do_peeling (loop_vec_info loop_vinfo, tree niters, tree nitersm1,
 		 bool check_profitability, bool niters_no_overflow)
 {
   edge e, guard_e;
-  tree type = TREE_TYPE (niters), guard_cond;
+  tree guard_cond;
   basic_block guard_bb, guard_to;
   profile_probability prob_prolog, prob_vector, prob_epilog;
   int estimated_vf;
@@ -2469,6 +2734,7 @@ vect_do_peeling (loop_vec_info loop_vinfo, tree niters, tree nitersm1,
 
   /* Generate the number of iterations for the prolog loop.  We do this here
      so that we can also get the upper bound on the number of iterations.  */
+  tree type = TREE_TYPE (niters);
   tree niters_prolog;
   int bound_prolog = 0;
   if (prolog_peeling)
diff --git a/gcc/tree-vect-loop.c b/gcc/tree-vect-loop.c
index e33a83bfa6b..c6269a95815 100644
--- a/gcc/tree-vect-loop.c
+++ b/gcc/tree-vect-loop.c
@@ -369,7 +369,12 @@ vect_determine_vectorization_factor (loop_vec_info loop_vinfo)
 		analyze_pattern_stmt = false;
 	    }
 
+	  bool is_gcond = gimple_code (stmt) == GIMPLE_COND;
+	  gcc_assert (!is_gcond
+		      || LOOP_VINFO_SPECULATIVE_EXECUTION (loop_vinfo));
+
 	  if (gimple_get_lhs (stmt) == NULL_TREE
+	      && !is_gcond
 	      /* MASK_STORE has no lhs, but is ok.  */
 	      && (!is_gimple_call (stmt)
 		  || !gimple_call_internal_p (stmt)
@@ -427,27 +432,31 @@ vect_determine_vectorization_factor (loop_vec_info loop_vinfo)
 	      gcc_assert (!STMT_VINFO_DATA_REF (stmt_info));
 	      if (gimple_call_internal_p (stmt, IFN_MASK_STORE))
 		scalar_type = TREE_TYPE (gimple_call_arg (stmt, 3));
+	      else if (is_gcond)
+		scalar_type = TREE_TYPE (gimple_cond_lhs (stmt));
 	      else
 		scalar_type = TREE_TYPE (gimple_get_lhs (stmt));
 
 	      /* Bool ops don't participate in vectorization factor
 		 computation.  For comparison use compared types to
 		 compute a factor.  */
-	      if (VECT_SCALAR_BOOLEAN_TYPE_P (scalar_type)
-		  && is_gimple_assign (stmt)
-		  && gimple_assign_rhs_code (stmt) != COND_EXPR)
+	      if (is_gcond
+		  || (VECT_SCALAR_BOOLEAN_TYPE_P (scalar_type)
+		      && is_gimple_assign (stmt)
+		      && gimple_assign_rhs_code (stmt) != COND_EXPR))
 		{
 		  if (STMT_VINFO_RELEVANT_P (stmt_info)
 		      || STMT_VINFO_LIVE_P (stmt_info))
 		    mask_producers.safe_push (stmt_info);
 		  bool_result = true;
 
-		  if (TREE_CODE_CLASS (gimple_assign_rhs_code (stmt))
-		      == tcc_comparison
+		  if (is_gimple_assign (stmt)
+		      && (TREE_CODE_CLASS (gimple_assign_rhs_code (stmt))
+			  == tcc_comparison)
 		      && !VECT_SCALAR_BOOLEAN_TYPE_P
 			    (TREE_TYPE (gimple_assign_rhs1 (stmt))))
 		    scalar_type = TREE_TYPE (gimple_assign_rhs1 (stmt));
-		  else
+		  else if (TREE_CODE (scalar_type) == BOOLEAN_TYPE)
 		    {
 		      if (!analyze_pattern_stmt && gsi_end_p (pattern_def_si))
 			{
@@ -589,13 +598,28 @@ vect_determine_vectorization_factor (loop_vec_info loop_vinfo)
       tree mask_type = NULL;
 
       stmt = STMT_VINFO_STMT (mask_producers[i]);
+      bool is_gcond = gimple_code (stmt) == GIMPLE_COND;
 
+      bool ops_are_booleans = true;
       if (is_gimple_assign (stmt)
 	  && TREE_CODE_CLASS (gimple_assign_rhs_code (stmt)) == tcc_comparison
 	  && !VECT_SCALAR_BOOLEAN_TYPE_P
 				      (TREE_TYPE (gimple_assign_rhs1 (stmt))))
 	{
 	  scalar_type = TREE_TYPE (gimple_assign_rhs1 (stmt));
+	  ops_are_booleans = false;
+
+	}
+      else if (is_gcond
+	       && TREE_CODE (TREE_TYPE (gimple_cond_lhs (stmt)))
+		  != BOOLEAN_TYPE)
+	{
+	  scalar_type = TREE_TYPE (gimple_cond_lhs (stmt));
+	  ops_are_booleans = false;
+	}
+
+      if (!ops_are_booleans)
+	{
 	  mask_type = get_mask_type_for_scalar_type (scalar_type);
 
 	  if (!mask_type)
@@ -1131,6 +1155,7 @@ _loop_vec_info::_loop_vec_info (struct loop *loop_in)
     slp_unrolling_factor (1),
     single_scalar_iteration_cost (0),
     vectorizable (false),
+    speculative_execution (false),
     can_fully_mask_p (true),
     fully_masked_p (false),
     peeling_for_gaps (false),
@@ -1140,7 +1165,10 @@ _loop_vec_info::_loop_vec_info (struct loop *loop_in)
     has_mask_store (false),
     scalar_loop (NULL),
     orig_loop_info (NULL),
-    vect_addr_base_htab (31)
+    vect_addr_base_htab (31),
+    exit_test_mask (NULL_TREE),
+    exit_mask (NULL_TREE),
+    nonspeculative_seq (NULL)
 {
   /* Create/Update stmt_info for all stmts in the loop.  */
   basic_block *body = get_loop_body (loop);
@@ -1252,6 +1280,7 @@ _loop_vec_info::~_loop_vec_info ()
   free (bbs);
 
   release_vec_loop_masks (&masks);
+  release_vec_loop_masks (&nonspeculative_masks);
 
   loop->aux = NULL;
 }
@@ -1296,22 +1325,40 @@ vect_verify_full_masking (loop_vec_info loop_vinfo)
 {
   struct loop *loop = LOOP_VINFO_LOOP (loop_vinfo);
   unsigned int min_ni_width;
+  unsigned HOST_WIDE_INT const_vf;
 
-  /* Get the maximum number of iterations that is representable
-     in the counter type.  */
-  tree ni_type = TREE_TYPE (LOOP_VINFO_NITERSM1 (loop_vinfo));
-  widest_int max_ni = wi::to_widest (TYPE_MAX_VALUE (ni_type)) + 1;
+  /* Get the number of bits needed to hold the number of iterations
+     as an unsigned value.  */
+  if (LOOP_VINFO_SPECULATIVE_EXECUTION (loop_vinfo))
+    {
+      /* For speculative loops, we only need to count the number of iterations
+	 before the vector loop.  */
+      if (LOOP_VINFO_VECT_FACTOR (loop_vinfo).is_constant (&const_vf))
+	{
+	  unsigned int factor = vect_get_max_nscalars_per_iter (loop_vinfo);
+	  min_ni_width = wi::min_precision (const_vf * factor, UNSIGNED);
+	}
+      else
+	min_ni_width = POINTER_SIZE;
+    }
+  else
+    {
+      /* Get the maximum number of iterations that is representable
+	 in the counter type.  */
+      tree ni_type = TREE_TYPE (LOOP_VINFO_NITERSM1 (loop_vinfo));
+      widest_int max_ni = wi::to_widest (TYPE_MAX_VALUE (ni_type)) + 1;
 
-  /* Get a more refined estimate for the number of iterations.  */
-  widest_int max_back_edges;
-  if (max_loop_iterations (loop, &max_back_edges))
-    max_ni = wi::smin (max_ni, max_back_edges + 1);
+      /* Get a more refined estimate for the number of iterations.  */
+      widest_int max_back_edges;
+      if (max_loop_iterations (loop, &max_back_edges))
+	max_ni = wi::smin (max_ni, max_back_edges + 1);
 
-  /* Account for rgroup masks, in which each bit is replicated N times.  */
-  max_ni *= vect_get_max_nscalars_per_iter (loop_vinfo);
+      /* Account for rgroup masks, in which each bit is replicated N times.  */
+      max_ni *= vect_get_max_nscalars_per_iter (loop_vinfo);
 
-  /* Work out how many bits we need to represent the limit.  */
-  min_ni_width = wi::min_precision (max_ni, UNSIGNED);
+      /* Work out how many bits we need to represent the limit.  */
+      min_ni_width = wi::min_precision (max_ni, UNSIGNED);
+    }
 
   /* Find a scalar mode for which WHILE_ULT is supported.  */
   opt_scalar_int_mode cmp_mode_iter;
@@ -1672,7 +1719,8 @@ vect_analyze_loop_form_1 (struct loop *loop, gcond **loop_cond,
 
   if (integer_zerop (*assumptions)
       || !*number_of_iterations
-      || chrec_contains_undetermined (*number_of_iterations))
+      || (loop->inner
+	  && chrec_contains_undetermined (*number_of_iterations)))
     {
       if (dump_enabled_p ())
 	dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
@@ -1680,6 +1728,15 @@ vect_analyze_loop_form_1 (struct loop *loop, gcond **loop_cond,
 			 "computed.\n");
       return false;
     }
+  else if (!loop->inner
+	   && chrec_contains_undetermined (*number_of_iterations))
+    {
+      if (dump_enabled_p ())
+	dump_printf_loc (MSG_NOTE, vect_location,
+			 "number of iterations cannot be computed, "
+			 "relying upon speculative execution\n");
+      return true;
+    }
 
   if (integer_zerop (*number_of_iterations))
     {
@@ -1706,6 +1763,21 @@ vect_analyze_loop_form (struct loop *loop)
     return NULL;
 
   loop_vec_info loop_vinfo = new _loop_vec_info (loop);
+
+  if (number_of_iterations
+      && chrec_contains_undetermined (number_of_iterations))
+    {
+      /* Nested loops are not supported for speculative execution.  */
+      gcc_assert (!loop->inner);
+
+      LOOP_VINFO_SPECULATIVE_EXECUTION (loop_vinfo) = true;
+
+      /* Since we don't know what the number of iterations there seems little
+	 point in having anything other than NULL.  */
+      number_of_iterations = NULL;
+      number_of_iterationsm1 = NULL;
+    }
+
   LOOP_VINFO_NITERSM1 (loop_vinfo) = number_of_iterationsm1;
   LOOP_VINFO_NITERS (loop_vinfo) = number_of_iterations;
   LOOP_VINFO_NITERS_UNCHANGED (loop_vinfo) = number_of_iterations;
@@ -2158,6 +2230,25 @@ vect_analyze_loop_2 (loop_vec_info loop_vinfo, bool &fatal)
 	  }
       }
 
+  /* TODO: We can't currently support stores for speculative loops.  */
+  if (LOOP_VINFO_SPECULATIVE_EXECUTION (loop_vinfo)
+      && LOOP_VINFO_DATAREFS (loop_vinfo).length () > 0)
+    {
+      vec<data_reference_p> datarefs = LOOP_VINFO_DATAREFS (loop_vinfo);
+      struct data_reference *dr;
+      unsigned int i;
+
+      FOR_EACH_VEC_ELT (datarefs, i, dr)
+	if (!DR_IS_READ (dr))
+	  {
+	    if (dump_enabled_p ())
+	      dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
+			       "Stores not supported for speculative "
+			       "loops.\n");
+	    return false;
+	  }
+    }
+
   /* Analyze the data references and also adjust the minimal
      vectorization factor according to the loads and stores.  */
 
@@ -2259,7 +2350,8 @@ vect_analyze_loop_2 (loop_vec_info loop_vinfo, bool &fatal)
 
   /* We don't expect to have to roll back to anything other than an empty
      set of rgroups.  */
-  gcc_assert (LOOP_VINFO_MASKS (loop_vinfo).is_empty ());
+  gcc_assert (LOOP_VINFO_MASKS (loop_vinfo).is_empty ()
+	      && LOOP_VINFO_NONSPECULATIVE_MASKS (loop_vinfo).is_empty ());
 
   /* This is the point where we can re-start analysis with SLP forced off.  */
 start_over:
@@ -2337,6 +2429,19 @@ start_over:
       return false;
     }
 
+  if (LOOP_VINFO_CAN_FULLY_MASK_P (loop_vinfo)
+      && !LOOP_VINFO_PEELING_FOR_ALIGNMENT (loop_vinfo)
+      && LOOP_VINFO_SPECULATIVE_EXECUTION (loop_vinfo)
+      && !LOOP_VINFO_NEEDS_NONSPECULATIVE_MASKS (loop_vinfo)
+      && !use_capped_vf (loop_vinfo))
+    {
+      LOOP_VINFO_CAN_FULLY_MASK_P (loop_vinfo) = false;
+      if (dump_enabled_p ())
+	dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
+			 "No need to predicate speculative loops without "
+			 "alignment peeling.\n");
+    }
+
   /* Decide whether to use a fully-masked loop for this vectorization
      factor.  */
   LOOP_VINFO_FULLY_MASKED_P (loop_vinfo)
@@ -2352,17 +2457,41 @@ start_over:
 			 "not using a fully-masked loop.\n");
     }
 
-  if (!LOOP_VINFO_FULLY_MASKED_P (loop_vinfo)
-      && use_capped_vf (loop_vinfo))
+  if (!LOOP_VINFO_FULLY_MASKED_P (loop_vinfo))
     {
-      if (dump_enabled_p ())
-	dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
-			 "Need to cap the runtime vectorization factor to "
-			 HOST_WIDE_INT_PRINT_DEC " but cannot fully mask"
-			 " the loop.\n",
-			 LOOP_VINFO_MAX_VECT_FACTOR (loop_vinfo));
-      /* Undoing SLP might allow us to use a mask.  */
-      goto again;
+      if (LOOP_VINFO_NEEDS_NONSPECULATIVE_MASKS (loop_vinfo))
+	{
+	  if (dump_enabled_p ())
+	    dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
+			     "Not vectorized: non-speculative operations "
+			     "need a fully-masked loop.\n");
+	  return false;
+	}
+
+      if (use_capped_vf (loop_vinfo))
+	{
+	  if (dump_enabled_p ())
+	    dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
+			     "Need to cap the runtime vectorization factor to "
+			     HOST_WIDE_INT_PRINT_DEC " but cannot fully mask"
+			     " the loop.\n",
+			     LOOP_VINFO_MAX_VECT_FACTOR (loop_vinfo));
+	  /* Undoing SLP might allow us to use a mask.  */
+	  goto again;
+	}
+    }
+
+  if (LOOP_VINFO_NEEDS_NONSPECULATIVE_MASKS (loop_vinfo))
+    {
+      tree mask_type = vect_mask_type_for_speculation (loop_vinfo);
+      if (!direct_internal_fn_supported_p (IFN_BREAK_AFTER, mask_type,
+					   OPTIMIZE_FOR_SPEED))
+	{
+	  if (dump_enabled_p ())
+	    dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
+			     "Not vectorized: BREAK_AFTER not supported.\n");
+	  return false;
+	}
     }
 
   /* If epilog loop is required because of data accesses with gaps,
@@ -2385,6 +2514,17 @@ start_over:
 	}
     }
 
+  if (!LOOP_VINFO_FULLY_MASKED_P (loop_vinfo)
+      && LOOP_VINFO_SPECULATIVE_EXECUTION (loop_vinfo)
+      && LOOP_VINFO_PEELING_FOR_ALIGNMENT (loop_vinfo))
+    {
+      if (dump_enabled_p ())
+	dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
+			 "Not supported: peeling speculative vectorization"
+			 " without a fully-masked loop.\n");
+      return false;
+    }
+
   /* Check the costings of the loop make vectorizing worthwhile.  */
   res = vect_analyze_loop_costing (loop_vinfo);
   if (res < 0)
@@ -2402,7 +2542,9 @@ start_over:
   th = LOOP_VINFO_COST_MODEL_THRESHOLD (loop_vinfo);
 
   unsigned HOST_WIDE_INT const_vf;
-  if (LOOP_VINFO_FULLY_MASKED_P (loop_vinfo))
+  if (LOOP_VINFO_SPECULATIVE_EXECUTION (loop_vinfo))
+    LOOP_VINFO_PEELING_FOR_NITER (loop_vinfo) = false;
+  else if (LOOP_VINFO_FULLY_MASKED_P (loop_vinfo))
     /* The main loop handles all iterations.  */
     LOOP_VINFO_PEELING_FOR_NITER (loop_vinfo) = false;
   else if (LOOP_VINFO_NITERS_KNOWN_P (loop_vinfo)
@@ -2448,7 +2590,8 @@ start_over:
      enough for both peeled prolog loop and vector loop.  This check
      can be merged along with threshold check of loop versioning, so
      increase threshold for this case if necessary.  */
-  if (LOOP_REQUIRES_VERSIONING (loop_vinfo))
+  if (LOOP_REQUIRES_VERSIONING (loop_vinfo)
+      && !LOOP_VINFO_SPECULATIVE_EXECUTION (loop_vinfo))
     {
       poly_uint64 niters_th = 0;
 
@@ -2574,6 +2717,7 @@ again:
     = init_cost (LOOP_VINFO_LOOP (loop_vinfo));
   /* Reset accumulated rgroup information.  */
   release_vec_loop_masks (&LOOP_VINFO_MASKS (loop_vinfo));
+  release_vec_loop_masks (&LOOP_VINFO_NONSPECULATIVE_MASKS (loop_vinfo));
   /* Reset assorted flags.  */
   LOOP_VINFO_PEELING_FOR_NITER (loop_vinfo) = false;
   LOOP_VINFO_PEELING_FOR_GAPS (loop_vinfo) = false;
@@ -6147,11 +6291,19 @@ vectorize_fold_left_reduction (gimple *stmt, gimple_stmt_iterator *gsi,
 	  SSA_NAME_DEF_STMT (reduc_var) = new_stmt;
 	  /* For chained SLP stmt is the first statement in the group and
 	     gsi points to the last statement in the group.  For non SLP stmt
-	     points to the same location as gsi. In either case tmp_gsi and gsi
-	     should both point to the same insertion point.  */
-	  gcc_assert (scalar_dest_def == gsi_stmt (*gsi));
-	  vect_finish_replace_stmt (scalar_dest_def, new_stmt);
-	}
+	     points to the same location as gsi.  */
+	  if (scalar_dest_def == gsi_stmt (*gsi))
+	    vect_finish_replace_stmt (scalar_dest_def, new_stmt);
+	  else
+	    {
+	      /* In this case we're moving the definition to later in the
+		 block.  That doesn't matter because the only uses of the
+		 lhs are in phi statements.  */
+	      gimple_stmt_iterator old_gsi = gsi_for_stmt (scalar_dest_def);
+	      gsi_remove (&old_gsi, true);
+	      vect_finish_stmt_generation (stmt, new_stmt, gsi);
+	    }
+        }
       else
 	{
 	  reduc_var = make_ssa_name (reduc_var, new_stmt);
@@ -7144,7 +7296,13 @@ vectorizable_reduction (gimple *stmt, gimple_stmt_iterator *gsi,
     vec_num = 1;
 
   internal_fn cond_fn = get_conditional_internal_fn (code, scalar_type);
+
+  /* In a speculative loop, the update must be predicated on the
+     nonspeculative masks, so that we don't include speculatively
+     loaded elements from beyond the end of the original loop.  */
   vec_loop_masks *masks = &LOOP_VINFO_MASKS (loop_vinfo);
+  if (LOOP_VINFO_SPECULATIVE_EXECUTION (loop_vinfo))
+    masks = &LOOP_VINFO_NONSPECULATIVE_MASKS (loop_vinfo);
 
   if (!vec_stmt) /* transformation not required.  */
     {
@@ -7190,6 +7348,12 @@ vectorizable_reduction (gimple *stmt, gimple_stmt_iterator *gsi,
 
   bool masked_loop_p = LOOP_VINFO_FULLY_MASKED_P (loop_vinfo);
 
+  gimple_stmt_iterator nonspeculative_gsi
+    = gsi_end (LOOP_VINFO_NONSPECULATIVE_SEQ (loop_vinfo));
+  if (masked_loop_p
+      && masks == &LOOP_VINFO_NONSPECULATIVE_MASKS (loop_vinfo))
+    gsi = &nonspeculative_gsi;
+
   if (reduction_type == FOLD_LEFT_REDUCTION)
     return vectorize_fold_left_reduction
       (stmt, gsi, vec_stmt, slp_node, reduc_def_stmt, code,
@@ -8036,6 +8200,37 @@ vectorizable_live_operation (gimple *stmt,
 	}
     }
 
+  if (LOOP_VINFO_SPECULATIVE_EXECUTION (loop_vinfo))
+    {
+      /* Need to construct the type because on the checking stage we don't
+	 yet have the speculative exit phi.  */
+      tree mask_type = build_same_sized_truth_vector_type (vectype);
+
+      if (!direct_internal_fn_supported_p (IFN_BREAK_AFTER, mask_type,
+					   OPTIMIZE_FOR_SPEED))
+	{
+	  if (dump_enabled_p ())
+	    dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
+			     "not vectorized: break after not supported.\n");
+	  return false;
+	}
+      if (!direct_internal_fn_supported_p (IFN_EXTRACT_LAST, vectype,
+					   OPTIMIZE_FOR_SPEED))
+	{
+	  if (dump_enabled_p ())
+	    dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
+			     "not vectorized: extract last not supported.\n");
+	  return false;
+	}
+      if (ncopies > 1)
+	{
+	  if (dump_enabled_p ())
+	    dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
+			     "not vectorized: ncopies is greater than 1.\n");
+	  return false;
+	}
+    }
+
   if (!vec_stmt)
     {
       /* No transformation required.  */
@@ -8122,19 +8317,37 @@ vectorizable_live_operation (gimple *stmt,
 
   gimple_seq stmts = NULL;
   tree new_tree;
-  if (LOOP_VINFO_FULLY_MASKED_P (loop_vinfo))
+  if (LOOP_VINFO_FULLY_MASKED_P (loop_vinfo)
+      || LOOP_VINFO_SPECULATIVE_EXECUTION (loop_vinfo))
     {
+      tree mask;
+      if (LOOP_VINFO_SPECULATIVE_EXECUTION (loop_vinfo))
+	{
+	  gcc_assert (ncopies == 1);
+	  tree orig_mask = LOOP_VINFO_EXIT_MASK (loop_vinfo);
+	  tree all_ones = build_minus_one_cst (TREE_TYPE (orig_mask));
+
+	  mask = make_ssa_name (TREE_TYPE (orig_mask));
+	  gcall *new_stmt = gimple_build_call_internal (IFN_BREAK_AFTER, 2,
+							all_ones, orig_mask);
+	  gimple_call_set_lhs (new_stmt, mask);
+	  gimple_seq_add_stmt (&stmts, new_stmt);
+	}
+      else
+	{
+	  gcc_assert (ncopies == 1 && !slp_node);
+	  mask = vect_get_loop_mask (gsi, &LOOP_VINFO_MASKS (loop_vinfo),
+				     1, vectype, 0);
+	}
+
       /* Emit:
 
 	   SCALAR_RES = EXTRACT_LAST <VEC_LHS, MASK>
 
 	 where VEC_LHS is the vectorized live-out result and MASK is
 	 the loop mask for the final iteration.  */
-      gcc_assert (ncopies == 1 && !slp_node);
       tree scalar_type = TREE_TYPE (STMT_VINFO_VECTYPE (stmt_info));
       tree scalar_res = make_ssa_name (scalar_type);
-      tree mask = vect_get_loop_mask (gsi, &LOOP_VINFO_MASKS (loop_vinfo),
-				      1, vectype, 0);
       gcall *new_stmt = gimple_build_call_internal (IFN_EXTRACT_LAST,
 						    2, mask, vec_lhs);
       gimple_call_set_lhs (new_stmt, scalar_res);
@@ -8226,6 +8439,9 @@ vect_loop_kill_debug_uses (struct loop *loop, gimple *stmt)
 static bool
 loop_niters_no_overflow (loop_vec_info loop_vinfo)
 {
+  if (LOOP_VINFO_SPECULATIVE_EXECUTION (loop_vinfo))
+    return false;
+
   /* Constant case.  */
   if (LOOP_VINFO_NITERS_KNOWN_P (loop_vinfo))
     {
@@ -8292,6 +8508,14 @@ vect_record_loop_mask (loop_vec_info loop_vinfo, vec_loop_masks *masks,
       rgm->max_nscalars_per_iter = nscalars_per_iter;
       rgm->mask_type = build_same_sized_truth_vector_type (vectype);
     }
+
+  /* Ensure that the required nonspeculative masks are a subset of
+     the speculative ones.  This has two benefits: it means that we
+     can test for target support in one go, and that we can AND in
+     the speculative masks when setting up the nonspeculative ones.  */
+  if (masks == &LOOP_VINFO_NONSPECULATIVE_MASKS (loop_vinfo))
+    vect_record_loop_mask (loop_vinfo, &LOOP_VINFO_MASKS (loop_vinfo),
+			   nvectors, vectype);
 }
 
 /* Given a complete set of masks MASKS, extract mask number INDEX
@@ -8343,6 +8567,52 @@ vect_get_loop_mask (gimple_stmt_iterator *gsi, vec_loop_masks *masks,
   return mask;
 }
 
+/* Get the mask to use for loads in LOOP_VINFO, or null if loads don't
+   need to be masked.  The arguments are as for vec_get_loop_mask.  */
+
+tree
+vect_get_load_mask (loop_vec_info loop_vinfo, gimple_stmt_iterator *gsi,
+		    unsigned int nvectors, tree vectype, unsigned int index)
+{
+  /* At present all loads in a speculative loop are speculative.
+     They need to be masked iff we are using masking to reach
+     alignment.  */
+  if (LOOP_VINFO_SPECULATIVE_EXECUTION (loop_vinfo)
+      && !LOOP_VINFO_MASK_SKIP_NITERS (loop_vinfo))
+    return NULL_TREE;
+
+  return vect_get_loop_mask (gsi, &LOOP_VINFO_MASKS (loop_vinfo),
+			     nvectors, vectype, index);
+}
+
+/* Return the mask type to use when computing which scalar iterations
+   are active in speculative loop LOOP_VINFO.  */
+
+tree
+vect_mask_type_for_speculation (loop_vec_info loop_vinfo)
+{
+  gcc_checking_assert (LOOP_VINFO_SPECULATIVE_EXECUTION (loop_vinfo));
+  return build_truth_vector_type (LOOP_VINFO_VECT_FACTOR (loop_vinfo),
+				  current_vector_size);
+}
+
+/* Calculate the scalar number of iterations in NIM from its mask,
+   adding any new statements to SEQ.  Return the number of iterations.  */
+
+tree
+vect_get_niters_from_mask (gimple_seq *seq, vec_niters_and_mask *nim)
+{
+  if (nim->niters == NULL_TREE)
+    {
+      nim->niters = make_temp_ssa_name (sizetype, NULL, "niters");
+      gcall *call = gimple_build_call_internal (IFN_MASK_POPCOUNT,
+						1, nim->mask);
+      gimple_call_set_lhs (call, nim->niters);
+      gimple_seq_add_stmt (seq, call);
+    }
+  return nim->niters;
+}
+
 /* Scale profiling counters by estimation for LOOP which is vectorized
    by factor VF.  */
 
@@ -8419,7 +8689,8 @@ vect_transform_loop (loop_vec_info loop_vinfo)
      checking is pointless, too.  */
   th = LOOP_VINFO_COST_MODEL_THRESHOLD (loop_vinfo);
   if (th >= vect_vf_for_cost (loop_vinfo)
-      && !LOOP_VINFO_NITERS_KNOWN_P (loop_vinfo))
+      && !LOOP_VINFO_NITERS_KNOWN_P (loop_vinfo)
+      && !LOOP_VINFO_SPECULATIVE_EXECUTION (loop_vinfo))
     {
       if (dump_enabled_p ())
 	dump_printf_loc (MSG_NOTE, vect_location,
@@ -8483,8 +8754,10 @@ vect_transform_loop (loop_vec_info loop_vinfo)
 			      &step_vector, &niters_vector_mult_vf, th,
 			      check_profitability, niters_no_overflow);
 
-  if (niters_vector == NULL_TREE)
+  if (niters_vector == NULL_TREE
+      && !LOOP_VINFO_SPECULATIVE_EXECUTION (loop_vinfo))
     {
+      gcc_assert (!LOOP_VINFO_PEELING_FOR_GAPS (loop_vinfo));
       if (LOOP_VINFO_NITERS_KNOWN_P (loop_vinfo)
 	  && !LOOP_VINFO_FULLY_MASKED_P (loop_vinfo)
 	  && must_eq (lowest_vf, vf))
@@ -8511,6 +8784,16 @@ vect_transform_loop (loop_vec_info loop_vinfo)
     /* This will deal with any possible peeling.  */
     vect_prepare_for_masked_peels (loop_vinfo);
 
+  if (LOOP_VINFO_SPECULATIVE_EXECUTION (loop_vinfo))
+    {
+      tree mask_type = vect_mask_type_for_speculation (loop_vinfo);
+      /* Create a dummy definition of the exit mask.  We'll fill in the
+	 real definition later.  */
+      tree mask = make_temp_ssa_name (mask_type, NULL, "exit_mask");
+      SSA_NAME_DEF_STMT (mask) = gimple_build_nop ();
+      LOOP_VINFO_EXIT_MASK (loop_vinfo) = mask;
+    }
+
   /* FORNOW: the vectorizer supports only loops which body consist
      of one basic block (header + empty latch). When the vectorizer will
      support more involved loop forms, the order by which the BBs are
@@ -8770,9 +9053,18 @@ vect_transform_loop (loop_vec_info loop_vinfo)
 	}
     }				/* BBs in loop */
 
+  /* Provide the real definition of LOOP_VINFO_EXIT_MASK.  */
+  if (LOOP_VINFO_SPECULATIVE_EXECUTION (loop_vinfo))
+    {
+      tree imask = LOOP_VINFO_EXIT_TEST_MASK (loop_vinfo);
+      tree omask = LOOP_VINFO_EXIT_MASK (loop_vinfo);
+      gphi *new_phi = create_phi_node (omask, single_exit (loop)->dest);
+      add_phi_arg (new_phi, imask, single_exit (loop), UNKNOWN_LOCATION);
+    }
+
   /* The vectorization factor is always > 1, so if we use an IV increment of 1.
      a zero NITERS becomes a nonzero NITERS_VECTOR.  */
-  if (integer_onep (step_vector))
+  if (step_vector && integer_onep (step_vector))
     niters_no_overflow = true;
   vect_set_loop_condition (loop, loop_vinfo, niters_vector, step_vector,
 			   niters_vector_mult_vf, !niters_no_overflow);
diff --git a/gcc/tree-vect-stmts.c b/gcc/tree-vect-stmts.c
index c0a87dc9275..36443cff685 100644
--- a/gcc/tree-vect-stmts.c
+++ b/gcc/tree-vect-stmts.c
@@ -304,7 +304,7 @@ is_simple_and_all_uses_invariant (gimple *stmt, loop_vec_info loop_vinfo)
    A stmt is considered "relevant for vectorization" if:
    - it has uses outside the loop.
    - it has vdefs (it alters memory).
-   - control stmts in the loop (except for the exit condition).
+   - control stmts in the loop (including the exit condition).
 
    CHECKME: what other side effects would the vectorizer allow?  */
 
@@ -323,8 +323,9 @@ vect_stmt_relevant_p (gimple *stmt, loop_vec_info loop_vinfo,
 
   /* cond stmt other than loop exit cond.  */
   if (is_ctrl_stmt (stmt)
-      && STMT_VINFO_TYPE (vinfo_for_stmt (stmt))
-         != loop_exit_ctrl_vec_info_type)
+      && (STMT_VINFO_TYPE (vinfo_for_stmt (stmt))
+	  != loop_exit_ctrl_vec_info_type
+	  || LOOP_VINFO_SPECULATIVE_EXECUTION (loop_vinfo)))
     *relevant = vect_used_in_scope;
 
   /* changing memory.  */
@@ -688,6 +689,12 @@ vect_mark_stmts_to_be_vectorized (loop_vec_info loop_vinfo)
 	}
     }
 
+  /* The exit condition is relevant for speculative loops.  */
+  if (LOOP_VINFO_SPECULATIVE_EXECUTION (loop_vinfo)
+      && !vect_stmt_relevant_p (get_loop_exit_condition (loop),
+				loop_vinfo, &relevant, &live_p))
+    gcc_unreachable ();
+
   /* 2. Process_worklist */
   while (worklist.length () > 0)
     {
@@ -2137,7 +2144,8 @@ get_group_load_store_type (gimple *stmt, tree vectype, bool slp,
   bool can_overrun_p = (!masked_p
 			&& vls_type == VLS_LOAD
 			&& loop_vinfo
-			&& !loop->inner);
+			&& !loop->inner
+			&& !LOOP_VINFO_SPECULATIVE_EXECUTION (loop_vinfo));
 
   /* There can only be a gap at the end of the group if the stride is
      known at compile time.  */
@@ -4506,6 +4514,30 @@ vect_create_vectorized_promotion_stmts (vec<tree> *vec_oprnds0,
   *vec_oprnds0 = vec_tmp;
 }
 
+/* Pack the masks in MASKS to a single mask and return it.  Insert any
+   new statements before GSI.  Leave MASKS with just the returned value
+   on exit.  */
+
+static tree
+vect_demote_masks (gimple_stmt_iterator *gsi, vec<tree> *masks)
+{
+  while (masks->length () > 1)
+    {
+      unsigned int nresults = masks->length () / 2;
+      tree dest_type = vect_double_mask_nunits (TREE_TYPE ((*masks)[0]));
+      for (unsigned int i = 0; i < nresults; ++i)
+	{
+	  tree dest = make_ssa_name (dest_type);
+	  gimple *stmt = gimple_build_assign (dest, VEC_PACK_TRUNC_EXPR,
+					      (*masks)[i * 2],
+					      (*masks)[i * 2 + 1]);
+	  gsi_insert_before (gsi, stmt, GSI_SAME_STMT);
+	  (*masks)[i] = dest;
+	}
+      masks->truncate (nresults);
+    }
+  return (*masks)[0];
+}
 
 /* Check if STMT performs a conversion operation, that can be vectorized.
    If VEC_STMT is also passed, vectorize the STMT: create a vectorized
@@ -6203,6 +6235,7 @@ vectorizable_store (gimple *stmt, gimple_stmt_iterator *gsi, gimple **vec_stmt,
 
   if (loop_vinfo)
     {
+      gcc_assert (!LOOP_VINFO_SPECULATIVE_EXECUTION (loop_vinfo));
       loop = LOOP_VINFO_LOOP (loop_vinfo);
       vf = LOOP_VINFO_VECT_FACTOR (loop_vinfo);
     }
@@ -7335,6 +7368,14 @@ vectorizable_load (gimple *stmt, gimple_stmt_iterator *gsi, gimple **vec_stmt,
 	  return false;
 	}
 
+      if (LOOP_VINFO_SPECULATIVE_EXECUTION (loop_vinfo))
+	{
+	  if (dump_enabled_p ())
+	    dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
+			     "speculative mask loads not supported\n");
+	  return false;
+	}
+
       int mask_index = internal_fn_mask_index (ifn);
       if (mask_index >= 0)
 	{
@@ -7370,12 +7411,24 @@ vectorizable_load (gimple *stmt, gimple_stmt_iterator *gsi, gimple **vec_stmt,
   gcc_assert (ncopies >= 1);
 
   /* FORNOW. This restriction should be relaxed.  */
-  if (nested_in_vect_loop && ncopies > 1)
+  if (ncopies > 1)
     {
-      if (dump_enabled_p ())
-        dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
-                         "multiple types in nested loop.\n");
-      return false;
+      if (nested_in_vect_loop)
+	{
+	  if (dump_enabled_p ())
+	    dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
+			     "multiple types in nested loop.\n");
+	  return false;
+	}
+
+      if (LOOP_VINFO_SPECULATIVE_EXECUTION (loop_vinfo))
+	{
+	  if (dump_enabled_p ())
+	    dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
+			     "multiple copies not supported for speculative "
+			     "loops.\n");
+	  return false;
+	}
     }
 
   /* Invalidate assumptions made by dependence analysis when vectorization
@@ -7988,7 +8041,6 @@ vectorizable_load (gimple *stmt, gimple_stmt_iterator *gsi, gimple **vec_stmt,
   tree vec_mask = NULL_TREE;
   prev_stmt_info = NULL;
   poly_uint64 group_elt = 0;
-  vec_loop_masks *masks = &LOOP_VINFO_MASKS (loop_vinfo);
   for (j = 0; j < ncopies; j++)
     {
       /* 1. Create the vector or array pointer update chain.  */
@@ -8079,7 +8131,8 @@ vectorizable_load (gimple *stmt, gimple_stmt_iterator *gsi, gimple **vec_stmt,
 
 	  tree final_mask = NULL_TREE;
 	  if (masked_loop_p)
-	    final_mask = vect_get_loop_mask (gsi, masks, ncopies, vectype, j);
+	    final_mask = vect_get_load_mask (loop_vinfo, gsi, ncopies,
+					     vectype, j);
 	  if (vec_mask)
 	    final_mask = prepare_load_store_mask (mask_vectype, final_mask,
 						  vec_mask, gsi);
@@ -8126,7 +8179,8 @@ vectorizable_load (gimple *stmt, gimple_stmt_iterator *gsi, gimple **vec_stmt,
 	      tree final_mask = NULL_TREE;
 	      if (masked_loop_p
 		  && memory_access_type != VMAT_INVARIANT)
-		final_mask = vect_get_loop_mask (gsi, masks, vec_num * ncopies,
+		final_mask = vect_get_load_mask (loop_vinfo, gsi,
+						 vec_num * ncopies,
 						 vectype, vec_num * j + i);
 	      if (vec_mask)
 		final_mask = prepare_load_store_mask (mask_vectype, final_mask,
@@ -8162,10 +8216,10 @@ vectorizable_load (gimple *stmt, gimple_stmt_iterator *gsi, gimple **vec_stmt,
 			break;
 		      }
 
-		    align = DR_TARGET_ALIGNMENT (dr);
 		    if (alignment_support_scheme == dr_aligned)
 		      {
 			gcc_assert (aligned_access_p (first_dr));
+			align = DR_TARGET_ALIGNMENT (first_dr);
 			misalign = 0;
 		      }
 		    else if (DR_MISALIGNMENT (first_dr) == -1)
@@ -8174,7 +8228,10 @@ vectorizable_load (gimple *stmt, gimple_stmt_iterator *gsi, gimple **vec_stmt,
 			misalign = 0;
 		      }
 		    else
-		      misalign = DR_MISALIGNMENT (first_dr);
+		      {
+			align = DR_TARGET_ALIGNMENT (first_dr);
+			misalign = DR_MISALIGNMENT (first_dr);
+		      }
 		    if (dataref_offset == NULL_TREE
 			&& TREE_CODE (dataref_ptr) == SSA_NAME)
 		      set_ptr_info_alignment (get_ptr_info (dataref_ptr),
@@ -8934,12 +8991,11 @@ vectorizable_comparison (gimple *stmt, gimple_stmt_iterator *gsi,
 			 gimple **vec_stmt, tree reduc_def,
 			 slp_tree slp_node)
 {
-  tree lhs, rhs1, rhs2;
+  tree rhs1, rhs2;
   stmt_vec_info stmt_info = vinfo_for_stmt (stmt);
   tree vectype1 = NULL_TREE, vectype2 = NULL_TREE;
   tree vectype = STMT_VINFO_VECTYPE (stmt_info);
   tree vec_rhs1 = NULL_TREE, vec_rhs2 = NULL_TREE;
-  tree new_temp;
   loop_vec_info loop_vinfo = STMT_VINFO_LOOP_VINFO (stmt_info);
   enum vect_def_type dts[2] = {vect_unknown_def_type, vect_unknown_def_type};
   int ndts = 2;
@@ -8983,16 +9039,55 @@ vectorizable_comparison (gimple *stmt, gimple_stmt_iterator *gsi,
       return false;
     }
 
-  if (!is_gimple_assign (stmt))
-    return false;
+  if (is_gimple_assign (stmt))
+    {
+      code = gimple_assign_rhs_code (stmt);
+      rhs1 = gimple_assign_rhs1 (stmt);
+      rhs2 = gimple_assign_rhs2 (stmt);
+    }
+  else if (gimple_code (stmt) == GIMPLE_COND)
+    {
+      gcc_assert (LOOP_VINFO_SPECULATIVE_EXECUTION (loop_vinfo));
 
-  code = gimple_assign_rhs_code (stmt);
+      /* TODO: Support more complex loops with more than one gcond stmt.  */
+      struct loop *loop = LOOP_VINFO_LOOP (loop_vinfo);
+      gcc_assert (stmt == get_loop_exit_condition (loop));
 
-  if (TREE_CODE_CLASS (code) != tcc_comparison)
+      rhs1 = gimple_cond_lhs (stmt);
+      rhs2 = gimple_cond_rhs (stmt);
+
+      code = gimple_cond_code (stmt);
+      edge exit_edge = single_exit (loop);
+      if (exit_edge->flags & EDGE_FALSE_VALUE)
+	{
+	  /* We want to invert the code and generate a mask such that if any
+	     bit is true the exit condition is met.  */
+	  bool honor_nans = FLOAT_TYPE_P (TREE_TYPE (rhs1));
+	  code = invert_tree_comparison (code, honor_nans);
+	  if (code == ERROR_MARK)
+	    {
+	      if (dump_enabled_p ())
+		dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
+				 "Cannot invert condition code.  Loop cannot "
+				 "be speculatively executed.\n");
+	      return false;
+	    }
+	}
+
+      if (optab_handler (cbranch_optab, TYPE_MODE (vectype))
+	  == CODE_FOR_nothing)
+	{
+	  if (dump_enabled_p ())
+	    dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
+			     "Target does not support testing a mask.\n");
+	  return false;
+	}
+    }
+  else
     return false;
 
-  rhs1 = gimple_assign_rhs1 (stmt);
-  rhs2 = gimple_assign_rhs2 (stmt);
+  if (TREE_CODE_CLASS (code) != tcc_comparison)
+    return false;
 
   if (!vect_is_simple_use (rhs1, stmt_info->vinfo, &def_stmt,
 			   &dts[0], &vectype1))
@@ -9070,6 +9165,17 @@ vectorizable_comparison (gimple *stmt, gimple_stmt_iterator *gsi,
       STMT_VINFO_TYPE (stmt_info) = comparison_vec_info_type;
       vect_model_simple_cost (stmt_info, ncopies * (1 + (bitop2 != NOP_EXPR)),
 			      dts, ndts, NULL, NULL);
+
+      /* Speulative loops need to AND the comparison result with the
+	 mask of active values.  */
+      if (LOOP_VINFO_CAN_FULLY_MASK_P (loop_vinfo)
+	  && LOOP_VINFO_SPECULATIVE_EXECUTION (loop_vinfo))
+	{
+	  tree final_type = vect_mask_type_for_speculation (loop_vinfo);
+	  vect_record_loop_mask (loop_vinfo, &LOOP_VINFO_MASKS (loop_vinfo),
+				 1, final_type);
+	}
+
       if (bitop1 == NOP_EXPR)
 	return expand_vec_cmp_expr_p (vectype, mask_type, code);
       else
@@ -9099,8 +9205,26 @@ vectorizable_comparison (gimple *stmt, gimple_stmt_iterator *gsi,
     }
 
   /* Handle def.  */
-  lhs = gimple_assign_lhs (stmt);
-  mask = vect_create_destination_var (lhs, mask_type);
+  if (is_gimple_assign (stmt))
+    {
+      tree lhs = gimple_assign_lhs (stmt);
+      mask = vect_create_destination_var (lhs, mask_type);
+    }
+  else
+    mask = NULL_TREE;
+
+  bool masked_speculative_p
+    = (LOOP_VINFO_SPECULATIVE_EXECUTION (loop_vinfo)
+       && LOOP_VINFO_MASK_SKIP_NITERS (loop_vinfo));
+
+  /* Pick an array of masks to use as the comparison results that feed
+     a GIMPLE_COND.  If all input elements are valid, we can operate
+     directly on the exit masks array.  If masking is needed, first
+     build a temporary array of unmasked results and then apply the
+     mask to it.
+
+     This is ignored (and cheap) if the statement isn't a GIMPLE_COND.  */
+  auto_vec<tree, 16> cmp_results;
 
   /* Handle cmp expr.  */
   for (j = 0; j < ncopies; j++)
@@ -9144,34 +9268,42 @@ vectorizable_comparison (gimple *stmt, gimple_stmt_iterator *gsi,
 	{
 	  vec_rhs2 = vec_oprnds1[i];
 
-	  new_temp = make_ssa_name (mask);
+	  tree cmp_res = (mask != NULL_TREE
+			  ? make_ssa_name (mask)
+			  : make_ssa_name (mask_type));
 	  if (bitop1 == NOP_EXPR)
 	    {
-	      new_stmt = gimple_build_assign (new_temp, code,
+	      new_stmt = gimple_build_assign (cmp_res, code,
 					      vec_rhs1, vec_rhs2);
 	      vect_finish_stmt_generation (stmt, new_stmt, gsi);
 	    }
 	  else
 	    {
+	      tree bitop1_res = (bitop2 == NOP_EXPR
+				 ? cmp_res
+				 : make_ssa_name (TREE_TYPE (cmp_res)));
 	      if (bitop1 == BIT_NOT_EXPR)
-		new_stmt = gimple_build_assign (new_temp, bitop1, vec_rhs2);
+		new_stmt = gimple_build_assign (bitop1_res, bitop1, vec_rhs2);
 	      else
-		new_stmt = gimple_build_assign (new_temp, bitop1, vec_rhs1,
+		new_stmt = gimple_build_assign (bitop1_res, bitop1, vec_rhs1,
 						vec_rhs2);
 	      vect_finish_stmt_generation (stmt, new_stmt, gsi);
 	      if (bitop2 != NOP_EXPR)
 		{
-		  tree res = make_ssa_name (mask);
 		  if (bitop2 == BIT_NOT_EXPR)
-		    new_stmt = gimple_build_assign (res, bitop2, new_temp);
+		    new_stmt = gimple_build_assign (cmp_res, bitop2,
+						    bitop1_res);
 		  else
-		    new_stmt = gimple_build_assign (res, bitop2, vec_rhs1,
-						    new_temp);
+		    new_stmt = gimple_build_assign (cmp_res, bitop2,
+						    vec_rhs1, bitop1_res);
 		  vect_finish_stmt_generation (stmt, new_stmt, gsi);
 		}
 	    }
+
 	  if (slp_node)
 	    SLP_TREE_VEC_STMTS (slp_node).quick_push (new_stmt);
+
+	  cmp_results.safe_push (cmp_res);
 	}
 
       if (slp_node)
@@ -9188,6 +9320,42 @@ vectorizable_comparison (gimple *stmt, gimple_stmt_iterator *gsi,
   vec_oprnds0.release ();
   vec_oprnds1.release ();
 
+  if (gimple_code (stmt) == GIMPLE_COND)
+    {
+      gcc_assert (LOOP_VINFO_SPECULATIVE_EXECUTION (loop_vinfo));
+
+      struct loop *loop = LOOP_VINFO_LOOP (loop_vinfo);
+      gcond *cond = get_loop_exit_condition (loop);
+      gcc_assert (cond);
+      gimple_stmt_iterator loop_cond_gsi = gsi_for_stmt (cond);
+
+      tree cmp_res = vect_demote_masks (&loop_cond_gsi, &cmp_results);
+      mask_type = TREE_TYPE (cmp_res);
+      if (masked_speculative_p)
+	{
+	  /* Work out which elements of the unmasked result are valid.  */
+	  mask = vect_get_loop_mask (gsi, &LOOP_VINFO_MASKS (loop_vinfo),
+				     1, mask_type, 0);
+
+	  /* Get the mask of values that actually matter.  */
+	  tree masked_res = make_ssa_name (mask_type);
+	  gimple *tmp_stmt = gimple_build_assign (masked_res, BIT_AND_EXPR,
+						  cmp_res, mask);
+	  gsi_insert_before (&loop_cond_gsi, tmp_stmt, GSI_SAME_STMT);
+	  cmp_res = masked_res;
+	}
+      LOOP_VINFO_EXIT_TEST_MASK (loop_vinfo) = cmp_res;
+
+      /* Get a boolean result that tells us whether to iterate.  It's easier
+	 to modify the condition in-place than to generate a new one and
+	 delete the old one.  */
+      edge exit_edge = single_exit (loop);
+      tree_code code = (exit_edge->flags & EDGE_TRUE_VALUE) ? NE_EXPR : EQ_EXPR;
+      tree zero_mask = build_zero_cst (mask_type);
+      gimple_cond_set_condition (cond, code, cmp_res, zero_mask);
+      update_stmt (cond);
+    }
+
   return true;
 }
 
diff --git a/gcc/tree-vectorizer.h b/gcc/tree-vectorizer.h
index 8073ba05a83..2afafda6b25 100644
--- a/gcc/tree-vectorizer.h
+++ b/gcc/tree-vectorizer.h
@@ -414,7 +414,8 @@ typedef struct _loop_vec_info : public vec_info {
   vec_niters_and_mask cap;
 
   /* The masks that a fully-masked loop should use to avoid operating
-     on inactive scalars.  */
+     on inactive scalars.  In a speculative loop, these masks control
+     the operations that can be executed speculatively.  */
   vec_loop_masks masks;
 
   /* If we are using a loop mask to align memory addresses, this variable
@@ -489,6 +490,9 @@ typedef struct _loop_vec_info : public vec_info {
   /* Is the loop vectorizable? */
   bool vectorizable;
 
+  /* Is this a speculative loop?  */
+  bool speculative_execution;
+
   /* Records whether we still have the option of using a fully-masked loop.  */
   bool can_fully_mask_p;
 
@@ -546,6 +550,22 @@ typedef struct _loop_vec_info : public vec_info {
   /* A map from X to a precomputed gimple_val containing
      CAPPED_VECTORIZATION_FACTOR * X.  */
   hash_map<tree, tree> vf_mult_map;
+
+  /* In a speculative loop, this is the result of the exit comparison.
+     It is a vector mask with one element for each scalar iteration.  */
+  tree exit_test_mask;
+
+  /* A value equal to EXIT_TEST_MASK for use outside the loop.  */
+  tree exit_mask;
+
+  /* In a speculative loop, these masks are used to control operations
+     that cannot be speculatively executed.  */
+  vec_loop_masks nonspeculative_masks;
+
+  /* Statements in a speculative loop that depend on nonspeculative masks.
+     These statements can only be executed after the exit condition has
+     been evaluated.  */
+  gimple_seq nonspeculative_seq;
 } *loop_vec_info;
 
 /* Access Functions.  */
@@ -599,6 +619,14 @@ typedef struct _loop_vec_info : public vec_info {
 #define LOOP_VINFO_ORIG_LOOP_INFO(L)       (L)->orig_loop_info
 #define LOOP_VINFO_ADDR_CACHE(L)	   (L)->vect_addr_base_htab
 #define LOOP_VINFO_VF_MULT_MAP(L)          (L)->vf_mult_map
+#define LOOP_VINFO_SPECULATIVE_EXECUTION(L) (L)->speculative_execution
+#define LOOP_VINFO_EXIT_TEST_MASK(L)        (L)->exit_test_mask
+#define LOOP_VINFO_EXIT_MASK(L)             (L)->exit_mask
+#define LOOP_VINFO_NONSPECULATIVE(L)          (L)->nonspeculative
+#define LOOP_VINFO_NEEDS_NONSPECULATIVE_MASKS(L) \
+  (!(L)->nonspeculative_masks.is_empty ())
+#define LOOP_VINFO_NONSPECULATIVE_MASKS(L)    (L)->nonspeculative_masks
+#define LOOP_VINFO_NONSPECULATIVE_SEQ(L)      (L)->nonspeculative_seq
 
 #define LOOP_REQUIRES_VERSIONING_FOR_ALIGNMENT(L)	\
   ((L)->may_misalign_stmts.length () > 0)
@@ -1625,6 +1653,10 @@ extern void vect_record_loop_mask (loop_vec_info, vec_loop_masks *,
 				   unsigned int, tree);
 extern tree vect_get_loop_mask (gimple_stmt_iterator *, vec_loop_masks *,
 				unsigned int, tree, unsigned int);
+extern tree vect_get_load_mask (loop_vec_info, gimple_stmt_iterator *,
+				unsigned int, tree, unsigned int);
+extern tree vect_mask_type_for_speculation (loop_vec_info);
+extern tree vect_get_niters_from_mask (gimple_seq *, vec_niters_and_mask *);
 
 /* Drive for loop transformation stage.  */
 extern struct loop *vect_transform_loop (loop_vec_info);
author	Richard Sandiford <richard.sandiford@linaro.org>	2017-06-23 17:52:44 +0100
committer	Richard Sandiford <richard.sandiford@linaro.org>	2017-11-20 16:01:23 +0000
commit	02cf0942b05e2278c7e251b969092b64f06b915d (patch)
tree	f9344ed179868f44cee2a9cdb6d31a3b0d38dfe6
parent	655e3625f9c65f2c9d4e8c76eeca5edf9254afeb (diff)
download	gcc-02cf0942b05e2278c7e251b969092b64f06b915d.tar.gz