summaryrefslogtreecommitdiff
diff options
context:
space:
mode:
authorRichard Sandiford <richard.sandiford@linaro.org>2017-06-23 17:52:44 +0100
committerRichard Sandiford <richard.sandiford@linaro.org>2017-11-20 16:01:23 +0000
commit02cf0942b05e2278c7e251b969092b64f06b915d (patch)
treef9344ed179868f44cee2a9cdb6d31a3b0d38dfe6
parent655e3625f9c65f2c9d4e8c76eeca5edf9254afeb (diff)
downloadgcc-02cf0942b05e2278c7e251b969092b64f06b915d.tar.gz
Add support for speculative loads
[Branch only patch -- not intended for trunk in its current state] This patch adds support for speculative loads in cases where the loads are (or can be made to be) aligned to a full vector size. Such loads can never partially fault and they should be more efficient than first-faulting loads for the cases that they can handle.
-rw-r--r--gcc/config/aarch64/aarch64-sve.md10
-rw-r--r--gcc/config/aarch64/aarch64.md1
-rw-r--r--gcc/gimple-iterator.h16
-rw-r--r--gcc/internal-fn.def10
-rw-r--r--gcc/optabs.def1
-rw-r--r--gcc/testsuite/gcc.target/aarch64/sve_speculative_1.c62
-rw-r--r--gcc/testsuite/gcc.target/aarch64/sve_speculative_10.c23
-rw-r--r--gcc/testsuite/gcc.target/aarch64/sve_speculative_11.c65
-rw-r--r--gcc/testsuite/gcc.target/aarch64/sve_speculative_11_run.c61
-rw-r--r--gcc/testsuite/gcc.target/aarch64/sve_speculative_12.c32
-rw-r--r--gcc/testsuite/gcc.target/aarch64/sve_speculative_12_run.c9
-rw-r--r--gcc/testsuite/gcc.target/aarch64/sve_speculative_13.c12
-rw-r--r--gcc/testsuite/gcc.target/aarch64/sve_speculative_13_run.c9
-rw-r--r--gcc/testsuite/gcc.target/aarch64/sve_speculative_14.c32
-rw-r--r--gcc/testsuite/gcc.target/aarch64/sve_speculative_14_run.c11
-rw-r--r--gcc/testsuite/gcc.target/aarch64/sve_speculative_15.c59
-rw-r--r--gcc/testsuite/gcc.target/aarch64/sve_speculative_15_run.c56
-rw-r--r--gcc/testsuite/gcc.target/aarch64/sve_speculative_16.c12
-rw-r--r--gcc/testsuite/gcc.target/aarch64/sve_speculative_16_run.c9
-rw-r--r--gcc/testsuite/gcc.target/aarch64/sve_speculative_17.c34
-rw-r--r--gcc/testsuite/gcc.target/aarch64/sve_speculative_17_run.c9
-rw-r--r--gcc/testsuite/gcc.target/aarch64/sve_speculative_1_run.c47
-rw-r--r--gcc/testsuite/gcc.target/aarch64/sve_speculative_2.c72
-rw-r--r--gcc/testsuite/gcc.target/aarch64/sve_speculative_2_run.c45
-rw-r--r--gcc/testsuite/gcc.target/aarch64/sve_speculative_3.c26
-rw-r--r--gcc/testsuite/gcc.target/aarch64/sve_speculative_4.c66
-rw-r--r--gcc/testsuite/gcc.target/aarch64/sve_speculative_4_run.c56
-rw-r--r--gcc/testsuite/gcc.target/aarch64/sve_speculative_5.c54
-rw-r--r--gcc/testsuite/gcc.target/aarch64/sve_speculative_5_run.c104
-rw-r--r--gcc/testsuite/gcc.target/aarch64/sve_speculative_5_run_2.c8
-rw-r--r--gcc/testsuite/gcc.target/aarch64/sve_speculative_5_run_3.c9
-rw-r--r--gcc/testsuite/gcc.target/aarch64/sve_speculative_6.c44
-rw-r--r--gcc/testsuite/gcc.target/aarch64/sve_speculative_7.c21
-rw-r--r--gcc/testsuite/gcc.target/aarch64/sve_speculative_8.c21
-rw-r--r--gcc/testsuite/gcc.target/aarch64/sve_speculative_9.c34
-rw-r--r--gcc/testsuite/gcc.target/aarch64/sve_speculative_9_run.c67
-rw-r--r--gcc/tree-vect-data-refs.c97
-rw-r--r--gcc/tree-vect-loop-manip.c282
-rw-r--r--gcc/tree-vect-loop.c382
-rw-r--r--gcc/tree-vect-stmts.c232
-rw-r--r--gcc/tree-vectorizer.h34
41 files changed, 2138 insertions, 96 deletions
diff --git a/gcc/config/aarch64/aarch64-sve.md b/gcc/config/aarch64/aarch64-sve.md
index 5d84b7fc595..e381cfcabe2 100644
--- a/gcc/config/aarch64/aarch64-sve.md
+++ b/gcc/config/aarch64/aarch64-sve.md
@@ -2379,6 +2379,16 @@
"<sve_fp_op>\t%0.<Vetype>, %1/m, %0.<Vetype>, %2.<Vetype>"
)
+(define_insn "break_after_<mode>"
+ [(set (match_operand:PRED_ALL 0 "register_operand" "=Upa")
+ (unspec:PRED_ALL
+ [(match_operand:PRED_ALL 1 "register_operand" "Upa")
+ (match_operand:PRED_ALL 2 "register_operand" "Upa")]
+ UNSPEC_BRKA))]
+ "TARGET_SVE"
+ "brka\t%0.b, %1/z, %2.b"
+)
+
(define_expand "mask_popcount<mode>"
[(set (match_operand:DI 0 "register_operand")
(unspec:DI [(match_dup 2)
diff --git a/gcc/config/aarch64/aarch64.md b/gcc/config/aarch64/aarch64.md
index 581e6a753d2..37dcd85440e 100644
--- a/gcc/config/aarch64/aarch64.md
+++ b/gcc/config/aarch64/aarch64.md
@@ -168,6 +168,7 @@
UNSPEC_CLASTB
UNSPEC_FADDA
UNSPEC_CNTP
+ UNSPEC_BRKA
])
(define_c_enum "unspecv" [
diff --git a/gcc/gimple-iterator.h b/gcc/gimple-iterator.h
index 70f18beceff..8e9fe1f087d 100644
--- a/gcc/gimple-iterator.h
+++ b/gcc/gimple-iterator.h
@@ -152,6 +152,22 @@ gsi_last_1 (gimple_seq *seq)
#define gsi_last(x) gsi_last_1 (&(x))
+/* Return a new iterator initially pointing at the end of SEQ. */
+
+static inline gimple_stmt_iterator
+gsi_end_1 (gimple_seq *seq)
+{
+ gimple_stmt_iterator i;
+
+ i.ptr = NULL;
+ i.seq = seq;
+ i.bb = i.ptr ? gimple_bb (i.ptr) : NULL;
+
+ return i;
+}
+
+#define gsi_end(x) gsi_end_1 (&(x))
+
/* Return a new iterator pointing to the last statement in basic block BB. */
static inline gimple_stmt_iterator
diff --git a/gcc/internal-fn.def b/gcc/internal-fn.def
index f28519837f2..1ff1d832eeb 100644
--- a/gcc/internal-fn.def
+++ b/gcc/internal-fn.def
@@ -155,6 +155,16 @@ DEF_INTERNAL_COND_OPTAB_FN (XOR, ECF_CONST | ECF_NOTHROW, xor, binary)
DEF_INTERNAL_OPTAB_FN (RSQRT, ECF_CONST, rsqrt, unary)
+/* IFN_BREAK_AFTER (A, B):
+
+ - If A & B is all false, return A.
+ - Otherwise find the first true bit in A & B. Copy bits of A up
+ to and including that bit and set the remaining bits to false.
+
+ A, B and the return value are all vector masks. */
+DEF_INTERNAL_OPTAB_FN (BREAK_AFTER, ECF_CONST | ECF_NOTHROW,
+ break_after, binary)
+
/* Extract the last active element from a vector. */
DEF_INTERNAL_OPTAB_FN (EXTRACT_LAST, ECF_CONST | ECF_NOTHROW,
extract_last, cond_unary)
diff --git a/gcc/optabs.def b/gcc/optabs.def
index bf67dfca132..d86dc803d5a 100644
--- a/gcc/optabs.def
+++ b/gcc/optabs.def
@@ -308,6 +308,7 @@ OPTAB_D (reduc_ior_scal_optab, "reduc_ior_scal_$a")
OPTAB_D (reduc_xor_scal_optab, "reduc_xor_scal_$a")
OPTAB_D (fold_left_plus_optab, "fold_left_plus_$a")
+OPTAB_D (break_after_optab, "break_after_$a")
OPTAB_D (extract_last_optab, "extract_last_$a")
OPTAB_D (fold_extract_last_optab, "fold_extract_last_$a")
diff --git a/gcc/testsuite/gcc.target/aarch64/sve_speculative_1.c b/gcc/testsuite/gcc.target/aarch64/sve_speculative_1.c
new file mode 100644
index 00000000000..ba2f569fd5c
--- /dev/null
+++ b/gcc/testsuite/gcc.target/aarch64/sve_speculative_1.c
@@ -0,0 +1,62 @@
+/* { dg-do compile } */
+/* { dg-options "-O3 -fno-inline -ffast-math -march=armv8-a+sve -fdump-tree-vect-details -msve-vector-bits=256" } */
+
+#include <stdint.h>
+
+/* Speculative loop with no data references. */
+
+#define SPEC_LOOP(ARGTYPE,INDUCTYPE)\
+INDUCTYPE spec_loop_##ARGTYPE##INDUCTYPE (ARGTYPE mask, ARGTYPE limit)\
+{\
+ INDUCTYPE i = 0;\
+ while ((i & mask) != limit)\
+ i += 1;\
+ return i;\
+}\
+
+#define SPEC_FP_LOOP(ARGTYPE,INDUCTYPE,FPTYPE)\
+FPTYPE spec_fp_loop_##ARGTYPE##INDUCTYPE##FPTYPE (ARGTYPE mask, ARGTYPE limit)\
+{\
+ INDUCTYPE i = 0;\
+ FPTYPE f = 0.0;\
+ while ((i & mask) != limit)\
+ {\
+ f += 1;\
+ i += 1;\
+ }\
+ return f;\
+}\
+
+SPEC_LOOP (uint8_t, uint8_t)
+SPEC_LOOP (uint16_t, uint16_t)
+SPEC_LOOP (uint32_t, uint32_t)
+SPEC_LOOP (uint64_t, uint64_t)
+
+SPEC_LOOP (int8_t, int8_t)
+SPEC_LOOP (int16_t, int16_t)
+SPEC_LOOP (int32_t, int32_t)
+SPEC_LOOP (int64_t, int64_t)
+
+/* Conversions. */
+SPEC_LOOP (uint16_t, uint8_t)
+
+SPEC_LOOP (uint32_t, uint8_t)
+SPEC_LOOP (uint32_t, uint16_t)
+
+SPEC_LOOP (uint64_t, uint8_t)
+SPEC_LOOP (uint64_t, uint16_t)
+SPEC_LOOP (uint64_t, uint32_t)
+
+SPEC_FP_LOOP (uint32_t, uint32_t, float)
+SPEC_FP_LOOP (uint64_t, uint64_t, double)
+
+SPEC_FP_LOOP (uint64_t, uint64_t, float)
+
+/* { dg-final { scan-tree-dump-times "LOOP VECTORIZED" 17 "vect" } } */
+/* { dg-final { scan-assembler-times {\tbrka\tp[0-9]+\.b, p[0-9]+/z, p[0-9]+\.b} 17 } } */
+/* { dg-final { scan-assembler-times {\tlastb\tw[0-9]+, p[0-7], z[0-9]+\.b} 5 } } */
+/* { dg-final { scan-assembler-times {\tlastb\tw[0-9]+, p[0-7], z[0-9]+\.h} 4 } } */
+/* { dg-final { scan-assembler-times {\tlastb\tw[0-9]+, p[0-7], z[0-9]+\.s} 3 } } */
+/* { dg-final { scan-assembler-times {\tlastb\tx[0-9]+, p[0-7], z[0-9]+\.d} 2 } } */
+/* { dg-final { scan-assembler-times {\tlastb\ts[0-9]+, p[0-7], z[0-9]+\.s} 2 } } */
+/* { dg-final { scan-assembler-times {\tlastb\td[0-9]+, p[0-7], z[0-9]+\.d} 1 } } */
diff --git a/gcc/testsuite/gcc.target/aarch64/sve_speculative_10.c b/gcc/testsuite/gcc.target/aarch64/sve_speculative_10.c
new file mode 100644
index 00000000000..c69164bb1ea
--- /dev/null
+++ b/gcc/testsuite/gcc.target/aarch64/sve_speculative_10.c
@@ -0,0 +1,23 @@
+/* { dg-do compile } */
+/* { dg-options "-O3 -fno-common -ffast-math -march=armv8-a+sve -fdump-tree-vect-details -msve-vector-bits=256" } */
+
+/* Speculative loop with two loads from global buffers which can be aligned,
+ but does require peeling. */
+
+int a[500];
+int b[500];
+
+int
+foo (int n)
+{
+ int i = 0;
+ do
+ i += 1;
+ while (a[i] + b[i] < n);
+ return i;
+}
+
+/* { dg-final { scan-tree-dump-not "loop versioned for vectorization to enhance alignment" "vect" } } */
+/* { dg-final { scan-tree-dump "Alignment of access forced using peeling" "vect" } } */
+/* { dg-final { scan-tree-dump "misalign = 4 bytes of ref" "vect" } } */
+/* { dg-final { scan-tree-dump "LOOP VECTORIZED" "vect" } } */
diff --git a/gcc/testsuite/gcc.target/aarch64/sve_speculative_11.c b/gcc/testsuite/gcc.target/aarch64/sve_speculative_11.c
new file mode 100644
index 00000000000..92e4adc5571
--- /dev/null
+++ b/gcc/testsuite/gcc.target/aarch64/sve_speculative_11.c
@@ -0,0 +1,65 @@
+/* { dg-do compile } */
+/* { dg-options "-O3 -fno-common -ffast-math -march=armv8-a+sve -fdump-tree-vect-details -msve-vector-bits=256" } */
+
+#include <stdint.h>
+
+/* Speculative loop with two loads from global buffers which can be aligned
+ without any peeling. */
+
+#define MAX_ARRAY_SIZE 500
+
+#ifndef STRIDE_LEVEL
+#define STRIDE_LEVEL 1
+#endif
+
+#define SPEC_LOOP(DATATYPE, ARGTYPE)\
+DATATYPE a##DATATYPE[MAX_ARRAY_SIZE];\
+DATATYPE b##DATATYPE[MAX_ARRAY_SIZE];\
+ARGTYPE spec_loop_##DATATYPE##_##ARGTYPE (DATATYPE n)\
+{\
+ ARGTYPE i = -1;\
+ do\
+ i += 1;\
+ while (a##DATATYPE[i*STRIDE_LEVEL] + b##DATATYPE[i*STRIDE_LEVEL] < n);\
+ return i;\
+}
+
+/* TODO: Cannot yet vectorize due to gather load. */
+SPEC_LOOP (int8_t, int8_t)
+SPEC_LOOP (int16_t, int16_t)
+
+SPEC_LOOP (int32_t, int32_t)
+SPEC_LOOP (int64_t, int64_t)
+SPEC_LOOP (float, int32_t)
+SPEC_LOOP (double, int64_t)
+
+/* { dg-final { scan-tree-dump-not "loop versioned for vectorization to enhance alignment" "vect" } } */
+/* { dg-final { scan-tree-dump-not "Alignment of access forced using peeling" "vect" } } */
+
+/* { dg-final { scan-tree-dump "force alignment of aint8_t" "vect" { xfail *-*-* } } } */
+/* { dg-final { scan-tree-dump "force alignment of bint8_t" "vect" { xfail *-*-* } } } */
+/* { dg-final { scan-tree-dump "force alignment of aint16_t" "vect" { xfail *-*-* } } } */
+/* { dg-final { scan-tree-dump "force alignment of bint16_t" "vect" { xfail *-*-* } } } */
+/* { dg-final { scan-tree-dump "force alignment of aint32_t" "vect" } } */
+/* { dg-final { scan-tree-dump "force alignment of bint32_t" "vect" } } */
+/* { dg-final { scan-tree-dump "force alignment of aint64_t" "vect" } } */
+/* { dg-final { scan-tree-dump "force alignment of bint64_t" "vect" } } */
+/* { dg-final { scan-tree-dump "force alignment of afloat" "vect" } } */
+/* { dg-final { scan-tree-dump "force alignment of bfloat" "vect" } } */
+/* { dg-final { scan-tree-dump "force alignment of adouble" "vect" } } */
+/* { dg-final { scan-tree-dump "force alignment of bdouble" "vect" } } */
+
+/* { dg-final { scan-tree-dump "misalign = 0 bytes of ref aint8_t" "vect" { xfail *-*-* } } } */
+/* { dg-final { scan-tree-dump "misalign = 0 bytes of ref bint8_t" "vect" { xfail *-*-* } } } */
+/* { dg-final { scan-tree-dump "misalign = 0 bytes of ref aint16_t" "vect" { xfail *-*-* } } } */
+/* { dg-final { scan-tree-dump "misalign = 0 bytes of ref bint16_t" "vect" { xfail *-*-* } } } */
+/* { dg-final { scan-tree-dump "misalign = 0 bytes of ref aint32_t" "vect" } } */
+/* { dg-final { scan-tree-dump "misalign = 0 bytes of ref bint32_t" "vect" } } */
+/* { dg-final { scan-tree-dump "misalign = 0 bytes of ref aint64_t" "vect" } } */
+/* { dg-final { scan-tree-dump "misalign = 0 bytes of ref bint64_t" "vect" } } */
+/* { dg-final { scan-tree-dump "misalign = 0 bytes of ref afloat" "vect" } } */
+/* { dg-final { scan-tree-dump "misalign = 0 bytes of ref bfloat" "vect" } } */
+/* { dg-final { scan-tree-dump "misalign = 0 bytes of ref adouble" "vect" } } */
+/* { dg-final { scan-tree-dump "misalign = 0 bytes of ref bdouble" "vect" } } */
+
+/* { dg-final { scan-tree-dump-times "LOOP VECTORIZED" 4 "vect" } } */
diff --git a/gcc/testsuite/gcc.target/aarch64/sve_speculative_11_run.c b/gcc/testsuite/gcc.target/aarch64/sve_speculative_11_run.c
new file mode 100644
index 00000000000..ebcefdb623c
--- /dev/null
+++ b/gcc/testsuite/gcc.target/aarch64/sve_speculative_11_run.c
@@ -0,0 +1,61 @@
+/* { dg-do run { target { aarch64_sve_hw } } } */
+/* { dg-options "-O3 -fno-common -ffast-math -march=armv8-a+sve" } */
+/* { dg-options "-O3 -fno-common -ffast-math -march=armv8-a+sve -msve-vector-bits=256" { target aarch64_sve256_hw } } */
+
+#include "sve_speculative_11.c"
+
+extern void abort (void);
+
+#ifndef FILL_DATA
+#define FILL_DATA 0
+#endif
+
+#ifndef EXIT_CONDITION
+#define EXIT_CONDITION 5
+#endif
+
+#ifndef LOOP_COUNTS
+#define LOOP_COUNTS {37,45,55,17,39,43}
+#endif
+int loop_counts[] = LOOP_COUNTS;
+
+/* Fill the arrays with the exit conditions.
+ Then refill at the correct strided accesses with fill data up to the end of
+ the loop count. */
+
+#define TEST_SPEC_LOOP_FUNC(DATATYPE, ARGTYPE)\
+void test_spec_loop_##DATATYPE##_##ARGTYPE (ARGTYPE num_elements)\
+{\
+ int i;\
+ for (i=0; i<MAX_ARRAY_SIZE; i++)\
+ {\
+ a##DATATYPE[i] = EXIT_CONDITION;\
+ b##DATATYPE[i] = EXIT_CONDITION;\
+ }\
+ for (i=0; (i<num_elements-1)*STRIDE_LEVEL; i++)\
+ {\
+ a##DATATYPE[i*STRIDE_LEVEL] = FILL_DATA;\
+ b##DATATYPE[i*STRIDE_LEVEL] = FILL_DATA;\
+ }\
+ ARGTYPE ret = spec_loop_##DATATYPE##_##ARGTYPE (EXIT_CONDITION);\
+ if (ret != num_elements - 1)\
+ abort ();\
+}
+
+TEST_SPEC_LOOP_FUNC (int8_t, int8_t)
+TEST_SPEC_LOOP_FUNC (int16_t, int16_t)
+TEST_SPEC_LOOP_FUNC (int32_t, int32_t)
+TEST_SPEC_LOOP_FUNC (int64_t, int64_t)
+TEST_SPEC_LOOP_FUNC (float, int32_t)
+TEST_SPEC_LOOP_FUNC (double, int64_t)
+
+int main (void)
+{
+ test_spec_loop_int8_t_int8_t (loop_counts[0]);
+ test_spec_loop_int16_t_int16_t (loop_counts[1]);
+ test_spec_loop_int32_t_int32_t (loop_counts[2]);
+ test_spec_loop_int64_t_int64_t (loop_counts[3]);
+ test_spec_loop_float_int32_t (loop_counts[4]);
+ test_spec_loop_double_int64_t (loop_counts[5]);
+ return 0;
+}
diff --git a/gcc/testsuite/gcc.target/aarch64/sve_speculative_12.c b/gcc/testsuite/gcc.target/aarch64/sve_speculative_12.c
new file mode 100644
index 00000000000..d6caa8e7513
--- /dev/null
+++ b/gcc/testsuite/gcc.target/aarch64/sve_speculative_12.c
@@ -0,0 +1,32 @@
+/* { dg-do compile } */
+/* { dg-options "-O3 -fno-common -ffast-math -march=armv8-a+sve -fdump-tree-vect-details -msve-vector-bits=256" } */
+
+/* Speculative loop with two loads from global buffers which can be aligned
+ without any peeling, and an access stride of 2. */
+
+#define STRIDE_LEVEL 2
+
+#include "sve_speculative_11.c"
+
+/* { dg-final { scan-tree-dump-not "loop versioned for vectorization to enhance alignment" "vect" } } */
+/* { dg-final { scan-tree-dump-not "Alignment of access forced using peeling" "vect" } } */
+
+/* { dg-final { scan-tree-dump "force alignment of aint8_t" "vect" { xfail *-*-* } } } */
+/* { dg-final { scan-tree-dump "force alignment of bint8_t" "vect" { xfail *-*-* } } } */
+/* { dg-final { scan-tree-dump "force alignment of aint16_t" "vect" { xfail *-*-* } } } */
+/* { dg-final { scan-tree-dump "force alignment of bint16_t" "vect" { xfail *-*-* } } } */
+/* { dg-final { scan-tree-dump "force alignment of aint32_t" "vect" } } */
+/* { dg-final { scan-tree-dump "force alignment of bint32_t" "vect" } } */
+/* { dg-final { scan-tree-dump "force alignment of aint64_t" "vect" } } */
+/* { dg-final { scan-tree-dump "force alignment of bint64_t" "vect" } } */
+
+/* { dg-final { scan-tree-dump "misalign = 0 bytes of ref aint8_t" "vect" { xfail *-*-* } } } */
+/* { dg-final { scan-tree-dump "misalign = 0 bytes of ref bint8_t" "vect" { xfail *-*-* } } } */
+/* { dg-final { scan-tree-dump "misalign = 0 bytes of ref aint16_t" "vect" { xfail *-*-* } } } */
+/* { dg-final { scan-tree-dump "misalign = 0 bytes of ref bint16_t" "vect" { xfail *-*-* } } } */
+/* { dg-final { scan-tree-dump "misalign = 0 bytes of ref aint32_t" "vect" } } */
+/* { dg-final { scan-tree-dump "misalign = 0 bytes of ref bint32_t" "vect" } } */
+/* { dg-final { scan-tree-dump "misalign = 0 bytes of ref aint64_t" "vect" } } */
+/* { dg-final { scan-tree-dump "misalign = 0 bytes of ref bint64_t" "vect" } } */
+
+/* { dg-final { scan-tree-dump-times "LOOP VECTORIZED" 4 "vect" } } */
diff --git a/gcc/testsuite/gcc.target/aarch64/sve_speculative_12_run.c b/gcc/testsuite/gcc.target/aarch64/sve_speculative_12_run.c
new file mode 100644
index 00000000000..42c346073c6
--- /dev/null
+++ b/gcc/testsuite/gcc.target/aarch64/sve_speculative_12_run.c
@@ -0,0 +1,9 @@
+/* { dg-do run { target { aarch64_sve_hw } } } */
+/* { dg-options "-O3 -fno-common -ffast-math -march=armv8-a+sve" } */
+/* { dg-options "-O3 -fno-common -ffast-math -march=armv8-a+sve -msve-vector-bits=256" { target aarch64_sve256_hw } } */
+
+#define STRIDE_LEVEL 2
+#define EXIT_CONDITION 7
+#define LOOP_COUNTS {43,27,19,54,25,27}
+
+#include "sve_speculative_11_run.c"
diff --git a/gcc/testsuite/gcc.target/aarch64/sve_speculative_13.c b/gcc/testsuite/gcc.target/aarch64/sve_speculative_13.c
new file mode 100644
index 00000000000..db95e81d3f2
--- /dev/null
+++ b/gcc/testsuite/gcc.target/aarch64/sve_speculative_13.c
@@ -0,0 +1,12 @@
+/* { dg-do compile } */
+/* { dg-options "-O3 -fno-common -ffast-math -march=armv8-a+sve -fdump-tree-vect-details -msve-vector-bits=256" } */
+
+/* Speculative loop with two loads from global buffers which can be aligned
+ without any peeling, and an access stride of 3. */
+
+#define STRIDE_LEVEL 3
+
+#include "sve_speculative_11.c"
+
+/* { dg-final { scan-tree-dump-times "not vectorized: can't calculate required alignment for data ref" 10 "vect" } } */
+/* { dg-final { scan-tree-dump-not "LOOP VECTORIZED" "vect" } } */
diff --git a/gcc/testsuite/gcc.target/aarch64/sve_speculative_13_run.c b/gcc/testsuite/gcc.target/aarch64/sve_speculative_13_run.c
new file mode 100644
index 00000000000..519ff21e168
--- /dev/null
+++ b/gcc/testsuite/gcc.target/aarch64/sve_speculative_13_run.c
@@ -0,0 +1,9 @@
+/* { dg-do run { target { aarch64_sve_hw } } } */
+/* { dg-options "-O3 -fno-common -ffast-math -march=armv8-a+sve" } */
+/* { dg-options "-O3 -fno-common -ffast-math -march=armv8-a+sve -msve-vector-bits=256" { target aarch64_sve256_hw } } */
+
+#define STRIDE_LEVEL 3
+#define EXIT_CONDITION 9
+#define LOOP_COUNTS {19,47,15,35,23,33}
+
+#include "sve_speculative_11_run.c"
diff --git a/gcc/testsuite/gcc.target/aarch64/sve_speculative_14.c b/gcc/testsuite/gcc.target/aarch64/sve_speculative_14.c
new file mode 100644
index 00000000000..218afb6c5ca
--- /dev/null
+++ b/gcc/testsuite/gcc.target/aarch64/sve_speculative_14.c
@@ -0,0 +1,32 @@
+/* { dg-do compile } */
+/* { dg-options "-O3 -fno-common -ffast-math -march=armv8-a+sve -fdump-tree-vect-details -msve-vector-bits=256" } */
+
+/* Speculative loop with two loads from global buffers which can be aligned
+ without any peeling, and an access stride of 4. */
+
+#define STRIDE_LEVEL 4
+
+#include "sve_speculative_11.c"
+
+/* { dg-final { scan-tree-dump-not "loop versioned for vectorization to enhance alignment" "vect" } } */
+/* { dg-final { scan-tree-dump-not "Alignment of access forced using peeling" "vect" } } */
+
+/* { dg-final { scan-tree-dump "force alignment of aint8_t" "vect" { xfail *-*-* } } } */
+/* { dg-final { scan-tree-dump "force alignment of bint8_t" "vect" { xfail *-*-* } } } */
+/* { dg-final { scan-tree-dump "force alignment of aint16_t" "vect" { xfail *-*-* } } } */
+/* { dg-final { scan-tree-dump "force alignment of bint16_t" "vect" { xfail *-*-* } } } */
+/* { dg-final { scan-tree-dump "force alignment of aint32_t" "vect" } } */
+/* { dg-final { scan-tree-dump "force alignment of bint32_t" "vect" } } */
+/* { dg-final { scan-tree-dump "force alignment of aint64_t" "vect" } } */
+/* { dg-final { scan-tree-dump "force alignment of bint64_t" "vect" } } */
+
+/* { dg-final { scan-tree-dump "misalign = 0 bytes of ref aint8_t" "vect" { xfail *-*-* } } } */
+/* { dg-final { scan-tree-dump "misalign = 0 bytes of ref bint8_t" "vect" { xfail *-*-* } } } */
+/* { dg-final { scan-tree-dump "misalign = 0 bytes of ref aint16_t" "vect" { xfail *-*-* } } } */
+/* { dg-final { scan-tree-dump "misalign = 0 bytes of ref bint16_t" "vect" { xfail *-*-* } } } */
+/* { dg-final { scan-tree-dump "misalign = 0 bytes of ref aint32_t" "vect" } } */
+/* { dg-final { scan-tree-dump "misalign = 0 bytes of ref bint32_t" "vect" } } */
+/* { dg-final { scan-tree-dump "misalign = 0 bytes of ref aint64_t" "vect" } } */
+/* { dg-final { scan-tree-dump "misalign = 0 bytes of ref bint64_t" "vect" } } */
+
+/* { dg-final { scan-tree-dump-times "LOOP VECTORIZED" 4 "vect" } } */
diff --git a/gcc/testsuite/gcc.target/aarch64/sve_speculative_14_run.c b/gcc/testsuite/gcc.target/aarch64/sve_speculative_14_run.c
new file mode 100644
index 00000000000..958e94fd822
--- /dev/null
+++ b/gcc/testsuite/gcc.target/aarch64/sve_speculative_14_run.c
@@ -0,0 +1,11 @@
+/* { dg-do run { target { aarch64_sve_hw } } } */
+/* { dg-options "-O3 -fno-common -ffast-math -march=armv8-a+sve" } */
+/* { dg-options "-O3 -fno-common -ffast-math -march=armv8-a+sve -msve-vector-bits=256" { target aarch64_sve256_hw } } */
+
+#define STRIDE_LEVEL 4
+
+#define FILL_DATA 5
+#define EXIT_CONDITION 22
+#define LOOP_COUNTS {43,27,19,54,25,27}
+
+#include "sve_speculative_11_run.c"
diff --git a/gcc/testsuite/gcc.target/aarch64/sve_speculative_15.c b/gcc/testsuite/gcc.target/aarch64/sve_speculative_15.c
new file mode 100644
index 00000000000..42ec564c90b
--- /dev/null
+++ b/gcc/testsuite/gcc.target/aarch64/sve_speculative_15.c
@@ -0,0 +1,59 @@
+/* { dg-do compile } */
+/* { dg-options "-O3 -fno-common -ffast-math -march=armv8-a+sve -fdump-tree-vect-details -msve-vector-bits=256" } */
+
+#include <stdint.h>
+
+/* Speculative loop with two consecutive loads from a single global buffer
+ which can be aligned without any peeling, and an access stride of 2. */
+
+#define MAX_ARRAY_SIZE 500
+
+/* Minimum STRIDE_LEVEL is 2. */
+#ifndef STRIDE_LEVEL
+#define STRIDE_LEVEL 2
+#endif
+
+#define SPEC_LOOP(DATATYPE, ARGTYPE)\
+DATATYPE a##DATATYPE[MAX_ARRAY_SIZE];\
+ARGTYPE spec_loop_##DATATYPE##_##ARGTYPE (DATATYPE n)\
+{\
+ ARGTYPE i = -1;\
+ do\
+ i += 1;\
+ while (a##DATATYPE[i*STRIDE_LEVEL] + a##DATATYPE[(i*STRIDE_LEVEL) + 1] < n);\
+ return i;\
+}
+
+/* TODO: Cannot yet vectorize due to gather load. */
+SPEC_LOOP (int8_t, int8_t)
+SPEC_LOOP (int16_t, int16_t)
+
+SPEC_LOOP (int32_t, int32_t)
+SPEC_LOOP (int64_t, int64_t)
+SPEC_LOOP (float, int32_t)
+SPEC_LOOP (double, int64_t)
+
+/* { dg-final { scan-tree-dump-not "loop versioned for vectorization to enhance alignment" "vect" } } */
+/* { dg-final { scan-tree-dump-not "Alignment of access forced using peeling" "vect" } } */
+
+/* { dg-final { scan-tree-dump "force alignment of aint8_t" "vect" { xfail *-*-* } } } */
+/* { dg-final { scan-tree-dump "force alignment of aint16_t" "vect" { xfail *-*-* } } } */
+/* { dg-final { scan-tree-dump "force alignment of aint32_t" "vect" } } */
+/* { dg-final { scan-tree-dump "force alignment of aint64_t" "vect" } } */
+/* { dg-final { scan-tree-dump "force alignment of afloat" "vect" } } */
+/* { dg-final { scan-tree-dump "force alignment of adouble" "vect" } } */
+
+/* { dg-final { scan-tree-dump "misalign = 1 bytes of ref aint8_t" "vect" { xfail *-*-* } } } */
+/* { dg-final { scan-tree-dump "misalign = 0 bytes of ref aint8_t" "vect" { xfail *-*-* } } } */
+/* { dg-final { scan-tree-dump "misalign = 0 bytes of ref aint16_t" "vect" { xfail *-*-* } } } */
+/* { dg-final { scan-tree-dump "misalign = 2 bytes of ref aint16_t" "vect" { xfail *-*-* } } } */
+/* { dg-final { scan-tree-dump "misalign = 0 bytes of ref aint32_t" "vect" } } */
+/* { dg-final { scan-tree-dump "misalign = 4 bytes of ref aint32_t" "vect" } } */
+/* { dg-final { scan-tree-dump "misalign = 0 bytes of ref aint64_t" "vect" } } */
+/* { dg-final { scan-tree-dump "misalign = 8 bytes of ref aint64_t" "vect" } } */
+/* { dg-final { scan-tree-dump "misalign = 0 bytes of ref afloat" "vect" } } */
+/* { dg-final { scan-tree-dump "misalign = 4 bytes of ref afloat" "vect" } } */
+/* { dg-final { scan-tree-dump "misalign = 0 bytes of ref adouble" "vect" } } */
+/* { dg-final { scan-tree-dump "misalign = 8 bytes of ref adouble" "vect" } } */
+
+/* { dg-final { scan-tree-dump-times "LOOP VECTORIZED" 4 "vect" } } */
diff --git a/gcc/testsuite/gcc.target/aarch64/sve_speculative_15_run.c b/gcc/testsuite/gcc.target/aarch64/sve_speculative_15_run.c
new file mode 100644
index 00000000000..533f99467fd
--- /dev/null
+++ b/gcc/testsuite/gcc.target/aarch64/sve_speculative_15_run.c
@@ -0,0 +1,56 @@
+/* { dg-do run { target { aarch64_sve_hw } } } */
+/* { dg-options "-O3 -fno-common -ffast-math -march=armv8-a+sve" } */
+/* { dg-options "-O3 -fno-common -ffast-math -march=armv8-a+sve -msve-vector-bits=256" { target aarch64_sve256_hw } } */
+
+#include "sve_speculative_15.c"
+
+extern void abort (void);
+
+#ifndef FILL_DATA
+#define FILL_DATA 0
+#endif
+
+#ifndef EXIT_CONDITION
+#define EXIT_CONDITION 5
+#endif
+
+#ifndef LOOP_COUNTS
+#define LOOP_COUNTS {37,45,55,17,39,43}
+#endif
+int loop_counts[] = LOOP_COUNTS;
+
+/* Fill the arrays with the exit conditions.
+ Then refill at the correct strided accesses with fill data up to the end of
+ the loop count. */
+
+#define TEST_SPEC_LOOP_FUNC(DATATYPE, ARGTYPE) \
+void \
+test_spec_loop_##DATATYPE##_##ARGTYPE (ARGTYPE num_elements) \
+{ \
+ for (int i = 0; i < MAX_ARRAY_SIZE; ++i) \
+ a##DATATYPE[i] = EXIT_CONDITION; \
+ for (int i = 0; i < (num_elements - 1) * STRIDE_LEVEL; ++i) \
+ a##DATATYPE[i] = FILL_DATA; \
+ ARGTYPE ret = spec_loop_##DATATYPE##_##ARGTYPE (EXIT_CONDITION); \
+ if (ret != num_elements - 1) \
+ abort (); \
+}
+
+TEST_SPEC_LOOP_FUNC (int8_t, int8_t)
+TEST_SPEC_LOOP_FUNC (int16_t, int16_t)
+TEST_SPEC_LOOP_FUNC (int32_t, int32_t)
+TEST_SPEC_LOOP_FUNC (int64_t, int64_t)
+TEST_SPEC_LOOP_FUNC (float, int32_t)
+TEST_SPEC_LOOP_FUNC (double, int64_t)
+
+int main (void)
+{
+ test_spec_loop_int8_t_int8_t (loop_counts[0]);
+ test_spec_loop_int16_t_int16_t (loop_counts[1]);
+ test_spec_loop_int32_t_int32_t (loop_counts[2]);
+ test_spec_loop_int64_t_int64_t (loop_counts[3]);
+ test_spec_loop_float_int32_t (loop_counts[4]);
+ test_spec_loop_double_int64_t (loop_counts[5]);
+ return 0;
+}
+
diff --git a/gcc/testsuite/gcc.target/aarch64/sve_speculative_16.c b/gcc/testsuite/gcc.target/aarch64/sve_speculative_16.c
new file mode 100644
index 00000000000..9affb766b2a
--- /dev/null
+++ b/gcc/testsuite/gcc.target/aarch64/sve_speculative_16.c
@@ -0,0 +1,12 @@
+/* { dg-do compile } */
+/* { dg-options "-O3 -fno-common -ffast-math -march=armv8-a+sve -fdump-tree-vect-details -msve-vector-bits=256" } */
+
+/* Speculative loop with two consecutive loads from a single global buffer
+ which can be aligned without any peeling, and an access stride of 3. */
+
+#define STRIDE_LEVEL 3
+
+#include "sve_speculative_15.c"
+
+/* { dg-final { scan-tree-dump-times "not vectorized: can't calculate required alignment for data ref" 10 "vect" } } */
+/* { dg-final { scan-tree-dump-not "LOOP VECTORIZED" "vect" } } */
diff --git a/gcc/testsuite/gcc.target/aarch64/sve_speculative_16_run.c b/gcc/testsuite/gcc.target/aarch64/sve_speculative_16_run.c
new file mode 100644
index 00000000000..7c53e7aeed6
--- /dev/null
+++ b/gcc/testsuite/gcc.target/aarch64/sve_speculative_16_run.c
@@ -0,0 +1,9 @@
+/* { dg-do run { target { aarch64_sve_hw } } } */
+/* { dg-options "-O3 -fno-common -ffast-math -march=armv8-a+sve" } */
+/* { dg-options "-O3 -fno-common -ffast-math -march=armv8-a+sve -msve-vector-bits=256" { target aarch64_sve256_hw } } */
+
+#define STRIDE_LEVEL 3
+#define EXIT_CONDITION 7
+#define LOOP_COUNTS {43,27,19,54,25,27}
+
+#include "sve_speculative_15_run.c"
diff --git a/gcc/testsuite/gcc.target/aarch64/sve_speculative_17.c b/gcc/testsuite/gcc.target/aarch64/sve_speculative_17.c
new file mode 100644
index 00000000000..b7e472e0deb
--- /dev/null
+++ b/gcc/testsuite/gcc.target/aarch64/sve_speculative_17.c
@@ -0,0 +1,34 @@
+/* { dg-do compile } */
+/* { dg-options "-O3 -fno-common -ffast-math -march=armv8-a+sve -fdump-tree-vect-details -msve-vector-bits=256" } */
+
+/* Speculative loop with two consecutive loads from a single global buffer
+ which can be aligned without any peeling, and an access stride of 4. */
+
+#define STRIDE_LEVEL 4
+
+#include "sve_speculative_15.c"
+
+/* { dg-final { scan-tree-dump-not "loop versioned for vectorization to enhance alignment" "vect" } } */
+/* { dg-final { scan-tree-dump-not "Alignment of access forced using peeling" "vect" } } */
+
+/* { dg-final { scan-tree-dump "force alignment of aint8_t" "vect" { xfail *-*-* } } } */
+/* { dg-final { scan-tree-dump "force alignment of aint16_t" "vect" { xfail *-*-* } } } */
+/* { dg-final { scan-tree-dump "force alignment of aint32_t" "vect" } } */
+/* { dg-final { scan-tree-dump "force alignment of aint64_t" "vect" } } */
+/* { dg-final { scan-tree-dump "force alignment of afloat" "vect" } } */
+/* { dg-final { scan-tree-dump "force alignment of adouble" "vect" } } */
+
+/* { dg-final { scan-tree-dump "misalign = 1 bytes of ref aint8_t" "vect" { xfail *-*-* } } } */
+/* { dg-final { scan-tree-dump "misalign = 0 bytes of ref aint8_t" "vect" { xfail *-*-* } } } */
+/* { dg-final { scan-tree-dump "misalign = 0 bytes of ref aint16_t" "vect" { xfail *-*-* } } } */
+/* { dg-final { scan-tree-dump "misalign = 2 bytes of ref aint16_t" "vect" { xfail *-*-* } } } */
+/* { dg-final { scan-tree-dump "misalign = 0 bytes of ref aint32_t" "vect" } } */
+/* { dg-final { scan-tree-dump "misalign = 4 bytes of ref aint32_t" "vect" } } */
+/* { dg-final { scan-tree-dump "misalign = 0 bytes of ref aint64_t" "vect" } } */
+/* { dg-final { scan-tree-dump "misalign = 8 bytes of ref aint64_t" "vect" } } */
+/* { dg-final { scan-tree-dump "misalign = 0 bytes of ref afloat" "vect" } } */
+/* { dg-final { scan-tree-dump "misalign = 4 bytes of ref afloat" "vect" } } */
+/* { dg-final { scan-tree-dump "misalign = 0 bytes of ref adouble" "vect" } } */
+/* { dg-final { scan-tree-dump "misalign = 8 bytes of ref adouble" "vect" } } */
+
+/* { dg-final { scan-tree-dump-times "LOOP VECTORIZED" 4 "vect" } } */
diff --git a/gcc/testsuite/gcc.target/aarch64/sve_speculative_17_run.c b/gcc/testsuite/gcc.target/aarch64/sve_speculative_17_run.c
new file mode 100644
index 00000000000..5453116429a
--- /dev/null
+++ b/gcc/testsuite/gcc.target/aarch64/sve_speculative_17_run.c
@@ -0,0 +1,9 @@
+/* { dg-do run { target { aarch64_sve_hw } } } */
+/* { dg-options "-O3 -fno-common -ffast-math -march=armv8-a+sve" } */
+/* { dg-options "-O3 -fno-common -ffast-math -march=armv8-a+sve -msve-vector-bits=256" { target aarch64_sve256_hw } } */
+
+#define STRIDE_LEVEL 4
+#define EXIT_CONDITION 9
+#define LOOP_COUNTS {19,47,15,35,23,33}
+
+#include "sve_speculative_15_run.c"
diff --git a/gcc/testsuite/gcc.target/aarch64/sve_speculative_1_run.c b/gcc/testsuite/gcc.target/aarch64/sve_speculative_1_run.c
new file mode 100644
index 00000000000..f4bb55ed6f8
--- /dev/null
+++ b/gcc/testsuite/gcc.target/aarch64/sve_speculative_1_run.c
@@ -0,0 +1,47 @@
+/* { dg-do run { target { aarch64_sve_hw } } } */
+/* { dg-options "-O3 -fno-inline -march=armv8-a+sve -ffast-math" } */
+/* { dg-options "-O3 -fno-inline -march=armv8-a+sve -ffast-math -msve-vector-bits=256" { target aarch64_sve256_hw } } */
+
+#include "sve_speculative_1.c"
+
+extern void abort (void);
+
+#define TEST_LOOP(ARGTYPE,INDUCTYPE)\
+{\
+ INDUCTYPE res = spec_loop_##ARGTYPE##INDUCTYPE (0xFF, 0xAE);\
+ if (res != 0xAE)\
+ abort ();\
+}\
+
+#define TEST_FP_LOOP(ARGTYPE,INDUCTYPE,FPTYPE)\
+{\
+ FPTYPE res = spec_fp_loop_##ARGTYPE##INDUCTYPE##FPTYPE (0xFF, 0xAE);\
+ if (res != 0xAE)\
+ abort ();\
+}\
+
+int main ()
+{
+ TEST_LOOP (uint8_t, uint8_t);
+ TEST_LOOP (uint16_t, uint16_t);
+ TEST_LOOP (uint32_t, uint32_t);
+ TEST_LOOP (uint64_t, uint64_t);
+ TEST_LOOP (int32_t, int32_t);
+ TEST_LOOP (int64_t, int64_t);
+
+ TEST_LOOP (uint16_t, uint8_t)
+
+ TEST_LOOP (uint32_t, uint8_t)
+ TEST_LOOP (uint32_t, uint16_t)
+
+ TEST_LOOP (uint64_t, uint8_t)
+ TEST_LOOP (uint64_t, uint16_t)
+ TEST_LOOP (uint64_t, uint32_t)
+
+ TEST_FP_LOOP (uint32_t, uint32_t, float)
+ TEST_FP_LOOP (uint64_t, uint64_t, double)
+
+ TEST_FP_LOOP (uint64_t, uint64_t, float)
+
+ return 0;
+}
diff --git a/gcc/testsuite/gcc.target/aarch64/sve_speculative_2.c b/gcc/testsuite/gcc.target/aarch64/sve_speculative_2.c
new file mode 100644
index 00000000000..108c5a6fbe6
--- /dev/null
+++ b/gcc/testsuite/gcc.target/aarch64/sve_speculative_2.c
@@ -0,0 +1,72 @@
+/* { dg-do compile } */
+/* { dg-options "-O3 -fno-inline -ffast-math -march=armv8-a+sve -fdump-tree-vect-details -msve-vector-bits=256" } */
+
+#include <stdint.h>
+#include <stdbool.h>
+
+/* Speculative loop with no data references. */
+
+/* FIXME: dup of rhs into predicate register is made of horrible code. */
+#define SPEC_LOOP(ARGTYPE,INDUCTYPE)\
+INDUCTYPE spec_loop_##ARGTYPE##INDUCTYPE (ARGTYPE mask, ARGTYPE limit,\
+ bool rhs)\
+{\
+ INDUCTYPE i = 0;\
+ bool lhs = (i & mask) != limit;\
+ while (lhs == rhs)\
+ {\
+ i += 1;\
+ lhs = (i & mask) != limit;\
+ }\
+ return i;\
+}\
+
+#define SPEC_FP_LOOP(ARGTYPE,INDUCTYPE,FPTYPE)\
+INDUCTYPE spec_fp_loop_##ARGTYPE##INDUCTYPE##FPTYPE (ARGTYPE mask, ARGTYPE limit,\
+ bool rhs)\
+{\
+ INDUCTYPE i = 0;\
+ FPTYPE f = 0.0;\
+ bool lhs = (i & mask) != limit;\
+ while (lhs == rhs)\
+ {\
+ f += 1;\
+ i += 1;\
+ lhs = (i & mask) != limit;\
+ }\
+ return f;\
+}\
+
+SPEC_LOOP (uint8_t, uint8_t)
+SPEC_LOOP (uint16_t, uint16_t)
+SPEC_LOOP (uint32_t, uint32_t)
+SPEC_LOOP (uint64_t, uint64_t)
+
+SPEC_LOOP (int8_t, int8_t)
+SPEC_LOOP (int16_t, int16_t)
+SPEC_LOOP (int32_t, int32_t)
+SPEC_LOOP (int64_t, int64_t)
+
+/* Conversions. */
+SPEC_LOOP (uint16_t, uint8_t)
+
+SPEC_LOOP (uint32_t, uint8_t)
+SPEC_LOOP (uint32_t, uint16_t)
+
+SPEC_LOOP (uint64_t, uint8_t)
+SPEC_LOOP (uint64_t, uint16_t)
+SPEC_LOOP (uint64_t, uint32_t)
+
+SPEC_FP_LOOP (uint32_t, uint32_t, float)
+SPEC_FP_LOOP (uint64_t, uint64_t, double)
+
+SPEC_FP_LOOP (uint64_t, uint64_t, float)
+
+/* { dg-final { scan-tree-dump-times "LOOP VECTORIZED" 17 "vect" } } */
+/* { dg-final { scan-assembler-times {\tbrka\tp[0-9]+\.b, p[0-9]+/z, p[0-9]+\.b} 17 } } */
+/* { dg-final { scan-assembler-times {\tlastb\tw[0-9]+, p[0-7], z[0-9]+\.b} 5 } } */
+/* { dg-final { scan-assembler-times {\tlastb\tw[0-9]+, p[0-7], z[0-9]+\.h} 4 } } */
+/* { dg-final { scan-assembler-times {\tlastb\tw[0-9]+, p[0-7], z[0-9]+\.s} 3 } } */
+/* { dg-final { scan-assembler-times {\tlastb\tx[0-9]+, p[0-7], z[0-9]+\.d} 2 } } */
+/* { dg-final { scan-assembler-times {\tlastb\ts[0-9]+, p[0-7], z[0-9]+\.s} 2 } } */
+/* { dg-final { scan-assembler-times {\tlastb\td[0-9]+, p[0-7], z[0-9]+\.d} 1 } } */
diff --git a/gcc/testsuite/gcc.target/aarch64/sve_speculative_2_run.c b/gcc/testsuite/gcc.target/aarch64/sve_speculative_2_run.c
new file mode 100644
index 00000000000..ad2c9c874b8
--- /dev/null
+++ b/gcc/testsuite/gcc.target/aarch64/sve_speculative_2_run.c
@@ -0,0 +1,45 @@
+/* { dg-do run { target { aarch64_sve_hw } } } */
+/* { dg-options "-O3 -fno-inline -march=armv8-a+sve -ffast-math" } */
+/* { dg-options "-O3 -fno-inline -march=armv8-a+sve -ffast-math -msve-vector-bits=256" { target aarch64_sve256_hw } } */
+
+#include "sve_speculative_2.c"
+
+extern void abort (void);
+
+#define TEST_LOOP(ARGTYPE,INDUCTYPE)\
+{\
+ INDUCTYPE res = spec_loop_##ARGTYPE##INDUCTYPE (0xFF, 0xAE, true);\
+ if (res != 0xAE)\
+ abort ();\
+}\
+
+#define TEST_FP_LOOP(ARGTYPE,INDUCTYPE,FPTYPE)\
+{\
+ FPTYPE res = spec_fp_loop_##ARGTYPE##INDUCTYPE##FPTYPE (0xFF, 0xAE, true);\
+ if (res != 0xAE)\
+ abort ();\
+}\
+
+int main ()
+{
+ TEST_LOOP (uint8_t, uint8_t);
+ TEST_LOOP (uint16_t, uint16_t);
+ TEST_LOOP (uint32_t, uint32_t);
+ TEST_LOOP (uint64_t, uint64_t);
+ TEST_LOOP (int32_t, int32_t);
+ TEST_LOOP (int64_t, int64_t);
+
+ TEST_LOOP (uint16_t, uint8_t)
+
+ TEST_LOOP (uint32_t, uint8_t)
+ TEST_LOOP (uint32_t, uint16_t)
+
+ TEST_LOOP (uint64_t, uint8_t)
+ TEST_LOOP (uint64_t, uint16_t)
+ TEST_LOOP (uint64_t, uint32_t)
+
+ TEST_FP_LOOP (uint32_t, uint32_t, float)
+ TEST_FP_LOOP (uint64_t, uint64_t, double)
+
+ return 0;
+}
diff --git a/gcc/testsuite/gcc.target/aarch64/sve_speculative_3.c b/gcc/testsuite/gcc.target/aarch64/sve_speculative_3.c
new file mode 100644
index 00000000000..db35711a193
--- /dev/null
+++ b/gcc/testsuite/gcc.target/aarch64/sve_speculative_3.c
@@ -0,0 +1,26 @@
+/* { dg-do compile } */
+/* { dg-options "-O3 -fno-inline -ffast-math -march=armv8-a+sve -fdump-tree-vect-details -msve-vector-bits=256" } */
+
+#include <stdint.h>
+
+/* Speculative loop with different sizes and no data references .
+ Cannot be vectorized. */
+
+#define SPEC_FP_LOOP(ARGTYPE,INDUCTYPE,FPTYPE)\
+FPTYPE spec_fp_loop_##ARGTYPE##INDUCTYPE (ARGTYPE mask, ARGTYPE limit)\
+{\
+ INDUCTYPE i = 0;\
+ FPTYPE f = 0.0;\
+ while ((i & mask) != limit)\
+ {\
+ f += 1;\
+ i += 1;\
+ }\
+ return f;\
+}\
+
+SPEC_FP_LOOP (uint32_t, uint32_t, double)
+
+/* { dg-final { scan-tree-dump-times "not vectorized: ncopies is greater than 1" 1 "vect" } } */
+/* { dg-final { scan-assembler-not "brka\tp\[0-9\]*.b, p\[0-9\]*\/z, p\[0-9\]*.b" } } */
+
diff --git a/gcc/testsuite/gcc.target/aarch64/sve_speculative_4.c b/gcc/testsuite/gcc.target/aarch64/sve_speculative_4.c
new file mode 100644
index 00000000000..32b8c71c92a
--- /dev/null
+++ b/gcc/testsuite/gcc.target/aarch64/sve_speculative_4.c
@@ -0,0 +1,66 @@
+/* { dg-do compile } */
+/* { dg-options "-O3 -fno-inline -ffast-math -march=armv8-a+sve -fdump-tree-vect-details -msve-vector-bits=256" } */
+
+#include <stdint.h>
+
+/* Speculative loop with a load. */
+
+#define SPEC_LOOP(ARGTYPE,INDUCTYPE)\
+INDUCTYPE spec_loop_##ARGTYPE##INDUCTYPE (ARGTYPE mask, ARGTYPE limit, ARGTYPE * array)\
+{\
+ uint64_t i = 0;\
+ INDUCTYPE r = 0;\
+ while ((i & mask) != limit)\
+ {\
+ r = array[i];\
+ i++;\
+ }\
+ return r;\
+}
+
+#define SPEC_FP_LOOP(ARGTYPE,FPTYPE)\
+FPTYPE spec_fp_loop_##ARGTYPE##FPTYPE (ARGTYPE mask, ARGTYPE limit, FPTYPE * array)\
+{\
+ uint64_t i = 0;\
+ FPTYPE f = 0.0;\
+ while ((i & mask) != limit)\
+ {\
+ f = array[i];\
+ i++;\
+ }\
+ return f;\
+}
+
+SPEC_LOOP (uint8_t, uint8_t)
+SPEC_LOOP (uint16_t, uint16_t)
+SPEC_LOOP (uint32_t, uint32_t)
+SPEC_LOOP (uint64_t, uint64_t)
+
+SPEC_LOOP (int8_t, int8_t)
+SPEC_LOOP (int16_t, int16_t)
+SPEC_LOOP (int32_t, int32_t)
+SPEC_LOOP (int64_t, int64_t)
+
+/* Conversions. */
+SPEC_LOOP (uint16_t, uint8_t)
+
+SPEC_LOOP (uint32_t, uint8_t)
+SPEC_LOOP (uint32_t, uint16_t)
+
+SPEC_LOOP (uint64_t, uint8_t)
+SPEC_LOOP (uint64_t, uint16_t)
+SPEC_LOOP (uint64_t, uint32_t)
+
+SPEC_FP_LOOP (uint32_t, float)
+SPEC_FP_LOOP (uint64_t, double)
+
+SPEC_FP_LOOP (uint64_t, float)
+
+/* { dg-final { scan-tree-dump-times "LOOP VECTORIZED" 17 "vect" } } */
+/* { dg-final { scan-assembler-times {\tbrka\tp[0-9]+\.b, p[0-9]+/z, p[0-9]+\.b} 17 } } */
+/* { dg-final { scan-assembler-times {\tlastb\tw[0-9]+, p[0-7], z[0-9]+\.b} 2 } } */
+/* { dg-final { scan-assembler-times {\tlastb\tw[0-9]+, p[0-7], z[0-9]+\.h} 3 } } */
+/* { dg-final { scan-assembler-times {\tlastb\tw[0-9]+, p[0-7], z[0-9]+\.s} 4 } } */
+/* { dg-final { scan-assembler-times {\tlastb\tx[0-9]+, p[0-7], z[0-9]+\.d} 5 } } */
+/* { dg-final { scan-assembler-times {\tlastb\ts[0-9]+, p[0-7], z[0-9]+\.s} 2 } } */
+/* { dg-final { scan-assembler-times {\tlastb\td[0-9]+, p[0-7], z[0-9]+\.d} 1 } } */
diff --git a/gcc/testsuite/gcc.target/aarch64/sve_speculative_4_run.c b/gcc/testsuite/gcc.target/aarch64/sve_speculative_4_run.c
new file mode 100644
index 00000000000..96834ba51be
--- /dev/null
+++ b/gcc/testsuite/gcc.target/aarch64/sve_speculative_4_run.c
@@ -0,0 +1,56 @@
+/* { dg-do run { target { aarch64_sve_hw } } } */
+/* { dg-options "-O3 -fno-inline -march=armv8-a+sve -ffast-math" } */
+/* { dg-options "-O3 -fno-inline -march=armv8-a+sve -ffast-math -msve-vector-bits=256" { target aarch64_sve256_hw } } */
+
+#include "sve_speculative_4.c"
+
+extern void abort (void);
+#include <string.h>
+
+#define MAX 0xAE
+
+#define TEST_LOOP(ARGTYPE,INDUCTYPE)\
+{\
+ ARGTYPE array[MAX];\
+ memset (array, 0, sizeof (ARGTYPE) * MAX);\
+ array[MAX - 1] = 72;\
+ INDUCTYPE res = spec_loop_##ARGTYPE##INDUCTYPE (0xFF, MAX, array);\
+ if (res != 72)\
+ abort ();\
+}
+
+#define TEST_FP_LOOP(ARGTYPE,FPTYPE)\
+{\
+ FPTYPE array[MAX];\
+ memset (array, 0, sizeof (FPTYPE) * MAX);\
+ array[MAX - 1] = 54.5;\
+ FPTYPE res = spec_fp_loop_##ARGTYPE##FPTYPE (0xFF, MAX, array);\
+ if (res != 54.5)\
+ abort ();\
+}
+
+int main ()
+{
+ TEST_LOOP (uint8_t, uint8_t);
+ TEST_LOOP (uint16_t, uint16_t);
+ TEST_LOOP (uint32_t, uint32_t);
+ TEST_LOOP (uint64_t, uint64_t);
+ TEST_LOOP (int32_t, int32_t);
+ TEST_LOOP (int64_t, int64_t);
+
+ TEST_LOOP (uint16_t, uint8_t)
+
+ TEST_LOOP (uint32_t, uint8_t)
+ TEST_LOOP (uint32_t, uint16_t)
+
+ TEST_LOOP (uint64_t, uint8_t)
+ TEST_LOOP (uint64_t, uint16_t)
+ TEST_LOOP (uint64_t, uint32_t)
+
+ TEST_FP_LOOP (uint32_t, float)
+ TEST_FP_LOOP (uint64_t, double)
+
+ TEST_FP_LOOP (uint64_t, float)
+
+ return 0;
+}
diff --git a/gcc/testsuite/gcc.target/aarch64/sve_speculative_5.c b/gcc/testsuite/gcc.target/aarch64/sve_speculative_5.c
new file mode 100644
index 00000000000..d1d8f8fbaaa
--- /dev/null
+++ b/gcc/testsuite/gcc.target/aarch64/sve_speculative_5.c
@@ -0,0 +1,54 @@
+/* { dg-do compile } */
+/* { dg-options "-O3 -fno-inline -ffast-math -march=armv8-a+sve -fdump-tree-vect-details -msve-vector-bits=256" } */
+
+#include <stdint.h>
+
+/* Speculative loop with a load. Exit condition in the array. */
+
+#ifndef EXIT_CONDITION
+#define EXIT_CONDITION 1
+#endif
+
+#define SPEC_LOOP(ARGTYPE)\
+ARGTYPE spec_loop_##ARGTYPE (ARGTYPE * array)\
+{\
+ ARGTYPE i = 0;\
+ ARGTYPE r = EXIT_CONDITION + 1;\
+ while (r != EXIT_CONDITION)\
+ {\
+ r = array[i];\
+ i++;\
+ }\
+ return i;\
+}
+
+#define SPEC_FP_LOOP(FPTYPE, ARGTYPE)\
+ARGTYPE spec_loop_##ARGTYPE##FPTYPE (FPTYPE * array)\
+{\
+ ARGTYPE i = 0;\
+ ARGTYPE r = EXIT_CONDITION + 1;\
+ while (r != EXIT_CONDITION)\
+ {\
+ r = array[i];\
+ i++;\
+ }\
+ return i;\
+}
+
+/* TODO: Cannot yet vectorize due to gather load. */
+SPEC_LOOP (int8_t)
+SPEC_LOOP (int16_t)
+
+SPEC_LOOP (int32_t)
+SPEC_LOOP (int64_t)
+SPEC_FP_LOOP (float, int32_t)
+SPEC_FP_LOOP (double, int64_t)
+
+/* { dg-final { scan-tree-dump-times "LOOP VECTORIZED" 4 "vect" } } */
+/* { dg-final { scan-assembler-times {\tbrka\tp[0-9]+\.b, p[0-9]+/z, p[0-9]+\.b} 4 } } */
+/* { dg-final { scan-assembler-not {\tlastb\tw[0-9]+, p[0-7], z[0-9]+\.b} } } */
+/* { dg-final { scan-assembler-not {\tlastb\tw[0-9]+, p[0-7], z[0-9]+\.h} } } */
+/* { dg-final { scan-assembler-times {\tlastb\tw[0-9]+, p[0-7], z[0-9]+\.s} 2 } } */
+/* { dg-final { scan-assembler-times {\tlastb\tx[0-9]+, p[0-7], z[0-9]+\.d} 2 } } */
+/* { dg-final { scan-assembler-not {\tlastb\ts[0-9]+, p[0-7], z[0-9]+\.s} } } */
+/* { dg-final { scan-assembler-not {\tlastb\td[0-9]+, p[0-7], z[0-9]+\.d} } } */
diff --git a/gcc/testsuite/gcc.target/aarch64/sve_speculative_5_run.c b/gcc/testsuite/gcc.target/aarch64/sve_speculative_5_run.c
new file mode 100644
index 00000000000..a8f7f9fff17
--- /dev/null
+++ b/gcc/testsuite/gcc.target/aarch64/sve_speculative_5_run.c
@@ -0,0 +1,104 @@
+/* { dg-do run { target { aarch64_sve_hw } } } */
+/* { dg-options "-O3 -fno-inline -march=armv8-a+sve" } */
+/* { dg-options "-O3 -fno-inline -march=armv8-a+sve -msve-vector-bits=256" { target aarch64_sve256_hw } } */
+
+#include "sve_speculative_5.c"
+
+#define _GNU_SOURCE
+#include <sys/mman.h>
+extern void abort (void);
+extern void *mremap (void *old_address, size_t old_size,
+ size_t new_size, int flags, ... /* void *new_address */);
+
+#ifndef FILL_DATA
+#define FILL_DATA 0
+#endif
+
+#ifndef LOOP_COUNTS
+#define LOOP_COUNTS {22,20,13,17,29,19}
+#endif
+int loop_counts[] = LOOP_COUNTS;
+
+/* Program will fault if memory beyond the boundaries of BUF is accessed. */
+
+#define SPACE_SIZE 4096*sizeof(int)
+
+/* Enable to confirm program segfaults when accessing outside of BUF. */
+#ifdef CHECK_SEGFAULT
+#define ADDITIONAL 1
+#else
+#define ADDITIONAL 0
+#endif
+
+/* BUF is an array of NUM_ELEMENTS size.
+ BUF_PRE points to 4 elements before BUF.
+ Before calling SPEC_LOOP, set the last element of BUF and the
+ four elements of BUF_PRE to the exit condition.
+ Fill the rest of BUF to the fill data. */
+
+#define TEST_SPEC_LOOP_FUNC(ARGTYPE)\
+void test_spec_loop_##ARGTYPE (void *bufend, ARGTYPE num_elements)\
+{\
+ int i;\
+ ARGTYPE* buf = ((ARGTYPE*)bufend) - num_elements;\
+ ARGTYPE* buf_pre = ((ARGTYPE*)bufend) - num_elements - 4;\
+ for (i=0; i<num_elements-1; i++)\
+ buf[i] = FILL_DATA;\
+ buf[num_elements - 1 + ADDITIONAL] = EXIT_CONDITION;\
+ for (i=0; i<4; i++)\
+ buf_pre[i] = EXIT_CONDITION;\
+ ARGTYPE ret = spec_loop_##ARGTYPE (buf);\
+ if (ret != num_elements)\
+ abort ();\
+}
+
+#define TEST_SPEC_FP_LOOP_FUNC(FPTYPE, ARGTYPE)\
+void test_spec_loop_##ARGTYPE##FPTYPE (void *bufend, ARGTYPE num_elements)\
+{\
+ int i;\
+ FPTYPE* buf = ((FPTYPE*)bufend) - num_elements;\
+ FPTYPE* buf_pre = ((FPTYPE*)bufend) - num_elements - 4;\
+ for (i=0; i<num_elements-1; i++)\
+ buf[i] = FILL_DATA;\
+ buf[num_elements - 1 + ADDITIONAL] = EXIT_CONDITION;\
+ for (i=0; i<4; i++)\
+ buf_pre[i] = EXIT_CONDITION;\
+ ARGTYPE ret = spec_loop_##ARGTYPE##FPTYPE (buf);\
+ if (ret != num_elements)\
+ abort ();\
+}
+
+TEST_SPEC_LOOP_FUNC (int8_t)
+TEST_SPEC_LOOP_FUNC (int16_t)
+TEST_SPEC_LOOP_FUNC (int32_t)
+TEST_SPEC_LOOP_FUNC (int64_t)
+TEST_SPEC_FP_LOOP_FUNC (float, int32_t)
+TEST_SPEC_FP_LOOP_FUNC (double, int64_t)
+
+int main (void)
+{
+ /* Map in two pages worth of space. Then reduce it down to a single page.
+ This will result in the second page of data being unmapped - ie it
+ will cause a segfault if accessed. */
+
+ void *space = mmap (0, SPACE_SIZE * 2, PROT_READ|PROT_WRITE,
+ MAP_ANON|MAP_PRIVATE, -1, 0);
+ if (space == (void*)-1)
+ abort ();
+
+ void *space_new = mremap (space, SPACE_SIZE * 2, SPACE_SIZE, 0);
+ if (space != space_new)
+ abort ();
+
+ /* set END to the start of the second (unmapped) page. */
+ char *end = space + SPACE_SIZE;
+
+ test_spec_loop_int8_t (end, loop_counts[0]);
+ test_spec_loop_int16_t (end, loop_counts[1]);
+ test_spec_loop_int32_t (end, loop_counts[2]);
+ test_spec_loop_int64_t (end, loop_counts[3]);
+ test_spec_loop_int32_tfloat (end, loop_counts[4]);
+ test_spec_loop_int64_tdouble (end, loop_counts[5]);
+
+ return 0;
+}
diff --git a/gcc/testsuite/gcc.target/aarch64/sve_speculative_5_run_2.c b/gcc/testsuite/gcc.target/aarch64/sve_speculative_5_run_2.c
new file mode 100644
index 00000000000..ed12336f47d
--- /dev/null
+++ b/gcc/testsuite/gcc.target/aarch64/sve_speculative_5_run_2.c
@@ -0,0 +1,8 @@
+/* { dg-do run { target { aarch64_sve_hw } } } */
+/* { dg-options "-O3 -fno-inline -march=armv8-a+sve" } */
+/* { dg-options "-O3 -fno-inline -march=armv8-a+sve -msve-vector-bits=256" { target aarch64_sve256_hw } } */
+
+/* Use exit condition of 0. */
+#define EXIT_CONDITION 0
+#define FILL_DATA 1
+#include "sve_speculative_5_run.c"
diff --git a/gcc/testsuite/gcc.target/aarch64/sve_speculative_5_run_3.c b/gcc/testsuite/gcc.target/aarch64/sve_speculative_5_run_3.c
new file mode 100644
index 00000000000..c6a5edf86b4
--- /dev/null
+++ b/gcc/testsuite/gcc.target/aarch64/sve_speculative_5_run_3.c
@@ -0,0 +1,9 @@
+/* { dg-do run { target { aarch64_sve_hw } } } */
+/* { dg-options "-O3 -fno-inline -march=armv8-a+sve" } */
+/* { dg-options "-O3 -fno-inline -march=armv8-a+sve -msve-vector-bits=256" { target aarch64_sve256_hw } } */
+
+/* Use exit condition of 0 and less than a single iteration. */
+#define EXIT_CONDITION 0
+#define FILL_DATA 1
+#define LOOP_COUNTS {3,5,3,1,5,1}
+#include "sve_speculative_5_run.c"
diff --git a/gcc/testsuite/gcc.target/aarch64/sve_speculative_6.c b/gcc/testsuite/gcc.target/aarch64/sve_speculative_6.c
new file mode 100644
index 00000000000..1b71687a257
--- /dev/null
+++ b/gcc/testsuite/gcc.target/aarch64/sve_speculative_6.c
@@ -0,0 +1,44 @@
+/* { dg-do compile } */
+/* { dg-options "-O3 -fno-inline -ffast-math -march=armv8-a+sve -fdump-tree-vect-details -msve-vector-bits=256" } */
+
+#include <stdint.h>
+
+/* Speculative loop with a conditional load. */
+
+#define SPEC_LOOP(ARGTYPE,INDUCTYPE)\
+INDUCTYPE spec_loop_##ARGTYPE##INDUCTYPE (ARGTYPE mask, ARGTYPE limit,\
+ ARGTYPE * array, ARGTYPE * cond)\
+{\
+ uint64_t i = 0;\
+ INDUCTYPE r = 0;\
+ while ((i & mask) != limit)\
+ {\
+ if (cond[i])\
+ r = array[i];\
+ i++;\
+ }\
+ return r;\
+}
+
+SPEC_LOOP (uint8_t, uint8_t)
+SPEC_LOOP (uint16_t, uint16_t)
+SPEC_LOOP (uint32_t, uint32_t)
+SPEC_LOOP (uint64_t, uint64_t)
+
+SPEC_LOOP (int8_t, int8_t)
+SPEC_LOOP (int16_t, int16_t)
+SPEC_LOOP (int32_t, int32_t)
+SPEC_LOOP (int64_t, int64_t)
+
+/* Conversions. */
+SPEC_LOOP (uint16_t, uint8_t)
+
+SPEC_LOOP (uint32_t, uint8_t)
+SPEC_LOOP (uint32_t, uint16_t)
+
+SPEC_LOOP (uint64_t, uint8_t)
+SPEC_LOOP (uint64_t, uint16_t)
+SPEC_LOOP (uint64_t, uint32_t)
+
+/* { dg-final { scan-tree-dump-not "LOOP VECTORIZED" "vect" } } */
+/* { dg-final { scan-tree-dump "speculative mask loads not supported" "vect" } } */
diff --git a/gcc/testsuite/gcc.target/aarch64/sve_speculative_7.c b/gcc/testsuite/gcc.target/aarch64/sve_speculative_7.c
new file mode 100644
index 00000000000..0c2d62387e2
--- /dev/null
+++ b/gcc/testsuite/gcc.target/aarch64/sve_speculative_7.c
@@ -0,0 +1,21 @@
+/* { dg-do compile } */
+/* { dg-options "-O3 -fno-inline -ffast-math -march=armv8-a+sve -fdump-tree-vect-details -msve-vector-bits=256" } */
+
+#include <stdint.h>
+
+/* Speculative loop with a load and a test. */
+
+uint32_t
+search (uint32_t *array)
+{
+ for (;;)
+ {
+ uint32_t x = *array++ >> 7;
+ if (x >= 200)
+ return x;
+ }
+}
+
+/* { dg-final { scan-tree-dump "LOOP VECTORIZED" "vect" } } */
+/* { dg-final { scan-assembler-times {\tbrka\tp[0-9]+\.b, p[0-9]+/z, p[0-9]+\.b} 1 } } */
+/* { dg-final { scan-assembler-times {\tlastb\tw[0-9]+, p[0-7], z[0-9]+\.s} 1 } } */
diff --git a/gcc/testsuite/gcc.target/aarch64/sve_speculative_8.c b/gcc/testsuite/gcc.target/aarch64/sve_speculative_8.c
new file mode 100644
index 00000000000..8c70e2f9012
--- /dev/null
+++ b/gcc/testsuite/gcc.target/aarch64/sve_speculative_8.c
@@ -0,0 +1,21 @@
+/* { dg-do compile } */
+/* { dg-options "-O3 -fno-inline -ffast-math -march=armv8-a+sve -fdump-tree-vect-details -msve-vector-bits=256" } */
+
+#include <stdint.h>
+
+/* Speculative loop with a load which requires multiple copies and a test. */
+
+uint32_t
+search (uint64_t *array)
+{
+ for (;;)
+ {
+ uint32_t x = *array++ >> 7;
+ if (x >= 200)
+ return x;
+ }
+}
+
+/* { dg-final { scan-tree-dump "multiple copies not supported for speculative loops" "vect" } } */
+/* { dg-final { scan-tree-dump "not vectorized: relevant stmt not supported" "vect" } } */
+/* { dg-final { scan-tree-dump-not "LOOP VECTORIZED" "vect" } } */
diff --git a/gcc/testsuite/gcc.target/aarch64/sve_speculative_9.c b/gcc/testsuite/gcc.target/aarch64/sve_speculative_9.c
new file mode 100644
index 00000000000..c21b44614c7
--- /dev/null
+++ b/gcc/testsuite/gcc.target/aarch64/sve_speculative_9.c
@@ -0,0 +1,34 @@
+/* { dg-do compile } */
+/* { dg-options "-O3 -fno-common -ffast-math -march=armv8-a+sve -fdump-tree-vect-details -msve-vector-bits=256" } */
+
+#include <stdint.h>
+
+/* Speculative loop with two loads which cannot both be aligned. */
+
+#ifndef STRIDE_LEVEL
+#define STRIDE_LEVEL 1
+#endif
+
+#define SPEC_LOOP(DATATYPE, ARGTYPE)\
+ARGTYPE spec_loop_##DATATYPE##_##ARGTYPE (DATATYPE *a, DATATYPE*b, DATATYPE n)\
+{\
+ ARGTYPE i = -1;\
+ do\
+ i += 1;\
+ while (a[i*STRIDE_LEVEL] + b[i*STRIDE_LEVEL] < n);\
+ return i;\
+}
+
+/* TODO: Cannot yet vectorize due to gather load. */
+SPEC_LOOP (int8_t, int8_t)
+SPEC_LOOP (int16_t, int16_t)
+
+SPEC_LOOP (int32_t, int32_t)
+SPEC_LOOP (int64_t, int64_t)
+SPEC_LOOP (float, int32_t)
+SPEC_LOOP (double, int64_t)
+
+
+/* { dg-final { scan-tree-dump-times "loop versioned for vectorization to enhance alignment" 4 "vect" } } */
+/* { dg-final { scan-tree-dump-not "Alignment of access forced using peeling" "vect" } } */
+/* { dg-final { scan-tree-dump-times "LOOP VECTORIZED" 4 "vect" } } */
diff --git a/gcc/testsuite/gcc.target/aarch64/sve_speculative_9_run.c b/gcc/testsuite/gcc.target/aarch64/sve_speculative_9_run.c
new file mode 100644
index 00000000000..f9470020fd0
--- /dev/null
+++ b/gcc/testsuite/gcc.target/aarch64/sve_speculative_9_run.c
@@ -0,0 +1,67 @@
+/* { dg-do run { target { aarch64_sve_hw } } } */
+/* { dg-options "-O3 -fno-common -ffast-math -march=armv8-a+sve" } */
+/* { dg-options "-O3 -fno-common -ffast-math -march=armv8-a+sve -msve-vector-bits=256" { target aarch64_sve256_hw } } */
+
+#include "sve_speculative_9.c"
+
+extern void abort (void);
+
+#ifndef MAX_ARRAY_SIZE
+#define MAX_ARRAY_SIZE 500
+#endif
+
+#ifndef FILL_DATA
+#define FILL_DATA 0
+#endif
+
+#ifndef EXIT_CONDITION
+#define EXIT_CONDITION 5
+#endif
+
+#ifndef LOOP_COUNTS
+#define LOOP_COUNTS {37,45,55,17,39,43}
+#endif
+int loop_counts[] = LOOP_COUNTS;
+
+/* Fill the arrays with the exit conditions.
+ Then refill at the correct strided accesses with fill data up to the end of
+ the loop count. */
+
+#define TEST_SPEC_LOOP_FUNC(DATATYPE, ARGTYPE)\
+void test_spec_loop_##DATATYPE##_##ARGTYPE (ARGTYPE num_elements)\
+{\
+ DATATYPE a[MAX_ARRAY_SIZE];\
+ DATATYPE b[MAX_ARRAY_SIZE];\
+ int i;\
+ for (i=0; i<MAX_ARRAY_SIZE; i++)\
+ {\
+ a[i] = EXIT_CONDITION;\
+ b[i] = EXIT_CONDITION;\
+ }\
+ for (i=0; (i<num_elements-1)*STRIDE_LEVEL; i++)\
+ {\
+ a[i*STRIDE_LEVEL] = FILL_DATA;\
+ b[i*STRIDE_LEVEL] = FILL_DATA;\
+ }\
+ ARGTYPE ret = spec_loop_##DATATYPE##_##ARGTYPE (a, b, EXIT_CONDITION);\
+ if (ret != num_elements - 1)\
+ abort ();\
+}
+
+TEST_SPEC_LOOP_FUNC (int8_t, int8_t)
+TEST_SPEC_LOOP_FUNC (int16_t, int16_t)
+TEST_SPEC_LOOP_FUNC (int32_t, int32_t)
+TEST_SPEC_LOOP_FUNC (int64_t, int64_t)
+TEST_SPEC_LOOP_FUNC (float, int32_t)
+TEST_SPEC_LOOP_FUNC (double, int64_t)
+
+int main (void)
+{
+ test_spec_loop_int8_t_int8_t (loop_counts[0]);
+ test_spec_loop_int16_t_int16_t (loop_counts[1]);
+ test_spec_loop_int32_t_int32_t (loop_counts[2]);
+ test_spec_loop_int64_t_int64_t (loop_counts[3]);
+ test_spec_loop_float_int32_t (loop_counts[4]);
+ test_spec_loop_double_int64_t (loop_counts[5]);
+ return 0;
+}
diff --git a/gcc/tree-vect-data-refs.c b/gcc/tree-vect-data-refs.c
index adb2af72573..3ef92c1d87d 100644
--- a/gcc/tree-vect-data-refs.c
+++ b/gcc/tree-vect-data-refs.c
@@ -834,15 +834,64 @@ vect_record_base_alignments (vec_info *vinfo)
}
}
+/* Function can_get_vect_data_ref_required_alignment
+
+ Try to calculate the alignment for the given data reference DR once
+ vectorised. If successful store the alignment to ALIGNMENT_P.
+
+ For non speculative loops, the alignment is always calculable and is given
+ by preferred_vector_alignment. For speculative loops we align to the
+ vector size multiplied by the step. */
+
+bool
+vect_can_calculate_target_alignment (struct data_reference *dr,
+ unsigned int *alignment_p)
+{
+ gimple *stmt = DR_STMT (dr);
+ stmt_vec_info stmt_info = vinfo_for_stmt (stmt);
+ loop_vec_info loop_vinfo = STMT_VINFO_LOOP_VINFO (stmt_info);
+
+ if (!loop_vinfo || !LOOP_VINFO_SPECULATIVE_EXECUTION (loop_vinfo))
+ {
+ tree vectype = STMT_VINFO_VECTYPE (stmt_info);
+ if (alignment_p)
+ *alignment_p = targetm.vectorize.preferred_vector_alignment (vectype);
+ return true;
+ }
+
+ /* We have to assume that non-constant vector sizes might not be
+ a power of two. */
+ unsigned HOST_WIDE_INT size;
+ if (!current_vector_size.is_constant (&size))
+ return false;
+
+ /* Step must be a positive integer. */
+ if (!tree_fits_shwi_p (DR_STEP (dr))
+ || tree_int_cst_sgn (DR_STEP (dr)) <= 0)
+ return false;
+
+ unsigned int step = tree_to_uhwi (DR_STEP (dr));
+ unsigned int unit_size =
+ tree_to_uhwi (TYPE_SIZE_UNIT (TREE_TYPE (DR_REF (dr))));
+
+ /* Step must be a power of two and divisible by the unit size. */
+ if (!pow2p_hwi (step) || step % unit_size != 0)
+ return false;
+
+ if (alignment_p)
+ *alignment_p = size * BITS_PER_UNIT * step / unit_size;
+ return true;
+}
+
/* Return the target alignment for the vectorized form of DR. */
static unsigned int
vect_calculate_target_alignment (struct data_reference *dr)
{
- gimple *stmt = DR_STMT (dr);
- stmt_vec_info stmt_info = vinfo_for_stmt (stmt);
- tree vectype = STMT_VINFO_VECTYPE (stmt_info);
- return targetm.vectorize.preferred_vector_alignment (vectype);
+ unsigned int ret;
+ if (!vect_can_calculate_target_alignment (dr, &ret))
+ gcc_unreachable ();
+ return ret;
}
/* Function vect_compute_data_ref_alignment
@@ -2288,11 +2337,11 @@ vect_find_same_alignment_drs (struct data_dependence_relation *ddr)
if (diff != 0)
{
/* Get the wider of the two alignments. */
- unsigned int align_a = (vect_calculate_target_alignment (dra)
- / BITS_PER_UNIT);
- unsigned int align_b = (vect_calculate_target_alignment (drb)
- / BITS_PER_UNIT);
- unsigned int max_align = MAX (align_a, align_b);
+ unsigned int align_a, align_b;
+ if (!vect_can_calculate_target_alignment (dra, &align_a)
+ || !vect_can_calculate_target_alignment (drb, &align_b))
+ return;
+ unsigned int max_align = MAX (align_a, align_b) / BITS_PER_UNIT;
/* Require the gap to be a multiple of the larger vector alignment. */
if (!wi::multiple_of_p (diff, max_align, SIGNED))
@@ -2341,6 +2390,17 @@ vect_analyze_data_refs_alignment (loop_vec_info vinfo)
FOR_EACH_VEC_ELT (datarefs, i, dr)
{
stmt_vec_info stmt_info = vinfo_for_stmt (DR_STMT (dr));
+
+ if (STMT_VINFO_VECTORIZABLE (stmt_info)
+ && !vect_can_calculate_target_alignment (dr, NULL))
+ {
+ if (dump_enabled_p ())
+ dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
+ "not vectorized: can't calculate required "
+ "alignment for data ref.\n");
+ return false;
+ }
+
if (STMT_VINFO_VECTORIZABLE (stmt_info)
&& !vect_compute_data_ref_alignment (dr))
{
@@ -3484,7 +3544,17 @@ vect_prune_runtime_alias_test_list (loop_vec_info loop_vinfo)
else
{
if (!operand_equal_p (DR_STEP (dr_a), DR_STEP (dr_b), 0))
- length_factor = scalar_loop_iters;
+ {
+ if (LOOP_VINFO_SPECULATIVE_EXECUTION (loop_vinfo))
+ {
+ if (dump_enabled_p ())
+ dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
+ "Cannot vectorize speculative loops with "
+ "differing data reference step sizes.\n");
+ return false;
+ }
+ length_factor = scalar_loop_iters;
+ }
else
length_factor = size_int (vect_factor);
segment_length_a = vect_vfa_segment_size (dr_a, length_factor);
@@ -4466,6 +4536,9 @@ vect_get_new_ssa_name (tree type, enum vect_var_kind var_kind, const char *name)
case vect_simple_var:
prefix = "vect";
break;
+ case vect_mask_var:
+ prefix = "mask";
+ break;
case vect_scalar_var:
prefix = "stmp";
break;
@@ -6652,6 +6725,10 @@ vect_supportable_dr_alignment (struct data_reference *dr,
{
vect_loop = LOOP_VINFO_LOOP (loop_vinfo);
nested_in_vect_loop = nested_in_vect_loop_p (vect_loop, stmt);
+
+ /* Speculative loops rely on aligned data refs. */
+ if (LOOP_VINFO_SPECULATIVE_EXECUTION (loop_vinfo))
+ return dr_unaligned_unsupported;
}
/* Possibly unaligned access. */
diff --git a/gcc/tree-vect-loop-manip.c b/gcc/tree-vect-loop-manip.c
index 57aab1b764f..901113fcf03 100644
--- a/gcc/tree-vect-loop-manip.c
+++ b/gcc/tree-vect-loop-manip.c
@@ -369,6 +369,242 @@ vect_maybe_permute_loop_masks (gimple_seq *seq, rgroup_masks *dest_rgm,
return false;
}
+/* Helper for vect_set_speculative_masks. Set the masks in RGM directly
+ from the corresponding scalar values. RGM belongs to LOOP, which has
+ been vectorized according to LOOP_VINFO. NSCALARITERS_SKIP is the
+ number of scalar iterations that we should skip during the first
+ iteration of the vector loop (because the start point has been
+ brought forward by that amount to achieve alignment).
+
+ Add any new preheader statements to PREHEADER_SEQ and any new header
+ statements to HEADER_SEQ. */
+
+static void
+vect_set_speculative_masks_directly (struct loop *loop,
+ loop_vec_info loop_vinfo,
+ gimple_seq *preheader_seq,
+ gimple_seq *header_seq,
+ rgroup_masks *rgm,
+ tree nscalariters_skip)
+{
+ /* It doesn't make sense to align for speculation when we have a
+ capped VF. */
+ gcc_assert (!use_capped_vf (loop_vinfo));
+
+ tree compare_type = LOOP_VINFO_MASK_COMPARE_TYPE (loop_vinfo);
+ tree mask_type = rgm->mask_type;
+ poly_uint64 nscalars_per_mask = TYPE_VECTOR_SUBPARTS (mask_type);
+ unsigned int nscalars_per_iter = rgm->max_nscalars_per_iter;
+
+ tree nscalars_skip = nscalariters_skip;
+ if (nscalars_per_iter != 1)
+ {
+ tree factor = build_int_cst (compare_type, nscalars_per_iter);
+ nscalars_skip = gimple_build (preheader_seq, MULT_EXPR, compare_type,
+ nscalars_skip, factor);
+ }
+
+ tree full_mask = build_minus_one_cst (mask_type);
+ tree mask;
+ unsigned int i;
+ FOR_EACH_VEC_ELT (rgm->masks, i, mask)
+ {
+ /* Previous masks covered START scalars. This mask covers the
+ next batch. */
+ tree start = build_int_cst (compare_type, nscalars_per_mask * i);
+ tree init_mask = vect_gen_while_not (preheader_seq, mask_type,
+ start, nscalars_skip);
+
+ /* Always use a full mask for subsequent iterations of the loop. */
+ vect_set_loop_mask (loop, header_seq, mask, init_mask,
+ full_mask, NULL_TREE);
+ }
+}
+
+/* Set up the controlling masks for LOOP, which is a speculative loop that
+ has been vectorized according to LOOP_VINFO. */
+
+static void
+vect_set_speculative_masks (struct loop *loop, loop_vec_info loop_vinfo)
+{
+ gimple_seq preheader_seq = NULL;
+ gimple_seq header_seq = NULL;
+
+ vec_loop_masks *masks = &LOOP_VINFO_MASKS (loop_vinfo);
+ tree nscalariters_skip = LOOP_VINFO_MASK_SKIP_NITERS (loop_vinfo);
+ rgroup_masks *rgm;
+ unsigned int i;
+ FOR_EACH_VEC_ELT (*masks, i, rgm)
+ if (!rgm->masks.is_empty ())
+ {
+ /* We shouldn't be using masks if there are no elements to skip
+ on the first iteration. */
+ gcc_assert (nscalariters_skip != NULL_TREE);
+
+ /* First try using permutes. */
+ unsigned int nmasks = i + 1;
+ if ((nmasks & 1) == 0)
+ {
+ rgroup_masks *half_rgm = &(*masks)[nmasks / 2 - 1];
+ if (!half_rgm->masks.is_empty ()
+ && vect_maybe_permute_loop_masks (&header_seq, rgm, half_rgm))
+ continue;
+ }
+
+ vect_set_speculative_masks_directly (loop, loop_vinfo,
+ &preheader_seq, &header_seq,
+ rgm, nscalariters_skip);
+ }
+
+ /* Emit all accumulated statements. */
+ add_preheader_seq (loop, preheader_seq);
+ add_header_seq (loop, header_seq);
+}
+
+/* RGM belongs to the nonspeculative masks of LOOP_VINFO. Set up the masks
+ in RGM so that the active bits corresponding to the first NSCALARITERS
+ scalar iterations are true and every other bit is false. Add any new
+ statements before GSI. */
+
+static void
+vect_set_nonspeculative_masks_directly (loop_vec_info loop_vinfo,
+ gimple_stmt_iterator *gsi,
+ rgroup_masks *rgm, tree nscalariters)
+{
+ tree compare_type = LOOP_VINFO_MASK_COMPARE_TYPE (loop_vinfo);
+ tree mask_type = rgm->mask_type;
+ poly_uint64 nscalars_per_mask = TYPE_VECTOR_SUBPARTS (mask_type);
+ unsigned int nscalars_per_iter = rgm->max_nscalars_per_iter;
+
+ /* Calculate the number of scalars covered by the rgroup. */
+ gimple_seq seq = NULL;
+ tree nscalars = nscalariters;
+ if (nscalars_per_iter != 1)
+ nscalars = gimple_build (&seq, MULT_EXPR, compare_type, nscalars,
+ build_int_cst (compare_type, nscalars_per_iter));
+ if (seq)
+ gsi_insert_seq_before (gsi, seq, GSI_SAME_STMT);
+
+ tree mask;
+ unsigned int i;
+ FOR_EACH_VEC_ELT (rgm->masks, i, mask)
+ {
+ /* Previous masks covered START scalars. This mask covers the
+ next batch. */
+ tree start = build_int_cst (compare_type, nscalars_per_mask * i);
+ if (LOOP_VINFO_MASK_SKIP_NITERS (loop_vinfo))
+ {
+ /* First get a mask that ignores whether bits are active. */
+ tree temp = make_ssa_name (mask_type);
+ gcall *call = vect_gen_while (temp, start, nscalars);
+ gsi_insert_before (gsi, call, GSI_SAME_STMT);
+
+ /* Now AND the result with the active lanes. */
+ tree active
+ = vect_get_loop_mask (gsi, &LOOP_VINFO_MASKS (loop_vinfo),
+ rgm->masks.length (), mask_type, i);
+ gassign *assign = gimple_build_assign (mask, BIT_AND_EXPR,
+ temp, active);
+ gsi_insert_before (gsi, assign, GSI_SAME_STMT);
+ }
+ else
+ {
+ /* All lanes are active. */
+ gcall *call = vect_gen_while (mask, start, nscalars);
+ gsi_insert_before (gsi, call, GSI_SAME_STMT);
+ }
+ }
+}
+
+/* Set MASK to the mask of active elements up to and including the
+ first iteration for which the exit condition of LOOP_VINFO is true.
+ Insert any new statements before GSI. ALL_ACTIVE_P is true if we
+ should treat all elements as active, false if we should get the
+ mask of active elements from the main loop mask. */
+
+static void
+vect_add_break_after (loop_vec_info loop_vinfo, gimple_stmt_iterator *gsi,
+ tree mask, bool all_active_p)
+{
+ tree mask_type = TREE_TYPE (mask);
+
+ tree active;
+ if (all_active_p)
+ active = build_minus_one_cst (mask_type);
+ else
+ active = vect_get_loop_mask (gsi, &LOOP_VINFO_MASKS (loop_vinfo),
+ 1, mask_type, 0);
+
+ /* Break the mask after the first true exit condition. */
+ tree exit_mask = LOOP_VINFO_EXIT_TEST_MASK (loop_vinfo);
+ gcall *call = gimple_build_call_internal (IFN_BREAK_AFTER, 2,
+ active, exit_mask);
+ gimple_call_set_lhs (call, mask);
+ gsi_insert_before (gsi, call, GSI_SAME_STMT);
+}
+
+/* Set up the nonspeculative masks in LOOP_VINFO. Emit any new statements
+ before GSI. */
+
+static void
+vect_set_nonspeculative_masks (loop_vec_info loop_vinfo,
+ gimple_stmt_iterator *gsi)
+{
+ vec_niters_and_mask nim;
+ vec_loop_masks *masks = &LOOP_VINFO_NONSPECULATIVE_MASKS (loop_vinfo);
+ tree compare_type = LOOP_VINFO_MASK_COMPARE_TYPE (loop_vinfo);
+ tree niters = NULL_TREE;
+ rgroup_masks *rgm;
+ unsigned int i;
+ FOR_EACH_VEC_ELT (*masks, i, rgm)
+ if (!rgm->masks.is_empty ())
+ {
+ unsigned int nmasks = i + 1;
+
+ /* Try to set the mask directly with a BREAK_AFTER. */
+ if (nmasks == 1 && rgm->max_nscalars_per_iter == 1)
+ {
+ /* All elements are active unless we're peeling for
+ alignment. */
+ vect_add_break_after (loop_vinfo, gsi, rgm->masks[0],
+ !LOOP_VINFO_MASK_SKIP_NITERS (loop_vinfo));
+ continue;
+ }
+
+ /* Try using permutes. */
+ if ((nmasks & 1) == 0)
+ {
+ gimple_seq seq = NULL;
+ rgroup_masks *half_rgm = &(*masks)[nmasks / 2 - 1];
+ if (!half_rgm->masks.is_empty ()
+ && vect_maybe_permute_loop_masks (&seq, rgm, half_rgm))
+ {
+ gsi_insert_seq_before (gsi, seq, GSI_SAME_STMT);
+ continue;
+ }
+ }
+
+ if (niters == NULL_TREE)
+ {
+ /* Get the mask of elements up to and including the first
+ iteration for which the exit condition is true.
+ Include any inactive starting elements at this stage. */
+ tree mask_type = vect_mask_type_for_speculation (loop_vinfo);
+ nim.mask = make_ssa_name (mask_type);
+ vect_add_break_after (loop_vinfo, gsi, nim.mask, true);
+
+ /* Convert the mask to a scalar count, then convert the
+ sizetype result to the mask comparison type. */
+ gimple_seq seq = NULL;
+ niters = vect_get_niters_from_mask (&seq, &nim);
+ niters = gimple_convert (&seq, compare_type, niters);
+ if (seq)
+ gsi_insert_seq_before (gsi, seq, GSI_SAME_STMT);
+ }
+ vect_set_nonspeculative_masks_directly (loop_vinfo, gsi, rgm, niters);
+ }
+}
+
/* Helper for vect_set_loop_condition_masked. Generate definitions for
all the masks in RGM and return a mask that is nonzero when the loop
needs to iterate. Add any new preheader statements to PREHEADER_SEQ
@@ -939,11 +1175,29 @@ vect_set_loop_condition (struct loop *loop, loop_vec_info loop_vinfo,
tree niters, tree step, tree final_iv,
bool niters_maybe_zero)
{
- gcond *cond_stmt;
+ gcond *cond_stmt = NULL;
gcond *orig_cond = get_loop_exit_condition (loop);
gimple_stmt_iterator loop_cond_gsi = gsi_for_stmt (orig_cond);
+ bool masked_p = (loop_vinfo && LOOP_VINFO_FULLY_MASKED_P (loop_vinfo));
+ bool speculation_p
+ = (loop_vinfo && LOOP_VINFO_SPECULATIVE_EXECUTION (loop_vinfo));
- if (loop_vinfo && LOOP_VINFO_FULLY_MASKED_P (loop_vinfo))
+ if (speculation_p)
+ {
+ /* Set the masks that control statements that cannot be speculatively
+ executed. */
+ vect_set_nonspeculative_masks (loop_vinfo, &loop_cond_gsi);
+
+ /* ...then add the statements themselves. */
+ gimple_seq late_seq = LOOP_VINFO_NONSPECULATIVE_SEQ (loop_vinfo);
+ if (late_seq)
+ gsi_insert_seq_before (&loop_cond_gsi, late_seq, GSI_SAME_STMT);
+
+ /* Set up the masks that control the speculative statements. */
+ if (masked_p)
+ vect_set_speculative_masks (loop, loop_vinfo);
+ }
+ else if (masked_p)
cond_stmt = vect_set_loop_condition_masked (loop, loop_vinfo, niters,
final_iv, niters_maybe_zero,
loop_cond_gsi);
@@ -952,11 +1206,14 @@ vect_set_loop_condition (struct loop *loop, loop_vec_info loop_vinfo,
final_iv, niters_maybe_zero,
loop_cond_gsi);
- /* Remove old loop exit test. */
- gsi_remove (&loop_cond_gsi, true);
- free_stmt_vec_info (orig_cond);
+ if (!speculation_p)
+ {
+ /* Remove old loop exit test. */
+ gsi_remove (&loop_cond_gsi, true);
+ free_stmt_vec_info (orig_cond);
+ }
- if (dump_enabled_p ())
+ if (dump_enabled_p () && cond_stmt)
{
dump_printf_loc (MSG_NOTE, vect_location, "New loop exit condition: ");
dump_gimple_stmt (MSG_NOTE, TDF_SLIM, cond_stmt, 0);
@@ -1644,13 +1901,15 @@ vect_gen_prolog_loop_niters (loop_vec_info loop_vinfo,
{
struct data_reference *dr = LOOP_VINFO_UNALIGNED_DR (loop_vinfo);
tree var;
- tree niters_type = TREE_TYPE (LOOP_VINFO_NITERS (loop_vinfo));
gimple_seq stmts = NULL, new_stmts = NULL;
tree iters, iters_name;
gimple *dr_stmt = DR_STMT (dr);
stmt_vec_info stmt_info = vinfo_for_stmt (dr_stmt);
tree vectype = STMT_VINFO_VECTYPE (stmt_info);
unsigned int target_align = DR_TARGET_ALIGNMENT (dr);
+ tree niters_type = (LOOP_VINFO_SPECULATIVE_EXECUTION (loop_vinfo)
+ ? size_type_node
+ : TREE_TYPE (LOOP_VINFO_NITERS (loop_vinfo)));
if (LOOP_VINFO_PEELING_FOR_ALIGNMENT (loop_vinfo) > 0)
{
@@ -1829,6 +2088,12 @@ vect_prepare_for_masked_peels (loop_vec_info loop_vinfo)
tree
vect_build_loop_niters (loop_vec_info loop_vinfo, bool *new_var_p)
{
+ if (!LOOP_VINFO_NITERS (loop_vinfo))
+ {
+ gcc_assert (LOOP_VINFO_SPECULATIVE_EXECUTION (loop_vinfo));
+ return NULL;
+ }
+
tree ni = unshare_expr (LOOP_VINFO_NITERS (loop_vinfo));
if (TREE_CODE (ni) == INTEGER_CST)
return ni;
@@ -2421,7 +2686,7 @@ vect_do_peeling (loop_vec_info loop_vinfo, tree niters, tree nitersm1,
bool check_profitability, bool niters_no_overflow)
{
edge e, guard_e;
- tree type = TREE_TYPE (niters), guard_cond;
+ tree guard_cond;
basic_block guard_bb, guard_to;
profile_probability prob_prolog, prob_vector, prob_epilog;
int estimated_vf;
@@ -2469,6 +2734,7 @@ vect_do_peeling (loop_vec_info loop_vinfo, tree niters, tree nitersm1,
/* Generate the number of iterations for the prolog loop. We do this here
so that we can also get the upper bound on the number of iterations. */
+ tree type = TREE_TYPE (niters);
tree niters_prolog;
int bound_prolog = 0;
if (prolog_peeling)
diff --git a/gcc/tree-vect-loop.c b/gcc/tree-vect-loop.c
index e33a83bfa6b..c6269a95815 100644
--- a/gcc/tree-vect-loop.c
+++ b/gcc/tree-vect-loop.c
@@ -369,7 +369,12 @@ vect_determine_vectorization_factor (loop_vec_info loop_vinfo)
analyze_pattern_stmt = false;
}
+ bool is_gcond = gimple_code (stmt) == GIMPLE_COND;
+ gcc_assert (!is_gcond
+ || LOOP_VINFO_SPECULATIVE_EXECUTION (loop_vinfo));
+
if (gimple_get_lhs (stmt) == NULL_TREE
+ && !is_gcond
/* MASK_STORE has no lhs, but is ok. */
&& (!is_gimple_call (stmt)
|| !gimple_call_internal_p (stmt)
@@ -427,27 +432,31 @@ vect_determine_vectorization_factor (loop_vec_info loop_vinfo)
gcc_assert (!STMT_VINFO_DATA_REF (stmt_info));
if (gimple_call_internal_p (stmt, IFN_MASK_STORE))
scalar_type = TREE_TYPE (gimple_call_arg (stmt, 3));
+ else if (is_gcond)
+ scalar_type = TREE_TYPE (gimple_cond_lhs (stmt));
else
scalar_type = TREE_TYPE (gimple_get_lhs (stmt));
/* Bool ops don't participate in vectorization factor
computation. For comparison use compared types to
compute a factor. */
- if (VECT_SCALAR_BOOLEAN_TYPE_P (scalar_type)
- && is_gimple_assign (stmt)
- && gimple_assign_rhs_code (stmt) != COND_EXPR)
+ if (is_gcond
+ || (VECT_SCALAR_BOOLEAN_TYPE_P (scalar_type)
+ && is_gimple_assign (stmt)
+ && gimple_assign_rhs_code (stmt) != COND_EXPR))
{
if (STMT_VINFO_RELEVANT_P (stmt_info)
|| STMT_VINFO_LIVE_P (stmt_info))
mask_producers.safe_push (stmt_info);
bool_result = true;
- if (TREE_CODE_CLASS (gimple_assign_rhs_code (stmt))
- == tcc_comparison
+ if (is_gimple_assign (stmt)
+ && (TREE_CODE_CLASS (gimple_assign_rhs_code (stmt))
+ == tcc_comparison)
&& !VECT_SCALAR_BOOLEAN_TYPE_P
(TREE_TYPE (gimple_assign_rhs1 (stmt))))
scalar_type = TREE_TYPE (gimple_assign_rhs1 (stmt));
- else
+ else if (TREE_CODE (scalar_type) == BOOLEAN_TYPE)
{
if (!analyze_pattern_stmt && gsi_end_p (pattern_def_si))
{
@@ -589,13 +598,28 @@ vect_determine_vectorization_factor (loop_vec_info loop_vinfo)
tree mask_type = NULL;
stmt = STMT_VINFO_STMT (mask_producers[i]);
+ bool is_gcond = gimple_code (stmt) == GIMPLE_COND;
+ bool ops_are_booleans = true;
if (is_gimple_assign (stmt)
&& TREE_CODE_CLASS (gimple_assign_rhs_code (stmt)) == tcc_comparison
&& !VECT_SCALAR_BOOLEAN_TYPE_P
(TREE_TYPE (gimple_assign_rhs1 (stmt))))
{
scalar_type = TREE_TYPE (gimple_assign_rhs1 (stmt));
+ ops_are_booleans = false;
+
+ }
+ else if (is_gcond
+ && TREE_CODE (TREE_TYPE (gimple_cond_lhs (stmt)))
+ != BOOLEAN_TYPE)
+ {
+ scalar_type = TREE_TYPE (gimple_cond_lhs (stmt));
+ ops_are_booleans = false;
+ }
+
+ if (!ops_are_booleans)
+ {
mask_type = get_mask_type_for_scalar_type (scalar_type);
if (!mask_type)
@@ -1131,6 +1155,7 @@ _loop_vec_info::_loop_vec_info (struct loop *loop_in)
slp_unrolling_factor (1),
single_scalar_iteration_cost (0),
vectorizable (false),
+ speculative_execution (false),
can_fully_mask_p (true),
fully_masked_p (false),
peeling_for_gaps (false),
@@ -1140,7 +1165,10 @@ _loop_vec_info::_loop_vec_info (struct loop *loop_in)
has_mask_store (false),
scalar_loop (NULL),
orig_loop_info (NULL),
- vect_addr_base_htab (31)
+ vect_addr_base_htab (31),
+ exit_test_mask (NULL_TREE),
+ exit_mask (NULL_TREE),
+ nonspeculative_seq (NULL)
{
/* Create/Update stmt_info for all stmts in the loop. */
basic_block *body = get_loop_body (loop);
@@ -1252,6 +1280,7 @@ _loop_vec_info::~_loop_vec_info ()
free (bbs);
release_vec_loop_masks (&masks);
+ release_vec_loop_masks (&nonspeculative_masks);
loop->aux = NULL;
}
@@ -1296,22 +1325,40 @@ vect_verify_full_masking (loop_vec_info loop_vinfo)
{
struct loop *loop = LOOP_VINFO_LOOP (loop_vinfo);
unsigned int min_ni_width;
+ unsigned HOST_WIDE_INT const_vf;
- /* Get the maximum number of iterations that is representable
- in the counter type. */
- tree ni_type = TREE_TYPE (LOOP_VINFO_NITERSM1 (loop_vinfo));
- widest_int max_ni = wi::to_widest (TYPE_MAX_VALUE (ni_type)) + 1;
+ /* Get the number of bits needed to hold the number of iterations
+ as an unsigned value. */
+ if (LOOP_VINFO_SPECULATIVE_EXECUTION (loop_vinfo))
+ {
+ /* For speculative loops, we only need to count the number of iterations
+ before the vector loop. */
+ if (LOOP_VINFO_VECT_FACTOR (loop_vinfo).is_constant (&const_vf))
+ {
+ unsigned int factor = vect_get_max_nscalars_per_iter (loop_vinfo);
+ min_ni_width = wi::min_precision (const_vf * factor, UNSIGNED);
+ }
+ else
+ min_ni_width = POINTER_SIZE;
+ }
+ else
+ {
+ /* Get the maximum number of iterations that is representable
+ in the counter type. */
+ tree ni_type = TREE_TYPE (LOOP_VINFO_NITERSM1 (loop_vinfo));
+ widest_int max_ni = wi::to_widest (TYPE_MAX_VALUE (ni_type)) + 1;
- /* Get a more refined estimate for the number of iterations. */
- widest_int max_back_edges;
- if (max_loop_iterations (loop, &max_back_edges))
- max_ni = wi::smin (max_ni, max_back_edges + 1);
+ /* Get a more refined estimate for the number of iterations. */
+ widest_int max_back_edges;
+ if (max_loop_iterations (loop, &max_back_edges))
+ max_ni = wi::smin (max_ni, max_back_edges + 1);
- /* Account for rgroup masks, in which each bit is replicated N times. */
- max_ni *= vect_get_max_nscalars_per_iter (loop_vinfo);
+ /* Account for rgroup masks, in which each bit is replicated N times. */
+ max_ni *= vect_get_max_nscalars_per_iter (loop_vinfo);
- /* Work out how many bits we need to represent the limit. */
- min_ni_width = wi::min_precision (max_ni, UNSIGNED);
+ /* Work out how many bits we need to represent the limit. */
+ min_ni_width = wi::min_precision (max_ni, UNSIGNED);
+ }
/* Find a scalar mode for which WHILE_ULT is supported. */
opt_scalar_int_mode cmp_mode_iter;
@@ -1672,7 +1719,8 @@ vect_analyze_loop_form_1 (struct loop *loop, gcond **loop_cond,
if (integer_zerop (*assumptions)
|| !*number_of_iterations
- || chrec_contains_undetermined (*number_of_iterations))
+ || (loop->inner
+ && chrec_contains_undetermined (*number_of_iterations)))
{
if (dump_enabled_p ())
dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
@@ -1680,6 +1728,15 @@ vect_analyze_loop_form_1 (struct loop *loop, gcond **loop_cond,
"computed.\n");
return false;
}
+ else if (!loop->inner
+ && chrec_contains_undetermined (*number_of_iterations))
+ {
+ if (dump_enabled_p ())
+ dump_printf_loc (MSG_NOTE, vect_location,
+ "number of iterations cannot be computed, "
+ "relying upon speculative execution\n");
+ return true;
+ }
if (integer_zerop (*number_of_iterations))
{
@@ -1706,6 +1763,21 @@ vect_analyze_loop_form (struct loop *loop)
return NULL;
loop_vec_info loop_vinfo = new _loop_vec_info (loop);
+
+ if (number_of_iterations
+ && chrec_contains_undetermined (number_of_iterations))
+ {
+ /* Nested loops are not supported for speculative execution. */
+ gcc_assert (!loop->inner);
+
+ LOOP_VINFO_SPECULATIVE_EXECUTION (loop_vinfo) = true;
+
+ /* Since we don't know what the number of iterations there seems little
+ point in having anything other than NULL. */
+ number_of_iterations = NULL;
+ number_of_iterationsm1 = NULL;
+ }
+
LOOP_VINFO_NITERSM1 (loop_vinfo) = number_of_iterationsm1;
LOOP_VINFO_NITERS (loop_vinfo) = number_of_iterations;
LOOP_VINFO_NITERS_UNCHANGED (loop_vinfo) = number_of_iterations;
@@ -2158,6 +2230,25 @@ vect_analyze_loop_2 (loop_vec_info loop_vinfo, bool &fatal)
}
}
+ /* TODO: We can't currently support stores for speculative loops. */
+ if (LOOP_VINFO_SPECULATIVE_EXECUTION (loop_vinfo)
+ && LOOP_VINFO_DATAREFS (loop_vinfo).length () > 0)
+ {
+ vec<data_reference_p> datarefs = LOOP_VINFO_DATAREFS (loop_vinfo);
+ struct data_reference *dr;
+ unsigned int i;
+
+ FOR_EACH_VEC_ELT (datarefs, i, dr)
+ if (!DR_IS_READ (dr))
+ {
+ if (dump_enabled_p ())
+ dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
+ "Stores not supported for speculative "
+ "loops.\n");
+ return false;
+ }
+ }
+
/* Analyze the data references and also adjust the minimal
vectorization factor according to the loads and stores. */
@@ -2259,7 +2350,8 @@ vect_analyze_loop_2 (loop_vec_info loop_vinfo, bool &fatal)
/* We don't expect to have to roll back to anything other than an empty
set of rgroups. */
- gcc_assert (LOOP_VINFO_MASKS (loop_vinfo).is_empty ());
+ gcc_assert (LOOP_VINFO_MASKS (loop_vinfo).is_empty ()
+ && LOOP_VINFO_NONSPECULATIVE_MASKS (loop_vinfo).is_empty ());
/* This is the point where we can re-start analysis with SLP forced off. */
start_over:
@@ -2337,6 +2429,19 @@ start_over:
return false;
}
+ if (LOOP_VINFO_CAN_FULLY_MASK_P (loop_vinfo)
+ && !LOOP_VINFO_PEELING_FOR_ALIGNMENT (loop_vinfo)
+ && LOOP_VINFO_SPECULATIVE_EXECUTION (loop_vinfo)
+ && !LOOP_VINFO_NEEDS_NONSPECULATIVE_MASKS (loop_vinfo)
+ && !use_capped_vf (loop_vinfo))
+ {
+ LOOP_VINFO_CAN_FULLY_MASK_P (loop_vinfo) = false;
+ if (dump_enabled_p ())
+ dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
+ "No need to predicate speculative loops without "
+ "alignment peeling.\n");
+ }
+
/* Decide whether to use a fully-masked loop for this vectorization
factor. */
LOOP_VINFO_FULLY_MASKED_P (loop_vinfo)
@@ -2352,17 +2457,41 @@ start_over:
"not using a fully-masked loop.\n");
}
- if (!LOOP_VINFO_FULLY_MASKED_P (loop_vinfo)
- && use_capped_vf (loop_vinfo))
+ if (!LOOP_VINFO_FULLY_MASKED_P (loop_vinfo))
{
- if (dump_enabled_p ())
- dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
- "Need to cap the runtime vectorization factor to "
- HOST_WIDE_INT_PRINT_DEC " but cannot fully mask"
- " the loop.\n",
- LOOP_VINFO_MAX_VECT_FACTOR (loop_vinfo));
- /* Undoing SLP might allow us to use a mask. */
- goto again;
+ if (LOOP_VINFO_NEEDS_NONSPECULATIVE_MASKS (loop_vinfo))
+ {
+ if (dump_enabled_p ())
+ dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
+ "Not vectorized: non-speculative operations "
+ "need a fully-masked loop.\n");
+ return false;
+ }
+
+ if (use_capped_vf (loop_vinfo))
+ {
+ if (dump_enabled_p ())
+ dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
+ "Need to cap the runtime vectorization factor to "
+ HOST_WIDE_INT_PRINT_DEC " but cannot fully mask"
+ " the loop.\n",
+ LOOP_VINFO_MAX_VECT_FACTOR (loop_vinfo));
+ /* Undoing SLP might allow us to use a mask. */
+ goto again;
+ }
+ }
+
+ if (LOOP_VINFO_NEEDS_NONSPECULATIVE_MASKS (loop_vinfo))
+ {
+ tree mask_type = vect_mask_type_for_speculation (loop_vinfo);
+ if (!direct_internal_fn_supported_p (IFN_BREAK_AFTER, mask_type,
+ OPTIMIZE_FOR_SPEED))
+ {
+ if (dump_enabled_p ())
+ dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
+ "Not vectorized: BREAK_AFTER not supported.\n");
+ return false;
+ }
}
/* If epilog loop is required because of data accesses with gaps,
@@ -2385,6 +2514,17 @@ start_over:
}
}
+ if (!LOOP_VINFO_FULLY_MASKED_P (loop_vinfo)
+ && LOOP_VINFO_SPECULATIVE_EXECUTION (loop_vinfo)
+ && LOOP_VINFO_PEELING_FOR_ALIGNMENT (loop_vinfo))
+ {
+ if (dump_enabled_p ())
+ dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
+ "Not supported: peeling speculative vectorization"
+ " without a fully-masked loop.\n");
+ return false;
+ }
+
/* Check the costings of the loop make vectorizing worthwhile. */
res = vect_analyze_loop_costing (loop_vinfo);
if (res < 0)
@@ -2402,7 +2542,9 @@ start_over:
th = LOOP_VINFO_COST_MODEL_THRESHOLD (loop_vinfo);
unsigned HOST_WIDE_INT const_vf;
- if (LOOP_VINFO_FULLY_MASKED_P (loop_vinfo))
+ if (LOOP_VINFO_SPECULATIVE_EXECUTION (loop_vinfo))
+ LOOP_VINFO_PEELING_FOR_NITER (loop_vinfo) = false;
+ else if (LOOP_VINFO_FULLY_MASKED_P (loop_vinfo))
/* The main loop handles all iterations. */
LOOP_VINFO_PEELING_FOR_NITER (loop_vinfo) = false;
else if (LOOP_VINFO_NITERS_KNOWN_P (loop_vinfo)
@@ -2448,7 +2590,8 @@ start_over:
enough for both peeled prolog loop and vector loop. This check
can be merged along with threshold check of loop versioning, so
increase threshold for this case if necessary. */
- if (LOOP_REQUIRES_VERSIONING (loop_vinfo))
+ if (LOOP_REQUIRES_VERSIONING (loop_vinfo)
+ && !LOOP_VINFO_SPECULATIVE_EXECUTION (loop_vinfo))
{
poly_uint64 niters_th = 0;
@@ -2574,6 +2717,7 @@ again:
= init_cost (LOOP_VINFO_LOOP (loop_vinfo));
/* Reset accumulated rgroup information. */
release_vec_loop_masks (&LOOP_VINFO_MASKS (loop_vinfo));
+ release_vec_loop_masks (&LOOP_VINFO_NONSPECULATIVE_MASKS (loop_vinfo));
/* Reset assorted flags. */
LOOP_VINFO_PEELING_FOR_NITER (loop_vinfo) = false;
LOOP_VINFO_PEELING_FOR_GAPS (loop_vinfo) = false;
@@ -6147,11 +6291,19 @@ vectorize_fold_left_reduction (gimple *stmt, gimple_stmt_iterator *gsi,
SSA_NAME_DEF_STMT (reduc_var) = new_stmt;
/* For chained SLP stmt is the first statement in the group and
gsi points to the last statement in the group. For non SLP stmt
- points to the same location as gsi. In either case tmp_gsi and gsi
- should both point to the same insertion point. */
- gcc_assert (scalar_dest_def == gsi_stmt (*gsi));
- vect_finish_replace_stmt (scalar_dest_def, new_stmt);
- }
+ points to the same location as gsi. */
+ if (scalar_dest_def == gsi_stmt (*gsi))
+ vect_finish_replace_stmt (scalar_dest_def, new_stmt);
+ else
+ {
+ /* In this case we're moving the definition to later in the
+ block. That doesn't matter because the only uses of the
+ lhs are in phi statements. */
+ gimple_stmt_iterator old_gsi = gsi_for_stmt (scalar_dest_def);
+ gsi_remove (&old_gsi, true);
+ vect_finish_stmt_generation (stmt, new_stmt, gsi);
+ }
+ }
else
{
reduc_var = make_ssa_name (reduc_var, new_stmt);
@@ -7144,7 +7296,13 @@ vectorizable_reduction (gimple *stmt, gimple_stmt_iterator *gsi,
vec_num = 1;
internal_fn cond_fn = get_conditional_internal_fn (code, scalar_type);
+
+ /* In a speculative loop, the update must be predicated on the
+ nonspeculative masks, so that we don't include speculatively
+ loaded elements from beyond the end of the original loop. */
vec_loop_masks *masks = &LOOP_VINFO_MASKS (loop_vinfo);
+ if (LOOP_VINFO_SPECULATIVE_EXECUTION (loop_vinfo))
+ masks = &LOOP_VINFO_NONSPECULATIVE_MASKS (loop_vinfo);
if (!vec_stmt) /* transformation not required. */
{
@@ -7190,6 +7348,12 @@ vectorizable_reduction (gimple *stmt, gimple_stmt_iterator *gsi,
bool masked_loop_p = LOOP_VINFO_FULLY_MASKED_P (loop_vinfo);
+ gimple_stmt_iterator nonspeculative_gsi
+ = gsi_end (LOOP_VINFO_NONSPECULATIVE_SEQ (loop_vinfo));
+ if (masked_loop_p
+ && masks == &LOOP_VINFO_NONSPECULATIVE_MASKS (loop_vinfo))
+ gsi = &nonspeculative_gsi;
+
if (reduction_type == FOLD_LEFT_REDUCTION)
return vectorize_fold_left_reduction
(stmt, gsi, vec_stmt, slp_node, reduc_def_stmt, code,
@@ -8036,6 +8200,37 @@ vectorizable_live_operation (gimple *stmt,
}
}
+ if (LOOP_VINFO_SPECULATIVE_EXECUTION (loop_vinfo))
+ {
+ /* Need to construct the type because on the checking stage we don't
+ yet have the speculative exit phi. */
+ tree mask_type = build_same_sized_truth_vector_type (vectype);
+
+ if (!direct_internal_fn_supported_p (IFN_BREAK_AFTER, mask_type,
+ OPTIMIZE_FOR_SPEED))
+ {
+ if (dump_enabled_p ())
+ dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
+ "not vectorized: break after not supported.\n");
+ return false;
+ }
+ if (!direct_internal_fn_supported_p (IFN_EXTRACT_LAST, vectype,
+ OPTIMIZE_FOR_SPEED))
+ {
+ if (dump_enabled_p ())
+ dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
+ "not vectorized: extract last not supported.\n");
+ return false;
+ }
+ if (ncopies > 1)
+ {
+ if (dump_enabled_p ())
+ dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
+ "not vectorized: ncopies is greater than 1.\n");
+ return false;
+ }
+ }
+
if (!vec_stmt)
{
/* No transformation required. */
@@ -8122,19 +8317,37 @@ vectorizable_live_operation (gimple *stmt,
gimple_seq stmts = NULL;
tree new_tree;
- if (LOOP_VINFO_FULLY_MASKED_P (loop_vinfo))
+ if (LOOP_VINFO_FULLY_MASKED_P (loop_vinfo)
+ || LOOP_VINFO_SPECULATIVE_EXECUTION (loop_vinfo))
{
+ tree mask;
+ if (LOOP_VINFO_SPECULATIVE_EXECUTION (loop_vinfo))
+ {
+ gcc_assert (ncopies == 1);
+ tree orig_mask = LOOP_VINFO_EXIT_MASK (loop_vinfo);
+ tree all_ones = build_minus_one_cst (TREE_TYPE (orig_mask));
+
+ mask = make_ssa_name (TREE_TYPE (orig_mask));
+ gcall *new_stmt = gimple_build_call_internal (IFN_BREAK_AFTER, 2,
+ all_ones, orig_mask);
+ gimple_call_set_lhs (new_stmt, mask);
+ gimple_seq_add_stmt (&stmts, new_stmt);
+ }
+ else
+ {
+ gcc_assert (ncopies == 1 && !slp_node);
+ mask = vect_get_loop_mask (gsi, &LOOP_VINFO_MASKS (loop_vinfo),
+ 1, vectype, 0);
+ }
+
/* Emit:
SCALAR_RES = EXTRACT_LAST <VEC_LHS, MASK>
where VEC_LHS is the vectorized live-out result and MASK is
the loop mask for the final iteration. */
- gcc_assert (ncopies == 1 && !slp_node);
tree scalar_type = TREE_TYPE (STMT_VINFO_VECTYPE (stmt_info));
tree scalar_res = make_ssa_name (scalar_type);
- tree mask = vect_get_loop_mask (gsi, &LOOP_VINFO_MASKS (loop_vinfo),
- 1, vectype, 0);
gcall *new_stmt = gimple_build_call_internal (IFN_EXTRACT_LAST,
2, mask, vec_lhs);
gimple_call_set_lhs (new_stmt, scalar_res);
@@ -8226,6 +8439,9 @@ vect_loop_kill_debug_uses (struct loop *loop, gimple *stmt)
static bool
loop_niters_no_overflow (loop_vec_info loop_vinfo)
{
+ if (LOOP_VINFO_SPECULATIVE_EXECUTION (loop_vinfo))
+ return false;
+
/* Constant case. */
if (LOOP_VINFO_NITERS_KNOWN_P (loop_vinfo))
{
@@ -8292,6 +8508,14 @@ vect_record_loop_mask (loop_vec_info loop_vinfo, vec_loop_masks *masks,
rgm->max_nscalars_per_iter = nscalars_per_iter;
rgm->mask_type = build_same_sized_truth_vector_type (vectype);
}
+
+ /* Ensure that the required nonspeculative masks are a subset of
+ the speculative ones. This has two benefits: it means that we
+ can test for target support in one go, and that we can AND in
+ the speculative masks when setting up the nonspeculative ones. */
+ if (masks == &LOOP_VINFO_NONSPECULATIVE_MASKS (loop_vinfo))
+ vect_record_loop_mask (loop_vinfo, &LOOP_VINFO_MASKS (loop_vinfo),
+ nvectors, vectype);
}
/* Given a complete set of masks MASKS, extract mask number INDEX
@@ -8343,6 +8567,52 @@ vect_get_loop_mask (gimple_stmt_iterator *gsi, vec_loop_masks *masks,
return mask;
}
+/* Get the mask to use for loads in LOOP_VINFO, or null if loads don't
+ need to be masked. The arguments are as for vec_get_loop_mask. */
+
+tree
+vect_get_load_mask (loop_vec_info loop_vinfo, gimple_stmt_iterator *gsi,
+ unsigned int nvectors, tree vectype, unsigned int index)
+{
+ /* At present all loads in a speculative loop are speculative.
+ They need to be masked iff we are using masking to reach
+ alignment. */
+ if (LOOP_VINFO_SPECULATIVE_EXECUTION (loop_vinfo)
+ && !LOOP_VINFO_MASK_SKIP_NITERS (loop_vinfo))
+ return NULL_TREE;
+
+ return vect_get_loop_mask (gsi, &LOOP_VINFO_MASKS (loop_vinfo),
+ nvectors, vectype, index);
+}
+
+/* Return the mask type to use when computing which scalar iterations
+ are active in speculative loop LOOP_VINFO. */
+
+tree
+vect_mask_type_for_speculation (loop_vec_info loop_vinfo)
+{
+ gcc_checking_assert (LOOP_VINFO_SPECULATIVE_EXECUTION (loop_vinfo));
+ return build_truth_vector_type (LOOP_VINFO_VECT_FACTOR (loop_vinfo),
+ current_vector_size);
+}
+
+/* Calculate the scalar number of iterations in NIM from its mask,
+ adding any new statements to SEQ. Return the number of iterations. */
+
+tree
+vect_get_niters_from_mask (gimple_seq *seq, vec_niters_and_mask *nim)
+{
+ if (nim->niters == NULL_TREE)
+ {
+ nim->niters = make_temp_ssa_name (sizetype, NULL, "niters");
+ gcall *call = gimple_build_call_internal (IFN_MASK_POPCOUNT,
+ 1, nim->mask);
+ gimple_call_set_lhs (call, nim->niters);
+ gimple_seq_add_stmt (seq, call);
+ }
+ return nim->niters;
+}
+
/* Scale profiling counters by estimation for LOOP which is vectorized
by factor VF. */
@@ -8419,7 +8689,8 @@ vect_transform_loop (loop_vec_info loop_vinfo)
checking is pointless, too. */
th = LOOP_VINFO_COST_MODEL_THRESHOLD (loop_vinfo);
if (th >= vect_vf_for_cost (loop_vinfo)
- && !LOOP_VINFO_NITERS_KNOWN_P (loop_vinfo))
+ && !LOOP_VINFO_NITERS_KNOWN_P (loop_vinfo)
+ && !LOOP_VINFO_SPECULATIVE_EXECUTION (loop_vinfo))
{
if (dump_enabled_p ())
dump_printf_loc (MSG_NOTE, vect_location,
@@ -8483,8 +8754,10 @@ vect_transform_loop (loop_vec_info loop_vinfo)
&step_vector, &niters_vector_mult_vf, th,
check_profitability, niters_no_overflow);
- if (niters_vector == NULL_TREE)
+ if (niters_vector == NULL_TREE
+ && !LOOP_VINFO_SPECULATIVE_EXECUTION (loop_vinfo))
{
+ gcc_assert (!LOOP_VINFO_PEELING_FOR_GAPS (loop_vinfo));
if (LOOP_VINFO_NITERS_KNOWN_P (loop_vinfo)
&& !LOOP_VINFO_FULLY_MASKED_P (loop_vinfo)
&& must_eq (lowest_vf, vf))
@@ -8511,6 +8784,16 @@ vect_transform_loop (loop_vec_info loop_vinfo)
/* This will deal with any possible peeling. */
vect_prepare_for_masked_peels (loop_vinfo);
+ if (LOOP_VINFO_SPECULATIVE_EXECUTION (loop_vinfo))
+ {
+ tree mask_type = vect_mask_type_for_speculation (loop_vinfo);
+ /* Create a dummy definition of the exit mask. We'll fill in the
+ real definition later. */
+ tree mask = make_temp_ssa_name (mask_type, NULL, "exit_mask");
+ SSA_NAME_DEF_STMT (mask) = gimple_build_nop ();
+ LOOP_VINFO_EXIT_MASK (loop_vinfo) = mask;
+ }
+
/* FORNOW: the vectorizer supports only loops which body consist
of one basic block (header + empty latch). When the vectorizer will
support more involved loop forms, the order by which the BBs are
@@ -8770,9 +9053,18 @@ vect_transform_loop (loop_vec_info loop_vinfo)
}
} /* BBs in loop */
+ /* Provide the real definition of LOOP_VINFO_EXIT_MASK. */
+ if (LOOP_VINFO_SPECULATIVE_EXECUTION (loop_vinfo))
+ {
+ tree imask = LOOP_VINFO_EXIT_TEST_MASK (loop_vinfo);
+ tree omask = LOOP_VINFO_EXIT_MASK (loop_vinfo);
+ gphi *new_phi = create_phi_node (omask, single_exit (loop)->dest);
+ add_phi_arg (new_phi, imask, single_exit (loop), UNKNOWN_LOCATION);
+ }
+
/* The vectorization factor is always > 1, so if we use an IV increment of 1.
a zero NITERS becomes a nonzero NITERS_VECTOR. */
- if (integer_onep (step_vector))
+ if (step_vector && integer_onep (step_vector))
niters_no_overflow = true;
vect_set_loop_condition (loop, loop_vinfo, niters_vector, step_vector,
niters_vector_mult_vf, !niters_no_overflow);
diff --git a/gcc/tree-vect-stmts.c b/gcc/tree-vect-stmts.c
index c0a87dc9275..36443cff685 100644
--- a/gcc/tree-vect-stmts.c
+++ b/gcc/tree-vect-stmts.c
@@ -304,7 +304,7 @@ is_simple_and_all_uses_invariant (gimple *stmt, loop_vec_info loop_vinfo)
A stmt is considered "relevant for vectorization" if:
- it has uses outside the loop.
- it has vdefs (it alters memory).
- - control stmts in the loop (except for the exit condition).
+ - control stmts in the loop (including the exit condition).
CHECKME: what other side effects would the vectorizer allow? */
@@ -323,8 +323,9 @@ vect_stmt_relevant_p (gimple *stmt, loop_vec_info loop_vinfo,
/* cond stmt other than loop exit cond. */
if (is_ctrl_stmt (stmt)
- && STMT_VINFO_TYPE (vinfo_for_stmt (stmt))
- != loop_exit_ctrl_vec_info_type)
+ && (STMT_VINFO_TYPE (vinfo_for_stmt (stmt))
+ != loop_exit_ctrl_vec_info_type
+ || LOOP_VINFO_SPECULATIVE_EXECUTION (loop_vinfo)))
*relevant = vect_used_in_scope;
/* changing memory. */
@@ -688,6 +689,12 @@ vect_mark_stmts_to_be_vectorized (loop_vec_info loop_vinfo)
}
}
+ /* The exit condition is relevant for speculative loops. */
+ if (LOOP_VINFO_SPECULATIVE_EXECUTION (loop_vinfo)
+ && !vect_stmt_relevant_p (get_loop_exit_condition (loop),
+ loop_vinfo, &relevant, &live_p))
+ gcc_unreachable ();
+
/* 2. Process_worklist */
while (worklist.length () > 0)
{
@@ -2137,7 +2144,8 @@ get_group_load_store_type (gimple *stmt, tree vectype, bool slp,
bool can_overrun_p = (!masked_p
&& vls_type == VLS_LOAD
&& loop_vinfo
- && !loop->inner);
+ && !loop->inner
+ && !LOOP_VINFO_SPECULATIVE_EXECUTION (loop_vinfo));
/* There can only be a gap at the end of the group if the stride is
known at compile time. */
@@ -4506,6 +4514,30 @@ vect_create_vectorized_promotion_stmts (vec<tree> *vec_oprnds0,
*vec_oprnds0 = vec_tmp;
}
+/* Pack the masks in MASKS to a single mask and return it. Insert any
+ new statements before GSI. Leave MASKS with just the returned value
+ on exit. */
+
+static tree
+vect_demote_masks (gimple_stmt_iterator *gsi, vec<tree> *masks)
+{
+ while (masks->length () > 1)
+ {
+ unsigned int nresults = masks->length () / 2;
+ tree dest_type = vect_double_mask_nunits (TREE_TYPE ((*masks)[0]));
+ for (unsigned int i = 0; i < nresults; ++i)
+ {
+ tree dest = make_ssa_name (dest_type);
+ gimple *stmt = gimple_build_assign (dest, VEC_PACK_TRUNC_EXPR,
+ (*masks)[i * 2],
+ (*masks)[i * 2 + 1]);
+ gsi_insert_before (gsi, stmt, GSI_SAME_STMT);
+ (*masks)[i] = dest;
+ }
+ masks->truncate (nresults);
+ }
+ return (*masks)[0];
+}
/* Check if STMT performs a conversion operation, that can be vectorized.
If VEC_STMT is also passed, vectorize the STMT: create a vectorized
@@ -6203,6 +6235,7 @@ vectorizable_store (gimple *stmt, gimple_stmt_iterator *gsi, gimple **vec_stmt,
if (loop_vinfo)
{
+ gcc_assert (!LOOP_VINFO_SPECULATIVE_EXECUTION (loop_vinfo));
loop = LOOP_VINFO_LOOP (loop_vinfo);
vf = LOOP_VINFO_VECT_FACTOR (loop_vinfo);
}
@@ -7335,6 +7368,14 @@ vectorizable_load (gimple *stmt, gimple_stmt_iterator *gsi, gimple **vec_stmt,
return false;
}
+ if (LOOP_VINFO_SPECULATIVE_EXECUTION (loop_vinfo))
+ {
+ if (dump_enabled_p ())
+ dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
+ "speculative mask loads not supported\n");
+ return false;
+ }
+
int mask_index = internal_fn_mask_index (ifn);
if (mask_index >= 0)
{
@@ -7370,12 +7411,24 @@ vectorizable_load (gimple *stmt, gimple_stmt_iterator *gsi, gimple **vec_stmt,
gcc_assert (ncopies >= 1);
/* FORNOW. This restriction should be relaxed. */
- if (nested_in_vect_loop && ncopies > 1)
+ if (ncopies > 1)
{
- if (dump_enabled_p ())
- dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
- "multiple types in nested loop.\n");
- return false;
+ if (nested_in_vect_loop)
+ {
+ if (dump_enabled_p ())
+ dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
+ "multiple types in nested loop.\n");
+ return false;
+ }
+
+ if (LOOP_VINFO_SPECULATIVE_EXECUTION (loop_vinfo))
+ {
+ if (dump_enabled_p ())
+ dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
+ "multiple copies not supported for speculative "
+ "loops.\n");
+ return false;
+ }
}
/* Invalidate assumptions made by dependence analysis when vectorization
@@ -7988,7 +8041,6 @@ vectorizable_load (gimple *stmt, gimple_stmt_iterator *gsi, gimple **vec_stmt,
tree vec_mask = NULL_TREE;
prev_stmt_info = NULL;
poly_uint64 group_elt = 0;
- vec_loop_masks *masks = &LOOP_VINFO_MASKS (loop_vinfo);
for (j = 0; j < ncopies; j++)
{
/* 1. Create the vector or array pointer update chain. */
@@ -8079,7 +8131,8 @@ vectorizable_load (gimple *stmt, gimple_stmt_iterator *gsi, gimple **vec_stmt,
tree final_mask = NULL_TREE;
if (masked_loop_p)
- final_mask = vect_get_loop_mask (gsi, masks, ncopies, vectype, j);
+ final_mask = vect_get_load_mask (loop_vinfo, gsi, ncopies,
+ vectype, j);
if (vec_mask)
final_mask = prepare_load_store_mask (mask_vectype, final_mask,
vec_mask, gsi);
@@ -8126,7 +8179,8 @@ vectorizable_load (gimple *stmt, gimple_stmt_iterator *gsi, gimple **vec_stmt,
tree final_mask = NULL_TREE;
if (masked_loop_p
&& memory_access_type != VMAT_INVARIANT)
- final_mask = vect_get_loop_mask (gsi, masks, vec_num * ncopies,
+ final_mask = vect_get_load_mask (loop_vinfo, gsi,
+ vec_num * ncopies,
vectype, vec_num * j + i);
if (vec_mask)
final_mask = prepare_load_store_mask (mask_vectype, final_mask,
@@ -8162,10 +8216,10 @@ vectorizable_load (gimple *stmt, gimple_stmt_iterator *gsi, gimple **vec_stmt,
break;
}
- align = DR_TARGET_ALIGNMENT (dr);
if (alignment_support_scheme == dr_aligned)
{
gcc_assert (aligned_access_p (first_dr));
+ align = DR_TARGET_ALIGNMENT (first_dr);
misalign = 0;
}
else if (DR_MISALIGNMENT (first_dr) == -1)
@@ -8174,7 +8228,10 @@ vectorizable_load (gimple *stmt, gimple_stmt_iterator *gsi, gimple **vec_stmt,
misalign = 0;
}
else
- misalign = DR_MISALIGNMENT (first_dr);
+ {
+ align = DR_TARGET_ALIGNMENT (first_dr);
+ misalign = DR_MISALIGNMENT (first_dr);
+ }
if (dataref_offset == NULL_TREE
&& TREE_CODE (dataref_ptr) == SSA_NAME)
set_ptr_info_alignment (get_ptr_info (dataref_ptr),
@@ -8934,12 +8991,11 @@ vectorizable_comparison (gimple *stmt, gimple_stmt_iterator *gsi,
gimple **vec_stmt, tree reduc_def,
slp_tree slp_node)
{
- tree lhs, rhs1, rhs2;
+ tree rhs1, rhs2;
stmt_vec_info stmt_info = vinfo_for_stmt (stmt);
tree vectype1 = NULL_TREE, vectype2 = NULL_TREE;
tree vectype = STMT_VINFO_VECTYPE (stmt_info);
tree vec_rhs1 = NULL_TREE, vec_rhs2 = NULL_TREE;
- tree new_temp;
loop_vec_info loop_vinfo = STMT_VINFO_LOOP_VINFO (stmt_info);
enum vect_def_type dts[2] = {vect_unknown_def_type, vect_unknown_def_type};
int ndts = 2;
@@ -8983,16 +9039,55 @@ vectorizable_comparison (gimple *stmt, gimple_stmt_iterator *gsi,
return false;
}
- if (!is_gimple_assign (stmt))
- return false;
+ if (is_gimple_assign (stmt))
+ {
+ code = gimple_assign_rhs_code (stmt);
+ rhs1 = gimple_assign_rhs1 (stmt);
+ rhs2 = gimple_assign_rhs2 (stmt);
+ }
+ else if (gimple_code (stmt) == GIMPLE_COND)
+ {
+ gcc_assert (LOOP_VINFO_SPECULATIVE_EXECUTION (loop_vinfo));
- code = gimple_assign_rhs_code (stmt);
+ /* TODO: Support more complex loops with more than one gcond stmt. */
+ struct loop *loop = LOOP_VINFO_LOOP (loop_vinfo);
+ gcc_assert (stmt == get_loop_exit_condition (loop));
- if (TREE_CODE_CLASS (code) != tcc_comparison)
+ rhs1 = gimple_cond_lhs (stmt);
+ rhs2 = gimple_cond_rhs (stmt);
+
+ code = gimple_cond_code (stmt);
+ edge exit_edge = single_exit (loop);
+ if (exit_edge->flags & EDGE_FALSE_VALUE)
+ {
+ /* We want to invert the code and generate a mask such that if any
+ bit is true the exit condition is met. */
+ bool honor_nans = FLOAT_TYPE_P (TREE_TYPE (rhs1));
+ code = invert_tree_comparison (code, honor_nans);
+ if (code == ERROR_MARK)
+ {
+ if (dump_enabled_p ())
+ dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
+ "Cannot invert condition code. Loop cannot "
+ "be speculatively executed.\n");
+ return false;
+ }
+ }
+
+ if (optab_handler (cbranch_optab, TYPE_MODE (vectype))
+ == CODE_FOR_nothing)
+ {
+ if (dump_enabled_p ())
+ dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
+ "Target does not support testing a mask.\n");
+ return false;
+ }
+ }
+ else
return false;
- rhs1 = gimple_assign_rhs1 (stmt);
- rhs2 = gimple_assign_rhs2 (stmt);
+ if (TREE_CODE_CLASS (code) != tcc_comparison)
+ return false;
if (!vect_is_simple_use (rhs1, stmt_info->vinfo, &def_stmt,
&dts[0], &vectype1))
@@ -9070,6 +9165,17 @@ vectorizable_comparison (gimple *stmt, gimple_stmt_iterator *gsi,
STMT_VINFO_TYPE (stmt_info) = comparison_vec_info_type;
vect_model_simple_cost (stmt_info, ncopies * (1 + (bitop2 != NOP_EXPR)),
dts, ndts, NULL, NULL);
+
+ /* Speulative loops need to AND the comparison result with the
+ mask of active values. */
+ if (LOOP_VINFO_CAN_FULLY_MASK_P (loop_vinfo)
+ && LOOP_VINFO_SPECULATIVE_EXECUTION (loop_vinfo))
+ {
+ tree final_type = vect_mask_type_for_speculation (loop_vinfo);
+ vect_record_loop_mask (loop_vinfo, &LOOP_VINFO_MASKS (loop_vinfo),
+ 1, final_type);
+ }
+
if (bitop1 == NOP_EXPR)
return expand_vec_cmp_expr_p (vectype, mask_type, code);
else
@@ -9099,8 +9205,26 @@ vectorizable_comparison (gimple *stmt, gimple_stmt_iterator *gsi,
}
/* Handle def. */
- lhs = gimple_assign_lhs (stmt);
- mask = vect_create_destination_var (lhs, mask_type);
+ if (is_gimple_assign (stmt))
+ {
+ tree lhs = gimple_assign_lhs (stmt);
+ mask = vect_create_destination_var (lhs, mask_type);
+ }
+ else
+ mask = NULL_TREE;
+
+ bool masked_speculative_p
+ = (LOOP_VINFO_SPECULATIVE_EXECUTION (loop_vinfo)
+ && LOOP_VINFO_MASK_SKIP_NITERS (loop_vinfo));
+
+ /* Pick an array of masks to use as the comparison results that feed
+ a GIMPLE_COND. If all input elements are valid, we can operate
+ directly on the exit masks array. If masking is needed, first
+ build a temporary array of unmasked results and then apply the
+ mask to it.
+
+ This is ignored (and cheap) if the statement isn't a GIMPLE_COND. */
+ auto_vec<tree, 16> cmp_results;
/* Handle cmp expr. */
for (j = 0; j < ncopies; j++)
@@ -9144,34 +9268,42 @@ vectorizable_comparison (gimple *stmt, gimple_stmt_iterator *gsi,
{
vec_rhs2 = vec_oprnds1[i];
- new_temp = make_ssa_name (mask);
+ tree cmp_res = (mask != NULL_TREE
+ ? make_ssa_name (mask)
+ : make_ssa_name (mask_type));
if (bitop1 == NOP_EXPR)
{
- new_stmt = gimple_build_assign (new_temp, code,
+ new_stmt = gimple_build_assign (cmp_res, code,
vec_rhs1, vec_rhs2);
vect_finish_stmt_generation (stmt, new_stmt, gsi);
}
else
{
+ tree bitop1_res = (bitop2 == NOP_EXPR
+ ? cmp_res
+ : make_ssa_name (TREE_TYPE (cmp_res)));
if (bitop1 == BIT_NOT_EXPR)
- new_stmt = gimple_build_assign (new_temp, bitop1, vec_rhs2);
+ new_stmt = gimple_build_assign (bitop1_res, bitop1, vec_rhs2);
else
- new_stmt = gimple_build_assign (new_temp, bitop1, vec_rhs1,
+ new_stmt = gimple_build_assign (bitop1_res, bitop1, vec_rhs1,
vec_rhs2);
vect_finish_stmt_generation (stmt, new_stmt, gsi);
if (bitop2 != NOP_EXPR)
{
- tree res = make_ssa_name (mask);
if (bitop2 == BIT_NOT_EXPR)
- new_stmt = gimple_build_assign (res, bitop2, new_temp);
+ new_stmt = gimple_build_assign (cmp_res, bitop2,
+ bitop1_res);
else
- new_stmt = gimple_build_assign (res, bitop2, vec_rhs1,
- new_temp);
+ new_stmt = gimple_build_assign (cmp_res, bitop2,
+ vec_rhs1, bitop1_res);
vect_finish_stmt_generation (stmt, new_stmt, gsi);
}
}
+
if (slp_node)
SLP_TREE_VEC_STMTS (slp_node).quick_push (new_stmt);
+
+ cmp_results.safe_push (cmp_res);
}
if (slp_node)
@@ -9188,6 +9320,42 @@ vectorizable_comparison (gimple *stmt, gimple_stmt_iterator *gsi,
vec_oprnds0.release ();
vec_oprnds1.release ();
+ if (gimple_code (stmt) == GIMPLE_COND)
+ {
+ gcc_assert (LOOP_VINFO_SPECULATIVE_EXECUTION (loop_vinfo));
+
+ struct loop *loop = LOOP_VINFO_LOOP (loop_vinfo);
+ gcond *cond = get_loop_exit_condition (loop);
+ gcc_assert (cond);
+ gimple_stmt_iterator loop_cond_gsi = gsi_for_stmt (cond);
+
+ tree cmp_res = vect_demote_masks (&loop_cond_gsi, &cmp_results);
+ mask_type = TREE_TYPE (cmp_res);
+ if (masked_speculative_p)
+ {
+ /* Work out which elements of the unmasked result are valid. */
+ mask = vect_get_loop_mask (gsi, &LOOP_VINFO_MASKS (loop_vinfo),
+ 1, mask_type, 0);
+
+ /* Get the mask of values that actually matter. */
+ tree masked_res = make_ssa_name (mask_type);
+ gimple *tmp_stmt = gimple_build_assign (masked_res, BIT_AND_EXPR,
+ cmp_res, mask);
+ gsi_insert_before (&loop_cond_gsi, tmp_stmt, GSI_SAME_STMT);
+ cmp_res = masked_res;
+ }
+ LOOP_VINFO_EXIT_TEST_MASK (loop_vinfo) = cmp_res;
+
+ /* Get a boolean result that tells us whether to iterate. It's easier
+ to modify the condition in-place than to generate a new one and
+ delete the old one. */
+ edge exit_edge = single_exit (loop);
+ tree_code code = (exit_edge->flags & EDGE_TRUE_VALUE) ? NE_EXPR : EQ_EXPR;
+ tree zero_mask = build_zero_cst (mask_type);
+ gimple_cond_set_condition (cond, code, cmp_res, zero_mask);
+ update_stmt (cond);
+ }
+
return true;
}
diff --git a/gcc/tree-vectorizer.h b/gcc/tree-vectorizer.h
index 8073ba05a83..2afafda6b25 100644
--- a/gcc/tree-vectorizer.h
+++ b/gcc/tree-vectorizer.h
@@ -414,7 +414,8 @@ typedef struct _loop_vec_info : public vec_info {
vec_niters_and_mask cap;
/* The masks that a fully-masked loop should use to avoid operating
- on inactive scalars. */
+ on inactive scalars. In a speculative loop, these masks control
+ the operations that can be executed speculatively. */
vec_loop_masks masks;
/* If we are using a loop mask to align memory addresses, this variable
@@ -489,6 +490,9 @@ typedef struct _loop_vec_info : public vec_info {
/* Is the loop vectorizable? */
bool vectorizable;
+ /* Is this a speculative loop? */
+ bool speculative_execution;
+
/* Records whether we still have the option of using a fully-masked loop. */
bool can_fully_mask_p;
@@ -546,6 +550,22 @@ typedef struct _loop_vec_info : public vec_info {
/* A map from X to a precomputed gimple_val containing
CAPPED_VECTORIZATION_FACTOR * X. */
hash_map<tree, tree> vf_mult_map;
+
+ /* In a speculative loop, this is the result of the exit comparison.
+ It is a vector mask with one element for each scalar iteration. */
+ tree exit_test_mask;
+
+ /* A value equal to EXIT_TEST_MASK for use outside the loop. */
+ tree exit_mask;
+
+ /* In a speculative loop, these masks are used to control operations
+ that cannot be speculatively executed. */
+ vec_loop_masks nonspeculative_masks;
+
+ /* Statements in a speculative loop that depend on nonspeculative masks.
+ These statements can only be executed after the exit condition has
+ been evaluated. */
+ gimple_seq nonspeculative_seq;
} *loop_vec_info;
/* Access Functions. */
@@ -599,6 +619,14 @@ typedef struct _loop_vec_info : public vec_info {
#define LOOP_VINFO_ORIG_LOOP_INFO(L) (L)->orig_loop_info
#define LOOP_VINFO_ADDR_CACHE(L) (L)->vect_addr_base_htab
#define LOOP_VINFO_VF_MULT_MAP(L) (L)->vf_mult_map
+#define LOOP_VINFO_SPECULATIVE_EXECUTION(L) (L)->speculative_execution
+#define LOOP_VINFO_EXIT_TEST_MASK(L) (L)->exit_test_mask
+#define LOOP_VINFO_EXIT_MASK(L) (L)->exit_mask
+#define LOOP_VINFO_NONSPECULATIVE(L) (L)->nonspeculative
+#define LOOP_VINFO_NEEDS_NONSPECULATIVE_MASKS(L) \
+ (!(L)->nonspeculative_masks.is_empty ())
+#define LOOP_VINFO_NONSPECULATIVE_MASKS(L) (L)->nonspeculative_masks
+#define LOOP_VINFO_NONSPECULATIVE_SEQ(L) (L)->nonspeculative_seq
#define LOOP_REQUIRES_VERSIONING_FOR_ALIGNMENT(L) \
((L)->may_misalign_stmts.length () > 0)
@@ -1625,6 +1653,10 @@ extern void vect_record_loop_mask (loop_vec_info, vec_loop_masks *,
unsigned int, tree);
extern tree vect_get_loop_mask (gimple_stmt_iterator *, vec_loop_masks *,
unsigned int, tree, unsigned int);
+extern tree vect_get_load_mask (loop_vec_info, gimple_stmt_iterator *,
+ unsigned int, tree, unsigned int);
+extern tree vect_mask_type_for_speculation (loop_vec_info);
+extern tree vect_get_niters_from_mask (gimple_seq *, vec_niters_and_mask *);
/* Drive for loop transformation stage. */
extern struct loop *vect_transform_loop (loop_vec_info);