diff options
author | Richard Sandiford <richard.sandiford@linaro.org> | 2017-06-23 17:52:44 +0100 |
---|---|---|
committer | Richard Sandiford <richard.sandiford@linaro.org> | 2017-11-20 16:01:23 +0000 |
commit | 02cf0942b05e2278c7e251b969092b64f06b915d (patch) | |
tree | f9344ed179868f44cee2a9cdb6d31a3b0d38dfe6 | |
parent | 655e3625f9c65f2c9d4e8c76eeca5edf9254afeb (diff) | |
download | gcc-02cf0942b05e2278c7e251b969092b64f06b915d.tar.gz |
Add support for speculative loads
[Branch only patch -- not intended for trunk in its current state]
This patch adds support for speculative loads in cases where the loads
are (or can be made to be) aligned to a full vector size. Such loads
can never partially fault and they should be more efficient than
first-faulting loads for the cases that they can handle.
41 files changed, 2138 insertions, 96 deletions
diff --git a/gcc/config/aarch64/aarch64-sve.md b/gcc/config/aarch64/aarch64-sve.md index 5d84b7fc595..e381cfcabe2 100644 --- a/gcc/config/aarch64/aarch64-sve.md +++ b/gcc/config/aarch64/aarch64-sve.md @@ -2379,6 +2379,16 @@ "<sve_fp_op>\t%0.<Vetype>, %1/m, %0.<Vetype>, %2.<Vetype>" ) +(define_insn "break_after_<mode>" + [(set (match_operand:PRED_ALL 0 "register_operand" "=Upa") + (unspec:PRED_ALL + [(match_operand:PRED_ALL 1 "register_operand" "Upa") + (match_operand:PRED_ALL 2 "register_operand" "Upa")] + UNSPEC_BRKA))] + "TARGET_SVE" + "brka\t%0.b, %1/z, %2.b" +) + (define_expand "mask_popcount<mode>" [(set (match_operand:DI 0 "register_operand") (unspec:DI [(match_dup 2) diff --git a/gcc/config/aarch64/aarch64.md b/gcc/config/aarch64/aarch64.md index 581e6a753d2..37dcd85440e 100644 --- a/gcc/config/aarch64/aarch64.md +++ b/gcc/config/aarch64/aarch64.md @@ -168,6 +168,7 @@ UNSPEC_CLASTB UNSPEC_FADDA UNSPEC_CNTP + UNSPEC_BRKA ]) (define_c_enum "unspecv" [ diff --git a/gcc/gimple-iterator.h b/gcc/gimple-iterator.h index 70f18beceff..8e9fe1f087d 100644 --- a/gcc/gimple-iterator.h +++ b/gcc/gimple-iterator.h @@ -152,6 +152,22 @@ gsi_last_1 (gimple_seq *seq) #define gsi_last(x) gsi_last_1 (&(x)) +/* Return a new iterator initially pointing at the end of SEQ. */ + +static inline gimple_stmt_iterator +gsi_end_1 (gimple_seq *seq) +{ + gimple_stmt_iterator i; + + i.ptr = NULL; + i.seq = seq; + i.bb = i.ptr ? gimple_bb (i.ptr) : NULL; + + return i; +} + +#define gsi_end(x) gsi_end_1 (&(x)) + /* Return a new iterator pointing to the last statement in basic block BB. */ static inline gimple_stmt_iterator diff --git a/gcc/internal-fn.def b/gcc/internal-fn.def index f28519837f2..1ff1d832eeb 100644 --- a/gcc/internal-fn.def +++ b/gcc/internal-fn.def @@ -155,6 +155,16 @@ DEF_INTERNAL_COND_OPTAB_FN (XOR, ECF_CONST | ECF_NOTHROW, xor, binary) DEF_INTERNAL_OPTAB_FN (RSQRT, ECF_CONST, rsqrt, unary) +/* IFN_BREAK_AFTER (A, B): + + - If A & B is all false, return A. + - Otherwise find the first true bit in A & B. Copy bits of A up + to and including that bit and set the remaining bits to false. + + A, B and the return value are all vector masks. */ +DEF_INTERNAL_OPTAB_FN (BREAK_AFTER, ECF_CONST | ECF_NOTHROW, + break_after, binary) + /* Extract the last active element from a vector. */ DEF_INTERNAL_OPTAB_FN (EXTRACT_LAST, ECF_CONST | ECF_NOTHROW, extract_last, cond_unary) diff --git a/gcc/optabs.def b/gcc/optabs.def index bf67dfca132..d86dc803d5a 100644 --- a/gcc/optabs.def +++ b/gcc/optabs.def @@ -308,6 +308,7 @@ OPTAB_D (reduc_ior_scal_optab, "reduc_ior_scal_$a") OPTAB_D (reduc_xor_scal_optab, "reduc_xor_scal_$a") OPTAB_D (fold_left_plus_optab, "fold_left_plus_$a") +OPTAB_D (break_after_optab, "break_after_$a") OPTAB_D (extract_last_optab, "extract_last_$a") OPTAB_D (fold_extract_last_optab, "fold_extract_last_$a") diff --git a/gcc/testsuite/gcc.target/aarch64/sve_speculative_1.c b/gcc/testsuite/gcc.target/aarch64/sve_speculative_1.c new file mode 100644 index 00000000000..ba2f569fd5c --- /dev/null +++ b/gcc/testsuite/gcc.target/aarch64/sve_speculative_1.c @@ -0,0 +1,62 @@ +/* { dg-do compile } */ +/* { dg-options "-O3 -fno-inline -ffast-math -march=armv8-a+sve -fdump-tree-vect-details -msve-vector-bits=256" } */ + +#include <stdint.h> + +/* Speculative loop with no data references. */ + +#define SPEC_LOOP(ARGTYPE,INDUCTYPE)\ +INDUCTYPE spec_loop_##ARGTYPE##INDUCTYPE (ARGTYPE mask, ARGTYPE limit)\ +{\ + INDUCTYPE i = 0;\ + while ((i & mask) != limit)\ + i += 1;\ + return i;\ +}\ + +#define SPEC_FP_LOOP(ARGTYPE,INDUCTYPE,FPTYPE)\ +FPTYPE spec_fp_loop_##ARGTYPE##INDUCTYPE##FPTYPE (ARGTYPE mask, ARGTYPE limit)\ +{\ + INDUCTYPE i = 0;\ + FPTYPE f = 0.0;\ + while ((i & mask) != limit)\ + {\ + f += 1;\ + i += 1;\ + }\ + return f;\ +}\ + +SPEC_LOOP (uint8_t, uint8_t) +SPEC_LOOP (uint16_t, uint16_t) +SPEC_LOOP (uint32_t, uint32_t) +SPEC_LOOP (uint64_t, uint64_t) + +SPEC_LOOP (int8_t, int8_t) +SPEC_LOOP (int16_t, int16_t) +SPEC_LOOP (int32_t, int32_t) +SPEC_LOOP (int64_t, int64_t) + +/* Conversions. */ +SPEC_LOOP (uint16_t, uint8_t) + +SPEC_LOOP (uint32_t, uint8_t) +SPEC_LOOP (uint32_t, uint16_t) + +SPEC_LOOP (uint64_t, uint8_t) +SPEC_LOOP (uint64_t, uint16_t) +SPEC_LOOP (uint64_t, uint32_t) + +SPEC_FP_LOOP (uint32_t, uint32_t, float) +SPEC_FP_LOOP (uint64_t, uint64_t, double) + +SPEC_FP_LOOP (uint64_t, uint64_t, float) + +/* { dg-final { scan-tree-dump-times "LOOP VECTORIZED" 17 "vect" } } */ +/* { dg-final { scan-assembler-times {\tbrka\tp[0-9]+\.b, p[0-9]+/z, p[0-9]+\.b} 17 } } */ +/* { dg-final { scan-assembler-times {\tlastb\tw[0-9]+, p[0-7], z[0-9]+\.b} 5 } } */ +/* { dg-final { scan-assembler-times {\tlastb\tw[0-9]+, p[0-7], z[0-9]+\.h} 4 } } */ +/* { dg-final { scan-assembler-times {\tlastb\tw[0-9]+, p[0-7], z[0-9]+\.s} 3 } } */ +/* { dg-final { scan-assembler-times {\tlastb\tx[0-9]+, p[0-7], z[0-9]+\.d} 2 } } */ +/* { dg-final { scan-assembler-times {\tlastb\ts[0-9]+, p[0-7], z[0-9]+\.s} 2 } } */ +/* { dg-final { scan-assembler-times {\tlastb\td[0-9]+, p[0-7], z[0-9]+\.d} 1 } } */ diff --git a/gcc/testsuite/gcc.target/aarch64/sve_speculative_10.c b/gcc/testsuite/gcc.target/aarch64/sve_speculative_10.c new file mode 100644 index 00000000000..c69164bb1ea --- /dev/null +++ b/gcc/testsuite/gcc.target/aarch64/sve_speculative_10.c @@ -0,0 +1,23 @@ +/* { dg-do compile } */ +/* { dg-options "-O3 -fno-common -ffast-math -march=armv8-a+sve -fdump-tree-vect-details -msve-vector-bits=256" } */ + +/* Speculative loop with two loads from global buffers which can be aligned, + but does require peeling. */ + +int a[500]; +int b[500]; + +int +foo (int n) +{ + int i = 0; + do + i += 1; + while (a[i] + b[i] < n); + return i; +} + +/* { dg-final { scan-tree-dump-not "loop versioned for vectorization to enhance alignment" "vect" } } */ +/* { dg-final { scan-tree-dump "Alignment of access forced using peeling" "vect" } } */ +/* { dg-final { scan-tree-dump "misalign = 4 bytes of ref" "vect" } } */ +/* { dg-final { scan-tree-dump "LOOP VECTORIZED" "vect" } } */ diff --git a/gcc/testsuite/gcc.target/aarch64/sve_speculative_11.c b/gcc/testsuite/gcc.target/aarch64/sve_speculative_11.c new file mode 100644 index 00000000000..92e4adc5571 --- /dev/null +++ b/gcc/testsuite/gcc.target/aarch64/sve_speculative_11.c @@ -0,0 +1,65 @@ +/* { dg-do compile } */ +/* { dg-options "-O3 -fno-common -ffast-math -march=armv8-a+sve -fdump-tree-vect-details -msve-vector-bits=256" } */ + +#include <stdint.h> + +/* Speculative loop with two loads from global buffers which can be aligned + without any peeling. */ + +#define MAX_ARRAY_SIZE 500 + +#ifndef STRIDE_LEVEL +#define STRIDE_LEVEL 1 +#endif + +#define SPEC_LOOP(DATATYPE, ARGTYPE)\ +DATATYPE a##DATATYPE[MAX_ARRAY_SIZE];\ +DATATYPE b##DATATYPE[MAX_ARRAY_SIZE];\ +ARGTYPE spec_loop_##DATATYPE##_##ARGTYPE (DATATYPE n)\ +{\ + ARGTYPE i = -1;\ + do\ + i += 1;\ + while (a##DATATYPE[i*STRIDE_LEVEL] + b##DATATYPE[i*STRIDE_LEVEL] < n);\ + return i;\ +} + +/* TODO: Cannot yet vectorize due to gather load. */ +SPEC_LOOP (int8_t, int8_t) +SPEC_LOOP (int16_t, int16_t) + +SPEC_LOOP (int32_t, int32_t) +SPEC_LOOP (int64_t, int64_t) +SPEC_LOOP (float, int32_t) +SPEC_LOOP (double, int64_t) + +/* { dg-final { scan-tree-dump-not "loop versioned for vectorization to enhance alignment" "vect" } } */ +/* { dg-final { scan-tree-dump-not "Alignment of access forced using peeling" "vect" } } */ + +/* { dg-final { scan-tree-dump "force alignment of aint8_t" "vect" { xfail *-*-* } } } */ +/* { dg-final { scan-tree-dump "force alignment of bint8_t" "vect" { xfail *-*-* } } } */ +/* { dg-final { scan-tree-dump "force alignment of aint16_t" "vect" { xfail *-*-* } } } */ +/* { dg-final { scan-tree-dump "force alignment of bint16_t" "vect" { xfail *-*-* } } } */ +/* { dg-final { scan-tree-dump "force alignment of aint32_t" "vect" } } */ +/* { dg-final { scan-tree-dump "force alignment of bint32_t" "vect" } } */ +/* { dg-final { scan-tree-dump "force alignment of aint64_t" "vect" } } */ +/* { dg-final { scan-tree-dump "force alignment of bint64_t" "vect" } } */ +/* { dg-final { scan-tree-dump "force alignment of afloat" "vect" } } */ +/* { dg-final { scan-tree-dump "force alignment of bfloat" "vect" } } */ +/* { dg-final { scan-tree-dump "force alignment of adouble" "vect" } } */ +/* { dg-final { scan-tree-dump "force alignment of bdouble" "vect" } } */ + +/* { dg-final { scan-tree-dump "misalign = 0 bytes of ref aint8_t" "vect" { xfail *-*-* } } } */ +/* { dg-final { scan-tree-dump "misalign = 0 bytes of ref bint8_t" "vect" { xfail *-*-* } } } */ +/* { dg-final { scan-tree-dump "misalign = 0 bytes of ref aint16_t" "vect" { xfail *-*-* } } } */ +/* { dg-final { scan-tree-dump "misalign = 0 bytes of ref bint16_t" "vect" { xfail *-*-* } } } */ +/* { dg-final { scan-tree-dump "misalign = 0 bytes of ref aint32_t" "vect" } } */ +/* { dg-final { scan-tree-dump "misalign = 0 bytes of ref bint32_t" "vect" } } */ +/* { dg-final { scan-tree-dump "misalign = 0 bytes of ref aint64_t" "vect" } } */ +/* { dg-final { scan-tree-dump "misalign = 0 bytes of ref bint64_t" "vect" } } */ +/* { dg-final { scan-tree-dump "misalign = 0 bytes of ref afloat" "vect" } } */ +/* { dg-final { scan-tree-dump "misalign = 0 bytes of ref bfloat" "vect" } } */ +/* { dg-final { scan-tree-dump "misalign = 0 bytes of ref adouble" "vect" } } */ +/* { dg-final { scan-tree-dump "misalign = 0 bytes of ref bdouble" "vect" } } */ + +/* { dg-final { scan-tree-dump-times "LOOP VECTORIZED" 4 "vect" } } */ diff --git a/gcc/testsuite/gcc.target/aarch64/sve_speculative_11_run.c b/gcc/testsuite/gcc.target/aarch64/sve_speculative_11_run.c new file mode 100644 index 00000000000..ebcefdb623c --- /dev/null +++ b/gcc/testsuite/gcc.target/aarch64/sve_speculative_11_run.c @@ -0,0 +1,61 @@ +/* { dg-do run { target { aarch64_sve_hw } } } */ +/* { dg-options "-O3 -fno-common -ffast-math -march=armv8-a+sve" } */ +/* { dg-options "-O3 -fno-common -ffast-math -march=armv8-a+sve -msve-vector-bits=256" { target aarch64_sve256_hw } } */ + +#include "sve_speculative_11.c" + +extern void abort (void); + +#ifndef FILL_DATA +#define FILL_DATA 0 +#endif + +#ifndef EXIT_CONDITION +#define EXIT_CONDITION 5 +#endif + +#ifndef LOOP_COUNTS +#define LOOP_COUNTS {37,45,55,17,39,43} +#endif +int loop_counts[] = LOOP_COUNTS; + +/* Fill the arrays with the exit conditions. + Then refill at the correct strided accesses with fill data up to the end of + the loop count. */ + +#define TEST_SPEC_LOOP_FUNC(DATATYPE, ARGTYPE)\ +void test_spec_loop_##DATATYPE##_##ARGTYPE (ARGTYPE num_elements)\ +{\ + int i;\ + for (i=0; i<MAX_ARRAY_SIZE; i++)\ + {\ + a##DATATYPE[i] = EXIT_CONDITION;\ + b##DATATYPE[i] = EXIT_CONDITION;\ + }\ + for (i=0; (i<num_elements-1)*STRIDE_LEVEL; i++)\ + {\ + a##DATATYPE[i*STRIDE_LEVEL] = FILL_DATA;\ + b##DATATYPE[i*STRIDE_LEVEL] = FILL_DATA;\ + }\ + ARGTYPE ret = spec_loop_##DATATYPE##_##ARGTYPE (EXIT_CONDITION);\ + if (ret != num_elements - 1)\ + abort ();\ +} + +TEST_SPEC_LOOP_FUNC (int8_t, int8_t) +TEST_SPEC_LOOP_FUNC (int16_t, int16_t) +TEST_SPEC_LOOP_FUNC (int32_t, int32_t) +TEST_SPEC_LOOP_FUNC (int64_t, int64_t) +TEST_SPEC_LOOP_FUNC (float, int32_t) +TEST_SPEC_LOOP_FUNC (double, int64_t) + +int main (void) +{ + test_spec_loop_int8_t_int8_t (loop_counts[0]); + test_spec_loop_int16_t_int16_t (loop_counts[1]); + test_spec_loop_int32_t_int32_t (loop_counts[2]); + test_spec_loop_int64_t_int64_t (loop_counts[3]); + test_spec_loop_float_int32_t (loop_counts[4]); + test_spec_loop_double_int64_t (loop_counts[5]); + return 0; +} diff --git a/gcc/testsuite/gcc.target/aarch64/sve_speculative_12.c b/gcc/testsuite/gcc.target/aarch64/sve_speculative_12.c new file mode 100644 index 00000000000..d6caa8e7513 --- /dev/null +++ b/gcc/testsuite/gcc.target/aarch64/sve_speculative_12.c @@ -0,0 +1,32 @@ +/* { dg-do compile } */ +/* { dg-options "-O3 -fno-common -ffast-math -march=armv8-a+sve -fdump-tree-vect-details -msve-vector-bits=256" } */ + +/* Speculative loop with two loads from global buffers which can be aligned + without any peeling, and an access stride of 2. */ + +#define STRIDE_LEVEL 2 + +#include "sve_speculative_11.c" + +/* { dg-final { scan-tree-dump-not "loop versioned for vectorization to enhance alignment" "vect" } } */ +/* { dg-final { scan-tree-dump-not "Alignment of access forced using peeling" "vect" } } */ + +/* { dg-final { scan-tree-dump "force alignment of aint8_t" "vect" { xfail *-*-* } } } */ +/* { dg-final { scan-tree-dump "force alignment of bint8_t" "vect" { xfail *-*-* } } } */ +/* { dg-final { scan-tree-dump "force alignment of aint16_t" "vect" { xfail *-*-* } } } */ +/* { dg-final { scan-tree-dump "force alignment of bint16_t" "vect" { xfail *-*-* } } } */ +/* { dg-final { scan-tree-dump "force alignment of aint32_t" "vect" } } */ +/* { dg-final { scan-tree-dump "force alignment of bint32_t" "vect" } } */ +/* { dg-final { scan-tree-dump "force alignment of aint64_t" "vect" } } */ +/* { dg-final { scan-tree-dump "force alignment of bint64_t" "vect" } } */ + +/* { dg-final { scan-tree-dump "misalign = 0 bytes of ref aint8_t" "vect" { xfail *-*-* } } } */ +/* { dg-final { scan-tree-dump "misalign = 0 bytes of ref bint8_t" "vect" { xfail *-*-* } } } */ +/* { dg-final { scan-tree-dump "misalign = 0 bytes of ref aint16_t" "vect" { xfail *-*-* } } } */ +/* { dg-final { scan-tree-dump "misalign = 0 bytes of ref bint16_t" "vect" { xfail *-*-* } } } */ +/* { dg-final { scan-tree-dump "misalign = 0 bytes of ref aint32_t" "vect" } } */ +/* { dg-final { scan-tree-dump "misalign = 0 bytes of ref bint32_t" "vect" } } */ +/* { dg-final { scan-tree-dump "misalign = 0 bytes of ref aint64_t" "vect" } } */ +/* { dg-final { scan-tree-dump "misalign = 0 bytes of ref bint64_t" "vect" } } */ + +/* { dg-final { scan-tree-dump-times "LOOP VECTORIZED" 4 "vect" } } */ diff --git a/gcc/testsuite/gcc.target/aarch64/sve_speculative_12_run.c b/gcc/testsuite/gcc.target/aarch64/sve_speculative_12_run.c new file mode 100644 index 00000000000..42c346073c6 --- /dev/null +++ b/gcc/testsuite/gcc.target/aarch64/sve_speculative_12_run.c @@ -0,0 +1,9 @@ +/* { dg-do run { target { aarch64_sve_hw } } } */ +/* { dg-options "-O3 -fno-common -ffast-math -march=armv8-a+sve" } */ +/* { dg-options "-O3 -fno-common -ffast-math -march=armv8-a+sve -msve-vector-bits=256" { target aarch64_sve256_hw } } */ + +#define STRIDE_LEVEL 2 +#define EXIT_CONDITION 7 +#define LOOP_COUNTS {43,27,19,54,25,27} + +#include "sve_speculative_11_run.c" diff --git a/gcc/testsuite/gcc.target/aarch64/sve_speculative_13.c b/gcc/testsuite/gcc.target/aarch64/sve_speculative_13.c new file mode 100644 index 00000000000..db95e81d3f2 --- /dev/null +++ b/gcc/testsuite/gcc.target/aarch64/sve_speculative_13.c @@ -0,0 +1,12 @@ +/* { dg-do compile } */ +/* { dg-options "-O3 -fno-common -ffast-math -march=armv8-a+sve -fdump-tree-vect-details -msve-vector-bits=256" } */ + +/* Speculative loop with two loads from global buffers which can be aligned + without any peeling, and an access stride of 3. */ + +#define STRIDE_LEVEL 3 + +#include "sve_speculative_11.c" + +/* { dg-final { scan-tree-dump-times "not vectorized: can't calculate required alignment for data ref" 10 "vect" } } */ +/* { dg-final { scan-tree-dump-not "LOOP VECTORIZED" "vect" } } */ diff --git a/gcc/testsuite/gcc.target/aarch64/sve_speculative_13_run.c b/gcc/testsuite/gcc.target/aarch64/sve_speculative_13_run.c new file mode 100644 index 00000000000..519ff21e168 --- /dev/null +++ b/gcc/testsuite/gcc.target/aarch64/sve_speculative_13_run.c @@ -0,0 +1,9 @@ +/* { dg-do run { target { aarch64_sve_hw } } } */ +/* { dg-options "-O3 -fno-common -ffast-math -march=armv8-a+sve" } */ +/* { dg-options "-O3 -fno-common -ffast-math -march=armv8-a+sve -msve-vector-bits=256" { target aarch64_sve256_hw } } */ + +#define STRIDE_LEVEL 3 +#define EXIT_CONDITION 9 +#define LOOP_COUNTS {19,47,15,35,23,33} + +#include "sve_speculative_11_run.c" diff --git a/gcc/testsuite/gcc.target/aarch64/sve_speculative_14.c b/gcc/testsuite/gcc.target/aarch64/sve_speculative_14.c new file mode 100644 index 00000000000..218afb6c5ca --- /dev/null +++ b/gcc/testsuite/gcc.target/aarch64/sve_speculative_14.c @@ -0,0 +1,32 @@ +/* { dg-do compile } */ +/* { dg-options "-O3 -fno-common -ffast-math -march=armv8-a+sve -fdump-tree-vect-details -msve-vector-bits=256" } */ + +/* Speculative loop with two loads from global buffers which can be aligned + without any peeling, and an access stride of 4. */ + +#define STRIDE_LEVEL 4 + +#include "sve_speculative_11.c" + +/* { dg-final { scan-tree-dump-not "loop versioned for vectorization to enhance alignment" "vect" } } */ +/* { dg-final { scan-tree-dump-not "Alignment of access forced using peeling" "vect" } } */ + +/* { dg-final { scan-tree-dump "force alignment of aint8_t" "vect" { xfail *-*-* } } } */ +/* { dg-final { scan-tree-dump "force alignment of bint8_t" "vect" { xfail *-*-* } } } */ +/* { dg-final { scan-tree-dump "force alignment of aint16_t" "vect" { xfail *-*-* } } } */ +/* { dg-final { scan-tree-dump "force alignment of bint16_t" "vect" { xfail *-*-* } } } */ +/* { dg-final { scan-tree-dump "force alignment of aint32_t" "vect" } } */ +/* { dg-final { scan-tree-dump "force alignment of bint32_t" "vect" } } */ +/* { dg-final { scan-tree-dump "force alignment of aint64_t" "vect" } } */ +/* { dg-final { scan-tree-dump "force alignment of bint64_t" "vect" } } */ + +/* { dg-final { scan-tree-dump "misalign = 0 bytes of ref aint8_t" "vect" { xfail *-*-* } } } */ +/* { dg-final { scan-tree-dump "misalign = 0 bytes of ref bint8_t" "vect" { xfail *-*-* } } } */ +/* { dg-final { scan-tree-dump "misalign = 0 bytes of ref aint16_t" "vect" { xfail *-*-* } } } */ +/* { dg-final { scan-tree-dump "misalign = 0 bytes of ref bint16_t" "vect" { xfail *-*-* } } } */ +/* { dg-final { scan-tree-dump "misalign = 0 bytes of ref aint32_t" "vect" } } */ +/* { dg-final { scan-tree-dump "misalign = 0 bytes of ref bint32_t" "vect" } } */ +/* { dg-final { scan-tree-dump "misalign = 0 bytes of ref aint64_t" "vect" } } */ +/* { dg-final { scan-tree-dump "misalign = 0 bytes of ref bint64_t" "vect" } } */ + +/* { dg-final { scan-tree-dump-times "LOOP VECTORIZED" 4 "vect" } } */ diff --git a/gcc/testsuite/gcc.target/aarch64/sve_speculative_14_run.c b/gcc/testsuite/gcc.target/aarch64/sve_speculative_14_run.c new file mode 100644 index 00000000000..958e94fd822 --- /dev/null +++ b/gcc/testsuite/gcc.target/aarch64/sve_speculative_14_run.c @@ -0,0 +1,11 @@ +/* { dg-do run { target { aarch64_sve_hw } } } */ +/* { dg-options "-O3 -fno-common -ffast-math -march=armv8-a+sve" } */ +/* { dg-options "-O3 -fno-common -ffast-math -march=armv8-a+sve -msve-vector-bits=256" { target aarch64_sve256_hw } } */ + +#define STRIDE_LEVEL 4 + +#define FILL_DATA 5 +#define EXIT_CONDITION 22 +#define LOOP_COUNTS {43,27,19,54,25,27} + +#include "sve_speculative_11_run.c" diff --git a/gcc/testsuite/gcc.target/aarch64/sve_speculative_15.c b/gcc/testsuite/gcc.target/aarch64/sve_speculative_15.c new file mode 100644 index 00000000000..42ec564c90b --- /dev/null +++ b/gcc/testsuite/gcc.target/aarch64/sve_speculative_15.c @@ -0,0 +1,59 @@ +/* { dg-do compile } */ +/* { dg-options "-O3 -fno-common -ffast-math -march=armv8-a+sve -fdump-tree-vect-details -msve-vector-bits=256" } */ + +#include <stdint.h> + +/* Speculative loop with two consecutive loads from a single global buffer + which can be aligned without any peeling, and an access stride of 2. */ + +#define MAX_ARRAY_SIZE 500 + +/* Minimum STRIDE_LEVEL is 2. */ +#ifndef STRIDE_LEVEL +#define STRIDE_LEVEL 2 +#endif + +#define SPEC_LOOP(DATATYPE, ARGTYPE)\ +DATATYPE a##DATATYPE[MAX_ARRAY_SIZE];\ +ARGTYPE spec_loop_##DATATYPE##_##ARGTYPE (DATATYPE n)\ +{\ + ARGTYPE i = -1;\ + do\ + i += 1;\ + while (a##DATATYPE[i*STRIDE_LEVEL] + a##DATATYPE[(i*STRIDE_LEVEL) + 1] < n);\ + return i;\ +} + +/* TODO: Cannot yet vectorize due to gather load. */ +SPEC_LOOP (int8_t, int8_t) +SPEC_LOOP (int16_t, int16_t) + +SPEC_LOOP (int32_t, int32_t) +SPEC_LOOP (int64_t, int64_t) +SPEC_LOOP (float, int32_t) +SPEC_LOOP (double, int64_t) + +/* { dg-final { scan-tree-dump-not "loop versioned for vectorization to enhance alignment" "vect" } } */ +/* { dg-final { scan-tree-dump-not "Alignment of access forced using peeling" "vect" } } */ + +/* { dg-final { scan-tree-dump "force alignment of aint8_t" "vect" { xfail *-*-* } } } */ +/* { dg-final { scan-tree-dump "force alignment of aint16_t" "vect" { xfail *-*-* } } } */ +/* { dg-final { scan-tree-dump "force alignment of aint32_t" "vect" } } */ +/* { dg-final { scan-tree-dump "force alignment of aint64_t" "vect" } } */ +/* { dg-final { scan-tree-dump "force alignment of afloat" "vect" } } */ +/* { dg-final { scan-tree-dump "force alignment of adouble" "vect" } } */ + +/* { dg-final { scan-tree-dump "misalign = 1 bytes of ref aint8_t" "vect" { xfail *-*-* } } } */ +/* { dg-final { scan-tree-dump "misalign = 0 bytes of ref aint8_t" "vect" { xfail *-*-* } } } */ +/* { dg-final { scan-tree-dump "misalign = 0 bytes of ref aint16_t" "vect" { xfail *-*-* } } } */ +/* { dg-final { scan-tree-dump "misalign = 2 bytes of ref aint16_t" "vect" { xfail *-*-* } } } */ +/* { dg-final { scan-tree-dump "misalign = 0 bytes of ref aint32_t" "vect" } } */ +/* { dg-final { scan-tree-dump "misalign = 4 bytes of ref aint32_t" "vect" } } */ +/* { dg-final { scan-tree-dump "misalign = 0 bytes of ref aint64_t" "vect" } } */ +/* { dg-final { scan-tree-dump "misalign = 8 bytes of ref aint64_t" "vect" } } */ +/* { dg-final { scan-tree-dump "misalign = 0 bytes of ref afloat" "vect" } } */ +/* { dg-final { scan-tree-dump "misalign = 4 bytes of ref afloat" "vect" } } */ +/* { dg-final { scan-tree-dump "misalign = 0 bytes of ref adouble" "vect" } } */ +/* { dg-final { scan-tree-dump "misalign = 8 bytes of ref adouble" "vect" } } */ + +/* { dg-final { scan-tree-dump-times "LOOP VECTORIZED" 4 "vect" } } */ diff --git a/gcc/testsuite/gcc.target/aarch64/sve_speculative_15_run.c b/gcc/testsuite/gcc.target/aarch64/sve_speculative_15_run.c new file mode 100644 index 00000000000..533f99467fd --- /dev/null +++ b/gcc/testsuite/gcc.target/aarch64/sve_speculative_15_run.c @@ -0,0 +1,56 @@ +/* { dg-do run { target { aarch64_sve_hw } } } */ +/* { dg-options "-O3 -fno-common -ffast-math -march=armv8-a+sve" } */ +/* { dg-options "-O3 -fno-common -ffast-math -march=armv8-a+sve -msve-vector-bits=256" { target aarch64_sve256_hw } } */ + +#include "sve_speculative_15.c" + +extern void abort (void); + +#ifndef FILL_DATA +#define FILL_DATA 0 +#endif + +#ifndef EXIT_CONDITION +#define EXIT_CONDITION 5 +#endif + +#ifndef LOOP_COUNTS +#define LOOP_COUNTS {37,45,55,17,39,43} +#endif +int loop_counts[] = LOOP_COUNTS; + +/* Fill the arrays with the exit conditions. + Then refill at the correct strided accesses with fill data up to the end of + the loop count. */ + +#define TEST_SPEC_LOOP_FUNC(DATATYPE, ARGTYPE) \ +void \ +test_spec_loop_##DATATYPE##_##ARGTYPE (ARGTYPE num_elements) \ +{ \ + for (int i = 0; i < MAX_ARRAY_SIZE; ++i) \ + a##DATATYPE[i] = EXIT_CONDITION; \ + for (int i = 0; i < (num_elements - 1) * STRIDE_LEVEL; ++i) \ + a##DATATYPE[i] = FILL_DATA; \ + ARGTYPE ret = spec_loop_##DATATYPE##_##ARGTYPE (EXIT_CONDITION); \ + if (ret != num_elements - 1) \ + abort (); \ +} + +TEST_SPEC_LOOP_FUNC (int8_t, int8_t) +TEST_SPEC_LOOP_FUNC (int16_t, int16_t) +TEST_SPEC_LOOP_FUNC (int32_t, int32_t) +TEST_SPEC_LOOP_FUNC (int64_t, int64_t) +TEST_SPEC_LOOP_FUNC (float, int32_t) +TEST_SPEC_LOOP_FUNC (double, int64_t) + +int main (void) +{ + test_spec_loop_int8_t_int8_t (loop_counts[0]); + test_spec_loop_int16_t_int16_t (loop_counts[1]); + test_spec_loop_int32_t_int32_t (loop_counts[2]); + test_spec_loop_int64_t_int64_t (loop_counts[3]); + test_spec_loop_float_int32_t (loop_counts[4]); + test_spec_loop_double_int64_t (loop_counts[5]); + return 0; +} + diff --git a/gcc/testsuite/gcc.target/aarch64/sve_speculative_16.c b/gcc/testsuite/gcc.target/aarch64/sve_speculative_16.c new file mode 100644 index 00000000000..9affb766b2a --- /dev/null +++ b/gcc/testsuite/gcc.target/aarch64/sve_speculative_16.c @@ -0,0 +1,12 @@ +/* { dg-do compile } */ +/* { dg-options "-O3 -fno-common -ffast-math -march=armv8-a+sve -fdump-tree-vect-details -msve-vector-bits=256" } */ + +/* Speculative loop with two consecutive loads from a single global buffer + which can be aligned without any peeling, and an access stride of 3. */ + +#define STRIDE_LEVEL 3 + +#include "sve_speculative_15.c" + +/* { dg-final { scan-tree-dump-times "not vectorized: can't calculate required alignment for data ref" 10 "vect" } } */ +/* { dg-final { scan-tree-dump-not "LOOP VECTORIZED" "vect" } } */ diff --git a/gcc/testsuite/gcc.target/aarch64/sve_speculative_16_run.c b/gcc/testsuite/gcc.target/aarch64/sve_speculative_16_run.c new file mode 100644 index 00000000000..7c53e7aeed6 --- /dev/null +++ b/gcc/testsuite/gcc.target/aarch64/sve_speculative_16_run.c @@ -0,0 +1,9 @@ +/* { dg-do run { target { aarch64_sve_hw } } } */ +/* { dg-options "-O3 -fno-common -ffast-math -march=armv8-a+sve" } */ +/* { dg-options "-O3 -fno-common -ffast-math -march=armv8-a+sve -msve-vector-bits=256" { target aarch64_sve256_hw } } */ + +#define STRIDE_LEVEL 3 +#define EXIT_CONDITION 7 +#define LOOP_COUNTS {43,27,19,54,25,27} + +#include "sve_speculative_15_run.c" diff --git a/gcc/testsuite/gcc.target/aarch64/sve_speculative_17.c b/gcc/testsuite/gcc.target/aarch64/sve_speculative_17.c new file mode 100644 index 00000000000..b7e472e0deb --- /dev/null +++ b/gcc/testsuite/gcc.target/aarch64/sve_speculative_17.c @@ -0,0 +1,34 @@ +/* { dg-do compile } */ +/* { dg-options "-O3 -fno-common -ffast-math -march=armv8-a+sve -fdump-tree-vect-details -msve-vector-bits=256" } */ + +/* Speculative loop with two consecutive loads from a single global buffer + which can be aligned without any peeling, and an access stride of 4. */ + +#define STRIDE_LEVEL 4 + +#include "sve_speculative_15.c" + +/* { dg-final { scan-tree-dump-not "loop versioned for vectorization to enhance alignment" "vect" } } */ +/* { dg-final { scan-tree-dump-not "Alignment of access forced using peeling" "vect" } } */ + +/* { dg-final { scan-tree-dump "force alignment of aint8_t" "vect" { xfail *-*-* } } } */ +/* { dg-final { scan-tree-dump "force alignment of aint16_t" "vect" { xfail *-*-* } } } */ +/* { dg-final { scan-tree-dump "force alignment of aint32_t" "vect" } } */ +/* { dg-final { scan-tree-dump "force alignment of aint64_t" "vect" } } */ +/* { dg-final { scan-tree-dump "force alignment of afloat" "vect" } } */ +/* { dg-final { scan-tree-dump "force alignment of adouble" "vect" } } */ + +/* { dg-final { scan-tree-dump "misalign = 1 bytes of ref aint8_t" "vect" { xfail *-*-* } } } */ +/* { dg-final { scan-tree-dump "misalign = 0 bytes of ref aint8_t" "vect" { xfail *-*-* } } } */ +/* { dg-final { scan-tree-dump "misalign = 0 bytes of ref aint16_t" "vect" { xfail *-*-* } } } */ +/* { dg-final { scan-tree-dump "misalign = 2 bytes of ref aint16_t" "vect" { xfail *-*-* } } } */ +/* { dg-final { scan-tree-dump "misalign = 0 bytes of ref aint32_t" "vect" } } */ +/* { dg-final { scan-tree-dump "misalign = 4 bytes of ref aint32_t" "vect" } } */ +/* { dg-final { scan-tree-dump "misalign = 0 bytes of ref aint64_t" "vect" } } */ +/* { dg-final { scan-tree-dump "misalign = 8 bytes of ref aint64_t" "vect" } } */ +/* { dg-final { scan-tree-dump "misalign = 0 bytes of ref afloat" "vect" } } */ +/* { dg-final { scan-tree-dump "misalign = 4 bytes of ref afloat" "vect" } } */ +/* { dg-final { scan-tree-dump "misalign = 0 bytes of ref adouble" "vect" } } */ +/* { dg-final { scan-tree-dump "misalign = 8 bytes of ref adouble" "vect" } } */ + +/* { dg-final { scan-tree-dump-times "LOOP VECTORIZED" 4 "vect" } } */ diff --git a/gcc/testsuite/gcc.target/aarch64/sve_speculative_17_run.c b/gcc/testsuite/gcc.target/aarch64/sve_speculative_17_run.c new file mode 100644 index 00000000000..5453116429a --- /dev/null +++ b/gcc/testsuite/gcc.target/aarch64/sve_speculative_17_run.c @@ -0,0 +1,9 @@ +/* { dg-do run { target { aarch64_sve_hw } } } */ +/* { dg-options "-O3 -fno-common -ffast-math -march=armv8-a+sve" } */ +/* { dg-options "-O3 -fno-common -ffast-math -march=armv8-a+sve -msve-vector-bits=256" { target aarch64_sve256_hw } } */ + +#define STRIDE_LEVEL 4 +#define EXIT_CONDITION 9 +#define LOOP_COUNTS {19,47,15,35,23,33} + +#include "sve_speculative_15_run.c" diff --git a/gcc/testsuite/gcc.target/aarch64/sve_speculative_1_run.c b/gcc/testsuite/gcc.target/aarch64/sve_speculative_1_run.c new file mode 100644 index 00000000000..f4bb55ed6f8 --- /dev/null +++ b/gcc/testsuite/gcc.target/aarch64/sve_speculative_1_run.c @@ -0,0 +1,47 @@ +/* { dg-do run { target { aarch64_sve_hw } } } */ +/* { dg-options "-O3 -fno-inline -march=armv8-a+sve -ffast-math" } */ +/* { dg-options "-O3 -fno-inline -march=armv8-a+sve -ffast-math -msve-vector-bits=256" { target aarch64_sve256_hw } } */ + +#include "sve_speculative_1.c" + +extern void abort (void); + +#define TEST_LOOP(ARGTYPE,INDUCTYPE)\ +{\ + INDUCTYPE res = spec_loop_##ARGTYPE##INDUCTYPE (0xFF, 0xAE);\ + if (res != 0xAE)\ + abort ();\ +}\ + +#define TEST_FP_LOOP(ARGTYPE,INDUCTYPE,FPTYPE)\ +{\ + FPTYPE res = spec_fp_loop_##ARGTYPE##INDUCTYPE##FPTYPE (0xFF, 0xAE);\ + if (res != 0xAE)\ + abort ();\ +}\ + +int main () +{ + TEST_LOOP (uint8_t, uint8_t); + TEST_LOOP (uint16_t, uint16_t); + TEST_LOOP (uint32_t, uint32_t); + TEST_LOOP (uint64_t, uint64_t); + TEST_LOOP (int32_t, int32_t); + TEST_LOOP (int64_t, int64_t); + + TEST_LOOP (uint16_t, uint8_t) + + TEST_LOOP (uint32_t, uint8_t) + TEST_LOOP (uint32_t, uint16_t) + + TEST_LOOP (uint64_t, uint8_t) + TEST_LOOP (uint64_t, uint16_t) + TEST_LOOP (uint64_t, uint32_t) + + TEST_FP_LOOP (uint32_t, uint32_t, float) + TEST_FP_LOOP (uint64_t, uint64_t, double) + + TEST_FP_LOOP (uint64_t, uint64_t, float) + + return 0; +} diff --git a/gcc/testsuite/gcc.target/aarch64/sve_speculative_2.c b/gcc/testsuite/gcc.target/aarch64/sve_speculative_2.c new file mode 100644 index 00000000000..108c5a6fbe6 --- /dev/null +++ b/gcc/testsuite/gcc.target/aarch64/sve_speculative_2.c @@ -0,0 +1,72 @@ +/* { dg-do compile } */ +/* { dg-options "-O3 -fno-inline -ffast-math -march=armv8-a+sve -fdump-tree-vect-details -msve-vector-bits=256" } */ + +#include <stdint.h> +#include <stdbool.h> + +/* Speculative loop with no data references. */ + +/* FIXME: dup of rhs into predicate register is made of horrible code. */ +#define SPEC_LOOP(ARGTYPE,INDUCTYPE)\ +INDUCTYPE spec_loop_##ARGTYPE##INDUCTYPE (ARGTYPE mask, ARGTYPE limit,\ + bool rhs)\ +{\ + INDUCTYPE i = 0;\ + bool lhs = (i & mask) != limit;\ + while (lhs == rhs)\ + {\ + i += 1;\ + lhs = (i & mask) != limit;\ + }\ + return i;\ +}\ + +#define SPEC_FP_LOOP(ARGTYPE,INDUCTYPE,FPTYPE)\ +INDUCTYPE spec_fp_loop_##ARGTYPE##INDUCTYPE##FPTYPE (ARGTYPE mask, ARGTYPE limit,\ + bool rhs)\ +{\ + INDUCTYPE i = 0;\ + FPTYPE f = 0.0;\ + bool lhs = (i & mask) != limit;\ + while (lhs == rhs)\ + {\ + f += 1;\ + i += 1;\ + lhs = (i & mask) != limit;\ + }\ + return f;\ +}\ + +SPEC_LOOP (uint8_t, uint8_t) +SPEC_LOOP (uint16_t, uint16_t) +SPEC_LOOP (uint32_t, uint32_t) +SPEC_LOOP (uint64_t, uint64_t) + +SPEC_LOOP (int8_t, int8_t) +SPEC_LOOP (int16_t, int16_t) +SPEC_LOOP (int32_t, int32_t) +SPEC_LOOP (int64_t, int64_t) + +/* Conversions. */ +SPEC_LOOP (uint16_t, uint8_t) + +SPEC_LOOP (uint32_t, uint8_t) +SPEC_LOOP (uint32_t, uint16_t) + +SPEC_LOOP (uint64_t, uint8_t) +SPEC_LOOP (uint64_t, uint16_t) +SPEC_LOOP (uint64_t, uint32_t) + +SPEC_FP_LOOP (uint32_t, uint32_t, float) +SPEC_FP_LOOP (uint64_t, uint64_t, double) + +SPEC_FP_LOOP (uint64_t, uint64_t, float) + +/* { dg-final { scan-tree-dump-times "LOOP VECTORIZED" 17 "vect" } } */ +/* { dg-final { scan-assembler-times {\tbrka\tp[0-9]+\.b, p[0-9]+/z, p[0-9]+\.b} 17 } } */ +/* { dg-final { scan-assembler-times {\tlastb\tw[0-9]+, p[0-7], z[0-9]+\.b} 5 } } */ +/* { dg-final { scan-assembler-times {\tlastb\tw[0-9]+, p[0-7], z[0-9]+\.h} 4 } } */ +/* { dg-final { scan-assembler-times {\tlastb\tw[0-9]+, p[0-7], z[0-9]+\.s} 3 } } */ +/* { dg-final { scan-assembler-times {\tlastb\tx[0-9]+, p[0-7], z[0-9]+\.d} 2 } } */ +/* { dg-final { scan-assembler-times {\tlastb\ts[0-9]+, p[0-7], z[0-9]+\.s} 2 } } */ +/* { dg-final { scan-assembler-times {\tlastb\td[0-9]+, p[0-7], z[0-9]+\.d} 1 } } */ diff --git a/gcc/testsuite/gcc.target/aarch64/sve_speculative_2_run.c b/gcc/testsuite/gcc.target/aarch64/sve_speculative_2_run.c new file mode 100644 index 00000000000..ad2c9c874b8 --- /dev/null +++ b/gcc/testsuite/gcc.target/aarch64/sve_speculative_2_run.c @@ -0,0 +1,45 @@ +/* { dg-do run { target { aarch64_sve_hw } } } */ +/* { dg-options "-O3 -fno-inline -march=armv8-a+sve -ffast-math" } */ +/* { dg-options "-O3 -fno-inline -march=armv8-a+sve -ffast-math -msve-vector-bits=256" { target aarch64_sve256_hw } } */ + +#include "sve_speculative_2.c" + +extern void abort (void); + +#define TEST_LOOP(ARGTYPE,INDUCTYPE)\ +{\ + INDUCTYPE res = spec_loop_##ARGTYPE##INDUCTYPE (0xFF, 0xAE, true);\ + if (res != 0xAE)\ + abort ();\ +}\ + +#define TEST_FP_LOOP(ARGTYPE,INDUCTYPE,FPTYPE)\ +{\ + FPTYPE res = spec_fp_loop_##ARGTYPE##INDUCTYPE##FPTYPE (0xFF, 0xAE, true);\ + if (res != 0xAE)\ + abort ();\ +}\ + +int main () +{ + TEST_LOOP (uint8_t, uint8_t); + TEST_LOOP (uint16_t, uint16_t); + TEST_LOOP (uint32_t, uint32_t); + TEST_LOOP (uint64_t, uint64_t); + TEST_LOOP (int32_t, int32_t); + TEST_LOOP (int64_t, int64_t); + + TEST_LOOP (uint16_t, uint8_t) + + TEST_LOOP (uint32_t, uint8_t) + TEST_LOOP (uint32_t, uint16_t) + + TEST_LOOP (uint64_t, uint8_t) + TEST_LOOP (uint64_t, uint16_t) + TEST_LOOP (uint64_t, uint32_t) + + TEST_FP_LOOP (uint32_t, uint32_t, float) + TEST_FP_LOOP (uint64_t, uint64_t, double) + + return 0; +} diff --git a/gcc/testsuite/gcc.target/aarch64/sve_speculative_3.c b/gcc/testsuite/gcc.target/aarch64/sve_speculative_3.c new file mode 100644 index 00000000000..db35711a193 --- /dev/null +++ b/gcc/testsuite/gcc.target/aarch64/sve_speculative_3.c @@ -0,0 +1,26 @@ +/* { dg-do compile } */ +/* { dg-options "-O3 -fno-inline -ffast-math -march=armv8-a+sve -fdump-tree-vect-details -msve-vector-bits=256" } */ + +#include <stdint.h> + +/* Speculative loop with different sizes and no data references . + Cannot be vectorized. */ + +#define SPEC_FP_LOOP(ARGTYPE,INDUCTYPE,FPTYPE)\ +FPTYPE spec_fp_loop_##ARGTYPE##INDUCTYPE (ARGTYPE mask, ARGTYPE limit)\ +{\ + INDUCTYPE i = 0;\ + FPTYPE f = 0.0;\ + while ((i & mask) != limit)\ + {\ + f += 1;\ + i += 1;\ + }\ + return f;\ +}\ + +SPEC_FP_LOOP (uint32_t, uint32_t, double) + +/* { dg-final { scan-tree-dump-times "not vectorized: ncopies is greater than 1" 1 "vect" } } */ +/* { dg-final { scan-assembler-not "brka\tp\[0-9\]*.b, p\[0-9\]*\/z, p\[0-9\]*.b" } } */ + diff --git a/gcc/testsuite/gcc.target/aarch64/sve_speculative_4.c b/gcc/testsuite/gcc.target/aarch64/sve_speculative_4.c new file mode 100644 index 00000000000..32b8c71c92a --- /dev/null +++ b/gcc/testsuite/gcc.target/aarch64/sve_speculative_4.c @@ -0,0 +1,66 @@ +/* { dg-do compile } */ +/* { dg-options "-O3 -fno-inline -ffast-math -march=armv8-a+sve -fdump-tree-vect-details -msve-vector-bits=256" } */ + +#include <stdint.h> + +/* Speculative loop with a load. */ + +#define SPEC_LOOP(ARGTYPE,INDUCTYPE)\ +INDUCTYPE spec_loop_##ARGTYPE##INDUCTYPE (ARGTYPE mask, ARGTYPE limit, ARGTYPE * array)\ +{\ + uint64_t i = 0;\ + INDUCTYPE r = 0;\ + while ((i & mask) != limit)\ + {\ + r = array[i];\ + i++;\ + }\ + return r;\ +} + +#define SPEC_FP_LOOP(ARGTYPE,FPTYPE)\ +FPTYPE spec_fp_loop_##ARGTYPE##FPTYPE (ARGTYPE mask, ARGTYPE limit, FPTYPE * array)\ +{\ + uint64_t i = 0;\ + FPTYPE f = 0.0;\ + while ((i & mask) != limit)\ + {\ + f = array[i];\ + i++;\ + }\ + return f;\ +} + +SPEC_LOOP (uint8_t, uint8_t) +SPEC_LOOP (uint16_t, uint16_t) +SPEC_LOOP (uint32_t, uint32_t) +SPEC_LOOP (uint64_t, uint64_t) + +SPEC_LOOP (int8_t, int8_t) +SPEC_LOOP (int16_t, int16_t) +SPEC_LOOP (int32_t, int32_t) +SPEC_LOOP (int64_t, int64_t) + +/* Conversions. */ +SPEC_LOOP (uint16_t, uint8_t) + +SPEC_LOOP (uint32_t, uint8_t) +SPEC_LOOP (uint32_t, uint16_t) + +SPEC_LOOP (uint64_t, uint8_t) +SPEC_LOOP (uint64_t, uint16_t) +SPEC_LOOP (uint64_t, uint32_t) + +SPEC_FP_LOOP (uint32_t, float) +SPEC_FP_LOOP (uint64_t, double) + +SPEC_FP_LOOP (uint64_t, float) + +/* { dg-final { scan-tree-dump-times "LOOP VECTORIZED" 17 "vect" } } */ +/* { dg-final { scan-assembler-times {\tbrka\tp[0-9]+\.b, p[0-9]+/z, p[0-9]+\.b} 17 } } */ +/* { dg-final { scan-assembler-times {\tlastb\tw[0-9]+, p[0-7], z[0-9]+\.b} 2 } } */ +/* { dg-final { scan-assembler-times {\tlastb\tw[0-9]+, p[0-7], z[0-9]+\.h} 3 } } */ +/* { dg-final { scan-assembler-times {\tlastb\tw[0-9]+, p[0-7], z[0-9]+\.s} 4 } } */ +/* { dg-final { scan-assembler-times {\tlastb\tx[0-9]+, p[0-7], z[0-9]+\.d} 5 } } */ +/* { dg-final { scan-assembler-times {\tlastb\ts[0-9]+, p[0-7], z[0-9]+\.s} 2 } } */ +/* { dg-final { scan-assembler-times {\tlastb\td[0-9]+, p[0-7], z[0-9]+\.d} 1 } } */ diff --git a/gcc/testsuite/gcc.target/aarch64/sve_speculative_4_run.c b/gcc/testsuite/gcc.target/aarch64/sve_speculative_4_run.c new file mode 100644 index 00000000000..96834ba51be --- /dev/null +++ b/gcc/testsuite/gcc.target/aarch64/sve_speculative_4_run.c @@ -0,0 +1,56 @@ +/* { dg-do run { target { aarch64_sve_hw } } } */ +/* { dg-options "-O3 -fno-inline -march=armv8-a+sve -ffast-math" } */ +/* { dg-options "-O3 -fno-inline -march=armv8-a+sve -ffast-math -msve-vector-bits=256" { target aarch64_sve256_hw } } */ + +#include "sve_speculative_4.c" + +extern void abort (void); +#include <string.h> + +#define MAX 0xAE + +#define TEST_LOOP(ARGTYPE,INDUCTYPE)\ +{\ + ARGTYPE array[MAX];\ + memset (array, 0, sizeof (ARGTYPE) * MAX);\ + array[MAX - 1] = 72;\ + INDUCTYPE res = spec_loop_##ARGTYPE##INDUCTYPE (0xFF, MAX, array);\ + if (res != 72)\ + abort ();\ +} + +#define TEST_FP_LOOP(ARGTYPE,FPTYPE)\ +{\ + FPTYPE array[MAX];\ + memset (array, 0, sizeof (FPTYPE) * MAX);\ + array[MAX - 1] = 54.5;\ + FPTYPE res = spec_fp_loop_##ARGTYPE##FPTYPE (0xFF, MAX, array);\ + if (res != 54.5)\ + abort ();\ +} + +int main () +{ + TEST_LOOP (uint8_t, uint8_t); + TEST_LOOP (uint16_t, uint16_t); + TEST_LOOP (uint32_t, uint32_t); + TEST_LOOP (uint64_t, uint64_t); + TEST_LOOP (int32_t, int32_t); + TEST_LOOP (int64_t, int64_t); + + TEST_LOOP (uint16_t, uint8_t) + + TEST_LOOP (uint32_t, uint8_t) + TEST_LOOP (uint32_t, uint16_t) + + TEST_LOOP (uint64_t, uint8_t) + TEST_LOOP (uint64_t, uint16_t) + TEST_LOOP (uint64_t, uint32_t) + + TEST_FP_LOOP (uint32_t, float) + TEST_FP_LOOP (uint64_t, double) + + TEST_FP_LOOP (uint64_t, float) + + return 0; +} diff --git a/gcc/testsuite/gcc.target/aarch64/sve_speculative_5.c b/gcc/testsuite/gcc.target/aarch64/sve_speculative_5.c new file mode 100644 index 00000000000..d1d8f8fbaaa --- /dev/null +++ b/gcc/testsuite/gcc.target/aarch64/sve_speculative_5.c @@ -0,0 +1,54 @@ +/* { dg-do compile } */ +/* { dg-options "-O3 -fno-inline -ffast-math -march=armv8-a+sve -fdump-tree-vect-details -msve-vector-bits=256" } */ + +#include <stdint.h> + +/* Speculative loop with a load. Exit condition in the array. */ + +#ifndef EXIT_CONDITION +#define EXIT_CONDITION 1 +#endif + +#define SPEC_LOOP(ARGTYPE)\ +ARGTYPE spec_loop_##ARGTYPE (ARGTYPE * array)\ +{\ + ARGTYPE i = 0;\ + ARGTYPE r = EXIT_CONDITION + 1;\ + while (r != EXIT_CONDITION)\ + {\ + r = array[i];\ + i++;\ + }\ + return i;\ +} + +#define SPEC_FP_LOOP(FPTYPE, ARGTYPE)\ +ARGTYPE spec_loop_##ARGTYPE##FPTYPE (FPTYPE * array)\ +{\ + ARGTYPE i = 0;\ + ARGTYPE r = EXIT_CONDITION + 1;\ + while (r != EXIT_CONDITION)\ + {\ + r = array[i];\ + i++;\ + }\ + return i;\ +} + +/* TODO: Cannot yet vectorize due to gather load. */ +SPEC_LOOP (int8_t) +SPEC_LOOP (int16_t) + +SPEC_LOOP (int32_t) +SPEC_LOOP (int64_t) +SPEC_FP_LOOP (float, int32_t) +SPEC_FP_LOOP (double, int64_t) + +/* { dg-final { scan-tree-dump-times "LOOP VECTORIZED" 4 "vect" } } */ +/* { dg-final { scan-assembler-times {\tbrka\tp[0-9]+\.b, p[0-9]+/z, p[0-9]+\.b} 4 } } */ +/* { dg-final { scan-assembler-not {\tlastb\tw[0-9]+, p[0-7], z[0-9]+\.b} } } */ +/* { dg-final { scan-assembler-not {\tlastb\tw[0-9]+, p[0-7], z[0-9]+\.h} } } */ +/* { dg-final { scan-assembler-times {\tlastb\tw[0-9]+, p[0-7], z[0-9]+\.s} 2 } } */ +/* { dg-final { scan-assembler-times {\tlastb\tx[0-9]+, p[0-7], z[0-9]+\.d} 2 } } */ +/* { dg-final { scan-assembler-not {\tlastb\ts[0-9]+, p[0-7], z[0-9]+\.s} } } */ +/* { dg-final { scan-assembler-not {\tlastb\td[0-9]+, p[0-7], z[0-9]+\.d} } } */ diff --git a/gcc/testsuite/gcc.target/aarch64/sve_speculative_5_run.c b/gcc/testsuite/gcc.target/aarch64/sve_speculative_5_run.c new file mode 100644 index 00000000000..a8f7f9fff17 --- /dev/null +++ b/gcc/testsuite/gcc.target/aarch64/sve_speculative_5_run.c @@ -0,0 +1,104 @@ +/* { dg-do run { target { aarch64_sve_hw } } } */ +/* { dg-options "-O3 -fno-inline -march=armv8-a+sve" } */ +/* { dg-options "-O3 -fno-inline -march=armv8-a+sve -msve-vector-bits=256" { target aarch64_sve256_hw } } */ + +#include "sve_speculative_5.c" + +#define _GNU_SOURCE +#include <sys/mman.h> +extern void abort (void); +extern void *mremap (void *old_address, size_t old_size, + size_t new_size, int flags, ... /* void *new_address */); + +#ifndef FILL_DATA +#define FILL_DATA 0 +#endif + +#ifndef LOOP_COUNTS +#define LOOP_COUNTS {22,20,13,17,29,19} +#endif +int loop_counts[] = LOOP_COUNTS; + +/* Program will fault if memory beyond the boundaries of BUF is accessed. */ + +#define SPACE_SIZE 4096*sizeof(int) + +/* Enable to confirm program segfaults when accessing outside of BUF. */ +#ifdef CHECK_SEGFAULT +#define ADDITIONAL 1 +#else +#define ADDITIONAL 0 +#endif + +/* BUF is an array of NUM_ELEMENTS size. + BUF_PRE points to 4 elements before BUF. + Before calling SPEC_LOOP, set the last element of BUF and the + four elements of BUF_PRE to the exit condition. + Fill the rest of BUF to the fill data. */ + +#define TEST_SPEC_LOOP_FUNC(ARGTYPE)\ +void test_spec_loop_##ARGTYPE (void *bufend, ARGTYPE num_elements)\ +{\ + int i;\ + ARGTYPE* buf = ((ARGTYPE*)bufend) - num_elements;\ + ARGTYPE* buf_pre = ((ARGTYPE*)bufend) - num_elements - 4;\ + for (i=0; i<num_elements-1; i++)\ + buf[i] = FILL_DATA;\ + buf[num_elements - 1 + ADDITIONAL] = EXIT_CONDITION;\ + for (i=0; i<4; i++)\ + buf_pre[i] = EXIT_CONDITION;\ + ARGTYPE ret = spec_loop_##ARGTYPE (buf);\ + if (ret != num_elements)\ + abort ();\ +} + +#define TEST_SPEC_FP_LOOP_FUNC(FPTYPE, ARGTYPE)\ +void test_spec_loop_##ARGTYPE##FPTYPE (void *bufend, ARGTYPE num_elements)\ +{\ + int i;\ + FPTYPE* buf = ((FPTYPE*)bufend) - num_elements;\ + FPTYPE* buf_pre = ((FPTYPE*)bufend) - num_elements - 4;\ + for (i=0; i<num_elements-1; i++)\ + buf[i] = FILL_DATA;\ + buf[num_elements - 1 + ADDITIONAL] = EXIT_CONDITION;\ + for (i=0; i<4; i++)\ + buf_pre[i] = EXIT_CONDITION;\ + ARGTYPE ret = spec_loop_##ARGTYPE##FPTYPE (buf);\ + if (ret != num_elements)\ + abort ();\ +} + +TEST_SPEC_LOOP_FUNC (int8_t) +TEST_SPEC_LOOP_FUNC (int16_t) +TEST_SPEC_LOOP_FUNC (int32_t) +TEST_SPEC_LOOP_FUNC (int64_t) +TEST_SPEC_FP_LOOP_FUNC (float, int32_t) +TEST_SPEC_FP_LOOP_FUNC (double, int64_t) + +int main (void) +{ + /* Map in two pages worth of space. Then reduce it down to a single page. + This will result in the second page of data being unmapped - ie it + will cause a segfault if accessed. */ + + void *space = mmap (0, SPACE_SIZE * 2, PROT_READ|PROT_WRITE, + MAP_ANON|MAP_PRIVATE, -1, 0); + if (space == (void*)-1) + abort (); + + void *space_new = mremap (space, SPACE_SIZE * 2, SPACE_SIZE, 0); + if (space != space_new) + abort (); + + /* set END to the start of the second (unmapped) page. */ + char *end = space + SPACE_SIZE; + + test_spec_loop_int8_t (end, loop_counts[0]); + test_spec_loop_int16_t (end, loop_counts[1]); + test_spec_loop_int32_t (end, loop_counts[2]); + test_spec_loop_int64_t (end, loop_counts[3]); + test_spec_loop_int32_tfloat (end, loop_counts[4]); + test_spec_loop_int64_tdouble (end, loop_counts[5]); + + return 0; +} diff --git a/gcc/testsuite/gcc.target/aarch64/sve_speculative_5_run_2.c b/gcc/testsuite/gcc.target/aarch64/sve_speculative_5_run_2.c new file mode 100644 index 00000000000..ed12336f47d --- /dev/null +++ b/gcc/testsuite/gcc.target/aarch64/sve_speculative_5_run_2.c @@ -0,0 +1,8 @@ +/* { dg-do run { target { aarch64_sve_hw } } } */ +/* { dg-options "-O3 -fno-inline -march=armv8-a+sve" } */ +/* { dg-options "-O3 -fno-inline -march=armv8-a+sve -msve-vector-bits=256" { target aarch64_sve256_hw } } */ + +/* Use exit condition of 0. */ +#define EXIT_CONDITION 0 +#define FILL_DATA 1 +#include "sve_speculative_5_run.c" diff --git a/gcc/testsuite/gcc.target/aarch64/sve_speculative_5_run_3.c b/gcc/testsuite/gcc.target/aarch64/sve_speculative_5_run_3.c new file mode 100644 index 00000000000..c6a5edf86b4 --- /dev/null +++ b/gcc/testsuite/gcc.target/aarch64/sve_speculative_5_run_3.c @@ -0,0 +1,9 @@ +/* { dg-do run { target { aarch64_sve_hw } } } */ +/* { dg-options "-O3 -fno-inline -march=armv8-a+sve" } */ +/* { dg-options "-O3 -fno-inline -march=armv8-a+sve -msve-vector-bits=256" { target aarch64_sve256_hw } } */ + +/* Use exit condition of 0 and less than a single iteration. */ +#define EXIT_CONDITION 0 +#define FILL_DATA 1 +#define LOOP_COUNTS {3,5,3,1,5,1} +#include "sve_speculative_5_run.c" diff --git a/gcc/testsuite/gcc.target/aarch64/sve_speculative_6.c b/gcc/testsuite/gcc.target/aarch64/sve_speculative_6.c new file mode 100644 index 00000000000..1b71687a257 --- /dev/null +++ b/gcc/testsuite/gcc.target/aarch64/sve_speculative_6.c @@ -0,0 +1,44 @@ +/* { dg-do compile } */ +/* { dg-options "-O3 -fno-inline -ffast-math -march=armv8-a+sve -fdump-tree-vect-details -msve-vector-bits=256" } */ + +#include <stdint.h> + +/* Speculative loop with a conditional load. */ + +#define SPEC_LOOP(ARGTYPE,INDUCTYPE)\ +INDUCTYPE spec_loop_##ARGTYPE##INDUCTYPE (ARGTYPE mask, ARGTYPE limit,\ + ARGTYPE * array, ARGTYPE * cond)\ +{\ + uint64_t i = 0;\ + INDUCTYPE r = 0;\ + while ((i & mask) != limit)\ + {\ + if (cond[i])\ + r = array[i];\ + i++;\ + }\ + return r;\ +} + +SPEC_LOOP (uint8_t, uint8_t) +SPEC_LOOP (uint16_t, uint16_t) +SPEC_LOOP (uint32_t, uint32_t) +SPEC_LOOP (uint64_t, uint64_t) + +SPEC_LOOP (int8_t, int8_t) +SPEC_LOOP (int16_t, int16_t) +SPEC_LOOP (int32_t, int32_t) +SPEC_LOOP (int64_t, int64_t) + +/* Conversions. */ +SPEC_LOOP (uint16_t, uint8_t) + +SPEC_LOOP (uint32_t, uint8_t) +SPEC_LOOP (uint32_t, uint16_t) + +SPEC_LOOP (uint64_t, uint8_t) +SPEC_LOOP (uint64_t, uint16_t) +SPEC_LOOP (uint64_t, uint32_t) + +/* { dg-final { scan-tree-dump-not "LOOP VECTORIZED" "vect" } } */ +/* { dg-final { scan-tree-dump "speculative mask loads not supported" "vect" } } */ diff --git a/gcc/testsuite/gcc.target/aarch64/sve_speculative_7.c b/gcc/testsuite/gcc.target/aarch64/sve_speculative_7.c new file mode 100644 index 00000000000..0c2d62387e2 --- /dev/null +++ b/gcc/testsuite/gcc.target/aarch64/sve_speculative_7.c @@ -0,0 +1,21 @@ +/* { dg-do compile } */ +/* { dg-options "-O3 -fno-inline -ffast-math -march=armv8-a+sve -fdump-tree-vect-details -msve-vector-bits=256" } */ + +#include <stdint.h> + +/* Speculative loop with a load and a test. */ + +uint32_t +search (uint32_t *array) +{ + for (;;) + { + uint32_t x = *array++ >> 7; + if (x >= 200) + return x; + } +} + +/* { dg-final { scan-tree-dump "LOOP VECTORIZED" "vect" } } */ +/* { dg-final { scan-assembler-times {\tbrka\tp[0-9]+\.b, p[0-9]+/z, p[0-9]+\.b} 1 } } */ +/* { dg-final { scan-assembler-times {\tlastb\tw[0-9]+, p[0-7], z[0-9]+\.s} 1 } } */ diff --git a/gcc/testsuite/gcc.target/aarch64/sve_speculative_8.c b/gcc/testsuite/gcc.target/aarch64/sve_speculative_8.c new file mode 100644 index 00000000000..8c70e2f9012 --- /dev/null +++ b/gcc/testsuite/gcc.target/aarch64/sve_speculative_8.c @@ -0,0 +1,21 @@ +/* { dg-do compile } */ +/* { dg-options "-O3 -fno-inline -ffast-math -march=armv8-a+sve -fdump-tree-vect-details -msve-vector-bits=256" } */ + +#include <stdint.h> + +/* Speculative loop with a load which requires multiple copies and a test. */ + +uint32_t +search (uint64_t *array) +{ + for (;;) + { + uint32_t x = *array++ >> 7; + if (x >= 200) + return x; + } +} + +/* { dg-final { scan-tree-dump "multiple copies not supported for speculative loops" "vect" } } */ +/* { dg-final { scan-tree-dump "not vectorized: relevant stmt not supported" "vect" } } */ +/* { dg-final { scan-tree-dump-not "LOOP VECTORIZED" "vect" } } */ diff --git a/gcc/testsuite/gcc.target/aarch64/sve_speculative_9.c b/gcc/testsuite/gcc.target/aarch64/sve_speculative_9.c new file mode 100644 index 00000000000..c21b44614c7 --- /dev/null +++ b/gcc/testsuite/gcc.target/aarch64/sve_speculative_9.c @@ -0,0 +1,34 @@ +/* { dg-do compile } */ +/* { dg-options "-O3 -fno-common -ffast-math -march=armv8-a+sve -fdump-tree-vect-details -msve-vector-bits=256" } */ + +#include <stdint.h> + +/* Speculative loop with two loads which cannot both be aligned. */ + +#ifndef STRIDE_LEVEL +#define STRIDE_LEVEL 1 +#endif + +#define SPEC_LOOP(DATATYPE, ARGTYPE)\ +ARGTYPE spec_loop_##DATATYPE##_##ARGTYPE (DATATYPE *a, DATATYPE*b, DATATYPE n)\ +{\ + ARGTYPE i = -1;\ + do\ + i += 1;\ + while (a[i*STRIDE_LEVEL] + b[i*STRIDE_LEVEL] < n);\ + return i;\ +} + +/* TODO: Cannot yet vectorize due to gather load. */ +SPEC_LOOP (int8_t, int8_t) +SPEC_LOOP (int16_t, int16_t) + +SPEC_LOOP (int32_t, int32_t) +SPEC_LOOP (int64_t, int64_t) +SPEC_LOOP (float, int32_t) +SPEC_LOOP (double, int64_t) + + +/* { dg-final { scan-tree-dump-times "loop versioned for vectorization to enhance alignment" 4 "vect" } } */ +/* { dg-final { scan-tree-dump-not "Alignment of access forced using peeling" "vect" } } */ +/* { dg-final { scan-tree-dump-times "LOOP VECTORIZED" 4 "vect" } } */ diff --git a/gcc/testsuite/gcc.target/aarch64/sve_speculative_9_run.c b/gcc/testsuite/gcc.target/aarch64/sve_speculative_9_run.c new file mode 100644 index 00000000000..f9470020fd0 --- /dev/null +++ b/gcc/testsuite/gcc.target/aarch64/sve_speculative_9_run.c @@ -0,0 +1,67 @@ +/* { dg-do run { target { aarch64_sve_hw } } } */ +/* { dg-options "-O3 -fno-common -ffast-math -march=armv8-a+sve" } */ +/* { dg-options "-O3 -fno-common -ffast-math -march=armv8-a+sve -msve-vector-bits=256" { target aarch64_sve256_hw } } */ + +#include "sve_speculative_9.c" + +extern void abort (void); + +#ifndef MAX_ARRAY_SIZE +#define MAX_ARRAY_SIZE 500 +#endif + +#ifndef FILL_DATA +#define FILL_DATA 0 +#endif + +#ifndef EXIT_CONDITION +#define EXIT_CONDITION 5 +#endif + +#ifndef LOOP_COUNTS +#define LOOP_COUNTS {37,45,55,17,39,43} +#endif +int loop_counts[] = LOOP_COUNTS; + +/* Fill the arrays with the exit conditions. + Then refill at the correct strided accesses with fill data up to the end of + the loop count. */ + +#define TEST_SPEC_LOOP_FUNC(DATATYPE, ARGTYPE)\ +void test_spec_loop_##DATATYPE##_##ARGTYPE (ARGTYPE num_elements)\ +{\ + DATATYPE a[MAX_ARRAY_SIZE];\ + DATATYPE b[MAX_ARRAY_SIZE];\ + int i;\ + for (i=0; i<MAX_ARRAY_SIZE; i++)\ + {\ + a[i] = EXIT_CONDITION;\ + b[i] = EXIT_CONDITION;\ + }\ + for (i=0; (i<num_elements-1)*STRIDE_LEVEL; i++)\ + {\ + a[i*STRIDE_LEVEL] = FILL_DATA;\ + b[i*STRIDE_LEVEL] = FILL_DATA;\ + }\ + ARGTYPE ret = spec_loop_##DATATYPE##_##ARGTYPE (a, b, EXIT_CONDITION);\ + if (ret != num_elements - 1)\ + abort ();\ +} + +TEST_SPEC_LOOP_FUNC (int8_t, int8_t) +TEST_SPEC_LOOP_FUNC (int16_t, int16_t) +TEST_SPEC_LOOP_FUNC (int32_t, int32_t) +TEST_SPEC_LOOP_FUNC (int64_t, int64_t) +TEST_SPEC_LOOP_FUNC (float, int32_t) +TEST_SPEC_LOOP_FUNC (double, int64_t) + +int main (void) +{ + test_spec_loop_int8_t_int8_t (loop_counts[0]); + test_spec_loop_int16_t_int16_t (loop_counts[1]); + test_spec_loop_int32_t_int32_t (loop_counts[2]); + test_spec_loop_int64_t_int64_t (loop_counts[3]); + test_spec_loop_float_int32_t (loop_counts[4]); + test_spec_loop_double_int64_t (loop_counts[5]); + return 0; +} diff --git a/gcc/tree-vect-data-refs.c b/gcc/tree-vect-data-refs.c index adb2af72573..3ef92c1d87d 100644 --- a/gcc/tree-vect-data-refs.c +++ b/gcc/tree-vect-data-refs.c @@ -834,15 +834,64 @@ vect_record_base_alignments (vec_info *vinfo) } } +/* Function can_get_vect_data_ref_required_alignment + + Try to calculate the alignment for the given data reference DR once + vectorised. If successful store the alignment to ALIGNMENT_P. + + For non speculative loops, the alignment is always calculable and is given + by preferred_vector_alignment. For speculative loops we align to the + vector size multiplied by the step. */ + +bool +vect_can_calculate_target_alignment (struct data_reference *dr, + unsigned int *alignment_p) +{ + gimple *stmt = DR_STMT (dr); + stmt_vec_info stmt_info = vinfo_for_stmt (stmt); + loop_vec_info loop_vinfo = STMT_VINFO_LOOP_VINFO (stmt_info); + + if (!loop_vinfo || !LOOP_VINFO_SPECULATIVE_EXECUTION (loop_vinfo)) + { + tree vectype = STMT_VINFO_VECTYPE (stmt_info); + if (alignment_p) + *alignment_p = targetm.vectorize.preferred_vector_alignment (vectype); + return true; + } + + /* We have to assume that non-constant vector sizes might not be + a power of two. */ + unsigned HOST_WIDE_INT size; + if (!current_vector_size.is_constant (&size)) + return false; + + /* Step must be a positive integer. */ + if (!tree_fits_shwi_p (DR_STEP (dr)) + || tree_int_cst_sgn (DR_STEP (dr)) <= 0) + return false; + + unsigned int step = tree_to_uhwi (DR_STEP (dr)); + unsigned int unit_size = + tree_to_uhwi (TYPE_SIZE_UNIT (TREE_TYPE (DR_REF (dr)))); + + /* Step must be a power of two and divisible by the unit size. */ + if (!pow2p_hwi (step) || step % unit_size != 0) + return false; + + if (alignment_p) + *alignment_p = size * BITS_PER_UNIT * step / unit_size; + return true; +} + /* Return the target alignment for the vectorized form of DR. */ static unsigned int vect_calculate_target_alignment (struct data_reference *dr) { - gimple *stmt = DR_STMT (dr); - stmt_vec_info stmt_info = vinfo_for_stmt (stmt); - tree vectype = STMT_VINFO_VECTYPE (stmt_info); - return targetm.vectorize.preferred_vector_alignment (vectype); + unsigned int ret; + if (!vect_can_calculate_target_alignment (dr, &ret)) + gcc_unreachable (); + return ret; } /* Function vect_compute_data_ref_alignment @@ -2288,11 +2337,11 @@ vect_find_same_alignment_drs (struct data_dependence_relation *ddr) if (diff != 0) { /* Get the wider of the two alignments. */ - unsigned int align_a = (vect_calculate_target_alignment (dra) - / BITS_PER_UNIT); - unsigned int align_b = (vect_calculate_target_alignment (drb) - / BITS_PER_UNIT); - unsigned int max_align = MAX (align_a, align_b); + unsigned int align_a, align_b; + if (!vect_can_calculate_target_alignment (dra, &align_a) + || !vect_can_calculate_target_alignment (drb, &align_b)) + return; + unsigned int max_align = MAX (align_a, align_b) / BITS_PER_UNIT; /* Require the gap to be a multiple of the larger vector alignment. */ if (!wi::multiple_of_p (diff, max_align, SIGNED)) @@ -2341,6 +2390,17 @@ vect_analyze_data_refs_alignment (loop_vec_info vinfo) FOR_EACH_VEC_ELT (datarefs, i, dr) { stmt_vec_info stmt_info = vinfo_for_stmt (DR_STMT (dr)); + + if (STMT_VINFO_VECTORIZABLE (stmt_info) + && !vect_can_calculate_target_alignment (dr, NULL)) + { + if (dump_enabled_p ()) + dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location, + "not vectorized: can't calculate required " + "alignment for data ref.\n"); + return false; + } + if (STMT_VINFO_VECTORIZABLE (stmt_info) && !vect_compute_data_ref_alignment (dr)) { @@ -3484,7 +3544,17 @@ vect_prune_runtime_alias_test_list (loop_vec_info loop_vinfo) else { if (!operand_equal_p (DR_STEP (dr_a), DR_STEP (dr_b), 0)) - length_factor = scalar_loop_iters; + { + if (LOOP_VINFO_SPECULATIVE_EXECUTION (loop_vinfo)) + { + if (dump_enabled_p ()) + dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location, + "Cannot vectorize speculative loops with " + "differing data reference step sizes.\n"); + return false; + } + length_factor = scalar_loop_iters; + } else length_factor = size_int (vect_factor); segment_length_a = vect_vfa_segment_size (dr_a, length_factor); @@ -4466,6 +4536,9 @@ vect_get_new_ssa_name (tree type, enum vect_var_kind var_kind, const char *name) case vect_simple_var: prefix = "vect"; break; + case vect_mask_var: + prefix = "mask"; + break; case vect_scalar_var: prefix = "stmp"; break; @@ -6652,6 +6725,10 @@ vect_supportable_dr_alignment (struct data_reference *dr, { vect_loop = LOOP_VINFO_LOOP (loop_vinfo); nested_in_vect_loop = nested_in_vect_loop_p (vect_loop, stmt); + + /* Speculative loops rely on aligned data refs. */ + if (LOOP_VINFO_SPECULATIVE_EXECUTION (loop_vinfo)) + return dr_unaligned_unsupported; } /* Possibly unaligned access. */ diff --git a/gcc/tree-vect-loop-manip.c b/gcc/tree-vect-loop-manip.c index 57aab1b764f..901113fcf03 100644 --- a/gcc/tree-vect-loop-manip.c +++ b/gcc/tree-vect-loop-manip.c @@ -369,6 +369,242 @@ vect_maybe_permute_loop_masks (gimple_seq *seq, rgroup_masks *dest_rgm, return false; } +/* Helper for vect_set_speculative_masks. Set the masks in RGM directly + from the corresponding scalar values. RGM belongs to LOOP, which has + been vectorized according to LOOP_VINFO. NSCALARITERS_SKIP is the + number of scalar iterations that we should skip during the first + iteration of the vector loop (because the start point has been + brought forward by that amount to achieve alignment). + + Add any new preheader statements to PREHEADER_SEQ and any new header + statements to HEADER_SEQ. */ + +static void +vect_set_speculative_masks_directly (struct loop *loop, + loop_vec_info loop_vinfo, + gimple_seq *preheader_seq, + gimple_seq *header_seq, + rgroup_masks *rgm, + tree nscalariters_skip) +{ + /* It doesn't make sense to align for speculation when we have a + capped VF. */ + gcc_assert (!use_capped_vf (loop_vinfo)); + + tree compare_type = LOOP_VINFO_MASK_COMPARE_TYPE (loop_vinfo); + tree mask_type = rgm->mask_type; + poly_uint64 nscalars_per_mask = TYPE_VECTOR_SUBPARTS (mask_type); + unsigned int nscalars_per_iter = rgm->max_nscalars_per_iter; + + tree nscalars_skip = nscalariters_skip; + if (nscalars_per_iter != 1) + { + tree factor = build_int_cst (compare_type, nscalars_per_iter); + nscalars_skip = gimple_build (preheader_seq, MULT_EXPR, compare_type, + nscalars_skip, factor); + } + + tree full_mask = build_minus_one_cst (mask_type); + tree mask; + unsigned int i; + FOR_EACH_VEC_ELT (rgm->masks, i, mask) + { + /* Previous masks covered START scalars. This mask covers the + next batch. */ + tree start = build_int_cst (compare_type, nscalars_per_mask * i); + tree init_mask = vect_gen_while_not (preheader_seq, mask_type, + start, nscalars_skip); + + /* Always use a full mask for subsequent iterations of the loop. */ + vect_set_loop_mask (loop, header_seq, mask, init_mask, + full_mask, NULL_TREE); + } +} + +/* Set up the controlling masks for LOOP, which is a speculative loop that + has been vectorized according to LOOP_VINFO. */ + +static void +vect_set_speculative_masks (struct loop *loop, loop_vec_info loop_vinfo) +{ + gimple_seq preheader_seq = NULL; + gimple_seq header_seq = NULL; + + vec_loop_masks *masks = &LOOP_VINFO_MASKS (loop_vinfo); + tree nscalariters_skip = LOOP_VINFO_MASK_SKIP_NITERS (loop_vinfo); + rgroup_masks *rgm; + unsigned int i; + FOR_EACH_VEC_ELT (*masks, i, rgm) + if (!rgm->masks.is_empty ()) + { + /* We shouldn't be using masks if there are no elements to skip + on the first iteration. */ + gcc_assert (nscalariters_skip != NULL_TREE); + + /* First try using permutes. */ + unsigned int nmasks = i + 1; + if ((nmasks & 1) == 0) + { + rgroup_masks *half_rgm = &(*masks)[nmasks / 2 - 1]; + if (!half_rgm->masks.is_empty () + && vect_maybe_permute_loop_masks (&header_seq, rgm, half_rgm)) + continue; + } + + vect_set_speculative_masks_directly (loop, loop_vinfo, + &preheader_seq, &header_seq, + rgm, nscalariters_skip); + } + + /* Emit all accumulated statements. */ + add_preheader_seq (loop, preheader_seq); + add_header_seq (loop, header_seq); +} + +/* RGM belongs to the nonspeculative masks of LOOP_VINFO. Set up the masks + in RGM so that the active bits corresponding to the first NSCALARITERS + scalar iterations are true and every other bit is false. Add any new + statements before GSI. */ + +static void +vect_set_nonspeculative_masks_directly (loop_vec_info loop_vinfo, + gimple_stmt_iterator *gsi, + rgroup_masks *rgm, tree nscalariters) +{ + tree compare_type = LOOP_VINFO_MASK_COMPARE_TYPE (loop_vinfo); + tree mask_type = rgm->mask_type; + poly_uint64 nscalars_per_mask = TYPE_VECTOR_SUBPARTS (mask_type); + unsigned int nscalars_per_iter = rgm->max_nscalars_per_iter; + + /* Calculate the number of scalars covered by the rgroup. */ + gimple_seq seq = NULL; + tree nscalars = nscalariters; + if (nscalars_per_iter != 1) + nscalars = gimple_build (&seq, MULT_EXPR, compare_type, nscalars, + build_int_cst (compare_type, nscalars_per_iter)); + if (seq) + gsi_insert_seq_before (gsi, seq, GSI_SAME_STMT); + + tree mask; + unsigned int i; + FOR_EACH_VEC_ELT (rgm->masks, i, mask) + { + /* Previous masks covered START scalars. This mask covers the + next batch. */ + tree start = build_int_cst (compare_type, nscalars_per_mask * i); + if (LOOP_VINFO_MASK_SKIP_NITERS (loop_vinfo)) + { + /* First get a mask that ignores whether bits are active. */ + tree temp = make_ssa_name (mask_type); + gcall *call = vect_gen_while (temp, start, nscalars); + gsi_insert_before (gsi, call, GSI_SAME_STMT); + + /* Now AND the result with the active lanes. */ + tree active + = vect_get_loop_mask (gsi, &LOOP_VINFO_MASKS (loop_vinfo), + rgm->masks.length (), mask_type, i); + gassign *assign = gimple_build_assign (mask, BIT_AND_EXPR, + temp, active); + gsi_insert_before (gsi, assign, GSI_SAME_STMT); + } + else + { + /* All lanes are active. */ + gcall *call = vect_gen_while (mask, start, nscalars); + gsi_insert_before (gsi, call, GSI_SAME_STMT); + } + } +} + +/* Set MASK to the mask of active elements up to and including the + first iteration for which the exit condition of LOOP_VINFO is true. + Insert any new statements before GSI. ALL_ACTIVE_P is true if we + should treat all elements as active, false if we should get the + mask of active elements from the main loop mask. */ + +static void +vect_add_break_after (loop_vec_info loop_vinfo, gimple_stmt_iterator *gsi, + tree mask, bool all_active_p) +{ + tree mask_type = TREE_TYPE (mask); + + tree active; + if (all_active_p) + active = build_minus_one_cst (mask_type); + else + active = vect_get_loop_mask (gsi, &LOOP_VINFO_MASKS (loop_vinfo), + 1, mask_type, 0); + + /* Break the mask after the first true exit condition. */ + tree exit_mask = LOOP_VINFO_EXIT_TEST_MASK (loop_vinfo); + gcall *call = gimple_build_call_internal (IFN_BREAK_AFTER, 2, + active, exit_mask); + gimple_call_set_lhs (call, mask); + gsi_insert_before (gsi, call, GSI_SAME_STMT); +} + +/* Set up the nonspeculative masks in LOOP_VINFO. Emit any new statements + before GSI. */ + +static void +vect_set_nonspeculative_masks (loop_vec_info loop_vinfo, + gimple_stmt_iterator *gsi) +{ + vec_niters_and_mask nim; + vec_loop_masks *masks = &LOOP_VINFO_NONSPECULATIVE_MASKS (loop_vinfo); + tree compare_type = LOOP_VINFO_MASK_COMPARE_TYPE (loop_vinfo); + tree niters = NULL_TREE; + rgroup_masks *rgm; + unsigned int i; + FOR_EACH_VEC_ELT (*masks, i, rgm) + if (!rgm->masks.is_empty ()) + { + unsigned int nmasks = i + 1; + + /* Try to set the mask directly with a BREAK_AFTER. */ + if (nmasks == 1 && rgm->max_nscalars_per_iter == 1) + { + /* All elements are active unless we're peeling for + alignment. */ + vect_add_break_after (loop_vinfo, gsi, rgm->masks[0], + !LOOP_VINFO_MASK_SKIP_NITERS (loop_vinfo)); + continue; + } + + /* Try using permutes. */ + if ((nmasks & 1) == 0) + { + gimple_seq seq = NULL; + rgroup_masks *half_rgm = &(*masks)[nmasks / 2 - 1]; + if (!half_rgm->masks.is_empty () + && vect_maybe_permute_loop_masks (&seq, rgm, half_rgm)) + { + gsi_insert_seq_before (gsi, seq, GSI_SAME_STMT); + continue; + } + } + + if (niters == NULL_TREE) + { + /* Get the mask of elements up to and including the first + iteration for which the exit condition is true. + Include any inactive starting elements at this stage. */ + tree mask_type = vect_mask_type_for_speculation (loop_vinfo); + nim.mask = make_ssa_name (mask_type); + vect_add_break_after (loop_vinfo, gsi, nim.mask, true); + + /* Convert the mask to a scalar count, then convert the + sizetype result to the mask comparison type. */ + gimple_seq seq = NULL; + niters = vect_get_niters_from_mask (&seq, &nim); + niters = gimple_convert (&seq, compare_type, niters); + if (seq) + gsi_insert_seq_before (gsi, seq, GSI_SAME_STMT); + } + vect_set_nonspeculative_masks_directly (loop_vinfo, gsi, rgm, niters); + } +} + /* Helper for vect_set_loop_condition_masked. Generate definitions for all the masks in RGM and return a mask that is nonzero when the loop needs to iterate. Add any new preheader statements to PREHEADER_SEQ @@ -939,11 +1175,29 @@ vect_set_loop_condition (struct loop *loop, loop_vec_info loop_vinfo, tree niters, tree step, tree final_iv, bool niters_maybe_zero) { - gcond *cond_stmt; + gcond *cond_stmt = NULL; gcond *orig_cond = get_loop_exit_condition (loop); gimple_stmt_iterator loop_cond_gsi = gsi_for_stmt (orig_cond); + bool masked_p = (loop_vinfo && LOOP_VINFO_FULLY_MASKED_P (loop_vinfo)); + bool speculation_p + = (loop_vinfo && LOOP_VINFO_SPECULATIVE_EXECUTION (loop_vinfo)); - if (loop_vinfo && LOOP_VINFO_FULLY_MASKED_P (loop_vinfo)) + if (speculation_p) + { + /* Set the masks that control statements that cannot be speculatively + executed. */ + vect_set_nonspeculative_masks (loop_vinfo, &loop_cond_gsi); + + /* ...then add the statements themselves. */ + gimple_seq late_seq = LOOP_VINFO_NONSPECULATIVE_SEQ (loop_vinfo); + if (late_seq) + gsi_insert_seq_before (&loop_cond_gsi, late_seq, GSI_SAME_STMT); + + /* Set up the masks that control the speculative statements. */ + if (masked_p) + vect_set_speculative_masks (loop, loop_vinfo); + } + else if (masked_p) cond_stmt = vect_set_loop_condition_masked (loop, loop_vinfo, niters, final_iv, niters_maybe_zero, loop_cond_gsi); @@ -952,11 +1206,14 @@ vect_set_loop_condition (struct loop *loop, loop_vec_info loop_vinfo, final_iv, niters_maybe_zero, loop_cond_gsi); - /* Remove old loop exit test. */ - gsi_remove (&loop_cond_gsi, true); - free_stmt_vec_info (orig_cond); + if (!speculation_p) + { + /* Remove old loop exit test. */ + gsi_remove (&loop_cond_gsi, true); + free_stmt_vec_info (orig_cond); + } - if (dump_enabled_p ()) + if (dump_enabled_p () && cond_stmt) { dump_printf_loc (MSG_NOTE, vect_location, "New loop exit condition: "); dump_gimple_stmt (MSG_NOTE, TDF_SLIM, cond_stmt, 0); @@ -1644,13 +1901,15 @@ vect_gen_prolog_loop_niters (loop_vec_info loop_vinfo, { struct data_reference *dr = LOOP_VINFO_UNALIGNED_DR (loop_vinfo); tree var; - tree niters_type = TREE_TYPE (LOOP_VINFO_NITERS (loop_vinfo)); gimple_seq stmts = NULL, new_stmts = NULL; tree iters, iters_name; gimple *dr_stmt = DR_STMT (dr); stmt_vec_info stmt_info = vinfo_for_stmt (dr_stmt); tree vectype = STMT_VINFO_VECTYPE (stmt_info); unsigned int target_align = DR_TARGET_ALIGNMENT (dr); + tree niters_type = (LOOP_VINFO_SPECULATIVE_EXECUTION (loop_vinfo) + ? size_type_node + : TREE_TYPE (LOOP_VINFO_NITERS (loop_vinfo))); if (LOOP_VINFO_PEELING_FOR_ALIGNMENT (loop_vinfo) > 0) { @@ -1829,6 +2088,12 @@ vect_prepare_for_masked_peels (loop_vec_info loop_vinfo) tree vect_build_loop_niters (loop_vec_info loop_vinfo, bool *new_var_p) { + if (!LOOP_VINFO_NITERS (loop_vinfo)) + { + gcc_assert (LOOP_VINFO_SPECULATIVE_EXECUTION (loop_vinfo)); + return NULL; + } + tree ni = unshare_expr (LOOP_VINFO_NITERS (loop_vinfo)); if (TREE_CODE (ni) == INTEGER_CST) return ni; @@ -2421,7 +2686,7 @@ vect_do_peeling (loop_vec_info loop_vinfo, tree niters, tree nitersm1, bool check_profitability, bool niters_no_overflow) { edge e, guard_e; - tree type = TREE_TYPE (niters), guard_cond; + tree guard_cond; basic_block guard_bb, guard_to; profile_probability prob_prolog, prob_vector, prob_epilog; int estimated_vf; @@ -2469,6 +2734,7 @@ vect_do_peeling (loop_vec_info loop_vinfo, tree niters, tree nitersm1, /* Generate the number of iterations for the prolog loop. We do this here so that we can also get the upper bound on the number of iterations. */ + tree type = TREE_TYPE (niters); tree niters_prolog; int bound_prolog = 0; if (prolog_peeling) diff --git a/gcc/tree-vect-loop.c b/gcc/tree-vect-loop.c index e33a83bfa6b..c6269a95815 100644 --- a/gcc/tree-vect-loop.c +++ b/gcc/tree-vect-loop.c @@ -369,7 +369,12 @@ vect_determine_vectorization_factor (loop_vec_info loop_vinfo) analyze_pattern_stmt = false; } + bool is_gcond = gimple_code (stmt) == GIMPLE_COND; + gcc_assert (!is_gcond + || LOOP_VINFO_SPECULATIVE_EXECUTION (loop_vinfo)); + if (gimple_get_lhs (stmt) == NULL_TREE + && !is_gcond /* MASK_STORE has no lhs, but is ok. */ && (!is_gimple_call (stmt) || !gimple_call_internal_p (stmt) @@ -427,27 +432,31 @@ vect_determine_vectorization_factor (loop_vec_info loop_vinfo) gcc_assert (!STMT_VINFO_DATA_REF (stmt_info)); if (gimple_call_internal_p (stmt, IFN_MASK_STORE)) scalar_type = TREE_TYPE (gimple_call_arg (stmt, 3)); + else if (is_gcond) + scalar_type = TREE_TYPE (gimple_cond_lhs (stmt)); else scalar_type = TREE_TYPE (gimple_get_lhs (stmt)); /* Bool ops don't participate in vectorization factor computation. For comparison use compared types to compute a factor. */ - if (VECT_SCALAR_BOOLEAN_TYPE_P (scalar_type) - && is_gimple_assign (stmt) - && gimple_assign_rhs_code (stmt) != COND_EXPR) + if (is_gcond + || (VECT_SCALAR_BOOLEAN_TYPE_P (scalar_type) + && is_gimple_assign (stmt) + && gimple_assign_rhs_code (stmt) != COND_EXPR)) { if (STMT_VINFO_RELEVANT_P (stmt_info) || STMT_VINFO_LIVE_P (stmt_info)) mask_producers.safe_push (stmt_info); bool_result = true; - if (TREE_CODE_CLASS (gimple_assign_rhs_code (stmt)) - == tcc_comparison + if (is_gimple_assign (stmt) + && (TREE_CODE_CLASS (gimple_assign_rhs_code (stmt)) + == tcc_comparison) && !VECT_SCALAR_BOOLEAN_TYPE_P (TREE_TYPE (gimple_assign_rhs1 (stmt)))) scalar_type = TREE_TYPE (gimple_assign_rhs1 (stmt)); - else + else if (TREE_CODE (scalar_type) == BOOLEAN_TYPE) { if (!analyze_pattern_stmt && gsi_end_p (pattern_def_si)) { @@ -589,13 +598,28 @@ vect_determine_vectorization_factor (loop_vec_info loop_vinfo) tree mask_type = NULL; stmt = STMT_VINFO_STMT (mask_producers[i]); + bool is_gcond = gimple_code (stmt) == GIMPLE_COND; + bool ops_are_booleans = true; if (is_gimple_assign (stmt) && TREE_CODE_CLASS (gimple_assign_rhs_code (stmt)) == tcc_comparison && !VECT_SCALAR_BOOLEAN_TYPE_P (TREE_TYPE (gimple_assign_rhs1 (stmt)))) { scalar_type = TREE_TYPE (gimple_assign_rhs1 (stmt)); + ops_are_booleans = false; + + } + else if (is_gcond + && TREE_CODE (TREE_TYPE (gimple_cond_lhs (stmt))) + != BOOLEAN_TYPE) + { + scalar_type = TREE_TYPE (gimple_cond_lhs (stmt)); + ops_are_booleans = false; + } + + if (!ops_are_booleans) + { mask_type = get_mask_type_for_scalar_type (scalar_type); if (!mask_type) @@ -1131,6 +1155,7 @@ _loop_vec_info::_loop_vec_info (struct loop *loop_in) slp_unrolling_factor (1), single_scalar_iteration_cost (0), vectorizable (false), + speculative_execution (false), can_fully_mask_p (true), fully_masked_p (false), peeling_for_gaps (false), @@ -1140,7 +1165,10 @@ _loop_vec_info::_loop_vec_info (struct loop *loop_in) has_mask_store (false), scalar_loop (NULL), orig_loop_info (NULL), - vect_addr_base_htab (31) + vect_addr_base_htab (31), + exit_test_mask (NULL_TREE), + exit_mask (NULL_TREE), + nonspeculative_seq (NULL) { /* Create/Update stmt_info for all stmts in the loop. */ basic_block *body = get_loop_body (loop); @@ -1252,6 +1280,7 @@ _loop_vec_info::~_loop_vec_info () free (bbs); release_vec_loop_masks (&masks); + release_vec_loop_masks (&nonspeculative_masks); loop->aux = NULL; } @@ -1296,22 +1325,40 @@ vect_verify_full_masking (loop_vec_info loop_vinfo) { struct loop *loop = LOOP_VINFO_LOOP (loop_vinfo); unsigned int min_ni_width; + unsigned HOST_WIDE_INT const_vf; - /* Get the maximum number of iterations that is representable - in the counter type. */ - tree ni_type = TREE_TYPE (LOOP_VINFO_NITERSM1 (loop_vinfo)); - widest_int max_ni = wi::to_widest (TYPE_MAX_VALUE (ni_type)) + 1; + /* Get the number of bits needed to hold the number of iterations + as an unsigned value. */ + if (LOOP_VINFO_SPECULATIVE_EXECUTION (loop_vinfo)) + { + /* For speculative loops, we only need to count the number of iterations + before the vector loop. */ + if (LOOP_VINFO_VECT_FACTOR (loop_vinfo).is_constant (&const_vf)) + { + unsigned int factor = vect_get_max_nscalars_per_iter (loop_vinfo); + min_ni_width = wi::min_precision (const_vf * factor, UNSIGNED); + } + else + min_ni_width = POINTER_SIZE; + } + else + { + /* Get the maximum number of iterations that is representable + in the counter type. */ + tree ni_type = TREE_TYPE (LOOP_VINFO_NITERSM1 (loop_vinfo)); + widest_int max_ni = wi::to_widest (TYPE_MAX_VALUE (ni_type)) + 1; - /* Get a more refined estimate for the number of iterations. */ - widest_int max_back_edges; - if (max_loop_iterations (loop, &max_back_edges)) - max_ni = wi::smin (max_ni, max_back_edges + 1); + /* Get a more refined estimate for the number of iterations. */ + widest_int max_back_edges; + if (max_loop_iterations (loop, &max_back_edges)) + max_ni = wi::smin (max_ni, max_back_edges + 1); - /* Account for rgroup masks, in which each bit is replicated N times. */ - max_ni *= vect_get_max_nscalars_per_iter (loop_vinfo); + /* Account for rgroup masks, in which each bit is replicated N times. */ + max_ni *= vect_get_max_nscalars_per_iter (loop_vinfo); - /* Work out how many bits we need to represent the limit. */ - min_ni_width = wi::min_precision (max_ni, UNSIGNED); + /* Work out how many bits we need to represent the limit. */ + min_ni_width = wi::min_precision (max_ni, UNSIGNED); + } /* Find a scalar mode for which WHILE_ULT is supported. */ opt_scalar_int_mode cmp_mode_iter; @@ -1672,7 +1719,8 @@ vect_analyze_loop_form_1 (struct loop *loop, gcond **loop_cond, if (integer_zerop (*assumptions) || !*number_of_iterations - || chrec_contains_undetermined (*number_of_iterations)) + || (loop->inner + && chrec_contains_undetermined (*number_of_iterations))) { if (dump_enabled_p ()) dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location, @@ -1680,6 +1728,15 @@ vect_analyze_loop_form_1 (struct loop *loop, gcond **loop_cond, "computed.\n"); return false; } + else if (!loop->inner + && chrec_contains_undetermined (*number_of_iterations)) + { + if (dump_enabled_p ()) + dump_printf_loc (MSG_NOTE, vect_location, + "number of iterations cannot be computed, " + "relying upon speculative execution\n"); + return true; + } if (integer_zerop (*number_of_iterations)) { @@ -1706,6 +1763,21 @@ vect_analyze_loop_form (struct loop *loop) return NULL; loop_vec_info loop_vinfo = new _loop_vec_info (loop); + + if (number_of_iterations + && chrec_contains_undetermined (number_of_iterations)) + { + /* Nested loops are not supported for speculative execution. */ + gcc_assert (!loop->inner); + + LOOP_VINFO_SPECULATIVE_EXECUTION (loop_vinfo) = true; + + /* Since we don't know what the number of iterations there seems little + point in having anything other than NULL. */ + number_of_iterations = NULL; + number_of_iterationsm1 = NULL; + } + LOOP_VINFO_NITERSM1 (loop_vinfo) = number_of_iterationsm1; LOOP_VINFO_NITERS (loop_vinfo) = number_of_iterations; LOOP_VINFO_NITERS_UNCHANGED (loop_vinfo) = number_of_iterations; @@ -2158,6 +2230,25 @@ vect_analyze_loop_2 (loop_vec_info loop_vinfo, bool &fatal) } } + /* TODO: We can't currently support stores for speculative loops. */ + if (LOOP_VINFO_SPECULATIVE_EXECUTION (loop_vinfo) + && LOOP_VINFO_DATAREFS (loop_vinfo).length () > 0) + { + vec<data_reference_p> datarefs = LOOP_VINFO_DATAREFS (loop_vinfo); + struct data_reference *dr; + unsigned int i; + + FOR_EACH_VEC_ELT (datarefs, i, dr) + if (!DR_IS_READ (dr)) + { + if (dump_enabled_p ()) + dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location, + "Stores not supported for speculative " + "loops.\n"); + return false; + } + } + /* Analyze the data references and also adjust the minimal vectorization factor according to the loads and stores. */ @@ -2259,7 +2350,8 @@ vect_analyze_loop_2 (loop_vec_info loop_vinfo, bool &fatal) /* We don't expect to have to roll back to anything other than an empty set of rgroups. */ - gcc_assert (LOOP_VINFO_MASKS (loop_vinfo).is_empty ()); + gcc_assert (LOOP_VINFO_MASKS (loop_vinfo).is_empty () + && LOOP_VINFO_NONSPECULATIVE_MASKS (loop_vinfo).is_empty ()); /* This is the point where we can re-start analysis with SLP forced off. */ start_over: @@ -2337,6 +2429,19 @@ start_over: return false; } + if (LOOP_VINFO_CAN_FULLY_MASK_P (loop_vinfo) + && !LOOP_VINFO_PEELING_FOR_ALIGNMENT (loop_vinfo) + && LOOP_VINFO_SPECULATIVE_EXECUTION (loop_vinfo) + && !LOOP_VINFO_NEEDS_NONSPECULATIVE_MASKS (loop_vinfo) + && !use_capped_vf (loop_vinfo)) + { + LOOP_VINFO_CAN_FULLY_MASK_P (loop_vinfo) = false; + if (dump_enabled_p ()) + dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location, + "No need to predicate speculative loops without " + "alignment peeling.\n"); + } + /* Decide whether to use a fully-masked loop for this vectorization factor. */ LOOP_VINFO_FULLY_MASKED_P (loop_vinfo) @@ -2352,17 +2457,41 @@ start_over: "not using a fully-masked loop.\n"); } - if (!LOOP_VINFO_FULLY_MASKED_P (loop_vinfo) - && use_capped_vf (loop_vinfo)) + if (!LOOP_VINFO_FULLY_MASKED_P (loop_vinfo)) { - if (dump_enabled_p ()) - dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location, - "Need to cap the runtime vectorization factor to " - HOST_WIDE_INT_PRINT_DEC " but cannot fully mask" - " the loop.\n", - LOOP_VINFO_MAX_VECT_FACTOR (loop_vinfo)); - /* Undoing SLP might allow us to use a mask. */ - goto again; + if (LOOP_VINFO_NEEDS_NONSPECULATIVE_MASKS (loop_vinfo)) + { + if (dump_enabled_p ()) + dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location, + "Not vectorized: non-speculative operations " + "need a fully-masked loop.\n"); + return false; + } + + if (use_capped_vf (loop_vinfo)) + { + if (dump_enabled_p ()) + dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location, + "Need to cap the runtime vectorization factor to " + HOST_WIDE_INT_PRINT_DEC " but cannot fully mask" + " the loop.\n", + LOOP_VINFO_MAX_VECT_FACTOR (loop_vinfo)); + /* Undoing SLP might allow us to use a mask. */ + goto again; + } + } + + if (LOOP_VINFO_NEEDS_NONSPECULATIVE_MASKS (loop_vinfo)) + { + tree mask_type = vect_mask_type_for_speculation (loop_vinfo); + if (!direct_internal_fn_supported_p (IFN_BREAK_AFTER, mask_type, + OPTIMIZE_FOR_SPEED)) + { + if (dump_enabled_p ()) + dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location, + "Not vectorized: BREAK_AFTER not supported.\n"); + return false; + } } /* If epilog loop is required because of data accesses with gaps, @@ -2385,6 +2514,17 @@ start_over: } } + if (!LOOP_VINFO_FULLY_MASKED_P (loop_vinfo) + && LOOP_VINFO_SPECULATIVE_EXECUTION (loop_vinfo) + && LOOP_VINFO_PEELING_FOR_ALIGNMENT (loop_vinfo)) + { + if (dump_enabled_p ()) + dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location, + "Not supported: peeling speculative vectorization" + " without a fully-masked loop.\n"); + return false; + } + /* Check the costings of the loop make vectorizing worthwhile. */ res = vect_analyze_loop_costing (loop_vinfo); if (res < 0) @@ -2402,7 +2542,9 @@ start_over: th = LOOP_VINFO_COST_MODEL_THRESHOLD (loop_vinfo); unsigned HOST_WIDE_INT const_vf; - if (LOOP_VINFO_FULLY_MASKED_P (loop_vinfo)) + if (LOOP_VINFO_SPECULATIVE_EXECUTION (loop_vinfo)) + LOOP_VINFO_PEELING_FOR_NITER (loop_vinfo) = false; + else if (LOOP_VINFO_FULLY_MASKED_P (loop_vinfo)) /* The main loop handles all iterations. */ LOOP_VINFO_PEELING_FOR_NITER (loop_vinfo) = false; else if (LOOP_VINFO_NITERS_KNOWN_P (loop_vinfo) @@ -2448,7 +2590,8 @@ start_over: enough for both peeled prolog loop and vector loop. This check can be merged along with threshold check of loop versioning, so increase threshold for this case if necessary. */ - if (LOOP_REQUIRES_VERSIONING (loop_vinfo)) + if (LOOP_REQUIRES_VERSIONING (loop_vinfo) + && !LOOP_VINFO_SPECULATIVE_EXECUTION (loop_vinfo)) { poly_uint64 niters_th = 0; @@ -2574,6 +2717,7 @@ again: = init_cost (LOOP_VINFO_LOOP (loop_vinfo)); /* Reset accumulated rgroup information. */ release_vec_loop_masks (&LOOP_VINFO_MASKS (loop_vinfo)); + release_vec_loop_masks (&LOOP_VINFO_NONSPECULATIVE_MASKS (loop_vinfo)); /* Reset assorted flags. */ LOOP_VINFO_PEELING_FOR_NITER (loop_vinfo) = false; LOOP_VINFO_PEELING_FOR_GAPS (loop_vinfo) = false; @@ -6147,11 +6291,19 @@ vectorize_fold_left_reduction (gimple *stmt, gimple_stmt_iterator *gsi, SSA_NAME_DEF_STMT (reduc_var) = new_stmt; /* For chained SLP stmt is the first statement in the group and gsi points to the last statement in the group. For non SLP stmt - points to the same location as gsi. In either case tmp_gsi and gsi - should both point to the same insertion point. */ - gcc_assert (scalar_dest_def == gsi_stmt (*gsi)); - vect_finish_replace_stmt (scalar_dest_def, new_stmt); - } + points to the same location as gsi. */ + if (scalar_dest_def == gsi_stmt (*gsi)) + vect_finish_replace_stmt (scalar_dest_def, new_stmt); + else + { + /* In this case we're moving the definition to later in the + block. That doesn't matter because the only uses of the + lhs are in phi statements. */ + gimple_stmt_iterator old_gsi = gsi_for_stmt (scalar_dest_def); + gsi_remove (&old_gsi, true); + vect_finish_stmt_generation (stmt, new_stmt, gsi); + } + } else { reduc_var = make_ssa_name (reduc_var, new_stmt); @@ -7144,7 +7296,13 @@ vectorizable_reduction (gimple *stmt, gimple_stmt_iterator *gsi, vec_num = 1; internal_fn cond_fn = get_conditional_internal_fn (code, scalar_type); + + /* In a speculative loop, the update must be predicated on the + nonspeculative masks, so that we don't include speculatively + loaded elements from beyond the end of the original loop. */ vec_loop_masks *masks = &LOOP_VINFO_MASKS (loop_vinfo); + if (LOOP_VINFO_SPECULATIVE_EXECUTION (loop_vinfo)) + masks = &LOOP_VINFO_NONSPECULATIVE_MASKS (loop_vinfo); if (!vec_stmt) /* transformation not required. */ { @@ -7190,6 +7348,12 @@ vectorizable_reduction (gimple *stmt, gimple_stmt_iterator *gsi, bool masked_loop_p = LOOP_VINFO_FULLY_MASKED_P (loop_vinfo); + gimple_stmt_iterator nonspeculative_gsi + = gsi_end (LOOP_VINFO_NONSPECULATIVE_SEQ (loop_vinfo)); + if (masked_loop_p + && masks == &LOOP_VINFO_NONSPECULATIVE_MASKS (loop_vinfo)) + gsi = &nonspeculative_gsi; + if (reduction_type == FOLD_LEFT_REDUCTION) return vectorize_fold_left_reduction (stmt, gsi, vec_stmt, slp_node, reduc_def_stmt, code, @@ -8036,6 +8200,37 @@ vectorizable_live_operation (gimple *stmt, } } + if (LOOP_VINFO_SPECULATIVE_EXECUTION (loop_vinfo)) + { + /* Need to construct the type because on the checking stage we don't + yet have the speculative exit phi. */ + tree mask_type = build_same_sized_truth_vector_type (vectype); + + if (!direct_internal_fn_supported_p (IFN_BREAK_AFTER, mask_type, + OPTIMIZE_FOR_SPEED)) + { + if (dump_enabled_p ()) + dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location, + "not vectorized: break after not supported.\n"); + return false; + } + if (!direct_internal_fn_supported_p (IFN_EXTRACT_LAST, vectype, + OPTIMIZE_FOR_SPEED)) + { + if (dump_enabled_p ()) + dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location, + "not vectorized: extract last not supported.\n"); + return false; + } + if (ncopies > 1) + { + if (dump_enabled_p ()) + dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location, + "not vectorized: ncopies is greater than 1.\n"); + return false; + } + } + if (!vec_stmt) { /* No transformation required. */ @@ -8122,19 +8317,37 @@ vectorizable_live_operation (gimple *stmt, gimple_seq stmts = NULL; tree new_tree; - if (LOOP_VINFO_FULLY_MASKED_P (loop_vinfo)) + if (LOOP_VINFO_FULLY_MASKED_P (loop_vinfo) + || LOOP_VINFO_SPECULATIVE_EXECUTION (loop_vinfo)) { + tree mask; + if (LOOP_VINFO_SPECULATIVE_EXECUTION (loop_vinfo)) + { + gcc_assert (ncopies == 1); + tree orig_mask = LOOP_VINFO_EXIT_MASK (loop_vinfo); + tree all_ones = build_minus_one_cst (TREE_TYPE (orig_mask)); + + mask = make_ssa_name (TREE_TYPE (orig_mask)); + gcall *new_stmt = gimple_build_call_internal (IFN_BREAK_AFTER, 2, + all_ones, orig_mask); + gimple_call_set_lhs (new_stmt, mask); + gimple_seq_add_stmt (&stmts, new_stmt); + } + else + { + gcc_assert (ncopies == 1 && !slp_node); + mask = vect_get_loop_mask (gsi, &LOOP_VINFO_MASKS (loop_vinfo), + 1, vectype, 0); + } + /* Emit: SCALAR_RES = EXTRACT_LAST <VEC_LHS, MASK> where VEC_LHS is the vectorized live-out result and MASK is the loop mask for the final iteration. */ - gcc_assert (ncopies == 1 && !slp_node); tree scalar_type = TREE_TYPE (STMT_VINFO_VECTYPE (stmt_info)); tree scalar_res = make_ssa_name (scalar_type); - tree mask = vect_get_loop_mask (gsi, &LOOP_VINFO_MASKS (loop_vinfo), - 1, vectype, 0); gcall *new_stmt = gimple_build_call_internal (IFN_EXTRACT_LAST, 2, mask, vec_lhs); gimple_call_set_lhs (new_stmt, scalar_res); @@ -8226,6 +8439,9 @@ vect_loop_kill_debug_uses (struct loop *loop, gimple *stmt) static bool loop_niters_no_overflow (loop_vec_info loop_vinfo) { + if (LOOP_VINFO_SPECULATIVE_EXECUTION (loop_vinfo)) + return false; + /* Constant case. */ if (LOOP_VINFO_NITERS_KNOWN_P (loop_vinfo)) { @@ -8292,6 +8508,14 @@ vect_record_loop_mask (loop_vec_info loop_vinfo, vec_loop_masks *masks, rgm->max_nscalars_per_iter = nscalars_per_iter; rgm->mask_type = build_same_sized_truth_vector_type (vectype); } + + /* Ensure that the required nonspeculative masks are a subset of + the speculative ones. This has two benefits: it means that we + can test for target support in one go, and that we can AND in + the speculative masks when setting up the nonspeculative ones. */ + if (masks == &LOOP_VINFO_NONSPECULATIVE_MASKS (loop_vinfo)) + vect_record_loop_mask (loop_vinfo, &LOOP_VINFO_MASKS (loop_vinfo), + nvectors, vectype); } /* Given a complete set of masks MASKS, extract mask number INDEX @@ -8343,6 +8567,52 @@ vect_get_loop_mask (gimple_stmt_iterator *gsi, vec_loop_masks *masks, return mask; } +/* Get the mask to use for loads in LOOP_VINFO, or null if loads don't + need to be masked. The arguments are as for vec_get_loop_mask. */ + +tree +vect_get_load_mask (loop_vec_info loop_vinfo, gimple_stmt_iterator *gsi, + unsigned int nvectors, tree vectype, unsigned int index) +{ + /* At present all loads in a speculative loop are speculative. + They need to be masked iff we are using masking to reach + alignment. */ + if (LOOP_VINFO_SPECULATIVE_EXECUTION (loop_vinfo) + && !LOOP_VINFO_MASK_SKIP_NITERS (loop_vinfo)) + return NULL_TREE; + + return vect_get_loop_mask (gsi, &LOOP_VINFO_MASKS (loop_vinfo), + nvectors, vectype, index); +} + +/* Return the mask type to use when computing which scalar iterations + are active in speculative loop LOOP_VINFO. */ + +tree +vect_mask_type_for_speculation (loop_vec_info loop_vinfo) +{ + gcc_checking_assert (LOOP_VINFO_SPECULATIVE_EXECUTION (loop_vinfo)); + return build_truth_vector_type (LOOP_VINFO_VECT_FACTOR (loop_vinfo), + current_vector_size); +} + +/* Calculate the scalar number of iterations in NIM from its mask, + adding any new statements to SEQ. Return the number of iterations. */ + +tree +vect_get_niters_from_mask (gimple_seq *seq, vec_niters_and_mask *nim) +{ + if (nim->niters == NULL_TREE) + { + nim->niters = make_temp_ssa_name (sizetype, NULL, "niters"); + gcall *call = gimple_build_call_internal (IFN_MASK_POPCOUNT, + 1, nim->mask); + gimple_call_set_lhs (call, nim->niters); + gimple_seq_add_stmt (seq, call); + } + return nim->niters; +} + /* Scale profiling counters by estimation for LOOP which is vectorized by factor VF. */ @@ -8419,7 +8689,8 @@ vect_transform_loop (loop_vec_info loop_vinfo) checking is pointless, too. */ th = LOOP_VINFO_COST_MODEL_THRESHOLD (loop_vinfo); if (th >= vect_vf_for_cost (loop_vinfo) - && !LOOP_VINFO_NITERS_KNOWN_P (loop_vinfo)) + && !LOOP_VINFO_NITERS_KNOWN_P (loop_vinfo) + && !LOOP_VINFO_SPECULATIVE_EXECUTION (loop_vinfo)) { if (dump_enabled_p ()) dump_printf_loc (MSG_NOTE, vect_location, @@ -8483,8 +8754,10 @@ vect_transform_loop (loop_vec_info loop_vinfo) &step_vector, &niters_vector_mult_vf, th, check_profitability, niters_no_overflow); - if (niters_vector == NULL_TREE) + if (niters_vector == NULL_TREE + && !LOOP_VINFO_SPECULATIVE_EXECUTION (loop_vinfo)) { + gcc_assert (!LOOP_VINFO_PEELING_FOR_GAPS (loop_vinfo)); if (LOOP_VINFO_NITERS_KNOWN_P (loop_vinfo) && !LOOP_VINFO_FULLY_MASKED_P (loop_vinfo) && must_eq (lowest_vf, vf)) @@ -8511,6 +8784,16 @@ vect_transform_loop (loop_vec_info loop_vinfo) /* This will deal with any possible peeling. */ vect_prepare_for_masked_peels (loop_vinfo); + if (LOOP_VINFO_SPECULATIVE_EXECUTION (loop_vinfo)) + { + tree mask_type = vect_mask_type_for_speculation (loop_vinfo); + /* Create a dummy definition of the exit mask. We'll fill in the + real definition later. */ + tree mask = make_temp_ssa_name (mask_type, NULL, "exit_mask"); + SSA_NAME_DEF_STMT (mask) = gimple_build_nop (); + LOOP_VINFO_EXIT_MASK (loop_vinfo) = mask; + } + /* FORNOW: the vectorizer supports only loops which body consist of one basic block (header + empty latch). When the vectorizer will support more involved loop forms, the order by which the BBs are @@ -8770,9 +9053,18 @@ vect_transform_loop (loop_vec_info loop_vinfo) } } /* BBs in loop */ + /* Provide the real definition of LOOP_VINFO_EXIT_MASK. */ + if (LOOP_VINFO_SPECULATIVE_EXECUTION (loop_vinfo)) + { + tree imask = LOOP_VINFO_EXIT_TEST_MASK (loop_vinfo); + tree omask = LOOP_VINFO_EXIT_MASK (loop_vinfo); + gphi *new_phi = create_phi_node (omask, single_exit (loop)->dest); + add_phi_arg (new_phi, imask, single_exit (loop), UNKNOWN_LOCATION); + } + /* The vectorization factor is always > 1, so if we use an IV increment of 1. a zero NITERS becomes a nonzero NITERS_VECTOR. */ - if (integer_onep (step_vector)) + if (step_vector && integer_onep (step_vector)) niters_no_overflow = true; vect_set_loop_condition (loop, loop_vinfo, niters_vector, step_vector, niters_vector_mult_vf, !niters_no_overflow); diff --git a/gcc/tree-vect-stmts.c b/gcc/tree-vect-stmts.c index c0a87dc9275..36443cff685 100644 --- a/gcc/tree-vect-stmts.c +++ b/gcc/tree-vect-stmts.c @@ -304,7 +304,7 @@ is_simple_and_all_uses_invariant (gimple *stmt, loop_vec_info loop_vinfo) A stmt is considered "relevant for vectorization" if: - it has uses outside the loop. - it has vdefs (it alters memory). - - control stmts in the loop (except for the exit condition). + - control stmts in the loop (including the exit condition). CHECKME: what other side effects would the vectorizer allow? */ @@ -323,8 +323,9 @@ vect_stmt_relevant_p (gimple *stmt, loop_vec_info loop_vinfo, /* cond stmt other than loop exit cond. */ if (is_ctrl_stmt (stmt) - && STMT_VINFO_TYPE (vinfo_for_stmt (stmt)) - != loop_exit_ctrl_vec_info_type) + && (STMT_VINFO_TYPE (vinfo_for_stmt (stmt)) + != loop_exit_ctrl_vec_info_type + || LOOP_VINFO_SPECULATIVE_EXECUTION (loop_vinfo))) *relevant = vect_used_in_scope; /* changing memory. */ @@ -688,6 +689,12 @@ vect_mark_stmts_to_be_vectorized (loop_vec_info loop_vinfo) } } + /* The exit condition is relevant for speculative loops. */ + if (LOOP_VINFO_SPECULATIVE_EXECUTION (loop_vinfo) + && !vect_stmt_relevant_p (get_loop_exit_condition (loop), + loop_vinfo, &relevant, &live_p)) + gcc_unreachable (); + /* 2. Process_worklist */ while (worklist.length () > 0) { @@ -2137,7 +2144,8 @@ get_group_load_store_type (gimple *stmt, tree vectype, bool slp, bool can_overrun_p = (!masked_p && vls_type == VLS_LOAD && loop_vinfo - && !loop->inner); + && !loop->inner + && !LOOP_VINFO_SPECULATIVE_EXECUTION (loop_vinfo)); /* There can only be a gap at the end of the group if the stride is known at compile time. */ @@ -4506,6 +4514,30 @@ vect_create_vectorized_promotion_stmts (vec<tree> *vec_oprnds0, *vec_oprnds0 = vec_tmp; } +/* Pack the masks in MASKS to a single mask and return it. Insert any + new statements before GSI. Leave MASKS with just the returned value + on exit. */ + +static tree +vect_demote_masks (gimple_stmt_iterator *gsi, vec<tree> *masks) +{ + while (masks->length () > 1) + { + unsigned int nresults = masks->length () / 2; + tree dest_type = vect_double_mask_nunits (TREE_TYPE ((*masks)[0])); + for (unsigned int i = 0; i < nresults; ++i) + { + tree dest = make_ssa_name (dest_type); + gimple *stmt = gimple_build_assign (dest, VEC_PACK_TRUNC_EXPR, + (*masks)[i * 2], + (*masks)[i * 2 + 1]); + gsi_insert_before (gsi, stmt, GSI_SAME_STMT); + (*masks)[i] = dest; + } + masks->truncate (nresults); + } + return (*masks)[0]; +} /* Check if STMT performs a conversion operation, that can be vectorized. If VEC_STMT is also passed, vectorize the STMT: create a vectorized @@ -6203,6 +6235,7 @@ vectorizable_store (gimple *stmt, gimple_stmt_iterator *gsi, gimple **vec_stmt, if (loop_vinfo) { + gcc_assert (!LOOP_VINFO_SPECULATIVE_EXECUTION (loop_vinfo)); loop = LOOP_VINFO_LOOP (loop_vinfo); vf = LOOP_VINFO_VECT_FACTOR (loop_vinfo); } @@ -7335,6 +7368,14 @@ vectorizable_load (gimple *stmt, gimple_stmt_iterator *gsi, gimple **vec_stmt, return false; } + if (LOOP_VINFO_SPECULATIVE_EXECUTION (loop_vinfo)) + { + if (dump_enabled_p ()) + dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location, + "speculative mask loads not supported\n"); + return false; + } + int mask_index = internal_fn_mask_index (ifn); if (mask_index >= 0) { @@ -7370,12 +7411,24 @@ vectorizable_load (gimple *stmt, gimple_stmt_iterator *gsi, gimple **vec_stmt, gcc_assert (ncopies >= 1); /* FORNOW. This restriction should be relaxed. */ - if (nested_in_vect_loop && ncopies > 1) + if (ncopies > 1) { - if (dump_enabled_p ()) - dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location, - "multiple types in nested loop.\n"); - return false; + if (nested_in_vect_loop) + { + if (dump_enabled_p ()) + dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location, + "multiple types in nested loop.\n"); + return false; + } + + if (LOOP_VINFO_SPECULATIVE_EXECUTION (loop_vinfo)) + { + if (dump_enabled_p ()) + dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location, + "multiple copies not supported for speculative " + "loops.\n"); + return false; + } } /* Invalidate assumptions made by dependence analysis when vectorization @@ -7988,7 +8041,6 @@ vectorizable_load (gimple *stmt, gimple_stmt_iterator *gsi, gimple **vec_stmt, tree vec_mask = NULL_TREE; prev_stmt_info = NULL; poly_uint64 group_elt = 0; - vec_loop_masks *masks = &LOOP_VINFO_MASKS (loop_vinfo); for (j = 0; j < ncopies; j++) { /* 1. Create the vector or array pointer update chain. */ @@ -8079,7 +8131,8 @@ vectorizable_load (gimple *stmt, gimple_stmt_iterator *gsi, gimple **vec_stmt, tree final_mask = NULL_TREE; if (masked_loop_p) - final_mask = vect_get_loop_mask (gsi, masks, ncopies, vectype, j); + final_mask = vect_get_load_mask (loop_vinfo, gsi, ncopies, + vectype, j); if (vec_mask) final_mask = prepare_load_store_mask (mask_vectype, final_mask, vec_mask, gsi); @@ -8126,7 +8179,8 @@ vectorizable_load (gimple *stmt, gimple_stmt_iterator *gsi, gimple **vec_stmt, tree final_mask = NULL_TREE; if (masked_loop_p && memory_access_type != VMAT_INVARIANT) - final_mask = vect_get_loop_mask (gsi, masks, vec_num * ncopies, + final_mask = vect_get_load_mask (loop_vinfo, gsi, + vec_num * ncopies, vectype, vec_num * j + i); if (vec_mask) final_mask = prepare_load_store_mask (mask_vectype, final_mask, @@ -8162,10 +8216,10 @@ vectorizable_load (gimple *stmt, gimple_stmt_iterator *gsi, gimple **vec_stmt, break; } - align = DR_TARGET_ALIGNMENT (dr); if (alignment_support_scheme == dr_aligned) { gcc_assert (aligned_access_p (first_dr)); + align = DR_TARGET_ALIGNMENT (first_dr); misalign = 0; } else if (DR_MISALIGNMENT (first_dr) == -1) @@ -8174,7 +8228,10 @@ vectorizable_load (gimple *stmt, gimple_stmt_iterator *gsi, gimple **vec_stmt, misalign = 0; } else - misalign = DR_MISALIGNMENT (first_dr); + { + align = DR_TARGET_ALIGNMENT (first_dr); + misalign = DR_MISALIGNMENT (first_dr); + } if (dataref_offset == NULL_TREE && TREE_CODE (dataref_ptr) == SSA_NAME) set_ptr_info_alignment (get_ptr_info (dataref_ptr), @@ -8934,12 +8991,11 @@ vectorizable_comparison (gimple *stmt, gimple_stmt_iterator *gsi, gimple **vec_stmt, tree reduc_def, slp_tree slp_node) { - tree lhs, rhs1, rhs2; + tree rhs1, rhs2; stmt_vec_info stmt_info = vinfo_for_stmt (stmt); tree vectype1 = NULL_TREE, vectype2 = NULL_TREE; tree vectype = STMT_VINFO_VECTYPE (stmt_info); tree vec_rhs1 = NULL_TREE, vec_rhs2 = NULL_TREE; - tree new_temp; loop_vec_info loop_vinfo = STMT_VINFO_LOOP_VINFO (stmt_info); enum vect_def_type dts[2] = {vect_unknown_def_type, vect_unknown_def_type}; int ndts = 2; @@ -8983,16 +9039,55 @@ vectorizable_comparison (gimple *stmt, gimple_stmt_iterator *gsi, return false; } - if (!is_gimple_assign (stmt)) - return false; + if (is_gimple_assign (stmt)) + { + code = gimple_assign_rhs_code (stmt); + rhs1 = gimple_assign_rhs1 (stmt); + rhs2 = gimple_assign_rhs2 (stmt); + } + else if (gimple_code (stmt) == GIMPLE_COND) + { + gcc_assert (LOOP_VINFO_SPECULATIVE_EXECUTION (loop_vinfo)); - code = gimple_assign_rhs_code (stmt); + /* TODO: Support more complex loops with more than one gcond stmt. */ + struct loop *loop = LOOP_VINFO_LOOP (loop_vinfo); + gcc_assert (stmt == get_loop_exit_condition (loop)); - if (TREE_CODE_CLASS (code) != tcc_comparison) + rhs1 = gimple_cond_lhs (stmt); + rhs2 = gimple_cond_rhs (stmt); + + code = gimple_cond_code (stmt); + edge exit_edge = single_exit (loop); + if (exit_edge->flags & EDGE_FALSE_VALUE) + { + /* We want to invert the code and generate a mask such that if any + bit is true the exit condition is met. */ + bool honor_nans = FLOAT_TYPE_P (TREE_TYPE (rhs1)); + code = invert_tree_comparison (code, honor_nans); + if (code == ERROR_MARK) + { + if (dump_enabled_p ()) + dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location, + "Cannot invert condition code. Loop cannot " + "be speculatively executed.\n"); + return false; + } + } + + if (optab_handler (cbranch_optab, TYPE_MODE (vectype)) + == CODE_FOR_nothing) + { + if (dump_enabled_p ()) + dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location, + "Target does not support testing a mask.\n"); + return false; + } + } + else return false; - rhs1 = gimple_assign_rhs1 (stmt); - rhs2 = gimple_assign_rhs2 (stmt); + if (TREE_CODE_CLASS (code) != tcc_comparison) + return false; if (!vect_is_simple_use (rhs1, stmt_info->vinfo, &def_stmt, &dts[0], &vectype1)) @@ -9070,6 +9165,17 @@ vectorizable_comparison (gimple *stmt, gimple_stmt_iterator *gsi, STMT_VINFO_TYPE (stmt_info) = comparison_vec_info_type; vect_model_simple_cost (stmt_info, ncopies * (1 + (bitop2 != NOP_EXPR)), dts, ndts, NULL, NULL); + + /* Speulative loops need to AND the comparison result with the + mask of active values. */ + if (LOOP_VINFO_CAN_FULLY_MASK_P (loop_vinfo) + && LOOP_VINFO_SPECULATIVE_EXECUTION (loop_vinfo)) + { + tree final_type = vect_mask_type_for_speculation (loop_vinfo); + vect_record_loop_mask (loop_vinfo, &LOOP_VINFO_MASKS (loop_vinfo), + 1, final_type); + } + if (bitop1 == NOP_EXPR) return expand_vec_cmp_expr_p (vectype, mask_type, code); else @@ -9099,8 +9205,26 @@ vectorizable_comparison (gimple *stmt, gimple_stmt_iterator *gsi, } /* Handle def. */ - lhs = gimple_assign_lhs (stmt); - mask = vect_create_destination_var (lhs, mask_type); + if (is_gimple_assign (stmt)) + { + tree lhs = gimple_assign_lhs (stmt); + mask = vect_create_destination_var (lhs, mask_type); + } + else + mask = NULL_TREE; + + bool masked_speculative_p + = (LOOP_VINFO_SPECULATIVE_EXECUTION (loop_vinfo) + && LOOP_VINFO_MASK_SKIP_NITERS (loop_vinfo)); + + /* Pick an array of masks to use as the comparison results that feed + a GIMPLE_COND. If all input elements are valid, we can operate + directly on the exit masks array. If masking is needed, first + build a temporary array of unmasked results and then apply the + mask to it. + + This is ignored (and cheap) if the statement isn't a GIMPLE_COND. */ + auto_vec<tree, 16> cmp_results; /* Handle cmp expr. */ for (j = 0; j < ncopies; j++) @@ -9144,34 +9268,42 @@ vectorizable_comparison (gimple *stmt, gimple_stmt_iterator *gsi, { vec_rhs2 = vec_oprnds1[i]; - new_temp = make_ssa_name (mask); + tree cmp_res = (mask != NULL_TREE + ? make_ssa_name (mask) + : make_ssa_name (mask_type)); if (bitop1 == NOP_EXPR) { - new_stmt = gimple_build_assign (new_temp, code, + new_stmt = gimple_build_assign (cmp_res, code, vec_rhs1, vec_rhs2); vect_finish_stmt_generation (stmt, new_stmt, gsi); } else { + tree bitop1_res = (bitop2 == NOP_EXPR + ? cmp_res + : make_ssa_name (TREE_TYPE (cmp_res))); if (bitop1 == BIT_NOT_EXPR) - new_stmt = gimple_build_assign (new_temp, bitop1, vec_rhs2); + new_stmt = gimple_build_assign (bitop1_res, bitop1, vec_rhs2); else - new_stmt = gimple_build_assign (new_temp, bitop1, vec_rhs1, + new_stmt = gimple_build_assign (bitop1_res, bitop1, vec_rhs1, vec_rhs2); vect_finish_stmt_generation (stmt, new_stmt, gsi); if (bitop2 != NOP_EXPR) { - tree res = make_ssa_name (mask); if (bitop2 == BIT_NOT_EXPR) - new_stmt = gimple_build_assign (res, bitop2, new_temp); + new_stmt = gimple_build_assign (cmp_res, bitop2, + bitop1_res); else - new_stmt = gimple_build_assign (res, bitop2, vec_rhs1, - new_temp); + new_stmt = gimple_build_assign (cmp_res, bitop2, + vec_rhs1, bitop1_res); vect_finish_stmt_generation (stmt, new_stmt, gsi); } } + if (slp_node) SLP_TREE_VEC_STMTS (slp_node).quick_push (new_stmt); + + cmp_results.safe_push (cmp_res); } if (slp_node) @@ -9188,6 +9320,42 @@ vectorizable_comparison (gimple *stmt, gimple_stmt_iterator *gsi, vec_oprnds0.release (); vec_oprnds1.release (); + if (gimple_code (stmt) == GIMPLE_COND) + { + gcc_assert (LOOP_VINFO_SPECULATIVE_EXECUTION (loop_vinfo)); + + struct loop *loop = LOOP_VINFO_LOOP (loop_vinfo); + gcond *cond = get_loop_exit_condition (loop); + gcc_assert (cond); + gimple_stmt_iterator loop_cond_gsi = gsi_for_stmt (cond); + + tree cmp_res = vect_demote_masks (&loop_cond_gsi, &cmp_results); + mask_type = TREE_TYPE (cmp_res); + if (masked_speculative_p) + { + /* Work out which elements of the unmasked result are valid. */ + mask = vect_get_loop_mask (gsi, &LOOP_VINFO_MASKS (loop_vinfo), + 1, mask_type, 0); + + /* Get the mask of values that actually matter. */ + tree masked_res = make_ssa_name (mask_type); + gimple *tmp_stmt = gimple_build_assign (masked_res, BIT_AND_EXPR, + cmp_res, mask); + gsi_insert_before (&loop_cond_gsi, tmp_stmt, GSI_SAME_STMT); + cmp_res = masked_res; + } + LOOP_VINFO_EXIT_TEST_MASK (loop_vinfo) = cmp_res; + + /* Get a boolean result that tells us whether to iterate. It's easier + to modify the condition in-place than to generate a new one and + delete the old one. */ + edge exit_edge = single_exit (loop); + tree_code code = (exit_edge->flags & EDGE_TRUE_VALUE) ? NE_EXPR : EQ_EXPR; + tree zero_mask = build_zero_cst (mask_type); + gimple_cond_set_condition (cond, code, cmp_res, zero_mask); + update_stmt (cond); + } + return true; } diff --git a/gcc/tree-vectorizer.h b/gcc/tree-vectorizer.h index 8073ba05a83..2afafda6b25 100644 --- a/gcc/tree-vectorizer.h +++ b/gcc/tree-vectorizer.h @@ -414,7 +414,8 @@ typedef struct _loop_vec_info : public vec_info { vec_niters_and_mask cap; /* The masks that a fully-masked loop should use to avoid operating - on inactive scalars. */ + on inactive scalars. In a speculative loop, these masks control + the operations that can be executed speculatively. */ vec_loop_masks masks; /* If we are using a loop mask to align memory addresses, this variable @@ -489,6 +490,9 @@ typedef struct _loop_vec_info : public vec_info { /* Is the loop vectorizable? */ bool vectorizable; + /* Is this a speculative loop? */ + bool speculative_execution; + /* Records whether we still have the option of using a fully-masked loop. */ bool can_fully_mask_p; @@ -546,6 +550,22 @@ typedef struct _loop_vec_info : public vec_info { /* A map from X to a precomputed gimple_val containing CAPPED_VECTORIZATION_FACTOR * X. */ hash_map<tree, tree> vf_mult_map; + + /* In a speculative loop, this is the result of the exit comparison. + It is a vector mask with one element for each scalar iteration. */ + tree exit_test_mask; + + /* A value equal to EXIT_TEST_MASK for use outside the loop. */ + tree exit_mask; + + /* In a speculative loop, these masks are used to control operations + that cannot be speculatively executed. */ + vec_loop_masks nonspeculative_masks; + + /* Statements in a speculative loop that depend on nonspeculative masks. + These statements can only be executed after the exit condition has + been evaluated. */ + gimple_seq nonspeculative_seq; } *loop_vec_info; /* Access Functions. */ @@ -599,6 +619,14 @@ typedef struct _loop_vec_info : public vec_info { #define LOOP_VINFO_ORIG_LOOP_INFO(L) (L)->orig_loop_info #define LOOP_VINFO_ADDR_CACHE(L) (L)->vect_addr_base_htab #define LOOP_VINFO_VF_MULT_MAP(L) (L)->vf_mult_map +#define LOOP_VINFO_SPECULATIVE_EXECUTION(L) (L)->speculative_execution +#define LOOP_VINFO_EXIT_TEST_MASK(L) (L)->exit_test_mask +#define LOOP_VINFO_EXIT_MASK(L) (L)->exit_mask +#define LOOP_VINFO_NONSPECULATIVE(L) (L)->nonspeculative +#define LOOP_VINFO_NEEDS_NONSPECULATIVE_MASKS(L) \ + (!(L)->nonspeculative_masks.is_empty ()) +#define LOOP_VINFO_NONSPECULATIVE_MASKS(L) (L)->nonspeculative_masks +#define LOOP_VINFO_NONSPECULATIVE_SEQ(L) (L)->nonspeculative_seq #define LOOP_REQUIRES_VERSIONING_FOR_ALIGNMENT(L) \ ((L)->may_misalign_stmts.length () > 0) @@ -1625,6 +1653,10 @@ extern void vect_record_loop_mask (loop_vec_info, vec_loop_masks *, unsigned int, tree); extern tree vect_get_loop_mask (gimple_stmt_iterator *, vec_loop_masks *, unsigned int, tree, unsigned int); +extern tree vect_get_load_mask (loop_vec_info, gimple_stmt_iterator *, + unsigned int, tree, unsigned int); +extern tree vect_mask_type_for_speculation (loop_vec_info); +extern tree vect_get_niters_from_mask (gimple_seq *, vec_niters_and_mask *); /* Drive for loop transformation stage. */ extern struct loop *vect_transform_loop (loop_vec_info); |