diff options
author | Richard Sandiford <richard.sandiford@linaro.org> | 2017-10-08 12:29:08 +0100 |
---|---|---|
committer | Richard Sandiford <richard.sandiford@linaro.org> | 2017-11-20 16:01:23 +0000 |
commit | f8d18515fd0e87fe9b68a23e6d73b80064baec97 (patch) | |
tree | bf7c7a9d43942db218b3786c21fea82242787adf | |
parent | f2ba9afa2bfb21956959a5ab48be1ef1ce2e6f2c (diff) | |
download | gcc-f8d18515fd0e87fe9b68a23e6d73b80064baec97.tar.gz |
Allow capped vectorisation factors
[Branch only patch -- not intended for trunk in its current state]
This patch allows the controlling mask for a loop to be clamped to
a compile-time maximum. We can then vectorise code that has a
known dependence distance, even if it is (or might be) smaller
than a vector.
This is almost ready for trunk, but I'd prefer to do some more
checking first.
-rw-r--r-- | gcc/config/aarch64/aarch64-sve.md | 20 | ||||
-rw-r--r-- | gcc/config/aarch64/aarch64.md | 1 | ||||
-rw-r--r-- | gcc/internal-fn.def | 3 | ||||
-rw-r--r-- | gcc/optabs.def | 2 | ||||
-rw-r--r-- | gcc/testsuite/gcc.dg/vect/slp-28.c | 4 | ||||
-rw-r--r-- | gcc/testsuite/gcc.target/aarch64/sve_cap_1.c | 44 | ||||
-rw-r--r-- | gcc/testsuite/gcc.target/aarch64/sve_cap_1_run.c | 37 | ||||
-rw-r--r-- | gcc/testsuite/gcc.target/aarch64/sve_cap_2.c | 53 | ||||
-rw-r--r-- | gcc/testsuite/gcc.target/aarch64/sve_cap_2_run.c | 44 | ||||
-rw-r--r-- | gcc/testsuite/gcc.target/aarch64/sve_cap_3.c | 53 | ||||
-rw-r--r-- | gcc/testsuite/gcc.target/aarch64/sve_cap_3_run.c | 44 | ||||
-rw-r--r-- | gcc/testsuite/gcc.target/aarch64/sve_cap_4.c | 42 | ||||
-rw-r--r-- | gcc/testsuite/gcc.target/aarch64/sve_cap_4_run.c | 37 | ||||
-rw-r--r-- | gcc/testsuite/gcc.target/aarch64/sve_cap_5.c | 24 | ||||
-rw-r--r-- | gcc/testsuite/gcc.target/aarch64/sve_cap_5_run.c | 40 | ||||
-rw-r--r-- | gcc/tree-ssa-loop-ivopts.c | 19 | ||||
-rw-r--r-- | gcc/tree-vect-data-refs.c | 7 | ||||
-rw-r--r-- | gcc/tree-vect-loop-manip.c | 102 | ||||
-rw-r--r-- | gcc/tree-vect-loop.c | 134 | ||||
-rw-r--r-- | gcc/tree-vect-stmts.c | 80 | ||||
-rw-r--r-- | gcc/tree-vectorizer.h | 35 |
21 files changed, 751 insertions, 74 deletions
diff --git a/gcc/config/aarch64/aarch64-sve.md b/gcc/config/aarch64/aarch64-sve.md index 0ab9c7b0ae8..5d84b7fc595 100644 --- a/gcc/config/aarch64/aarch64-sve.md +++ b/gcc/config/aarch64/aarch64-sve.md @@ -2379,6 +2379,26 @@ "<sve_fp_op>\t%0.<Vetype>, %1/m, %0.<Vetype>, %2.<Vetype>" ) +(define_expand "mask_popcount<mode>" + [(set (match_operand:DI 0 "register_operand") + (unspec:DI [(match_dup 2) + (match_operand:PRED_ALL 1 "register_operand")] + UNSPEC_CNTP))] + "TARGET_SVE" + { + operands[2] = force_reg (<MODE>mode, CONSTM1_RTX (<MODE>mode)); + } +) + +(define_insn "*mask_popcount<mode>" + [(set (match_operand:DI 0 "register_operand" "=r") + (unspec:DI [(match_operand:PRED_ALL 1 "register_operand" "Upa") + (match_operand:PRED_ALL 2 "register_operand" "Upa")] + UNSPEC_CNTP))] + "TARGET_SVE" + "cntp\t%0, %1, %2.<Vetype>" +) + ;; Shift an SVE vector left and insert a scalar into element 0. (define_insn "vec_shl_insert_<mode>" [(set (match_operand:SVE_ALL 0 "register_operand" "=w, w") diff --git a/gcc/config/aarch64/aarch64.md b/gcc/config/aarch64/aarch64.md index 0a4c90c1a39..581e6a753d2 100644 --- a/gcc/config/aarch64/aarch64.md +++ b/gcc/config/aarch64/aarch64.md @@ -167,6 +167,7 @@ UNSPEC_INSR UNSPEC_CLASTB UNSPEC_FADDA + UNSPEC_CNTP ]) (define_c_enum "unspecv" [ diff --git a/gcc/internal-fn.def b/gcc/internal-fn.def index 3cc62dd68a6..f28519837f2 100644 --- a/gcc/internal-fn.def +++ b/gcc/internal-fn.def @@ -125,6 +125,9 @@ DEF_INTERNAL_OPTAB_FN (STORE_LANES, ECF_CONST, vec_store_lanes, store_lanes) DEF_INTERNAL_OPTAB_FN (MASK_STORE_LANES, 0, vec_mask_store_lanes, mask_store_lanes) +DEF_INTERNAL_OPTAB_FN (MASK_POPCOUNT, ECF_CONST | ECF_NOTHROW, + mask_popcount, unary) + DEF_INTERNAL_OPTAB_FN (WHILE_ULT, ECF_CONST | ECF_NOTHROW, while_ult, while) DEF_INTERNAL_OPTAB_FN (VEC_INTERLEAVE_LO, ECF_CONST | ECF_NOTHROW, diff --git a/gcc/optabs.def b/gcc/optabs.def index 523906aa198..bf67dfca132 100644 --- a/gcc/optabs.def +++ b/gcc/optabs.def @@ -397,4 +397,6 @@ OPTAB_D (mask_scatter_store_optab, "mask_scatter_store$a") OPTAB_DC (vec_duplicate_optab, "vec_duplicate$a", VEC_DUPLICATE) OPTAB_DC (vec_series_optab, "vec_series$a", VEC_SERIES) + +OPTAB_D (mask_popcount_optab, "mask_popcount$a") OPTAB_D (vec_shl_insert_optab, "vec_shl_insert_$a") diff --git a/gcc/testsuite/gcc.dg/vect/slp-28.c b/gcc/testsuite/gcc.dg/vect/slp-28.c index 7778bad4465..4211b94ad7f 100644 --- a/gcc/testsuite/gcc.dg/vect/slp-28.c +++ b/gcc/testsuite/gcc.dg/vect/slp-28.c @@ -88,6 +88,6 @@ int main (void) return 0; } -/* { dg-final { scan-tree-dump-times "vectorized 1 loops" 1 "vect" } } */ -/* { dg-final { scan-tree-dump-times "vectorizing stmts using SLP" 1 "vect" } } */ +/* { dg-final { scan-tree-dump-times "vectorized 2 loops" 1 "vect" } } */ +/* { dg-final { scan-tree-dump-times "vectorizing stmts using SLP" 2 "vect" } } */ diff --git a/gcc/testsuite/gcc.target/aarch64/sve_cap_1.c b/gcc/testsuite/gcc.target/aarch64/sve_cap_1.c new file mode 100644 index 00000000000..1051fd1f7f0 --- /dev/null +++ b/gcc/testsuite/gcc.target/aarch64/sve_cap_1.c @@ -0,0 +1,44 @@ +/* { dg-do compile } */ +/* { dg-options "-O2 -ftree-vectorize -fno-inline -march=armv8-a+sve -fno-vect-cost-model" } */ + +#define LOOP(TYPE) \ + void \ + f_##TYPE##_1 (TYPE *a, int n) \ + { \ + for (int i = 0; i < n; ++i) \ + a[i] += a[i - 1]; \ + } \ + \ + void \ + f_##TYPE##_2 (TYPE *a, int n) \ + { \ + for (int i = 0; i < n; ++i) \ + a[i] += a[i - 2]; \ + } \ + \ + void \ + f_##TYPE##_5 (TYPE *a, int n) \ + { \ + for (int i = 0; i < n; ++i) \ + a[i] += a[i - 5]; \ + } + +LOOP (char) +LOOP (short) +LOOP (float) +LOOP (double) + +/* { dg-final { scan-assembler-times {\tstrb\t} 1 } } */ +/* { dg-final { scan-assembler-times {\tstrh\t} 1 } } */ +/* { dg-final { scan-assembler-times {\tstr\ts[0-9]+} 1 } } */ +/* { dg-final { scan-assembler-times {\tstr\td[0-9]+} 1 } } */ + +/* { dg-final { scan-assembler-times {\tld1b\tz[0-9]+\.b, p[0-7]/z, \[x[0-9]+, x[0-9]+\]} 4 } } */ +/* { dg-final { scan-assembler-times {\tld1h\tz[0-9]+\.h, p[0-7]/z, \[x[0-9]+, x[0-9]+, lsl 1\]} 4 } } */ +/* { dg-final { scan-assembler-times {\tld1w\tz[0-9]+\.s, p[0-7]/z, \[x[0-9]+, x[0-9]+, lsl 2\]} 4 } } */ +/* { dg-final { scan-assembler-times {\tld1d\tz[0-9]+\.d, p[0-7]/z, \[x[0-9]+, x[0-9]+, lsl 3\]} 4 } } */ + +/* { dg-final { scan-assembler-times {\tst1b\tz[0-9]+\.b, p[0-7], \[x[0-9]+, x[0-9]+\]} 2 } } */ +/* { dg-final { scan-assembler-times {\tst1h\tz[0-9]+\.h, p[0-7], \[x[0-9]+, x[0-9]+, lsl 1\]} 2 } } */ +/* { dg-final { scan-assembler-times {\tst1w\tz[0-9]+\.s, p[0-7], \[x[0-9]+, x[0-9]+, lsl 2\]} 2 } } */ +/* { dg-final { scan-assembler-times {\tst1d\tz[0-9]+\.d, p[0-7], \[x[0-9]+, x[0-9]+, lsl 3\]} 2 } } */ diff --git a/gcc/testsuite/gcc.target/aarch64/sve_cap_1_run.c b/gcc/testsuite/gcc.target/aarch64/sve_cap_1_run.c new file mode 100644 index 00000000000..0f280b04f0b --- /dev/null +++ b/gcc/testsuite/gcc.target/aarch64/sve_cap_1_run.c @@ -0,0 +1,37 @@ +/* { dg-do run { target { aarch64_sve_hw } } } */ +/* { dg-options "-O2 -ftree-vectorize -fno-inline -march=armv8-a+sve -fno-vect-cost-model" } */ + +#include "sve_cap_1.c" + +#define N 129 + +#define F(X) (((X) % 5) * (X)) + +#define TEST_LOOP(TYPE, M) \ + { \ + TYPE a[N + M]; \ + for (int i = 0; i < N + M; ++i) \ + a[i] = F (i); \ + f_##TYPE##_##M (a + M, N); \ + for (int i = 0; i < N; ++i) \ + { \ + TYPE x = a[i]; \ + TYPE y = F (i + M); \ + if (a[i + M] != (TYPE) (x + y)) \ + __builtin_abort (); \ + } \ + } + +#define TEST_LOOPS(TYPE) \ + TEST_LOOP (TYPE, 1) \ + TEST_LOOP (TYPE, 2) \ + TEST_LOOP (TYPE, 5) + +int +main (void) +{ + TEST_LOOPS (char); + TEST_LOOPS (short); + TEST_LOOPS (float); + TEST_LOOPS (double); +} diff --git a/gcc/testsuite/gcc.target/aarch64/sve_cap_2.c b/gcc/testsuite/gcc.target/aarch64/sve_cap_2.c new file mode 100644 index 00000000000..d46a08c2ee1 --- /dev/null +++ b/gcc/testsuite/gcc.target/aarch64/sve_cap_2.c @@ -0,0 +1,53 @@ +/* { dg-do compile } */ +/* { dg-options "-O2 -ftree-vectorize -fno-inline -march=armv8-a+sve -fno-vect-cost-model" } */ + +#define LOOP(TYPE) \ + void __attribute__ ((weak)) \ + f_##TYPE##_1 (TYPE *a, int n) \ + { \ + for (int i = 0; i < n; ++i) \ + { \ + a[i * 2] += a[i * 2 - 2]; \ + a[i * 2 + 1] += a[i * 2 - 1]; \ + } \ + } \ + \ + void __attribute__ ((weak)) \ + f_##TYPE##_2 (TYPE *a, int n) \ + { \ + for (int i = 0; i < n; ++i) \ + { \ + a[i * 2] += a[i * 2 - 4]; \ + a[i * 2 + 1] += a[i * 2 - 3]; \ + } \ + } \ + \ + void __attribute__ ((weak)) \ + f_##TYPE##_5 (TYPE *a, int n) \ + { \ + for (int i = 0; i < n; ++i) \ + { \ + a[i * 2] += a[i * 2 - 10]; \ + a[i * 2 + 1] += a[i * 2 - 9]; \ + } \ + } + +LOOP (char) +LOOP (short) +LOOP (float) +LOOP (double) + +/* { dg-final { scan-assembler-times {\tstrb\t} 2 } } */ +/* { dg-final { scan-assembler-times {\tstrh\t} 2 } } */ +/* { dg-final { scan-assembler-times {\tstp\ts[0-9]+} 1 } } */ +/* { dg-final { scan-assembler-times {\tstp\td[0-9]+} 1 } } */ + +/* { dg-final { scan-assembler-times {\tld1b\t} 4 } } */ +/* { dg-final { scan-assembler-times {\tld1h\t} 4 } } */ +/* { dg-final { scan-assembler-times {\tld1w\t} 4 } } */ +/* { dg-final { scan-assembler-times {\tld1d\t} 4 } } */ + +/* { dg-final { scan-assembler-times {\tst1b\t} 2 } } */ +/* { dg-final { scan-assembler-times {\tst1h\t} 2 } } */ +/* { dg-final { scan-assembler-times {\tst1w\t} 2 } } */ +/* { dg-final { scan-assembler-times {\tst1d\t} 2 } } */ diff --git a/gcc/testsuite/gcc.target/aarch64/sve_cap_2_run.c b/gcc/testsuite/gcc.target/aarch64/sve_cap_2_run.c new file mode 100644 index 00000000000..0f8ed957c79 --- /dev/null +++ b/gcc/testsuite/gcc.target/aarch64/sve_cap_2_run.c @@ -0,0 +1,44 @@ +/* { dg-do run { target { aarch64_sve_hw } } } */ +/* { dg-options "-O2 -ftree-vectorize -fno-inline -march=armv8-a+sve -fno-vect-cost-model" } */ + +#include "sve_cap_2.c" + +#define N 129 + +#define F(X) (((X) % 5) * (X)) +#define G(X) ((X) ^ 39) + +#define TEST_LOOP(TYPE, M) \ + { \ + TYPE a[(N + M) * 2]; \ + for (int i = 0; i < N + M; ++i) \ + { \ + a[i * 2] = F (i); \ + a[i * 2 + 1] = G (i); \ + } \ + f_##TYPE##_##M (a + M * 2, N); \ + for (int i = 0; i < N; ++i) \ + { \ + TYPE x0 = a[i * 2]; \ + TYPE y0 = F (i + M); \ + TYPE x1 = a[i * 2 + 1]; \ + TYPE y1 = G (i + M); \ + if (a[(i + M) * 2] != (TYPE) (x0 + y0) \ + || a[(i + M) * 2 + 1] != (TYPE) (x1 + y1)) \ + __builtin_abort (); \ + } \ + } + +#define TEST_LOOPS(TYPE) \ + TEST_LOOP (TYPE, 1) \ + TEST_LOOP (TYPE, 2) \ + TEST_LOOP (TYPE, 5) + +int +main (void) +{ + TEST_LOOPS (char); + TEST_LOOPS (short); + TEST_LOOPS (float); + TEST_LOOPS (double); +} diff --git a/gcc/testsuite/gcc.target/aarch64/sve_cap_3.c b/gcc/testsuite/gcc.target/aarch64/sve_cap_3.c new file mode 100644 index 00000000000..6515465b7f6 --- /dev/null +++ b/gcc/testsuite/gcc.target/aarch64/sve_cap_3.c @@ -0,0 +1,53 @@ +/* { dg-do compile } */ +/* { dg-options "-O2 -ftree-vectorize -fno-inline -march=armv8-a+sve -fno-vect-cost-model -msve-vector-bits=scalable" } */ + +#define LOOP(TYPE) \ + void __attribute__ ((weak)) \ + f_##TYPE##_1 (TYPE *a, int n) \ + { \ + for (int i = 0; i < n; ++i) \ + { \ + a[i * 2] += a[i * 2 - 2]; \ + a[i * 2 + 1] -= a[i * 2 - 1]; \ + } \ + } \ + \ + void __attribute__ ((weak)) \ + f_##TYPE##_2 (TYPE *a, int n) \ + { \ + for (int i = 0; i < n; ++i) \ + { \ + a[i * 2] += a[i * 2 - 4]; \ + a[i * 2 + 1] -= a[i * 2 - 3]; \ + } \ + } \ + \ + void __attribute__ ((weak)) \ + f_##TYPE##_5 (TYPE *a, int n) \ + { \ + for (int i = 0; i < n; ++i) \ + { \ + a[i * 2] += a[i * 2 - 10]; \ + a[i * 2 + 1] -= a[i * 2 - 9]; \ + } \ + } + +LOOP (char) +LOOP (short) +LOOP (float) +LOOP (double) + +/* { dg-final { scan-assembler-times {\tstrb\t} 2 } } */ +/* { dg-final { scan-assembler-times {\tstrh\t} 2 } } */ +/* { dg-final { scan-assembler-times {\tstp\ts[0-9]+} 1 } } */ +/* { dg-final { scan-assembler-times {\tstp\td[0-9]+} 1 } } */ + +/* { dg-final { scan-assembler-times {\tld2b\t} 4 } } */ +/* { dg-final { scan-assembler-times {\tld2h\t} 4 } } */ +/* { dg-final { scan-assembler-times {\tld2w\t} 4 } } */ +/* { dg-final { scan-assembler-times {\tld2d\t} 4 } } */ + +/* { dg-final { scan-assembler-times {\tst2b\t} 2 } } */ +/* { dg-final { scan-assembler-times {\tst2h\t} 2 } } */ +/* { dg-final { scan-assembler-times {\tst2w\t} 2 } } */ +/* { dg-final { scan-assembler-times {\tst2d\t} 2 } } */ diff --git a/gcc/testsuite/gcc.target/aarch64/sve_cap_3_run.c b/gcc/testsuite/gcc.target/aarch64/sve_cap_3_run.c new file mode 100644 index 00000000000..fe26162a812 --- /dev/null +++ b/gcc/testsuite/gcc.target/aarch64/sve_cap_3_run.c @@ -0,0 +1,44 @@ +/* { dg-do run { target { aarch64_sve_hw } } } */ +/* { dg-options "-O2 -ftree-vectorize -fno-inline -march=armv8-a+sve -fno-vect-cost-model" } */ + +#include "sve_cap_3.c" + +#define N 129 + +#define F(X) (((X) % 5) * (X)) +#define G(X) ((X) ^ 39) + +#define TEST_LOOP(TYPE, M) \ + { \ + TYPE a[(N + M) * 2]; \ + for (int i = 0; i < N + M; ++i) \ + { \ + a[i * 2] = F (i); \ + a[i * 2 + 1] = G (i); \ + } \ + f_##TYPE##_##M (a + M * 2, N); \ + for (int i = 0; i < N; ++i) \ + { \ + TYPE x0 = a[i * 2]; \ + TYPE y0 = F (i + M); \ + TYPE x1 = a[i * 2 + 1]; \ + TYPE y1 = G (i + M); \ + if (a[(i + M) * 2] != (TYPE) (y0 + x0) \ + || a[(i + M) * 2 + 1] != (TYPE) (y1 - x1)) \ + __builtin_abort (); \ + } \ + } + +#define TEST_LOOPS(TYPE) \ + TEST_LOOP (TYPE, 1) \ + TEST_LOOP (TYPE, 2) \ + TEST_LOOP (TYPE, 5) + +int +main (void) +{ + TEST_LOOPS (char); + TEST_LOOPS (short); + TEST_LOOPS (float); + TEST_LOOPS (double); +} diff --git a/gcc/testsuite/gcc.target/aarch64/sve_cap_4.c b/gcc/testsuite/gcc.target/aarch64/sve_cap_4.c new file mode 100644 index 00000000000..c3bf2f326d3 --- /dev/null +++ b/gcc/testsuite/gcc.target/aarch64/sve_cap_4.c @@ -0,0 +1,42 @@ +/* { dg-do compile } */ +/* { dg-options "-O2 -ftree-vectorize -fno-inline -march=armv8-a+sve -fno-vect-cost-model" } */ + +#define LOOP(TYPE) \ + void \ + f_##TYPE##_1 (TYPE *a, int n) \ + { \ + for (int i = 0; i < n; ++i) \ + a[i * 5] += a[i * 5 - 5]; \ + } \ + \ + void \ + f_##TYPE##_2 (TYPE *a, int n) \ + { \ + for (int i = 0; i < n; ++i) \ + a[i * 5] += a[i * 5 - 10]; \ + } \ + \ + void \ + f_##TYPE##_5 (TYPE *a, int n) \ + { \ + for (int i = 0; i < n; ++i) \ + a[i * 5] += a[i * 5 - 25]; \ + } + +LOOP (char) +LOOP (short) +LOOP (float) +LOOP (double) + +/* At the moment we can't use extending loads and truncating stores. + Please add ld and st scan-assemblers below if that changes. */ +/* { dg-final { scan-assembler-times {\tstrb\t} 1 { xfail *-*-* } } } */ +/* { dg-final { scan-assembler-times {\tstrh\t} 1 { xfail *-*-* } } } */ +/* { dg-final { scan-assembler-times {\tstr\ts[0-9]+} 1 } } */ +/* { dg-final { scan-assembler-times {\tstr\td[0-9]+} 1 } } */ + +/* { dg-final { scan-assembler-times {\tld1w\tz[0-9]+\.s, p[0-7]/z, \[x[0-9]+, z[0-9]+\.s, sxtw 2\]} 4 } } */ +/* { dg-final { scan-assembler-times {\tld1d\tz[0-9]+\.d, p[0-7]/z, \[x[0-9]+, z[0-9]+\.d, lsl 3\]} 4 } } */ + +/* { dg-final { scan-assembler-times {\tst1w\tz[0-9]+\.s, p[0-7], \[x[0-9]+, z[0-9]+\.s, sxtw 2\]} 2 } } */ +/* { dg-final { scan-assembler-times {\tst1d\tz[0-9]+\.d, p[0-7], \[x[0-9]+, z[0-9]+\.d, lsl 3\]} 2 } } */ diff --git a/gcc/testsuite/gcc.target/aarch64/sve_cap_4_run.c b/gcc/testsuite/gcc.target/aarch64/sve_cap_4_run.c new file mode 100644 index 00000000000..f39bc7fc3cb --- /dev/null +++ b/gcc/testsuite/gcc.target/aarch64/sve_cap_4_run.c @@ -0,0 +1,37 @@ +/* { dg-do run { target { aarch64_sve_hw } } } */ +/* { dg-options "-O2 -ftree-vectorize -fno-inline -march=armv8-a+sve -fno-vect-cost-model" } */ + +#include "sve_cap_4.c" + +#define N 129 + +#define F(X) (((X) % 5) * (X)) + +#define TEST_LOOP(TYPE, M) \ + { \ + TYPE a[(N + M) * 5]; \ + for (int i = 0; i < N + M; ++i) \ + a[i * 5] = F (i); \ + f_##TYPE##_##M (a + M * 5, N); \ + for (int i = 0; i < N; ++i) \ + { \ + TYPE x = a[i * 5]; \ + TYPE y = F (i + M); \ + if (a[(i + M) * 5] != (TYPE) (x + y)) \ + __builtin_abort (); \ + } \ + } + +#define TEST_LOOPS(TYPE) \ + TEST_LOOP (TYPE, 1) \ + TEST_LOOP (TYPE, 2) \ + TEST_LOOP (TYPE, 5) + +int +main (void) +{ + TEST_LOOPS (char); + TEST_LOOPS (short); + TEST_LOOPS (float); + TEST_LOOPS (double); +} diff --git a/gcc/testsuite/gcc.target/aarch64/sve_cap_5.c b/gcc/testsuite/gcc.target/aarch64/sve_cap_5.c new file mode 100644 index 00000000000..4d4987773b6 --- /dev/null +++ b/gcc/testsuite/gcc.target/aarch64/sve_cap_5.c @@ -0,0 +1,24 @@ +/* { dg-do compile } */ +/* { dg-options "-O2 -ftree-vectorize -fno-inline -march=armv8-a+sve -fno-vect-cost-model" } */ + +void __attribute__ ((noinline, noclone)) +f (double *x, float *y, int n) +{ + for (int i = 0; i < n; ++i) + { + x[i * 3 + 18] = x[i * 3 + 0] + y[i]; + x[i * 3 + 19] = x[i * 3 + 1] - y[i]; + x[i * 3 + 20] = x[i * 3 + 2]; + } +} + +/* { dg-final { scan-assembler-times {\tld1w\t} 1 } } */ +/* { dg-final { scan-assembler-times {\tld3d\t} 2 } } */ +/* { dg-final { scan-assembler-times {\tst3d\t} 2 } } */ +/* { dg-final { scan-assembler-times {\tpunpklo\t} 1 } } */ +/* { dg-final { scan-assembler-times {\tpunpkhi\t} 1 } } */ +/* { dg-final { scan-assembler-times {\tuunpklo\t} 1 } } */ +/* { dg-final { scan-assembler-times {\tuunpkhi\t} 1 } } */ +/* { dg-final { scan-assembler-times {\tfcvt\t} 2 } } */ +/* { dg-final { scan-assembler-times {\tfadd\t} 2 } } */ +/* { dg-final { scan-assembler-times {\tfsub\t} 2 } } */ diff --git a/gcc/testsuite/gcc.target/aarch64/sve_cap_5_run.c b/gcc/testsuite/gcc.target/aarch64/sve_cap_5_run.c new file mode 100644 index 00000000000..57fd625d20a --- /dev/null +++ b/gcc/testsuite/gcc.target/aarch64/sve_cap_5_run.c @@ -0,0 +1,40 @@ +/* { dg-do run { target { aarch64_sve_hw } } } */ +/* { dg-options "-O2 -ftree-vectorize -fno-inline -march=armv8-a+sve -fno-vect-cost-model" } */ + +#include "sve_cap_5.c" + +#define COUNT 5 +#define N ((COUNT + 2) * 6) + +int +main (void) +{ + double x[N * 3]; + float y[N]; + for (int i = 0; i < N; ++i) + { + x[i * 3 + 0] = i * 2; + x[i * 3 + 1] = i * 3; + x[i * 3 + 2] = i * 5; + y[i] = i * 4; + } + f (x, y, COUNT * 6); + for (int i = 0; i < N; ++i) + { + if (i >= 6 && i < (COUNT + 1) * 6) + { + if (x[i * 3 + 0] != x[i * 3 - 18] + (i - 6) * 4 + || x[i * 3 + 1] != x[i * 3 - 17] - (i - 6) * 4 + || x[i * 3 + 2] != x[i * 3 - 16]) + __builtin_abort (); + } + else + { + if (x[i * 3 + 0] != i * 2 + || x[i * 3 + 1] != i * 3 + || x[i * 3 + 2] != i * 5) + __builtin_abort (); + } + } + return 0; +} diff --git a/gcc/tree-ssa-loop-ivopts.c b/gcc/tree-ssa-loop-ivopts.c index e0ba500fce2..224732e7059 100644 --- a/gcc/tree-ssa-loop-ivopts.c +++ b/gcc/tree-ssa-loop-ivopts.c @@ -2143,6 +2143,7 @@ constant_multiple_of (tree top, tree bot, widest_int *mul) enum tree_code code; unsigned precision = TYPE_PRECISION (TREE_TYPE (top)); widest_int res, p0, p1; + gassign *assign; STRIP_NOPS (top); STRIP_NOPS (bot); @@ -2189,6 +2190,24 @@ constant_multiple_of (tree top, tree bot, widest_int *mul) *mul = wi::sext (wi::divmod_trunc (p0, p1, SIGNED, &res), precision); return res == 0; + case SSA_NAME: + /* Handle one important special case: TOP is an SSA_NAME defined + to be BOT * CST. This triggers in vector loops with variable + vectorization factors. */ + assign = dyn_cast <gassign *> (SSA_NAME_DEF_STMT (top)); + if (assign && gimple_assign_rhs_code (assign) == MULT_EXPR) + { + tree new_top = gimple_assign_rhs1 (assign); + mby = gimple_assign_rhs2 (assign); + if (TREE_CODE (mby) == INTEGER_CST + && constant_multiple_of (new_top, bot, &res)) + { + *mul = wi::sext (res * wi::to_widest (mby), precision); + return true; + } + } + return false; + default: if (POLY_INT_CST_P (top) && POLY_INT_CST_P (bot) diff --git a/gcc/tree-vect-data-refs.c b/gcc/tree-vect-data-refs.c index 373d4be6581..8b2cf1fbafe 100644 --- a/gcc/tree-vect-data-refs.c +++ b/gcc/tree-vect-data-refs.c @@ -53,6 +53,7 @@ along with GCC; see the file COPYING3. If not see #include "tree-cfg.h" #include "tree-hash-traits.h" #include "internal-fn.h" +#include "gimple-fold.h" /* Return true if load- or store-lanes optab OPTAB is implemented for COUNT vectors of type VECTYPE. NAME is the name of OPTAB. */ @@ -4782,7 +4783,6 @@ vect_create_addr_base_for_vector_ref (gimple *stmt, return entry->final_addr; } - /* Function vect_create_data_ref_ptr. Create a new pointer-to-AGGR_TYPE variable (ap), that points to the first @@ -5013,6 +5013,9 @@ vect_create_data_ref_ptr (gimple *stmt, tree aggr_type, struct loop *at_loop, { if (iv_step == NULL_TREE) { + /* The caller must provide an IV_STEP for capped VF. */ + gcc_assert (!use_capped_vf (loop_vinfo)); + /* The step of the aggregate pointer is the type size. */ iv_step = TYPE_SIZE_UNIT (aggr_type); /* One exception to the above is when the scalar step of the load in @@ -5143,7 +5146,7 @@ bump_vector_ptr (tree dataref_ptr, gimple *ptr_incr, gimple_stmt_iterator *gsi, mark_ptr_info_alignment_unknown (SSA_NAME_PTR_INFO (new_dataref_ptr)); } - if (!ptr_incr) + if (!ptr_incr || use_capped_vf (STMT_VINFO_LOOP_VINFO (stmt_info))) return new_dataref_ptr; /* Update the vector-pointer's cross-iteration increment. */ diff --git a/gcc/tree-vect-loop-manip.c b/gcc/tree-vect-loop-manip.c index 9d9a93e00d5..57aab1b764f 100644 --- a/gcc/tree-vect-loop-manip.c +++ b/gcc/tree-vect-loop-manip.c @@ -254,15 +254,31 @@ adjust_phi_and_debug_stmts (gimple *update_phi, edge e, tree new_def) /* Define one loop mask MASK from loop LOOP. INIT_MASK is the value that the mask should have during the first iteration and NEXT_MASK is the - value that it should have on subsequent iterations. */ + value that it should have on subsequent iterations. CAP_MASK, if + nonnull, is a cap that should be applied to each value of the mask + before the mask is used; add the statement that does to HEADER_SEQ. */ static void -vect_set_loop_mask (struct loop *loop, tree mask, tree init_mask, - tree next_mask) +vect_set_loop_mask (struct loop *loop, gimple_seq *header_seq, tree mask, + tree init_mask, tree next_mask, tree cap_mask) { - gphi *phi = create_phi_node (mask, loop->header); + tree mask_type = TREE_TYPE (mask); + tree uncapped_mask; + if (cap_mask) + uncapped_mask = make_temp_ssa_name (mask_type, NULL, "uncapped_mask"); + else + uncapped_mask = mask; + gphi *phi = create_phi_node (uncapped_mask, loop->header); add_phi_arg (phi, init_mask, loop_preheader_edge (loop), UNKNOWN_LOCATION); add_phi_arg (phi, next_mask, loop_latch_edge (loop), UNKNOWN_LOCATION); + + /* Apply the cap mask, if any. */ + if (cap_mask) + { + gimple *stmt = gimple_build_assign (mask, BIT_AND_EXPR, + uncapped_mask, cap_mask); + gimple_seq_add_stmt (header_seq, stmt); + } } /* Add SEQ to the end of LOOP's preheader block. */ @@ -355,12 +371,14 @@ vect_maybe_permute_loop_masks (gimple_seq *seq, rgroup_masks *dest_rgm, /* Helper for vect_set_loop_condition_masked. Generate definitions for all the masks in RGM and return a mask that is nonzero when the loop - needs to iterate. Add any new preheader statements to PREHEADER_SEQ. - Use LOOP_COND_GSI to insert code before the exit gcond. + needs to iterate. Add any new preheader statements to PREHEADER_SEQ + and any new header statements to HEADER_SEQ. Use LOOP_COND_GSI to + insert code before the exit gcond. RGM belongs to loop LOOP. The loop originally iterated NITERS times and has been vectorized according to LOOP_VINFO. Each iteration - of the vectorized loop handles VF iterations of the scalar loop. + of the vectorized loop handles CAPPED_VF iterations of the scalar loop, + where CAPPED_VF is bounded by the compile-time vectorization factor. If NITERS_SKIP is nonnull, the first iteration of the vectorized loop starts with NITERS_SKIP dummy iterations of the scalar loop before @@ -374,7 +392,7 @@ vect_maybe_permute_loop_masks (gimple_seq *seq, rgroup_masks *dest_rgm, does not overflow. However, MIGHT_WRAP_P says whether an induction variable that starts at 0 and has step: - VF * RGM->max_nscalars_per_iter + CAPPED_VF * RGM->max_nscalars_per_iter might overflow before hitting a value above: @@ -386,8 +404,9 @@ vect_maybe_permute_loop_masks (gimple_seq *seq, rgroup_masks *dest_rgm, static tree vect_set_loop_masks_directly (struct loop *loop, loop_vec_info loop_vinfo, gimple_seq *preheader_seq, + gimple_seq *header_seq, gimple_stmt_iterator loop_cond_gsi, - rgroup_masks *rgm, tree vf, + rgroup_masks *rgm, tree capped_vf, tree niters, tree niters_skip, bool might_wrap_p) { @@ -401,7 +420,7 @@ vect_set_loop_masks_directly (struct loop *loop, loop_vec_info loop_vinfo, of the vector loop, and the number that it should skip during the first iteration of the vector loop. */ tree nscalars_total = niters; - tree nscalars_step = vf; + tree nscalars_step = capped_vf; tree nscalars_skip = niters_skip; if (nscalars_per_iter != 1) { @@ -553,8 +572,8 @@ vect_set_loop_masks_directly (struct loop *loop, loop_vec_info loop_vinfo, else { /* FIRST_LIMIT is the maximum number of scalars handled by the - first iteration of the vector loop. Test the portion - associated with this mask. */ + first iteration of the vector loop (before any cap mask + is applied). Test the portion associated with this mask. */ start = bias_tree; end = first_limit; } @@ -589,7 +608,18 @@ vect_set_loop_masks_directly (struct loop *loop, loop_vec_info loop_vinfo, gcall *call = vect_gen_while (next_mask, test_index, this_test_limit); gsi_insert_before (test_gsi, call, GSI_SAME_STMT); - vect_set_loop_mask (loop, mask, init_mask, next_mask); + /* Get the cap that needs to be ANDed with every mask. */ + tree cap_mask = LOOP_VINFO_CAP (loop_vinfo).mask; + if (use_capped_vf (loop_vinfo) + && (!cap_mask || nscalars_per_iter != 1)) + { + cap_mask = make_temp_ssa_name (mask_type, NULL, "cap_mask"); + call = vect_gen_while (cap_mask, bias_tree, nscalars_step); + gimple_seq_add_stmt (preheader_seq, call); + } + + vect_set_loop_mask (loop, header_seq, mask, init_mask, + next_mask, cap_mask); } return next_mask; } @@ -658,16 +688,26 @@ vect_set_loop_condition_masked (struct loop *loop, loop_vec_info loop_vinfo, else iv_limit += max_vf - 1; } - /* IV_LIMIT is the maximum number of latch iterations, which is also - the maximum in-range IV value. Round this value down to the previous - vector alignment boundary and then add an extra full iteration. */ - poly_uint64 vf = LOOP_VINFO_VECT_FACTOR (loop_vinfo); - iv_limit = (iv_limit & -(int) known_alignment (vf)) + max_vf; + if (use_capped_vf (loop_vinfo)) + /* In the worst case the final vector iteration will handle a single + scalar iteration, so we'll have up to MAX_VF - 1 inactive + iterations. Add 1 to this to get the number of loop iterations + instead of the number of latch iterations. */ + iv_limit += max_vf; + else + { + /* IV_LIMIT is the maximum number of latch iterations, which + is also the maximum in-range IV value. Round this value + down to the previous vector alignment boundary and then add + an extra full iteration. */ + poly_uint64 vf = LOOP_VINFO_VECT_FACTOR (loop_vinfo); + iv_limit = (iv_limit & -(int) known_alignment (vf)) + max_vf; + } } - /* Get the vectorization factor in tree form. */ - tree vf = build_int_cst (compare_type, - LOOP_VINFO_VECT_FACTOR (loop_vinfo)); + /* Convert the runtime vectorization factor to the appropriate type. */ + tree capped_vf = gimple_convert (&preheader_seq, compare_type, + LOOP_VINFO_CAP (loop_vinfo).niters); /* Iterate over all the rgroups and fill in their masks. We could use the first mask from any rgroup for the loop condition; here we @@ -701,9 +741,10 @@ vect_set_loop_condition_masked (struct loop *loop, loop_vec_info loop_vinfo, /* Set up all masks for this group. */ test_mask = vect_set_loop_masks_directly (loop, loop_vinfo, - &preheader_seq, - loop_cond_gsi, rgm, vf, - niters, niters_skip, + &preheader_seq, &header_seq, + loop_cond_gsi, rgm, + capped_vf, niters, + niters_skip, might_wrap_p); } @@ -2384,15 +2425,16 @@ vect_do_peeling (loop_vec_info loop_vinfo, tree niters, tree nitersm1, basic_block guard_bb, guard_to; profile_probability prob_prolog, prob_vector, prob_epilog; int estimated_vf; + tree vf = LOOP_VINFO_CAP (loop_vinfo).niters; + poly_uint64 max_vf = LOOP_VINFO_VECT_FACTOR (loop_vinfo); int prolog_peeling = 0; if (!vect_use_loop_mask_for_alignment_p (loop_vinfo)) prolog_peeling = LOOP_VINFO_PEELING_FOR_ALIGNMENT (loop_vinfo); - poly_uint64 vf = LOOP_VINFO_VECT_FACTOR (loop_vinfo); poly_uint64 bound_epilog = 0; if (!LOOP_VINFO_FULLY_MASKED_P (loop_vinfo) && LOOP_VINFO_PEELING_FOR_NITER (loop_vinfo)) - bound_epilog += vf - 1; + bound_epilog += max_vf - 1; if (LOOP_VINFO_PEELING_FOR_GAPS (loop_vinfo)) bound_epilog += 1; bool epilog_peeling = may_ne (bound_epilog, 0U); @@ -2449,7 +2491,7 @@ vect_do_peeling (loop_vec_info loop_vinfo, tree niters, tree nitersm1, the end of vector loop and skip to the end of epilog loop. */ bool skip_epilog = (prolog_peeling < 0 || !LOOP_VINFO_NITERS_KNOWN_P (loop_vinfo) - || !vf.is_constant ()); + || TREE_CODE (vf) != INTEGER_CST); /* PEELING_FOR_GAPS is special because epilog loop must be executed. */ if (LOOP_VINFO_PEELING_FOR_GAPS (loop_vinfo)) skip_epilog = false; @@ -2530,9 +2572,11 @@ vect_do_peeling (loop_vec_info loop_vinfo, tree niters, tree nitersm1, niters = vect_build_loop_niters (loop_vinfo, &new_var_p); /* It's guaranteed that vector loop bound before vectorization is at least VF, so set range information for newly generated var. */ - if (new_var_p) + poly_uint64 const_vf; + if (new_var_p && poly_int_tree_p (vf, &const_vf)) set_range_info (niters, VR_RANGE, - wi::to_wide (build_int_cst (type, vf)), + wi::to_wide (build_int_cstu + (type, constant_lower_bound (const_vf))), wi::to_wide (TYPE_MAX_VALUE (type))); /* Prolog iterates at most bound_prolog times, latch iterates at diff --git a/gcc/tree-vect-loop.c b/gcc/tree-vect-loop.c index a39682108f1..e33a83bfa6b 100644 --- a/gcc/tree-vect-loop.c +++ b/gcc/tree-vect-loop.c @@ -1344,6 +1344,85 @@ vect_verify_full_masking (loop_vec_info loop_vinfo) return true; } +/* LOOP_VINFO uses a fully-masked loop and needs to use a capped + vectorization factor. Decide whether the best way of doing that is: + + cap_mask = IFN_WHILE_ULT (0, max_vf) + actual_vf = IFN_MASK_POPCOUNT (cap_mask) + + CAP_MASK can then be used for an rgroup for which nS == 1 and nV == 1 + (see the comment above rgroup_masks for details). + + Return true if this does seem to be the best implementation and + update LOOP_VINFO_CAP accordingly. */ + +static bool +vect_maybe_build_capped_vf_via_while (loop_vec_info loop_vinfo, + gimple_seq *seq) +{ + poly_uint64 nunits = LOOP_VINFO_VECT_FACTOR (loop_vinfo); + if (nunits.is_constant ()) + /* In this case the capped number of iterations is known at compile + time, so a POPCOUNT would be pointless. */ + return false; + + if (LOOP_VINFO_MASKS (loop_vinfo).is_empty ()) + return false; + + rgroup_masks *rgm = &LOOP_VINFO_MASKS (loop_vinfo)[0]; + if (rgm->max_nscalars_per_iter != 1) + /* There's no nS == 1 && nV == 1 mask that would benefit from + having a precomputed cap mask. */ + return false; + + if (!direct_internal_fn_supported_p (IFN_MASK_POPCOUNT, rgm->mask_type, + OPTIMIZE_FOR_SPEED)) + return false; + + tree compare_type = LOOP_VINFO_MASK_COMPARE_TYPE (loop_vinfo); + tree zero_index = build_int_cst (compare_type, 0); + tree limit = build_int_cst (compare_type, + LOOP_VINFO_MAX_VECT_FACTOR (loop_vinfo)); + + tree cap_mask = make_temp_ssa_name (rgm->mask_type, NULL, "cap_mask"); + gcall *stmt = vect_gen_while (cap_mask, zero_index, limit); + gimple_seq_add_stmt (seq, stmt); + LOOP_VINFO_CAP (loop_vinfo).mask = cap_mask; + + tree vf = make_temp_ssa_name (sizetype, NULL, "vf"); + stmt = gimple_build_call_internal (IFN_MASK_POPCOUNT, 1, cap_mask); + gimple_call_set_lhs (stmt, vf); + gimple_seq_add_stmt (seq, stmt); + LOOP_VINFO_CAP (loop_vinfo).niters = vf; + + return true; +} + +/* Initialize LOOP_VINFO_CAP (LOOP_VINFO). */ + +static void +vect_build_cap (loop_vec_info loop_vinfo) +{ + tree vf = size_int (LOOP_VINFO_VECT_FACTOR (loop_vinfo)); + if (!use_capped_vf (loop_vinfo)) + LOOP_VINFO_CAP (loop_vinfo).niters = vf; + else + { + gimple_seq seq = NULL; + if (!vect_maybe_build_capped_vf_via_while (loop_vinfo, &seq)) + { + tree max_vf = size_int (LOOP_VINFO_MAX_VECT_FACTOR (loop_vinfo)); + LOOP_VINFO_CAP (loop_vinfo).niters + = gimple_build (&seq, MIN_EXPR, sizetype, vf, max_vf); + } + if (seq) + { + edge pe = loop_preheader_edge (LOOP_VINFO_LOOP (loop_vinfo)); + gsi_insert_seq_on_edge_immediate (pe, seq); + } + } +} + /* Calculate the cost of one scalar iteration of the loop. */ static void vect_compute_single_scalar_iteration_cost (loop_vec_info loop_vinfo) @@ -2128,12 +2207,15 @@ vect_analyze_loop_2 (loop_vec_info loop_vinfo, bool &fatal) /* Analyze data dependences between the data-refs in the loop and adjust the maximum vectorization factor according to the dependences. - FORNOW: fail at the first data dependence that we encounter. */ + + We might be able to cope with max_vf that are smaller than the full + vector width by using a fully-masked loop. Postpone that decision + until we know whether full masking is possible. Of course, it might + not be a win to use vectors in this situation even if it is supported, + but that's a decision for the cost model. */ ok = vect_analyze_data_ref_dependences (loop_vinfo, &max_vf); - if (!ok - || (max_vf != MAX_VECTORIZATION_FACTOR - && may_lt (max_vf, min_vf))) + if (!ok || max_vf <= 1) { if (dump_enabled_p ()) dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location, @@ -2150,14 +2232,6 @@ vect_analyze_loop_2 (loop_vec_info loop_vinfo, bool &fatal) "can't determine vectorization factor.\n"); return false; } - if (max_vf != MAX_VECTORIZATION_FACTOR - && may_lt (max_vf, LOOP_VINFO_VECT_FACTOR (loop_vinfo))) - { - if (dump_enabled_p ()) - dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location, - "bad data dependence.\n"); - return false; - } /* Compute the scalar iteration cost. */ vect_compute_single_scalar_iteration_cost (loop_vinfo); @@ -2278,6 +2352,19 @@ start_over: "not using a fully-masked loop.\n"); } + if (!LOOP_VINFO_FULLY_MASKED_P (loop_vinfo) + && use_capped_vf (loop_vinfo)) + { + if (dump_enabled_p ()) + dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location, + "Need to cap the runtime vectorization factor to " + HOST_WIDE_INT_PRINT_DEC " but cannot fully mask" + " the loop.\n", + LOOP_VINFO_MAX_VECT_FACTOR (loop_vinfo)); + /* Undoing SLP might allow us to use a mask. */ + goto again; + } + /* If epilog loop is required because of data accesses with gaps, one additional iteration needs to be peeled. Check if there is enough iterations for vectorization. */ @@ -7347,7 +7434,7 @@ vectorizable_induction (gimple *phi, gphi *induction_phi; tree induc_def, vec_dest; tree init_expr, step_expr; - poly_uint64 vf = LOOP_VINFO_VECT_FACTOR (loop_vinfo); + tree vf = LOOP_VINFO_CAP (loop_vinfo).niters; unsigned i; tree expr; gimple_seq stmts; @@ -7526,12 +7613,9 @@ vectorizable_induction (gimple *phi, /* Generate [VF*S, VF*S, ... ]. */ if (SCALAR_FLOAT_TYPE_P (TREE_TYPE (step_expr))) - { - expr = build_int_cst (integer_type_node, vf); - expr = fold_convert (TREE_TYPE (step_expr), expr); - } + expr = fold_convert (TREE_TYPE (step_expr), vf); else - expr = build_int_cst (TREE_TYPE (step_expr), vf); + expr = vf; new_name = fold_build2 (MULT_EXPR, TREE_TYPE (step_expr), expr, step_expr); if (! CONSTANT_CLASS_P (new_name)) @@ -7726,12 +7810,9 @@ vectorizable_induction (gimple *phi, vec_step = [VF*S, VF*S, VF*S, VF*S] */ gimple_seq seq = NULL; if (SCALAR_FLOAT_TYPE_P (TREE_TYPE (step_expr))) - { - expr = build_int_cst (integer_type_node, vf); - expr = gimple_build (&seq, FLOAT_EXPR, TREE_TYPE (step_expr), expr); - } + expr = gimple_build (&seq, FLOAT_EXPR, TREE_TYPE (step_expr), vf); else - expr = build_int_cst (TREE_TYPE (step_expr), vf); + expr = gimple_convert (&seq, TREE_TYPE (step_expr), vf); new_name = gimple_build (&seq, MULT_EXPR, TREE_TYPE (step_expr), expr, step_expr); if (seq) @@ -8315,7 +8396,7 @@ vect_transform_loop (loop_vec_info loop_vinfo) tree step_vector = NULL_TREE; tree niters_vector_mult_vf = NULL_TREE; poly_uint64 vf = LOOP_VINFO_VECT_FACTOR (loop_vinfo); - unsigned int lowest_vf = constant_lower_bound (vf); + unsigned int lowest_vf; bool grouped_store; bool slp_scheduled = false; gimple *stmt, *pattern_stmt; @@ -8325,6 +8406,9 @@ vect_transform_loop (loop_vec_info loop_vinfo) bool check_profitability = false; unsigned int th; + lowest_vf = constant_lower_bound (vf); + lowest_vf = MIN (lowest_vf, LOOP_VINFO_MAX_VECT_FACTOR (loop_vinfo)); + if (dump_enabled_p ()) dump_printf_loc (MSG_NOTE, vect_location, "=== vec_transform_loop ===\n"); @@ -8389,6 +8473,8 @@ vect_transform_loop (loop_vec_info loop_vinfo) } } + vect_build_cap (loop_vinfo); + tree niters = vect_build_loop_niters (loop_vinfo); LOOP_VINFO_NITERS_UNCHANGED (loop_vinfo) = niters; tree nitersm1 = unshare_expr (LOOP_VINFO_NITERSM1 (loop_vinfo)); diff --git a/gcc/tree-vect-stmts.c b/gcc/tree-vect-stmts.c index df7ae274047..c0a87dc9275 100644 --- a/gcc/tree-vect-stmts.c +++ b/gcc/tree-vect-stmts.c @@ -110,6 +110,27 @@ record_stmt_cost (stmt_vector_for_cost *body_cost_vec, int count, count, kind, stmt_info, misalign, where); } +/* Return a tree that represents STEP multiplied by the vectorization + factor. */ + +static tree +vect_mult_by_vf (loop_vec_info loop_vinfo, tree step) +{ + hash_map<tree, tree> *map = &LOOP_VINFO_VF_MULT_MAP (loop_vinfo); + bool existed; + tree &entry = map->get_or_insert (step, &existed); + if (!existed) + { + gimple_seq seq = NULL; + tree vf = LOOP_VINFO_CAP (loop_vinfo).niters; + vf = gimple_convert (&seq, TREE_TYPE (step), vf); + entry = gimple_build (&seq, MULT_EXPR, TREE_TYPE (step), vf, step); + edge pe = loop_preheader_edge (LOOP_VINFO_LOOP (loop_vinfo)); + gsi_insert_seq_on_edge_immediate (pe, seq); + } + return entry; +} + /* Return a variable of type ELEM_TYPE[NELEMS]. */ static tree @@ -2812,7 +2833,8 @@ vect_get_gather_scatter_ops (struct loop *loop, gimple *stmt, static void vect_get_strided_load_store_ops (gimple *stmt, loop_vec_info loop_vinfo, gather_scatter_info *gs_info, - tree *dataref_bump, tree *vec_offset) + tree *iv_step, tree *dataref_bump, + tree *vec_offset) { stmt_vec_info stmt_info = vinfo_for_stmt (stmt); struct data_reference *dr = STMT_VINFO_DATA_REF (stmt_info); @@ -2827,6 +2849,12 @@ vect_get_strided_load_store_ops (gimple *stmt, loop_vec_info loop_vinfo, if (stmts) gsi_insert_seq_on_edge_immediate (loop_preheader_edge (loop), stmts); + if (use_capped_vf (loop_vinfo)) + *iv_step = vect_mult_by_vf (loop_vinfo, + fold_convert (sizetype, DR_STEP (dr))); + else + *iv_step = *dataref_bump; + /* The offset given in GS_INFO can have pointer type, so use the element type of the vector instead. */ tree offset_type = TREE_TYPE (gs_info->offset); @@ -2851,18 +2879,32 @@ vect_get_strided_load_store_ops (gimple *stmt, loop_vec_info loop_vinfo, being vectorized and MEMORY_ACCESS_TYPE describes the type of vectorization. */ -static tree -vect_get_data_ptr_increment (data_reference *dr, tree aggr_type, - vect_memory_access_type memory_access_type) +static void +vect_get_data_ptr_increment (loop_vec_info loop_vinfo, data_reference *dr, + tree aggr_type, unsigned int group_size, + vect_memory_access_type memory_access_type, + tree *iv_step, tree *bump) { if (memory_access_type == VMAT_INVARIANT) - return size_zero_node; + { + *iv_step = *bump = size_zero_node; + return; + } - tree iv_step = TYPE_SIZE_UNIT (aggr_type); + *bump = TYPE_SIZE_UNIT (aggr_type); tree step = vect_dr_behavior (dr)->step; if (tree_int_cst_sgn (step) == -1) - iv_step = fold_build1 (NEGATE_EXPR, TREE_TYPE (iv_step), iv_step); - return iv_step; + *bump = fold_build1 (NEGATE_EXPR, TREE_TYPE (*bump), *bump); + + if (loop_vinfo && use_capped_vf (loop_vinfo)) + { + tree elt_type = TREE_TYPE (DR_REF (dr)); + tree bytes_per_iter = size_binop (MULT_EXPR, TYPE_SIZE_UNIT (elt_type), + size_int (group_size)); + *iv_step = vect_mult_by_vf (loop_vinfo, bytes_per_iter); + } + else + *iv_step = *bump; } /* Check and perform vectorization of BUILT_IN_BSWAP{16,32,64}. */ @@ -6717,18 +6759,19 @@ vectorizable_store (gimple *stmt, gimple_stmt_iterator *gsi, gimple **vec_stmt, || memory_access_type == VMAT_CONTIGUOUS_REVERSE) offset = size_int (-TYPE_VECTOR_SUBPARTS (vectype) + 1); - tree bump; + tree bump, iv_step; tree vec_offset = NULL_TREE; if (STMT_VINFO_GATHER_SCATTER_P (stmt_info)) { aggr_type = NULL_TREE; + iv_step = NULL_TREE; bump = NULL_TREE; } else if (memory_access_type == VMAT_GATHER_SCATTER) { aggr_type = elem_type; vect_get_strided_load_store_ops (stmt, loop_vinfo, &gs_info, - &bump, &vec_offset); + &iv_step, &bump, &vec_offset); } else { @@ -6736,7 +6779,8 @@ vectorizable_store (gimple *stmt, gimple_stmt_iterator *gsi, gimple **vec_stmt, aggr_type = build_array_type_nelts (elem_type, vec_num * nunits); else aggr_type = vectype; - bump = vect_get_data_ptr_increment (dr, aggr_type, memory_access_type); + vect_get_data_ptr_increment (loop_vinfo, dr, aggr_type, group_size, + memory_access_type, &iv_step, &bump); } if (mask) @@ -6854,7 +6898,7 @@ vectorizable_store (gimple *stmt, gimple_stmt_iterator *gsi, gimple **vec_stmt, simd_lane_access_p ? loop : NULL, offset, &dummy, gsi, &ptr_incr, simd_lane_access_p, &inv_p, - NULL_TREE, bump); + NULL_TREE, iv_step); gcc_assert (bb_vinfo || !inv_p); } else @@ -7917,18 +7961,19 @@ vectorizable_load (gimple *stmt, gimple_stmt_iterator *gsi, gimple **vec_stmt, if (memory_access_type == VMAT_CONTIGUOUS_REVERSE) offset = size_int (-TYPE_VECTOR_SUBPARTS (vectype) + 1); - tree bump; + tree bump, iv_step; tree vec_offset = NULL_TREE; if (STMT_VINFO_GATHER_SCATTER_P (stmt_info)) { aggr_type = NULL_TREE; + iv_step = NULL_TREE; bump = NULL_TREE; } else if (memory_access_type == VMAT_GATHER_SCATTER) { aggr_type = elem_type; vect_get_strided_load_store_ops (stmt, loop_vinfo, &gs_info, - &bump, &vec_offset); + &iv_step, &bump, &vec_offset); } else { @@ -7936,7 +7981,8 @@ vectorizable_load (gimple *stmt, gimple_stmt_iterator *gsi, gimple **vec_stmt, aggr_type = build_array_type_nelts (elem_type, vec_num * nunits); else aggr_type = vectype; - bump = vect_get_data_ptr_increment (dr, aggr_type, memory_access_type); + vect_get_data_ptr_increment (loop_vinfo, dr, aggr_type, group_size, + memory_access_type, &iv_step, &bump); } tree vec_mask = NULL_TREE; @@ -7971,7 +8017,7 @@ vectorizable_load (gimple *stmt, gimple_stmt_iterator *gsi, gimple **vec_stmt, = vect_create_data_ref_ptr (first_stmt_for_drptr, aggr_type, at_loop, offset, &dummy, gsi, &ptr_incr, simd_lane_access_p, - &inv_p, byte_offset, bump); + &inv_p, byte_offset, iv_step); /* Adjust the pointer by the difference to first_stmt. */ data_reference_p ptrdr = STMT_VINFO_DATA_REF (vinfo_for_stmt (first_stmt_for_drptr)); @@ -7993,7 +8039,7 @@ vectorizable_load (gimple *stmt, gimple_stmt_iterator *gsi, gimple **vec_stmt, = vect_create_data_ref_ptr (first_stmt, aggr_type, at_loop, offset, &dummy, gsi, &ptr_incr, simd_lane_access_p, &inv_p, - byte_offset, bump); + byte_offset, iv_step); if (mask) vec_mask = vect_get_vec_def_for_operand (mask, stmt, mask_vectype); diff --git a/gcc/tree-vectorizer.h b/gcc/tree-vectorizer.h index 023594771bb..8073ba05a83 100644 --- a/gcc/tree-vectorizer.h +++ b/gcc/tree-vectorizer.h @@ -357,6 +357,18 @@ struct rgroup_masks { typedef auto_vec<rgroup_masks> vec_loop_masks; +/* Represents a scalar iteration count <= VF as both an integer count and a + vector mask. */ +struct vec_niters_and_mask { + vec_niters_and_mask () : niters (NULL_TREE), mask (NULL_TREE) {} + + /* The number of scalar iterations as a sizetype integer. */ + tree niters; + + /* The mask of scalar iterations, with one element per iteration. */ + tree mask; +}; + /*-----------------------------------------------------------------*/ /* Info on vectorized loops. */ /*-----------------------------------------------------------------*/ @@ -397,6 +409,10 @@ typedef struct _loop_vec_info : public vec_info { if there is no particular limit. */ unsigned HOST_WIDE_INT max_vectorization_factor; + /* The actual runtime vectorization factor, which is the minimum of + VECTORIZATION_FACTOR and MAX_VECTORIZATION_FACTOR. */ + vec_niters_and_mask cap; + /* The masks that a fully-masked loop should use to avoid operating on inactive scalars. */ vec_loop_masks masks; @@ -526,6 +542,10 @@ typedef struct _loop_vec_info : public vec_info { /* A hash table used for caching vector base addresses. */ hash_table<vect_addr_base_hasher> vect_addr_base_htab; + + /* A map from X to a precomputed gimple_val containing + CAPPED_VECTORIZATION_FACTOR * X. */ + hash_map<tree, tree> vf_mult_map; } *loop_vec_info; /* Access Functions. */ @@ -545,6 +565,7 @@ typedef struct _loop_vec_info : public vec_info { #define LOOP_VINFO_FULLY_MASKED_P(L) (L)->fully_masked_p #define LOOP_VINFO_VECT_FACTOR(L) (L)->vectorization_factor #define LOOP_VINFO_MAX_VECT_FACTOR(L) (L)->max_vectorization_factor +#define LOOP_VINFO_CAP(L) (L)->cap #define LOOP_VINFO_MASKS(L) (L)->masks #define LOOP_VINFO_MASK_SKIP_NITERS(L) (L)->mask_skip_niters #define LOOP_VINFO_MASK_COMPARE_TYPE(L) (L)->mask_compare_type @@ -577,6 +598,7 @@ typedef struct _loop_vec_info : public vec_info { #define LOOP_VINFO_SINGLE_SCALAR_ITERATION_COST(L) (L)->single_scalar_iteration_cost #define LOOP_VINFO_ORIG_LOOP_INFO(L) (L)->orig_loop_info #define LOOP_VINFO_ADDR_CACHE(L) (L)->vect_addr_base_htab +#define LOOP_VINFO_VF_MULT_MAP(L) (L)->vf_mult_map #define LOOP_REQUIRES_VERSIONING_FOR_ALIGNMENT(L) \ ((L)->may_misalign_stmts.length () > 0) @@ -1351,6 +1373,19 @@ unlimited_cost_model (loop_p loop) return (flag_vect_cost_model == VECT_COST_MODEL_UNLIMITED); } +/* Return true if the loop needs to use a vectorization factor that + is capped at run time. */ + +static inline bool +use_capped_vf (loop_vec_info loop_vinfo) +{ + return (loop_vinfo + && (LOOP_VINFO_MAX_VECT_FACTOR (loop_vinfo) + != MAX_VECTORIZATION_FACTOR) + && may_lt (LOOP_VINFO_MAX_VECT_FACTOR (loop_vinfo), + LOOP_VINFO_VECT_FACTOR (loop_vinfo))); +} + /* Return true if the loop described by LOOP_VINFO is fully-masked and if the first iteration should use a partial mask in order to achieve alignment. */ |