summaryrefslogtreecommitdiff
diff options
context:
space:
mode:
authorRichard Sandiford <richard.sandiford@linaro.org>2016-11-30 19:08:56 +0000
committerRichard Sandiford <richard.sandiford@linaro.org>2017-11-20 11:42:49 +0000
commitf9d9d07b1ba7536ac4e24d2852e1cc0147fb8acc (patch)
treea80761f5dcd0b0b7d60903bcacdaffc9cbab850f
parentd56832f6d254ebd9d54d010366bfeaf684e32910 (diff)
downloadgcc-f9d9d07b1ba7536ac4e24d2852e1cc0147fb8acc.tar.gz
Add support for vectorising live-out values using SVE LASTB
This patch uses the SVE LASTB instruction to optimise cases in which a value produced by the final scalar iteration of a vectorised loop is live outside the loop. Previously this situation would stop us from using a fully-masked loop. 2017-11-16 Richard Sandiford <richard.sandiford@linaro.org> Alan Hayward <alan.hayward@arm.com> David Sherwood <david.sherwood@arm.com> gcc/ * doc/md.texi (extract_last_@var{m}): Document. * optabs.def (extract_last_optab): New optab. * internal-fn.def (EXTRACT_LAST): New internal function. * internal-fn.c (cond_unary_direct): New macro. (expand_cond_unary_optab_fn): Likewise. (direct_cond_unary_optab_supported_p): Likewise. * tree-vect-loop.c (vectorizable_live_operation): Allow fully-masked loops using EXTRACT_LAST. * config/aarch64/aarch64-sve.md (aarch64_sve_lastb<mode>): Rename to... (extract_last_<mode>): ...this optab. (vec_extract<mode><Vel>): Update accordingly. gcc/testsuite/ * gcc.target/aarch64/sve_live_1.c: New test. * gcc.target/aarch64/sve_live_1_run.c: Likewise.
-rw-r--r--gcc/config/aarch64/aarch64-sve.md8
-rw-r--r--gcc/doc/md.texi8
-rw-r--r--gcc/internal-fn.c5
-rw-r--r--gcc/internal-fn.def4
-rw-r--r--gcc/optabs.def2
-rw-r--r--gcc/testsuite/gcc.target/aarch64/sve_live_1.c41
-rw-r--r--gcc/testsuite/gcc.target/aarch64/sve_live_1_run.c35
-rw-r--r--gcc/tree-vect-loop.c87
8 files changed, 168 insertions, 22 deletions
diff --git a/gcc/config/aarch64/aarch64-sve.md b/gcc/config/aarch64/aarch64-sve.md
index 78d78c34afd..946798851f9 100644
--- a/gcc/config/aarch64/aarch64-sve.md
+++ b/gcc/config/aarch64/aarch64-sve.md
@@ -345,8 +345,7 @@
/* The last element can be extracted with a LASTB and a false
predicate. */
rtx sel = force_reg (<VPRED>mode, CONST0_RTX (<VPRED>mode));
- emit_insn (gen_aarch64_sve_lastb<mode> (operands[0], sel,
- operands[1]));
+ emit_insn (gen_extract_last_<mode> (operands[0], sel, operands[1]));
DONE;
}
if (!CONST_INT_P (operands[2]))
@@ -365,8 +364,7 @@
emit_insn (gen_vec_cmp<v_int_equiv><vpred> (sel, cmp, series, zero));
/* Select the element using LASTB. */
- emit_insn (gen_aarch64_sve_lastb<mode> (operands[0], sel,
- operands[1]));
+ emit_insn (gen_extract_last_<mode> (operands[0], sel, operands[1]));
DONE;
}
}
@@ -431,7 +429,7 @@
;; Extract the last active element of operand 1 into operand 0.
;; If no elements are active, extract the last inactive element instead.
-(define_insn "aarch64_sve_lastb<mode>"
+(define_insn "extract_last_<mode>"
[(set (match_operand:<VEL> 0 "register_operand" "=r, w")
(unspec:<VEL>
[(match_operand:<VPRED> 1 "register_operand" "Upl, Upl")
diff --git a/gcc/doc/md.texi b/gcc/doc/md.texi
index ce562d1a555..2958728d90f 100644
--- a/gcc/doc/md.texi
+++ b/gcc/doc/md.texi
@@ -5268,6 +5268,14 @@ of a vector of mode @var{m}. Operand 1 is the vector input and operand 0
is the scalar result. The mode of the scalar result is the same as one
element of @var{m}.
+@cindex @code{extract_last_@var{m}} instruction pattern
+@item @code{extract_last_@var{m}}
+Find the last set bit in mask operand 1 and extract the associated element
+of vector operand 2. Store the result in scalar operand 0. Operand 2
+has vector mode @var{m} while operand 0 has the mode appropriate for one
+element of @var{m}. Operand 1 has the usual mask mode for vectors of mode
+@var{m}; see @code{TARGET_VECTORIZE_GET_MASK_MODE}.
+
@cindex @code{sdot_prod@var{m}} instruction pattern
@item @samp{sdot_prod@var{m}}
@cindex @code{udot_prod@var{m}} instruction pattern
diff --git a/gcc/internal-fn.c b/gcc/internal-fn.c
index 379ee32e6aa..0779c3538aa 100644
--- a/gcc/internal-fn.c
+++ b/gcc/internal-fn.c
@@ -88,6 +88,7 @@ init_internal_fns ()
#define mask_store_lanes_direct { 0, 0, false }
#define unary_direct { 0, 0, true }
#define binary_direct { 0, 0, true }
+#define cond_unary_direct { 1, 1, true }
#define cond_binary_direct { 1, 1, true }
#define while_direct { 0, 2, false }
@@ -2826,6 +2827,9 @@ expand_while_optab_fn (internal_fn, gcall *stmt, convert_optab optab)
#define expand_binary_optab_fn(FN, STMT, OPTAB) \
expand_direct_optab_fn (FN, STMT, OPTAB, 2)
+#define expand_cond_unary_optab_fn(FN, STMT, OPTAB) \
+ expand_direct_optab_fn (FN, STMT, OPTAB, 2)
+
#define expand_cond_binary_optab_fn(FN, STMT, OPTAB) \
expand_direct_optab_fn (FN, STMT, OPTAB, 3)
@@ -2902,6 +2906,7 @@ multi_vector_optab_supported_p (convert_optab optab, tree_pair types,
#define direct_unary_optab_supported_p direct_optab_supported_p
#define direct_binary_optab_supported_p direct_optab_supported_p
+#define direct_cond_unary_optab_supported_p direct_optab_supported_p
#define direct_cond_binary_optab_supported_p direct_optab_supported_p
#define direct_mask_load_optab_supported_p direct_optab_supported_p
#define direct_load_lanes_optab_supported_p multi_vector_optab_supported_p
diff --git a/gcc/internal-fn.def b/gcc/internal-fn.def
index 576d83901ea..8d6871feb44 100644
--- a/gcc/internal-fn.def
+++ b/gcc/internal-fn.def
@@ -142,6 +142,10 @@ DEF_INTERNAL_COND_OPTAB_FN (XOR, ECF_CONST | ECF_NOTHROW, xor, binary)
DEF_INTERNAL_OPTAB_FN (RSQRT, ECF_CONST, rsqrt, unary)
+/* Extract the last active element from a vector. */
+DEF_INTERNAL_OPTAB_FN (EXTRACT_LAST, ECF_CONST | ECF_NOTHROW,
+ extract_last, cond_unary)
+
/* Unary math functions. */
DEF_INTERNAL_FLT_FN (ACOS, ECF_CONST, acos, unary)
DEF_INTERNAL_FLT_FN (ASIN, ECF_CONST, asin, unary)
diff --git a/gcc/optabs.def b/gcc/optabs.def
index 27e98346a88..4660f50345f 100644
--- a/gcc/optabs.def
+++ b/gcc/optabs.def
@@ -307,6 +307,8 @@ OPTAB_D (reduc_and_scal_optab, "reduc_and_scal_$a")
OPTAB_D (reduc_ior_scal_optab, "reduc_ior_scal_$a")
OPTAB_D (reduc_xor_scal_optab, "reduc_xor_scal_$a")
+OPTAB_D (extract_last_optab, "extract_last_$a")
+
OPTAB_D (sdot_prod_optab, "sdot_prod$I$a")
OPTAB_D (ssum_widen_optab, "widen_ssum$I$a3")
OPTAB_D (udot_prod_optab, "udot_prod$I$a")
diff --git a/gcc/testsuite/gcc.target/aarch64/sve_live_1.c b/gcc/testsuite/gcc.target/aarch64/sve_live_1.c
new file mode 100644
index 00000000000..407d1277c50
--- /dev/null
+++ b/gcc/testsuite/gcc.target/aarch64/sve_live_1.c
@@ -0,0 +1,41 @@
+/* { dg-do assemble } */
+/* { dg-options "-O2 -ftree-vectorize -march=armv8-a+sve --save-temps" } */
+
+#include <stdint.h>
+
+#define EXTRACT_LAST(TYPE) \
+ TYPE __attribute__ ((noinline, noclone)) \
+ test_##TYPE (TYPE *x, int n, TYPE value) \
+ { \
+ TYPE last; \
+ for (int j = 0; j < n; ++j) \
+ { \
+ last = x[j]; \
+ x[j] = last * value; \
+ } \
+ return last; \
+ }
+
+#define TEST_ALL(T) \
+ T (uint8_t) \
+ T (uint16_t) \
+ T (uint32_t) \
+ T (uint64_t) \
+ T (_Float16) \
+ T (float) \
+ T (double)
+
+TEST_ALL (EXTRACT_LAST)
+
+/* { dg-final { scan-assembler-times {\twhilelo\tp[0-7].b, } 2 } } */
+/* { dg-final { scan-assembler-times {\twhilelo\tp[0-7].h, } 4 } } */
+/* { dg-final { scan-assembler-times {\twhilelo\tp[0-7].s, } 4 } } */
+/* { dg-final { scan-assembler-times {\twhilelo\tp[0-7].d, } 4 } } */
+
+/* { dg-final { scan-assembler-times {\tlastb\tw[0-9]+, p[0-7], z[0-9]+\.b\n} 1 } } */
+/* { dg-final { scan-assembler-times {\tlastb\tw[0-9]+, p[0-7], z[0-9]+\.h\n} 1 } } */
+/* { dg-final { scan-assembler-times {\tlastb\tw[0-9]+, p[0-7], z[0-9]+\.s\n} 1 } } */
+/* { dg-final { scan-assembler-times {\tlastb\tx[0-9]+, p[0-7], z[0-9]+\.d\n} 1 } } */
+/* { dg-final { scan-assembler-times {\tlastb\th[0-9]+, p[0-7], z[0-9]+\.h\n} 1 } } */
+/* { dg-final { scan-assembler-times {\tlastb\ts[0-9]+, p[0-7], z[0-9]+\.s\n} 1 } } */
+/* { dg-final { scan-assembler-times {\tlastb\td[0-9]+, p[0-7], z[0-9]+\.d\n} 1 } } */
diff --git a/gcc/testsuite/gcc.target/aarch64/sve_live_1_run.c b/gcc/testsuite/gcc.target/aarch64/sve_live_1_run.c
new file mode 100644
index 00000000000..2a1f6df4788
--- /dev/null
+++ b/gcc/testsuite/gcc.target/aarch64/sve_live_1_run.c
@@ -0,0 +1,35 @@
+/* { dg-do run { target { aarch64_sve_hw } } } */
+/* { dg-options "-O2 -ftree-vectorize -fno-inline -march=armv8-a+sve" } */
+
+#include "sve_live_1.c"
+
+#define N 107
+#define OP 70
+
+#define TEST_LOOP(TYPE) \
+ { \
+ TYPE a[N]; \
+ for (int i = 0; i < N; ++i) \
+ { \
+ a[i] = i * 2 + (i % 3); \
+ asm volatile ("" ::: "memory"); \
+ } \
+ TYPE expected = a[N - 1]; \
+ TYPE res = test_##TYPE (a, N, OP); \
+ if (res != expected) \
+ __builtin_abort (); \
+ for (int i = 0; i < N; ++i) \
+ { \
+ TYPE old = i * 2 + (i % 3); \
+ if (a[i] != (TYPE) (old * (TYPE) OP)) \
+ __builtin_abort (); \
+ asm volatile ("" ::: "memory"); \
+ } \
+ }
+
+int __attribute__ ((optimize (1)))
+main (void)
+{
+ TEST_ALL (TEST_LOOP);
+ return 0;
+}
diff --git a/gcc/tree-vect-loop.c b/gcc/tree-vect-loop.c
index b634646f98c..815dff37dad 100644
--- a/gcc/tree-vect-loop.c
+++ b/gcc/tree-vect-loop.c
@@ -7645,16 +7645,43 @@ vectorizable_live_operation (gimple *stmt,
if (!vec_stmt)
{
+ /* No transformation required. */
if (LOOP_VINFO_CAN_FULLY_MASK_P (loop_vinfo))
{
- if (dump_enabled_p ())
- dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
- "can't use a fully-masked loop because "
- "a value is live outside the loop.\n");
- LOOP_VINFO_CAN_FULLY_MASK_P (loop_vinfo) = false;
+ if (!direct_internal_fn_supported_p (IFN_EXTRACT_LAST, vectype,
+ OPTIMIZE_FOR_SPEED))
+ {
+ if (dump_enabled_p ())
+ dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
+ "can't use a fully-masked loop because "
+ "the target doesn't support extract last "
+ "reduction.\n");
+ LOOP_VINFO_CAN_FULLY_MASK_P (loop_vinfo) = false;
+ }
+ else if (slp_node)
+ {
+ if (dump_enabled_p ())
+ dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
+ "can't use a fully-masked loop because an "
+ "SLP statement is live after the loop.\n");
+ LOOP_VINFO_CAN_FULLY_MASK_P (loop_vinfo) = false;
+ }
+ else if (ncopies > 1)
+ {
+ if (dump_enabled_p ())
+ dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
+ "can't use a fully-masked loop because"
+ " ncopies is greater than 1.\n");
+ LOOP_VINFO_CAN_FULLY_MASK_P (loop_vinfo) = false;
+ }
+ else
+ {
+ gcc_assert (ncopies == 1 && !slp_node);
+ vect_record_loop_mask (loop_vinfo,
+ &LOOP_VINFO_MASKS (loop_vinfo),
+ 1, vectype);
+ }
}
-
- /* No transformation required. */
return true;
}
@@ -7671,12 +7698,12 @@ vectorizable_live_operation (gimple *stmt,
: TYPE_SIZE (TREE_TYPE (vectype)));
vec_bitsize = TYPE_SIZE (vectype);
- gcc_assert (!LOOP_VINFO_FULLY_MASKED_P (loop_vinfo));
-
/* Get the vectorized lhs of STMT and the lane to use (counted in bits). */
tree vec_lhs, bitstart;
if (slp_node)
{
+ gcc_assert (!LOOP_VINFO_FULLY_MASKED_P (loop_vinfo));
+
/* Get the correct slp vectorized stmt. */
vec_lhs = gimple_get_lhs (SLP_TREE_VEC_STMTS (slp_node)[vec_entry]);
@@ -7688,6 +7715,8 @@ vectorizable_live_operation (gimple *stmt,
{
enum vect_def_type dt = STMT_VINFO_DEF_TYPE (stmt_info);
vec_lhs = vect_get_vec_def_for_operand_1 (stmt, dt);
+ gcc_checking_assert (ncopies == 1
+ || !LOOP_VINFO_FULLY_MASKED_P (loop_vinfo));
/* For multiple copies, get the last copy. */
for (int i = 1; i < ncopies; ++i)
@@ -7698,15 +7727,39 @@ vectorizable_live_operation (gimple *stmt,
bitstart = int_const_binop (MINUS_EXPR, vec_bitsize, bitsize);
}
- /* Create a new vectorized stmt for the uses of STMT and insert outside the
- loop. */
gimple_seq stmts = NULL;
- tree bftype = TREE_TYPE (vectype);
- if (VECTOR_BOOLEAN_TYPE_P (vectype))
- bftype = build_nonstandard_integer_type (tree_to_uhwi (bitsize), 1);
- tree new_tree = build3 (BIT_FIELD_REF, bftype, vec_lhs, bitsize, bitstart);
- new_tree = force_gimple_operand (fold_convert (lhs_type, new_tree), &stmts,
- true, NULL_TREE);
+ tree new_tree;
+ if (LOOP_VINFO_FULLY_MASKED_P (loop_vinfo))
+ {
+ /* Emit:
+
+ SCALAR_RES = EXTRACT_LAST <VEC_LHS, MASK>
+
+ where VEC_LHS is the vectorized live-out result and MASK is
+ the loop mask for the final iteration. */
+ gcc_assert (ncopies == 1 && !slp_node);
+ tree scalar_type = TREE_TYPE (STMT_VINFO_VECTYPE (stmt_info));
+ tree scalar_res = make_ssa_name (scalar_type);
+ tree mask = vect_get_loop_mask (gsi, &LOOP_VINFO_MASKS (loop_vinfo),
+ 1, vectype, 0);
+ gcall *new_stmt = gimple_build_call_internal (IFN_EXTRACT_LAST,
+ 2, mask, vec_lhs);
+ gimple_call_set_lhs (new_stmt, scalar_res);
+ gimple_seq_add_stmt (&stmts, new_stmt);
+
+ /* Convert the extracted vector element to the required scalar type. */
+ new_tree = gimple_convert (&stmts, lhs_type, scalar_res);
+ }
+ else
+ {
+ tree bftype = TREE_TYPE (vectype);
+ if (VECTOR_BOOLEAN_TYPE_P (vectype))
+ bftype = build_nonstandard_integer_type (tree_to_uhwi (bitsize), 1);
+ new_tree = build3 (BIT_FIELD_REF, bftype, vec_lhs, bitsize, bitstart);
+ new_tree = force_gimple_operand (fold_convert (lhs_type, new_tree),
+ &stmts, true, NULL_TREE);
+ }
+
if (stmts)
gsi_insert_seq_on_edge_immediate (single_exit (loop), stmts);