diff options
author | Richard Sandiford <richard.sandiford@linaro.org> | 2017-11-14 19:11:42 +0000 |
---|---|---|
committer | Richard Sandiford <richard.sandiford@linaro.org> | 2017-11-20 15:09:14 +0000 |
commit | 2e97eeb6542ec749c8edd5765fcde83115869c08 (patch) | |
tree | 24bd5b7d26e779ad0f01470cbe709ef9e9c189cc | |
parent | 9c4d5a5738e1a17ae328b81bbd147b6ea9675505 (diff) | |
download | gcc-2e97eeb6542ec749c8edd5765fcde83115869c08.tar.gz |
Use gather loads for strided accesses
This patch tries to use gather loads for strided accesses,
rather than falling back to VMAT_ELEMENTWISE.
2017-11-16 Richard Sandiford <richard.sandiford@linaro.org>
Alan Hayward <alan.hayward@arm.com>
David Sherwood <david.sherwood@arm.com>
gcc/
* tree-vectorizer.h (vect_create_data_ref_ptr): Take an extra
optional tree argument.
* tree-vect-data-refs.c (vect_check_gather_scatter): Check for
null target hooks.
(vect_create_data_ref_ptr): Take the iv_step as an optional argument,
but continue to use the current value as a fallback.
(bump_vector_ptr): Use operand_equal_p rather than tree_int_cst_compare
to compare the updates.
* tree-vect-stmts.c (vect_use_strided_gather_scatters_p): New function.
(get_load_store_type): Use it when handling a strided access.
(vect_get_strided_load_store_ops): New function.
(vect_get_data_ptr_increment): Likewise.
(vectorizable_load): Handle strided gather loads. Always pass
a step to vect_create_data_ref_ptr and bump_vector_ptr.
gcc/testsuite/
* gcc.target/aarch64/sve_strided_load_1.c: New test.
* gcc.target/aarch64/sve_strided_load_2.c: Likewise.
* gcc.target/aarch64/sve_strided_load_3.c: Likewise.
-rw-r--r-- | gcc/testsuite/gcc.target/aarch64/sve_strided_load_1.c | 40 | ||||
-rw-r--r-- | gcc/testsuite/gcc.target/aarch64/sve_strided_load_2.c | 18 | ||||
-rw-r--r-- | gcc/testsuite/gcc.target/aarch64/sve_strided_load_3.c | 32 | ||||
-rw-r--r-- | gcc/tree-vect-data-refs.c | 41 | ||||
-rw-r--r-- | gcc/tree-vect-stmts.c | 148 | ||||
-rw-r--r-- | gcc/tree-vectorizer.h | 2 |
6 files changed, 256 insertions, 25 deletions
diff --git a/gcc/testsuite/gcc.target/aarch64/sve_strided_load_1.c b/gcc/testsuite/gcc.target/aarch64/sve_strided_load_1.c new file mode 100644 index 00000000000..b940ba9d4de --- /dev/null +++ b/gcc/testsuite/gcc.target/aarch64/sve_strided_load_1.c @@ -0,0 +1,40 @@ +/* { dg-do assemble } */ +/* { dg-options "-O2 -ftree-vectorize -march=armv8-a+sve --save-temps" } */ + +#include <stdint.h> + +#ifndef INDEX8 +#define INDEX8 int8_t +#define INDEX16 int16_t +#define INDEX32 int32_t +#define INDEX64 int64_t +#endif + +#define TEST_LOOP(DATA_TYPE, BITS) \ + void __attribute__ ((noinline, noclone)) \ + f_##DATA_TYPE##_##BITS (DATA_TYPE *restrict dest, \ + DATA_TYPE *restrict src, \ + INDEX##BITS stride, INDEX##BITS n) \ + { \ + for (INDEX##BITS i = 0; i < n; ++i) \ + dest[i] += src[i * stride]; \ + } + +#define TEST_TYPE(T, DATA_TYPE) \ + T (DATA_TYPE, 8) \ + T (DATA_TYPE, 16) \ + T (DATA_TYPE, 32) \ + T (DATA_TYPE, 64) + +#define TEST_ALL(T) \ + TEST_TYPE (T, int32_t) \ + TEST_TYPE (T, uint32_t) \ + TEST_TYPE (T, float) \ + TEST_TYPE (T, int64_t) \ + TEST_TYPE (T, uint64_t) \ + TEST_TYPE (T, double) + +TEST_ALL (TEST_LOOP) + +/* { dg-final { scan-assembler-times {\tld1w\tz[0-9]+\.s, p[0-7]/z, \[x[0-9]+, z[0-9]+.s, sxtw 2\]\n} 9 } } */ +/* { dg-final { scan-assembler-times {\tld1d\tz[0-9]+\.d, p[0-7]/z, \[x[0-9]+, z[0-9]+.d, lsl 3\]\n} 12 } } */ diff --git a/gcc/testsuite/gcc.target/aarch64/sve_strided_load_2.c b/gcc/testsuite/gcc.target/aarch64/sve_strided_load_2.c new file mode 100644 index 00000000000..a834989091d --- /dev/null +++ b/gcc/testsuite/gcc.target/aarch64/sve_strided_load_2.c @@ -0,0 +1,18 @@ +/* { dg-do assemble } */ +/* { dg-options "-O2 -ftree-vectorize -march=armv8-a+sve --save-temps" } */ + +#define INDEX8 uint8_t +#define INDEX16 uint16_t +#define INDEX32 uint32_t +#define INDEX64 uint64_t + +#include "sve_strided_load_1.c" + +/* 8 and 16 bits are signed because the multiplication promotes to int. + Using uxtw for all 9 would be OK. */ +/* { dg-final { scan-assembler-times {\tld1w\tz[0-9]+\.s, p[0-7]/z, \[x[0-9]+, z[0-9]+.s, sxtw 2\]\n} 6 } } */ +/* The 32-bit loop needs to honor the defined overflow in uint32_t, + so we vectorize the offset calculation. This means that the + 64-bit version needs two copies. */ +/* { dg-final { scan-assembler-times {\tld1w\tz[0-9]+\.s, p[0-7]/z, \[x[0-9]+, z[0-9]+.s, uxtw 2\]\n} 3 } } */ +/* { dg-final { scan-assembler-times {\tld1d\tz[0-9]+\.d, p[0-7]/z, \[x[0-9]+, z[0-9]+.d, lsl 3\]\n} 15 } } */ diff --git a/gcc/testsuite/gcc.target/aarch64/sve_strided_load_3.c b/gcc/testsuite/gcc.target/aarch64/sve_strided_load_3.c new file mode 100644 index 00000000000..8f0bfdd4bb8 --- /dev/null +++ b/gcc/testsuite/gcc.target/aarch64/sve_strided_load_3.c @@ -0,0 +1,32 @@ +/* { dg-do assemble } */ +/* { dg-options "-O2 -ftree-vectorize -march=armv8-a+sve --save-temps" } */ + +#include <stdint.h> + +#define TEST_LOOP(DATA_TYPE, OTHER_TYPE) \ + void __attribute__ ((noinline, noclone)) \ + f_##DATA_TYPE##_##BITS (DATA_TYPE *restrict dest, \ + DATA_TYPE *restrict src, \ + OTHER_TYPE *restrict other, \ + OTHER_TYPE mask, \ + int stride, int n) \ + { \ + for (int i = 0; i < n; ++i) \ + dest[i] = src[i * stride] + (OTHER_TYPE) (other[i] | mask); \ + } + +#define TEST_ALL(T) \ + T (int32_t, int16_t) \ + T (uint32_t, int16_t) \ + T (float, int16_t) \ + T (int64_t, int32_t) \ + T (uint64_t, int32_t) \ + T (double, int32_t) + +TEST_ALL (TEST_LOOP) + +/* { dg-final { scan-assembler-times {\tld1h\tz[0-9]+\.h, p[0-7]/z, \[x[0-9]+, x[0-9]+, lsl 1\]\n} 3 } } */ +/* { dg-final { scan-assembler-times {\tld1w\tz[0-9]+\.s, p[0-7]/z, \[x[0-9]+, z[0-9]+.s, sxtw 2\]\n} 6 } } */ + +/* { dg-final { scan-assembler-times {\tld1w\tz[0-9]+\.s, p[0-7]/z, \[x[0-9]+, x[0-9]+, lsl 2\]\n} 3 } } */ +/* { dg-final { scan-assembler-times {\tld1d\tz[0-9]+\.d, p[0-7]/z, \[x[0-9]+, z[0-9]+.d, lsl 3\]\n} 6 } } */ diff --git a/gcc/tree-vect-data-refs.c b/gcc/tree-vect-data-refs.c index 61413e9daae..5d09e59a4d0 100644 --- a/gcc/tree-vect-data-refs.c +++ b/gcc/tree-vect-data-refs.c @@ -3611,9 +3611,15 @@ vect_check_gather_scatter (gimple *stmt, loop_vec_info loop_vinfo, else { if (DR_IS_READ (dr)) - decl = targetm.vectorize.builtin_gather (vectype, offtype, scale); + { + if (targetm.vectorize.builtin_gather) + decl = targetm.vectorize.builtin_gather (vectype, offtype, scale); + } else - decl = targetm.vectorize.builtin_scatter (vectype, offtype, scale); + { + if (targetm.vectorize.builtin_scatter) + decl = targetm.vectorize.builtin_scatter (vectype, offtype, scale); + } if (!decl) return false; @@ -4362,6 +4368,10 @@ vect_create_addr_base_for_vector_ref (gimple *stmt, to the initial address accessed by the data-ref in STMT. This is similar to OFFSET, but OFFSET is counted in elements, while BYTE_OFFSET in bytes. + 8. IV_STEP (optional, defaults to NULL): the amount that should be added + to the IV during each iteration of the loop. NULL says to move + by one copy of AGGR_TYPE up or down, depending on the step of the + data reference. Output: 1. Declare a new ptr to vector_type, and have it point to the base of the @@ -4394,7 +4404,8 @@ tree vect_create_data_ref_ptr (gimple *stmt, tree aggr_type, struct loop *at_loop, tree offset, tree *initial_address, gimple_stmt_iterator *gsi, gimple **ptr_incr, - bool only_init, bool *inv_p, tree byte_offset) + bool only_init, bool *inv_p, tree byte_offset, + tree iv_step) { const char *base_name; stmt_vec_info stmt_info = vinfo_for_stmt (stmt); @@ -4418,7 +4429,8 @@ vect_create_data_ref_ptr (gimple *stmt, tree aggr_type, struct loop *at_loop, tree step; bb_vec_info bb_vinfo = STMT_VINFO_BB_VINFO (stmt_info); - gcc_assert (TREE_CODE (aggr_type) == ARRAY_TYPE + gcc_assert (iv_step != NULL_TREE + || TREE_CODE (aggr_type) == ARRAY_TYPE || TREE_CODE (aggr_type) == VECTOR_TYPE); if (loop_vinfo) @@ -4559,14 +4571,17 @@ vect_create_data_ref_ptr (gimple *stmt, tree aggr_type, struct loop *at_loop, aptr = aggr_ptr_init; else { - /* The step of the aggregate pointer is the type size. */ - tree iv_step = TYPE_SIZE_UNIT (aggr_type); - /* One exception to the above is when the scalar step of the load in - LOOP is zero. In this case the step here is also zero. */ - if (*inv_p) - iv_step = size_zero_node; - else if (tree_int_cst_sgn (step) == -1) - iv_step = fold_build1 (NEGATE_EXPR, TREE_TYPE (iv_step), iv_step); + if (iv_step == NULL_TREE) + { + /* The step of the aggregate pointer is the type size. */ + iv_step = TYPE_SIZE_UNIT (aggr_type); + /* One exception to the above is when the scalar step of the load in + LOOP is zero. In this case the step here is also zero. */ + if (*inv_p) + iv_step = size_zero_node; + else if (tree_int_cst_sgn (step) == -1) + iv_step = fold_build1 (NEGATE_EXPR, TREE_TYPE (iv_step), iv_step); + } standard_iv_increment_position (loop, &incr_gsi, &insert_after); @@ -4699,7 +4714,7 @@ bump_vector_ptr (tree dataref_ptr, gimple *ptr_incr, gimple_stmt_iterator *gsi, if (use == dataref_ptr) SET_USE (use_p, new_dataref_ptr); else - gcc_assert (tree_int_cst_compare (use, update) == 0); + gcc_assert (operand_equal_p (use, update, 0)); } return new_dataref_ptr; diff --git a/gcc/tree-vect-stmts.c b/gcc/tree-vect-stmts.c index 80f8a8dc93b..283064c7cd3 100644 --- a/gcc/tree-vect-stmts.c +++ b/gcc/tree-vect-stmts.c @@ -1847,6 +1847,44 @@ prepare_load_store_mask (tree mask_type, tree loop_mask, tree vec_mask, return and_res; } +/* Return true if we can use gather/scatter internal functions to + vectorize STMT, which is a grouped or strided load or store. + When returning true, fill in GS_INFO with the information required + to perform the operation. */ + +static bool +vect_use_strided_gather_scatters_p (gimple *stmt, loop_vec_info loop_vinfo, + gather_scatter_info *gs_info) +{ + if (!vect_check_gather_scatter (stmt, loop_vinfo, gs_info) + || gs_info->decl) + return false; + + scalar_mode element_mode = SCALAR_TYPE_MODE (gs_info->element_type); + unsigned int element_bits = GET_MODE_BITSIZE (element_mode); + tree offset_type = TREE_TYPE (gs_info->offset); + unsigned int offset_bits = TYPE_PRECISION (offset_type); + + /* Enforced by vect_check_gather_scatter. */ + gcc_assert (element_bits >= offset_bits); + + /* If the elements are wider than the offset, convert the offset to the + same width, without changing its sign. */ + if (element_bits > offset_bits) + { + bool unsigned_p = TYPE_UNSIGNED (offset_type); + offset_type = build_nonstandard_integer_type (element_bits, unsigned_p); + gs_info->offset = fold_convert (offset_type, gs_info->offset); + } + + if (dump_enabled_p ()) + dump_printf_loc (MSG_NOTE, vect_location, + "using gather/scatter for strided/grouped access," + " scale = %d\n", gs_info->scale); + + return true; +} + /* STMT is a non-strided load or store, meaning that it accesses elements with a known constant step. Return -1 if that step is negative, 0 if it is zero, and 1 if it is greater than zero. */ @@ -2200,7 +2238,11 @@ get_load_store_type (gimple *stmt, tree vectype, bool slp, bool masked_p, else if (STMT_VINFO_STRIDED_P (stmt_info)) { gcc_assert (!slp); - *memory_access_type = VMAT_ELEMENTWISE; + if (loop_vinfo + && vect_use_strided_gather_scatters_p (stmt, loop_vinfo, gs_info)) + *memory_access_type = VMAT_GATHER_SCATTER; + else + *memory_access_type = VMAT_ELEMENTWISE; } else { @@ -2640,6 +2682,71 @@ vect_get_gather_scatter_ops (struct loop *loop, gimple *stmt, offset_vectype); } +/* Prepare to implement a grouped or strided load or store using + the gather load or scatter store operation described by GS_INFO. + STMT is the load or store statement. + + Set *DATAREF_BUMP to the amount that should be added to the base + address after each copy of the vectorized statement. Set *VEC_OFFSET + to an invariant offset vector in which element I has the value + I * DR_STEP / SCALE. */ + +static void +vect_get_strided_load_store_ops (gimple *stmt, loop_vec_info loop_vinfo, + gather_scatter_info *gs_info, + tree *dataref_bump, tree *vec_offset) +{ + stmt_vec_info stmt_info = vinfo_for_stmt (stmt); + struct data_reference *dr = STMT_VINFO_DATA_REF (stmt_info); + struct loop *loop = LOOP_VINFO_LOOP (loop_vinfo); + tree vectype = STMT_VINFO_VECTYPE (stmt_info); + gimple_seq stmts; + + tree bump = size_binop (MULT_EXPR, + fold_convert (sizetype, DR_STEP (dr)), + size_int (TYPE_VECTOR_SUBPARTS (vectype))); + *dataref_bump = force_gimple_operand (bump, &stmts, true, NULL_TREE); + if (stmts) + gsi_insert_seq_on_edge_immediate (loop_preheader_edge (loop), stmts); + + /* The offset given in GS_INFO can have pointer type, so use the element + type of the vector instead. */ + tree offset_type = TREE_TYPE (gs_info->offset); + tree offset_vectype = get_vectype_for_scalar_type (offset_type); + offset_type = TREE_TYPE (offset_vectype); + + /* Calculate X = DR_STEP / SCALE and convert it to the appropriate type. */ + tree step = size_binop (EXACT_DIV_EXPR, DR_STEP (dr), + ssize_int (gs_info->scale)); + step = fold_convert (offset_type, step); + step = force_gimple_operand (step, &stmts, true, NULL_TREE); + + /* Create {0, X, X*2, X*3, ...}. */ + *vec_offset = gimple_build (&stmts, VEC_SERIES_EXPR, offset_vectype, + build_zero_cst (offset_type), step); + if (stmts) + gsi_insert_seq_on_edge_immediate (loop_preheader_edge (loop), stmts); +} + +/* Return the amount that should be added to a vector pointer to move + to the next or previous copy of AGGR_TYPE. DR is the data reference + being vectorized and MEMORY_ACCESS_TYPE describes the type of + vectorization. */ + +static tree +vect_get_data_ptr_increment (data_reference *dr, tree aggr_type, + vect_memory_access_type memory_access_type) +{ + if (memory_access_type == VMAT_INVARIANT) + return size_zero_node; + + tree iv_step = TYPE_SIZE_UNIT (aggr_type); + tree step = vect_dr_behavior (dr)->step; + if (tree_int_cst_sgn (step) == -1) + iv_step = fold_build1 (NEGATE_EXPR, TREE_TYPE (iv_step), iv_step); + return iv_step; +} + /* Check and perform vectorization of BUILT_IN_BSWAP{16,32,64}. */ static bool @@ -7417,6 +7524,9 @@ vectorizable_load (gimple *stmt, gimple_stmt_iterator *gsi, gimple **vec_stmt, return true; } + if (memory_access_type == VMAT_GATHER_SCATTER) + grouped_load = false; + if (grouped_load) { first_stmt = GROUP_FIRST_ELEMENT (stmt_info); @@ -7628,13 +7738,29 @@ vectorizable_load (gimple *stmt, gimple_stmt_iterator *gsi, gimple **vec_stmt, if (memory_access_type == VMAT_CONTIGUOUS_REVERSE) offset = size_int (-TYPE_VECTOR_SUBPARTS (vectype) + 1); - if (memory_access_type == VMAT_LOAD_STORE_LANES) - aggr_type = build_array_type_nelts (elem_type, vec_num * nunits); + tree bump; + tree vec_offset = NULL_TREE; + if (STMT_VINFO_GATHER_SCATTER_P (stmt_info)) + { + aggr_type = NULL_TREE; + bump = NULL_TREE; + } + else if (memory_access_type == VMAT_GATHER_SCATTER) + { + aggr_type = elem_type; + vect_get_strided_load_store_ops (stmt, loop_vinfo, &gs_info, + &bump, &vec_offset); + } else - aggr_type = vectype; + { + if (memory_access_type == VMAT_LOAD_STORE_LANES) + aggr_type = build_array_type_nelts (elem_type, vec_num * nunits); + else + aggr_type = vectype; + bump = vect_get_data_ptr_increment (dr, aggr_type, memory_access_type); + } tree vec_mask = NULL_TREE; - tree vec_offset = NULL_TREE; prev_stmt_info = NULL; poly_uint64 group_elt = 0; vec_loop_masks *masks = &LOOP_VINFO_MASKS (loop_vinfo); @@ -7666,7 +7792,7 @@ vectorizable_load (gimple *stmt, gimple_stmt_iterator *gsi, gimple **vec_stmt, = vect_create_data_ref_ptr (first_stmt_for_drptr, aggr_type, at_loop, offset, &dummy, gsi, &ptr_incr, simd_lane_access_p, - &inv_p, byte_offset); + &inv_p, byte_offset, bump); /* Adjust the pointer by the difference to first_stmt. */ data_reference_p ptrdr = STMT_VINFO_DATA_REF (vinfo_for_stmt (first_stmt_for_drptr)); @@ -7688,7 +7814,7 @@ vectorizable_load (gimple *stmt, gimple_stmt_iterator *gsi, gimple **vec_stmt, = vect_create_data_ref_ptr (first_stmt, aggr_type, at_loop, offset, &dummy, gsi, &ptr_incr, simd_lane_access_p, &inv_p, - byte_offset); + byte_offset, bump); if (mask) vec_mask = vect_get_vec_def_for_operand (mask, stmt, mask_vectype); @@ -7697,7 +7823,7 @@ vectorizable_load (gimple *stmt, gimple_stmt_iterator *gsi, gimple **vec_stmt, { if (dataref_offset) dataref_offset = int_const_binop (PLUS_EXPR, dataref_offset, - TYPE_SIZE_UNIT (aggr_type)); + bump); else if (STMT_VINFO_GATHER_SCATTER_P (stmt_info)) { gimple *def_stmt; @@ -7706,8 +7832,8 @@ vectorizable_load (gimple *stmt, gimple_stmt_iterator *gsi, gimple **vec_stmt, vec_offset = vect_get_vec_def_for_stmt_copy (dt, vec_offset); } else - dataref_ptr = bump_vector_ptr (dataref_ptr, ptr_incr, gsi, stmt, - TYPE_SIZE_UNIT (aggr_type)); + dataref_ptr = bump_vector_ptr (dataref_ptr, ptr_incr, gsi, + stmt, bump); if (mask) { gimple *def_stmt; @@ -7783,7 +7909,7 @@ vectorizable_load (gimple *stmt, gimple_stmt_iterator *gsi, gimple **vec_stmt, if (i > 0) dataref_ptr = bump_vector_ptr (dataref_ptr, ptr_incr, gsi, - stmt, NULL_TREE); + stmt, bump); /* 2. Create the vector-load in the loop. */ switch (alignment_support_scheme) diff --git a/gcc/tree-vectorizer.h b/gcc/tree-vectorizer.h index 125e178136b..36188246cb8 100644 --- a/gcc/tree-vectorizer.h +++ b/gcc/tree-vectorizer.h @@ -1461,7 +1461,7 @@ extern void vect_record_base_alignments (vec_info *); extern tree vect_create_data_ref_ptr (gimple *, tree, struct loop *, tree, tree *, gimple_stmt_iterator *, gimple **, bool, bool *, - tree = NULL_TREE); + tree = NULL_TREE, tree = NULL_TREE); extern tree bump_vector_ptr (tree, gimple *, gimple_stmt_iterator *, gimple *, tree); extern tree vect_create_destination_var (tree, tree); |