diff options
Diffstat (limited to 'gcc/tree-vectorizer.h')
-rw-r--r-- | gcc/tree-vectorizer.h | 138 |
1 files changed, 136 insertions, 2 deletions
diff --git a/gcc/tree-vectorizer.h b/gcc/tree-vectorizer.h index 224f4a67459..abbe6ef6b2d 100644 --- a/gcc/tree-vectorizer.h +++ b/gcc/tree-vectorizer.h @@ -211,6 +211,102 @@ is_a_helper <_bb_vec_info *>::test (vec_info *i) } +/* In general, we can divide the vector statements in a vectorized loop + into related groups ("rgroups") and say that for each rgroup there is + some nS such that the rgroup operates on nS values from one scalar + iteration followed by nS values from the next. That is, if VF is the + vectorization factor of the loop, the rgroup operates on a sequence: + + (1,1) (1,2) ... (1,nS) (2,1) ... (2,nS) ... (VF,1) ... (VF,nS) + + where (i,j) represents a scalar value with index j in a scalar + iteration with index i. + + [ We use the term "rgroup" to emphasise that this grouping isn't + necessarily the same as the grouping of statements used elsewhere. + For example, if we implement a group of scalar loads using gather + loads, we'll use a separate gather load for each scalar load, and + thus each gather load will belong to its own rgroup. ] + + In general this sequence will occupy nV vectors concatenated + together. If these vectors have nL lanes each, the total number + of scalar values N is given by: + + N = nS * VF = nV * nL + + None of nS, VF, nV and nL are required to be a power of 2. nS and nV + are compile-time constants but VF and nL can be variable (if the target + supports variable-length vectors). + + In classical vectorization, each iteration of the vector loop would + handle exactly VF iterations of the original scalar loop. However, + in a fully-masked loop, a particular iteration of the vector loop + might handle fewer than VF iterations of the scalar loop. The vector + lanes that correspond to iterations of the scalar loop are said to be + "active" and the other lanes are said to be "inactive". + + In a fully-masked loop, many rgroups need to be masked to ensure that + they have no effect for the inactive lanes. Each such rgroup needs a + sequence of booleans in the same order as above, but with each (i,j) + replaced by a boolean that indicates whether iteration i is active. + This sequence occupies nV vector masks that again have nL lanes each. + Thus the mask sequence as a whole consists of VF independent booleans + that are each repeated nS times. + + We make the simplifying assumption that if a sequence of nV masks is + suitable for one (nS,nL) pair, we can reuse it for (nS/2,nL/2) by + VIEW_CONVERTing it. This holds for all current targets that support + fully-masked loops. For example, suppose the scalar loop is: + + float *f; + double *d; + for (int i = 0; i < n; ++i) + { + f[i * 2 + 0] += 1.0f; + f[i * 2 + 1] += 2.0f; + d[i] += 3.0; + } + + and suppose that vectors have 256 bits. The vectorized f accesses + will belong to one rgroup and the vectorized d access to another: + + f rgroup: nS = 2, nV = 1, nL = 8 + d rgroup: nS = 1, nV = 1, nL = 4 + VF = 4 + + [ In this simple example the rgroups do correspond to the normal + SLP grouping scheme. ] + + If only the first three lanes are active, the masks we need are: + + f rgroup: 1 1 | 1 1 | 1 1 | 0 0 + d rgroup: 1 | 1 | 1 | 0 + + Here we can use a mask calculated for f's rgroup for d's, but not + vice versa. + + Thus for each value of nV, it is enough to provide nV masks, with the + mask being calculated based on the highest nL (or, equivalently, based + on the highest nS) required by any rgroup with that nV. We therefore + represent the entire collection of masks as a two-level table, with the + first level being indexed by nV - 1 (since nV == 0 doesn't exist) and + the second being indexed by the mask index 0 <= i < nV. */ + +/* The masks needed by rgroups with nV vectors, according to the + description above. */ +struct rgroup_masks { + /* The largest nS for all rgroups that use these masks. */ + unsigned int max_nscalars_per_iter; + + /* The type of mask to use, based on the highest nS recorded above. */ + tree mask_type; + + /* A vector of nV masks, in iteration order. */ + vec<tree> masks; +}; + +typedef auto_vec<rgroup_masks> vec_loop_masks; + /*-----------------------------------------------------------------*/ /* Info on vectorized loops. */ /*-----------------------------------------------------------------*/ @@ -251,6 +347,14 @@ typedef struct _loop_vec_info : public vec_info { if there is no particular limit. */ unsigned HOST_WIDE_INT max_vectorization_factor; + /* The masks that a fully-masked loop should use to avoid operating + on inactive scalars. */ + vec_loop_masks masks; + + /* Type of the variables to use in the WHILE_ULT call for fully-masked + loops. */ + tree mask_compare_type; + /* Unknown DRs according to which loop was peeled. */ struct data_reference *unaligned_dr; @@ -305,6 +409,12 @@ typedef struct _loop_vec_info : public vec_info { /* Is the loop vectorizable? */ bool vectorizable; + /* Records whether we still have the option of using a fully-masked loop. */ + bool can_fully_mask_p; + + /* True if have decided to use a fully-masked loop. */ + bool fully_masked_p; + /* When we have grouped data accesses with gaps, we may introduce invalid memory accesses. We peel the last iteration of the loop to prevent this. */ @@ -365,8 +475,12 @@ typedef struct _loop_vec_info : public vec_info { #define LOOP_VINFO_COST_MODEL_THRESHOLD(L) (L)->th #define LOOP_VINFO_VERSIONING_THRESHOLD(L) (L)->versioning_threshold #define LOOP_VINFO_VECTORIZABLE_P(L) (L)->vectorizable +#define LOOP_VINFO_CAN_FULLY_MASK_P(L) (L)->can_fully_mask_p +#define LOOP_VINFO_FULLY_MASKED_P(L) (L)->fully_masked_p #define LOOP_VINFO_VECT_FACTOR(L) (L)->vectorization_factor #define LOOP_VINFO_MAX_VECT_FACTOR(L) (L)->max_vectorization_factor +#define LOOP_VINFO_MASKS(L) (L)->masks +#define LOOP_VINFO_MASK_COMPARE_TYPE(L) (L)->mask_compare_type #define LOOP_VINFO_PTR_MASK(L) (L)->ptr_mask #define LOOP_VINFO_LOOP_NEST(L) (L)->loop_nest #define LOOP_VINFO_DATAREFS(L) (L)->datarefs @@ -1172,6 +1286,17 @@ vect_nunits_for_cost (tree vec_type) return estimated_poly_value (TYPE_VECTOR_SUBPARTS (vec_type)); } +/* Return the maximum possible vectorization factor for LOOP_VINFO. */ + +static inline unsigned HOST_WIDE_INT +vect_max_vf (loop_vec_info loop_vinfo) +{ + unsigned HOST_WIDE_INT vf; + if (LOOP_VINFO_VECT_FACTOR (loop_vinfo).is_constant (&vf)) + return vf; + return MAX_VECTORIZATION_FACTOR; +} + /* Return the size of the value accessed by unvectorized data reference DR. This is only valid once STMT_VINFO_VECTYPE has been calculated for the associated gimple statement, since that guarantees that DR accesses @@ -1194,8 +1319,8 @@ extern source_location vect_location; /* Simple loop peeling and versioning utilities for vectorizer's purposes - in tree-vect-loop-manip.c. */ -extern void slpeel_make_loop_iterate_ntimes (struct loop *, tree, tree, - tree, bool); +extern void vect_set_loop_condition (struct loop *, loop_vec_info, + tree, tree, tree, bool); extern bool slpeel_can_duplicate_loop_p (const struct loop *, const_edge); struct loop *slpeel_tree_duplicate_loop_to_edge_cfg (struct loop *, struct loop *, edge); @@ -1211,6 +1336,7 @@ extern poly_uint64 current_vector_size; extern tree get_vectype_for_scalar_type (tree); extern tree get_mask_type_for_scalar_type (tree); extern tree get_same_sized_vectype (tree, tree); +extern bool vect_get_loop_mask_type (loop_vec_info); extern bool vect_is_simple_use (tree, vec_info *, gimple **, enum vect_def_type *); extern bool vect_is_simple_use (tree, vec_info *, gimple **, @@ -1265,6 +1391,7 @@ extern bool vect_supportable_shift (enum tree_code, tree); extern tree vect_gen_perm_mask_any (tree, vec_perm_indices); extern tree vect_gen_perm_mask_checked (tree, vec_perm_indices); extern void optimize_mask_stores (struct loop*); +extern gcall *vect_gen_while (tree, tree, tree); /* In tree-vect-data-refs.c. */ extern bool vect_can_force_dr_alignment_p (const_tree, unsigned int); @@ -1318,6 +1445,13 @@ extern loop_vec_info vect_analyze_loop (struct loop *, loop_vec_info); extern tree vect_build_loop_niters (loop_vec_info, bool * = NULL); extern void vect_gen_vector_loop_niters (loop_vec_info, tree, tree *, tree *, bool); +extern tree vect_halve_mask_nunits (tree); +extern tree vect_double_mask_nunits (tree); +extern void vect_record_loop_mask (loop_vec_info, vec_loop_masks *, + unsigned int, tree); +extern tree vect_get_loop_mask (gimple_stmt_iterator *, vec_loop_masks *, + unsigned int, tree, unsigned int); + /* Drive for loop transformation stage. */ extern struct loop *vect_transform_loop (loop_vec_info); extern loop_vec_info vect_analyze_loop_form (struct loop *); |