summaryrefslogtreecommitdiff
path: root/gcc/tree-vectorizer.h
diff options
context:
space:
mode:
Diffstat (limited to 'gcc/tree-vectorizer.h')
-rw-r--r--gcc/tree-vectorizer.h138
1 files changed, 136 insertions, 2 deletions
diff --git a/gcc/tree-vectorizer.h b/gcc/tree-vectorizer.h
index 224f4a67459..abbe6ef6b2d 100644
--- a/gcc/tree-vectorizer.h
+++ b/gcc/tree-vectorizer.h
@@ -211,6 +211,102 @@ is_a_helper <_bb_vec_info *>::test (vec_info *i)
}
+/* In general, we can divide the vector statements in a vectorized loop
+ into related groups ("rgroups") and say that for each rgroup there is
+ some nS such that the rgroup operates on nS values from one scalar
+ iteration followed by nS values from the next. That is, if VF is the
+ vectorization factor of the loop, the rgroup operates on a sequence:
+
+ (1,1) (1,2) ... (1,nS) (2,1) ... (2,nS) ... (VF,1) ... (VF,nS)
+
+ where (i,j) represents a scalar value with index j in a scalar
+ iteration with index i.
+
+ [ We use the term "rgroup" to emphasise that this grouping isn't
+ necessarily the same as the grouping of statements used elsewhere.
+ For example, if we implement a group of scalar loads using gather
+ loads, we'll use a separate gather load for each scalar load, and
+ thus each gather load will belong to its own rgroup. ]
+
+ In general this sequence will occupy nV vectors concatenated
+ together. If these vectors have nL lanes each, the total number
+ of scalar values N is given by:
+
+ N = nS * VF = nV * nL
+
+ None of nS, VF, nV and nL are required to be a power of 2. nS and nV
+ are compile-time constants but VF and nL can be variable (if the target
+ supports variable-length vectors).
+
+ In classical vectorization, each iteration of the vector loop would
+ handle exactly VF iterations of the original scalar loop. However,
+ in a fully-masked loop, a particular iteration of the vector loop
+ might handle fewer than VF iterations of the scalar loop. The vector
+ lanes that correspond to iterations of the scalar loop are said to be
+ "active" and the other lanes are said to be "inactive".
+
+ In a fully-masked loop, many rgroups need to be masked to ensure that
+ they have no effect for the inactive lanes. Each such rgroup needs a
+ sequence of booleans in the same order as above, but with each (i,j)
+ replaced by a boolean that indicates whether iteration i is active.
+ This sequence occupies nV vector masks that again have nL lanes each.
+ Thus the mask sequence as a whole consists of VF independent booleans
+ that are each repeated nS times.
+
+ We make the simplifying assumption that if a sequence of nV masks is
+ suitable for one (nS,nL) pair, we can reuse it for (nS/2,nL/2) by
+ VIEW_CONVERTing it. This holds for all current targets that support
+ fully-masked loops. For example, suppose the scalar loop is:
+
+ float *f;
+ double *d;
+ for (int i = 0; i < n; ++i)
+ {
+ f[i * 2 + 0] += 1.0f;
+ f[i * 2 + 1] += 2.0f;
+ d[i] += 3.0;
+ }
+
+ and suppose that vectors have 256 bits. The vectorized f accesses
+ will belong to one rgroup and the vectorized d access to another:
+
+ f rgroup: nS = 2, nV = 1, nL = 8
+ d rgroup: nS = 1, nV = 1, nL = 4
+ VF = 4
+
+ [ In this simple example the rgroups do correspond to the normal
+ SLP grouping scheme. ]
+
+ If only the first three lanes are active, the masks we need are:
+
+ f rgroup: 1 1 | 1 1 | 1 1 | 0 0
+ d rgroup: 1 | 1 | 1 | 0
+
+ Here we can use a mask calculated for f's rgroup for d's, but not
+ vice versa.
+
+ Thus for each value of nV, it is enough to provide nV masks, with the
+ mask being calculated based on the highest nL (or, equivalently, based
+ on the highest nS) required by any rgroup with that nV. We therefore
+ represent the entire collection of masks as a two-level table, with the
+ first level being indexed by nV - 1 (since nV == 0 doesn't exist) and
+ the second being indexed by the mask index 0 <= i < nV. */
+
+/* The masks needed by rgroups with nV vectors, according to the
+ description above. */
+struct rgroup_masks {
+ /* The largest nS for all rgroups that use these masks. */
+ unsigned int max_nscalars_per_iter;
+
+ /* The type of mask to use, based on the highest nS recorded above. */
+ tree mask_type;
+
+ /* A vector of nV masks, in iteration order. */
+ vec<tree> masks;
+};
+
+typedef auto_vec<rgroup_masks> vec_loop_masks;
+
/*-----------------------------------------------------------------*/
/* Info on vectorized loops. */
/*-----------------------------------------------------------------*/
@@ -251,6 +347,14 @@ typedef struct _loop_vec_info : public vec_info {
if there is no particular limit. */
unsigned HOST_WIDE_INT max_vectorization_factor;
+ /* The masks that a fully-masked loop should use to avoid operating
+ on inactive scalars. */
+ vec_loop_masks masks;
+
+ /* Type of the variables to use in the WHILE_ULT call for fully-masked
+ loops. */
+ tree mask_compare_type;
+
/* Unknown DRs according to which loop was peeled. */
struct data_reference *unaligned_dr;
@@ -305,6 +409,12 @@ typedef struct _loop_vec_info : public vec_info {
/* Is the loop vectorizable? */
bool vectorizable;
+ /* Records whether we still have the option of using a fully-masked loop. */
+ bool can_fully_mask_p;
+
+ /* True if have decided to use a fully-masked loop. */
+ bool fully_masked_p;
+
/* When we have grouped data accesses with gaps, we may introduce invalid
memory accesses. We peel the last iteration of the loop to prevent
this. */
@@ -365,8 +475,12 @@ typedef struct _loop_vec_info : public vec_info {
#define LOOP_VINFO_COST_MODEL_THRESHOLD(L) (L)->th
#define LOOP_VINFO_VERSIONING_THRESHOLD(L) (L)->versioning_threshold
#define LOOP_VINFO_VECTORIZABLE_P(L) (L)->vectorizable
+#define LOOP_VINFO_CAN_FULLY_MASK_P(L) (L)->can_fully_mask_p
+#define LOOP_VINFO_FULLY_MASKED_P(L) (L)->fully_masked_p
#define LOOP_VINFO_VECT_FACTOR(L) (L)->vectorization_factor
#define LOOP_VINFO_MAX_VECT_FACTOR(L) (L)->max_vectorization_factor
+#define LOOP_VINFO_MASKS(L) (L)->masks
+#define LOOP_VINFO_MASK_COMPARE_TYPE(L) (L)->mask_compare_type
#define LOOP_VINFO_PTR_MASK(L) (L)->ptr_mask
#define LOOP_VINFO_LOOP_NEST(L) (L)->loop_nest
#define LOOP_VINFO_DATAREFS(L) (L)->datarefs
@@ -1172,6 +1286,17 @@ vect_nunits_for_cost (tree vec_type)
return estimated_poly_value (TYPE_VECTOR_SUBPARTS (vec_type));
}
+/* Return the maximum possible vectorization factor for LOOP_VINFO. */
+
+static inline unsigned HOST_WIDE_INT
+vect_max_vf (loop_vec_info loop_vinfo)
+{
+ unsigned HOST_WIDE_INT vf;
+ if (LOOP_VINFO_VECT_FACTOR (loop_vinfo).is_constant (&vf))
+ return vf;
+ return MAX_VECTORIZATION_FACTOR;
+}
+
/* Return the size of the value accessed by unvectorized data reference DR.
This is only valid once STMT_VINFO_VECTYPE has been calculated for the
associated gimple statement, since that guarantees that DR accesses
@@ -1194,8 +1319,8 @@ extern source_location vect_location;
/* Simple loop peeling and versioning utilities for vectorizer's purposes -
in tree-vect-loop-manip.c. */
-extern void slpeel_make_loop_iterate_ntimes (struct loop *, tree, tree,
- tree, bool);
+extern void vect_set_loop_condition (struct loop *, loop_vec_info,
+ tree, tree, tree, bool);
extern bool slpeel_can_duplicate_loop_p (const struct loop *, const_edge);
struct loop *slpeel_tree_duplicate_loop_to_edge_cfg (struct loop *,
struct loop *, edge);
@@ -1211,6 +1336,7 @@ extern poly_uint64 current_vector_size;
extern tree get_vectype_for_scalar_type (tree);
extern tree get_mask_type_for_scalar_type (tree);
extern tree get_same_sized_vectype (tree, tree);
+extern bool vect_get_loop_mask_type (loop_vec_info);
extern bool vect_is_simple_use (tree, vec_info *, gimple **,
enum vect_def_type *);
extern bool vect_is_simple_use (tree, vec_info *, gimple **,
@@ -1265,6 +1391,7 @@ extern bool vect_supportable_shift (enum tree_code, tree);
extern tree vect_gen_perm_mask_any (tree, vec_perm_indices);
extern tree vect_gen_perm_mask_checked (tree, vec_perm_indices);
extern void optimize_mask_stores (struct loop*);
+extern gcall *vect_gen_while (tree, tree, tree);
/* In tree-vect-data-refs.c. */
extern bool vect_can_force_dr_alignment_p (const_tree, unsigned int);
@@ -1318,6 +1445,13 @@ extern loop_vec_info vect_analyze_loop (struct loop *, loop_vec_info);
extern tree vect_build_loop_niters (loop_vec_info, bool * = NULL);
extern void vect_gen_vector_loop_niters (loop_vec_info, tree, tree *,
tree *, bool);
+extern tree vect_halve_mask_nunits (tree);
+extern tree vect_double_mask_nunits (tree);
+extern void vect_record_loop_mask (loop_vec_info, vec_loop_masks *,
+ unsigned int, tree);
+extern tree vect_get_loop_mask (gimple_stmt_iterator *, vec_loop_masks *,
+ unsigned int, tree, unsigned int);
+
/* Drive for loop transformation stage. */
extern struct loop *vect_transform_loop (loop_vec_info);
extern loop_vec_info vect_analyze_loop_form (struct loop *);