1 files changed, 136 insertions, 2 deletions
diff --git a/gcc/tree-vectorizer.h b/gcc/tree-vectorizer.h
index 224f4a67459..abbe6ef6b2d 100644
--- a/gcc/tree-vectorizer.h
+++ b/gcc/tree-vectorizer.h
@@ -211,6 +211,102 @@ is_a_helper <_bb_vec_info *>::test (vec_info *i)
 }
 
 
+/* In general, we can divide the vector statements in a vectorized loop
+   into related groups ("rgroups") and say that for each rgroup there is
+   some nS such that the rgroup operates on nS values from one scalar
+   iteration followed by nS values from the next.  That is, if VF is the
+   vectorization factor of the loop, the rgroup operates on a sequence:
+
+     (1,1) (1,2) ... (1,nS) (2,1) ... (2,nS) ... (VF,1) ... (VF,nS)
+
+   where (i,j) represents a scalar value with index j in a scalar
+   iteration with index i.
+
+   [ We use the term "rgroup" to emphasise that this grouping isn't
+     necessarily the same as the grouping of statements used elsewhere.
+     For example, if we implement a group of scalar loads using gather
+     loads, we'll use a separate gather load for each scalar load, and
+     thus each gather load will belong to its own rgroup. ]
+
+   In general this sequence will occupy nV vectors concatenated
+   together.  If these vectors have nL lanes each, the total number
+   of scalar values N is given by:
+
+       N = nS * VF = nV * nL
+
+   None of nS, VF, nV and nL are required to be a power of 2.  nS and nV
+   are compile-time constants but VF and nL can be variable (if the target
+   supports variable-length vectors).
+
+   In classical vectorization, each iteration of the vector loop would
+   handle exactly VF iterations of the original scalar loop.  However,
+   in a fully-masked loop, a particular iteration of the vector loop
+   might handle fewer than VF iterations of the scalar loop.  The vector
+   lanes that correspond to iterations of the scalar loop are said to be
+   "active" and the other lanes are said to be "inactive".
+
+   In a fully-masked loop, many rgroups need to be masked to ensure that
+   they have no effect for the inactive lanes.  Each such rgroup needs a
+   sequence of booleans in the same order as above, but with each (i,j)
+   replaced by a boolean that indicates whether iteration i is active.
+   This sequence occupies nV vector masks that again have nL lanes each.
+   Thus the mask sequence as a whole consists of VF independent booleans
+   that are each repeated nS times.
+
+   We make the simplifying assumption that if a sequence of nV masks is
+   suitable for one (nS,nL) pair, we can reuse it for (nS/2,nL/2) by
+   VIEW_CONVERTing it.  This holds for all current targets that support
+   fully-masked loops.  For example, suppose the scalar loop is:
+
+     float *f;
+     double *d;
+     for (int i = 0; i < n; ++i)
+       {
+         f[i * 2 + 0] += 1.0f;
+         f[i * 2 + 1] += 2.0f;
+         d[i] += 3.0;
+       }
+
+   and suppose that vectors have 256 bits.  The vectorized f accesses
+   will belong to one rgroup and the vectorized d access to another:
+
+     f rgroup: nS = 2, nV = 1, nL = 8
+     d rgroup: nS = 1, nV = 1, nL = 4
+               VF = 4
+
+     [ In this simple example the rgroups do correspond to the normal
+       SLP grouping scheme. ]
+
+   If only the first three lanes are active, the masks we need are:
+
+     f rgroup: 1 1 | 1 1 | 1 1 | 0 0
+     d rgroup:  1  |  1  |  1  |  0
+
+   Here we can use a mask calculated for f's rgroup for d's, but not
+   vice versa.
+
+   Thus for each value of nV, it is enough to provide nV masks, with the
+   mask being calculated based on the highest nL (or, equivalently, based
+   on the highest nS) required by any rgroup with that nV.  We therefore
+   represent the entire collection of masks as a two-level table, with the
+   first level being indexed by nV - 1 (since nV == 0 doesn't exist) and
+   the second being indexed by the mask index 0 <= i < nV.  */
+
+/* The masks needed by rgroups with nV vectors, according to the
+   description above.  */
+struct rgroup_masks {
+  /* The largest nS for all rgroups that use these masks.  */
+  unsigned int max_nscalars_per_iter;
+
+  /* The type of mask to use, based on the highest nS recorded above.  */
+  tree mask_type;
+
+  /* A vector of nV masks, in iteration order.  */
+  vec<tree> masks;
+};
+
+typedef auto_vec<rgroup_masks> vec_loop_masks;
+
 /*-----------------------------------------------------------------*/
 /* Info on vectorized loops.                                       */
 /*-----------------------------------------------------------------*/
@@ -251,6 +347,14 @@ typedef struct _loop_vec_info : public vec_info {
      if there is no particular limit.  */
   unsigned HOST_WIDE_INT max_vectorization_factor;
 
+  /* The masks that a fully-masked loop should use to avoid operating
+     on inactive scalars.  */
+  vec_loop_masks masks;
+
+  /* Type of the variables to use in the WHILE_ULT call for fully-masked
+     loops.  */
+  tree mask_compare_type;
+
   /* Unknown DRs according to which loop was peeled.  */
   struct data_reference *unaligned_dr;
 
@@ -305,6 +409,12 @@ typedef struct _loop_vec_info : public vec_info {
   /* Is the loop vectorizable? */
   bool vectorizable;
 
+  /* Records whether we still have the option of using a fully-masked loop.  */
+  bool can_fully_mask_p;
+
+  /* True if have decided to use a fully-masked loop.  */
+  bool fully_masked_p;
+
   /* When we have grouped data accesses with gaps, we may introduce invalid
      memory accesses.  We peel the last iteration of the loop to prevent
      this.  */
@@ -365,8 +475,12 @@ typedef struct _loop_vec_info : public vec_info {
 #define LOOP_VINFO_COST_MODEL_THRESHOLD(L) (L)->th
 #define LOOP_VINFO_VERSIONING_THRESHOLD(L) (L)->versioning_threshold
 #define LOOP_VINFO_VECTORIZABLE_P(L)       (L)->vectorizable
+#define LOOP_VINFO_CAN_FULLY_MASK_P(L)     (L)->can_fully_mask_p
+#define LOOP_VINFO_FULLY_MASKED_P(L)       (L)->fully_masked_p
 #define LOOP_VINFO_VECT_FACTOR(L)          (L)->vectorization_factor
 #define LOOP_VINFO_MAX_VECT_FACTOR(L)      (L)->max_vectorization_factor
+#define LOOP_VINFO_MASKS(L)                (L)->masks
+#define LOOP_VINFO_MASK_COMPARE_TYPE(L)    (L)->mask_compare_type
 #define LOOP_VINFO_PTR_MASK(L)             (L)->ptr_mask
 #define LOOP_VINFO_LOOP_NEST(L)            (L)->loop_nest
 #define LOOP_VINFO_DATAREFS(L)             (L)->datarefs
@@ -1172,6 +1286,17 @@ vect_nunits_for_cost (tree vec_type)
   return estimated_poly_value (TYPE_VECTOR_SUBPARTS (vec_type));
 }
 
+/* Return the maximum possible vectorization factor for LOOP_VINFO.  */
+
+static inline unsigned HOST_WIDE_INT
+vect_max_vf (loop_vec_info loop_vinfo)
+{
+  unsigned HOST_WIDE_INT vf;
+  if (LOOP_VINFO_VECT_FACTOR (loop_vinfo).is_constant (&vf))
+    return vf;
+  return MAX_VECTORIZATION_FACTOR;
+}
+
 /* Return the size of the value accessed by unvectorized data reference DR.
    This is only valid once STMT_VINFO_VECTYPE has been calculated for the
    associated gimple statement, since that guarantees that DR accesses
@@ -1194,8 +1319,8 @@ extern source_location vect_location;
 
 /* Simple loop peeling and versioning utilities for vectorizer's purposes -
    in tree-vect-loop-manip.c.  */
-extern void slpeel_make_loop_iterate_ntimes (struct loop *, tree, tree,
-					     tree, bool);
+extern void vect_set_loop_condition (struct loop *, loop_vec_info,
+				     tree, tree, tree, bool);
 extern bool slpeel_can_duplicate_loop_p (const struct loop *, const_edge);
 struct loop *slpeel_tree_duplicate_loop_to_edge_cfg (struct loop *,
 						     struct loop *, edge);
@@ -1211,6 +1336,7 @@ extern poly_uint64 current_vector_size;
 extern tree get_vectype_for_scalar_type (tree);
 extern tree get_mask_type_for_scalar_type (tree);
 extern tree get_same_sized_vectype (tree, tree);
+extern bool vect_get_loop_mask_type (loop_vec_info);
 extern bool vect_is_simple_use (tree, vec_info *, gimple **,
                                 enum vect_def_type *);
 extern bool vect_is_simple_use (tree, vec_info *, gimple **,
@@ -1265,6 +1391,7 @@ extern bool vect_supportable_shift (enum tree_code, tree);
 extern tree vect_gen_perm_mask_any (tree, vec_perm_indices);
 extern tree vect_gen_perm_mask_checked (tree, vec_perm_indices);
 extern void optimize_mask_stores (struct loop*);
+extern gcall *vect_gen_while (tree, tree, tree);
 
 /* In tree-vect-data-refs.c.  */
 extern bool vect_can_force_dr_alignment_p (const_tree, unsigned int);
@@ -1318,6 +1445,13 @@ extern loop_vec_info vect_analyze_loop (struct loop *, loop_vec_info);
 extern tree vect_build_loop_niters (loop_vec_info, bool * = NULL);
 extern void vect_gen_vector_loop_niters (loop_vec_info, tree, tree *,
 					 tree *, bool);
+extern tree vect_halve_mask_nunits (tree);
+extern tree vect_double_mask_nunits (tree);
+extern void vect_record_loop_mask (loop_vec_info, vec_loop_masks *,
+				   unsigned int, tree);
+extern tree vect_get_loop_mask (gimple_stmt_iterator *, vec_loop_masks *,
+				unsigned int, tree, unsigned int);
+
 /* Drive for loop transformation stage.  */
 extern struct loop *vect_transform_loop (loop_vec_info);
 extern loop_vec_info vect_analyze_loop_form (struct loop *);