1 files changed, 2077 insertions, 504 deletions
diff --git a/gcc/tree-vect-loop.c b/gcc/tree-vect-loop.c
index afb36eab1f7..d784754c6de 100644
--- a/gcc/tree-vect-loop.c
+++ b/gcc/tree-vect-loop.c
@@ -50,6 +50,7 @@ along with GCC; see the file COPYING3.  If not see
 #include "cgraph.h"
 #include "tree-cfg.h"
 #include "tree-if-conv.h"
+#include "internal-fn.h"
 
 /* Loop Vectorization Pass.
 
@@ -182,11 +183,10 @@ vect_determine_vectorization_factor (loop_vec_info loop_vinfo)
   struct loop *loop = LOOP_VINFO_LOOP (loop_vinfo);
   basic_block *bbs = LOOP_VINFO_BBS (loop_vinfo);
   unsigned nbbs = loop->num_nodes;
-  unsigned int vectorization_factor = 0;
+  poly_uint64 vectorization_factor = 1;
   tree scalar_type = NULL_TREE;
   gphi *phi;
   tree vectype;
-  unsigned int nunits;
   stmt_vec_info stmt_info;
   unsigned i;
   HOST_WIDE_INT dummy;
@@ -255,14 +255,14 @@ vect_determine_vectorization_factor (loop_vec_info loop_vinfo)
                   dump_printf (MSG_NOTE, "\n");
 		}
 
-	      nunits = TYPE_VECTOR_SUBPARTS (vectype);
 	      if (dump_enabled_p ())
-		dump_printf_loc (MSG_NOTE, vect_location, "nunits = %d\n",
-                                 nunits);
+		{
+		  dump_printf_loc (MSG_NOTE, vect_location, "nunits = ");
+		  dump_dec (MSG_NOTE, TYPE_VECTOR_SUBPARTS (vectype));
+		  dump_printf (MSG_NOTE, "\n");
+		}
 
-	      if (!vectorization_factor
-		  || (nunits > vectorization_factor))
-		vectorization_factor = nunits;
+	      vect_update_max_nunits (&vectorization_factor, vectype);
 	    }
 	}
 
@@ -369,7 +369,12 @@ vect_determine_vectorization_factor (loop_vec_info loop_vinfo)
 		analyze_pattern_stmt = false;
 	    }
 
+	  bool is_gcond = gimple_code (stmt) == GIMPLE_COND;
+	  gcc_assert (!is_gcond
+		      || LOOP_VINFO_SPECULATIVE_EXECUTION (loop_vinfo));
+
 	  if (gimple_get_lhs (stmt) == NULL_TREE
+	      && !is_gcond
 	      /* MASK_STORE has no lhs, but is ok.  */
 	      && (!is_gimple_call (stmt)
 		  || !gimple_call_internal_p (stmt)
@@ -427,27 +432,31 @@ vect_determine_vectorization_factor (loop_vec_info loop_vinfo)
 	      gcc_assert (!STMT_VINFO_DATA_REF (stmt_info));
 	      if (gimple_call_internal_p (stmt, IFN_MASK_STORE))
 		scalar_type = TREE_TYPE (gimple_call_arg (stmt, 3));
+	      else if (is_gcond)
+		scalar_type = TREE_TYPE (gimple_cond_lhs (stmt));
 	      else
 		scalar_type = TREE_TYPE (gimple_get_lhs (stmt));
 
 	      /* Bool ops don't participate in vectorization factor
 		 computation.  For comparison use compared types to
 		 compute a factor.  */
-	      if (VECT_SCALAR_BOOLEAN_TYPE_P (scalar_type)
-		  && is_gimple_assign (stmt)
-		  && gimple_assign_rhs_code (stmt) != COND_EXPR)
+	      if (is_gcond
+		  || (VECT_SCALAR_BOOLEAN_TYPE_P (scalar_type)
+		      && is_gimple_assign (stmt)
+		      && gimple_assign_rhs_code (stmt) != COND_EXPR))
 		{
 		  if (STMT_VINFO_RELEVANT_P (stmt_info)
 		      || STMT_VINFO_LIVE_P (stmt_info))
 		    mask_producers.safe_push (stmt_info);
 		  bool_result = true;
 
-		  if (TREE_CODE_CLASS (gimple_assign_rhs_code (stmt))
-		      == tcc_comparison
+		  if (is_gimple_assign (stmt)
+		      && (TREE_CODE_CLASS (gimple_assign_rhs_code (stmt))
+			  == tcc_comparison)
 		      && !VECT_SCALAR_BOOLEAN_TYPE_P
 			    (TREE_TYPE (gimple_assign_rhs1 (stmt))))
 		    scalar_type = TREE_TYPE (gimple_assign_rhs1 (stmt));
-		  else
+		  else if (TREE_CODE (scalar_type) == BOOLEAN_TYPE)
 		    {
 		      if (!analyze_pattern_stmt && gsi_end_p (pattern_def_si))
 			{
@@ -525,8 +534,8 @@ vect_determine_vectorization_factor (loop_vec_info loop_vinfo)
 	      return false;
 	    }
 
-	  if ((GET_MODE_SIZE (TYPE_MODE (vectype))
-	       != GET_MODE_SIZE (TYPE_MODE (vf_vectype))))
+	  if (may_ne (GET_MODE_SIZE (TYPE_MODE (vectype)),
+		      GET_MODE_SIZE (TYPE_MODE (vf_vectype))))
 	    {
 	      if (dump_enabled_p ())
 		{
@@ -550,12 +559,14 @@ vect_determine_vectorization_factor (loop_vec_info loop_vinfo)
               dump_printf (MSG_NOTE, "\n");
 	    }
 
-	  nunits = TYPE_VECTOR_SUBPARTS (vf_vectype);
 	  if (dump_enabled_p ())
-	    dump_printf_loc (MSG_NOTE, vect_location, "nunits = %d\n", nunits);
-	  if (!vectorization_factor
-	      || (nunits > vectorization_factor))
-	    vectorization_factor = nunits;
+	    {
+	      dump_printf_loc (MSG_NOTE, vect_location, "nunits = ");
+	      dump_dec (MSG_NOTE, TYPE_VECTOR_SUBPARTS (vf_vectype));
+	      dump_printf (MSG_NOTE, "\n");
+	    }
+
+	  vect_update_max_nunits (&vectorization_factor, vf_vectype);
 
 	  if (!analyze_pattern_stmt && gsi_end_p (pattern_def_si))
 	    {
@@ -567,9 +578,13 @@ vect_determine_vectorization_factor (loop_vec_info loop_vinfo)
 
   /* TODO: Analyze cost. Decide if worth while to vectorize.  */
   if (dump_enabled_p ())
-    dump_printf_loc (MSG_NOTE, vect_location, "vectorization factor = %d\n",
-                     vectorization_factor);
-  if (vectorization_factor <= 1)
+    {
+      dump_printf_loc (MSG_NOTE, vect_location, "vectorization factor = ");
+      dump_dec (MSG_NOTE, vectorization_factor);
+      dump_printf (MSG_NOTE, "\n");
+    }
+
+  if (must_le (vectorization_factor, 1U))
     {
       if (dump_enabled_p ())
         dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
@@ -583,13 +598,28 @@ vect_determine_vectorization_factor (loop_vec_info loop_vinfo)
       tree mask_type = NULL;
 
       stmt = STMT_VINFO_STMT (mask_producers[i]);
+      bool is_gcond = gimple_code (stmt) == GIMPLE_COND;
 
+      bool ops_are_booleans = true;
       if (is_gimple_assign (stmt)
 	  && TREE_CODE_CLASS (gimple_assign_rhs_code (stmt)) == tcc_comparison
 	  && !VECT_SCALAR_BOOLEAN_TYPE_P
 				      (TREE_TYPE (gimple_assign_rhs1 (stmt))))
 	{
 	  scalar_type = TREE_TYPE (gimple_assign_rhs1 (stmt));
+	  ops_are_booleans = false;
+
+	}
+      else if (is_gcond
+	       && TREE_CODE (TREE_TYPE (gimple_cond_lhs (stmt)))
+		  != BOOLEAN_TYPE)
+	{
+	  scalar_type = TREE_TYPE (gimple_cond_lhs (stmt));
+	  ops_are_booleans = false;
+	}
+
+      if (!ops_are_booleans)
+	{
 	  mask_type = get_mask_type_for_scalar_type (scalar_type);
 
 	  if (!mask_type)
@@ -631,8 +661,8 @@ vect_determine_vectorization_factor (loop_vec_info loop_vinfo)
 
 	      if (!mask_type)
 		mask_type = vectype;
-	      else if (TYPE_VECTOR_SUBPARTS (mask_type)
-		       != TYPE_VECTOR_SUBPARTS (vectype))
+	      else if (may_ne (TYPE_VECTOR_SUBPARTS (mask_type),
+			       TYPE_VECTOR_SUBPARTS (vectype)))
 		{
 		  if (dump_enabled_p ())
 		    {
@@ -956,6 +986,10 @@ vect_fixup_reduc_chain (gimple *stmt)
   gcc_assert (!GROUP_FIRST_ELEMENT (vinfo_for_stmt (firstp))
 	      && GROUP_FIRST_ELEMENT (vinfo_for_stmt (stmt)));
   GROUP_SIZE (vinfo_for_stmt (firstp)) = GROUP_SIZE (vinfo_for_stmt (stmt));
+  GROUP_FIRST_UID (vinfo_for_stmt (firstp))
+    = GROUP_FIRST_UID (vinfo_for_stmt (stmt));
+  GROUP_LAST_UID (vinfo_for_stmt (firstp))
+    = GROUP_LAST_UID (vinfo_for_stmt (stmt));
   do
     {
       stmtp = STMT_VINFO_RELATED_STMT (vinfo_for_stmt (stmt));
@@ -1110,21 +1144,32 @@ _loop_vec_info::_loop_vec_info (struct loop *loop_in)
     num_iters_unchanged (NULL_TREE),
     num_iters_assumptions (NULL_TREE),
     th (0),
+    versioning_threshold (0),
     vectorization_factor (0),
     max_vectorization_factor (0),
+    mask_skip_niters (NULL_TREE),
+    mask_compare_type (NULL_TREE),
     unaligned_dr (NULL),
     peeling_for_alignment (0),
     ptr_mask (0),
     slp_unrolling_factor (1),
     single_scalar_iteration_cost (0),
     vectorizable (false),
+    speculative_execution (false),
+    can_fully_mask_p (true),
+    fully_masked_p (false),
+    firstfaulting_execution (false),
     peeling_for_gaps (false),
     peeling_for_niter (false),
     operands_swapped (false),
     no_data_dependencies (false),
     has_mask_store (false),
     scalar_loop (NULL),
-    orig_loop_info (NULL)
+    orig_loop_info (NULL),
+    vect_addr_base_htab (31),
+    exit_test_mask (NULL_TREE),
+    exit_mask (NULL_TREE),
+    nonspeculative_seq (NULL)
 {
   /* Create/Update stmt_info for all stmts in the loop.  */
   basic_block *body = get_loop_body (loop);
@@ -1159,6 +1204,17 @@ _loop_vec_info::_loop_vec_info (struct loop *loop_in)
   gcc_assert (nbbs == loop->num_nodes);
 }
 
+/* Free all levels of MASKS.  */
+
+void
+release_vec_loop_masks (vec_loop_masks *masks)
+{
+  rgroup_masks *rgm;
+  unsigned int i;
+  FOR_EACH_VEC_ELT (*masks, i, rgm)
+    rgm->masks.release ();
+  masks->release ();
+}
 
 /* Free all memory used by the _loop_vec_info, as well as all the
    stmt_vec_info structs of all the stmts in the loop.  */
@@ -1224,9 +1280,196 @@ _loop_vec_info::~_loop_vec_info ()
 
   free (bbs);
 
+  release_vec_loop_masks (&masks);
+  release_vec_loop_masks (&nonspeculative_masks);
+
   loop->aux = NULL;
 }
 
+/* Return true if we can use CMP_TYPE as the comparison type to produce
+   all masks required to mask LOOP_VINFO.  */
+
+static bool
+can_produce_all_loop_masks_p (loop_vec_info loop_vinfo, tree cmp_type)
+{
+  rgroup_masks *rgm;
+  unsigned int i;
+  FOR_EACH_VEC_ELT (LOOP_VINFO_MASKS (loop_vinfo), i, rgm)
+    if (rgm->mask_type != NULL_TREE
+	&& !direct_internal_fn_supported_p (IFN_WHILE_ULT,
+					    cmp_type, rgm->mask_type,
+					    OPTIMIZE_FOR_SPEED))
+      return false;
+  return true;
+}
+
+/* Calculate the maximum number of scalars per iteration for every
+   rgroup in LOOP_VINFO.  */
+
+static unsigned int
+vect_get_max_nscalars_per_iter (loop_vec_info loop_vinfo)
+{
+  unsigned int res = 1;
+  unsigned int i;
+  rgroup_masks *rgm;
+  FOR_EACH_VEC_ELT (LOOP_VINFO_MASKS (loop_vinfo), i, rgm)
+    res = MAX (res, rgm->max_nscalars_per_iter);
+  return res;
+}
+
+/* Each statement in LOOP_VINFO can be masked where necessary.  Check
+   whether we can actually generate the masks required.  Return true if so,
+   storing the type of the scalar IV in LOOP_VINFO_MASK_COMPARE_TYPE.  */
+
+static bool
+vect_verify_full_masking (loop_vec_info loop_vinfo)
+{
+  struct loop *loop = LOOP_VINFO_LOOP (loop_vinfo);
+  unsigned int min_ni_width;
+  unsigned HOST_WIDE_INT const_vf;
+
+  /* Get the number of bits needed to hold the number of iterations
+     as an unsigned value.  */
+  if (LOOP_VINFO_SPECULATIVE_EXECUTION (loop_vinfo))
+    {
+      /* For speculative loops, we only need to count the number of iterations
+	 before the vector loop.  */
+      if (LOOP_VINFO_VECT_FACTOR (loop_vinfo).is_constant (&const_vf))
+	{
+	  unsigned int factor = vect_get_max_nscalars_per_iter (loop_vinfo);
+	  min_ni_width = wi::min_precision (const_vf * factor, UNSIGNED);
+	}
+      else
+	min_ni_width = POINTER_SIZE;
+    }
+  else
+    {
+      /* Get the maximum number of iterations that is representable
+	 in the counter type.  */
+      tree ni_type = TREE_TYPE (LOOP_VINFO_NITERSM1 (loop_vinfo));
+      widest_int max_ni = wi::to_widest (TYPE_MAX_VALUE (ni_type)) + 1;
+
+      /* Get a more refined estimate for the number of iterations.  */
+      widest_int max_back_edges;
+      if (max_loop_iterations (loop, &max_back_edges))
+	max_ni = wi::smin (max_ni, max_back_edges + 1);
+
+      /* Account for rgroup masks, in which each bit is replicated N times.  */
+      max_ni *= vect_get_max_nscalars_per_iter (loop_vinfo);
+
+      /* Work out how many bits we need to represent the limit.  */
+      min_ni_width = wi::min_precision (max_ni, UNSIGNED);
+    }
+
+  /* Find a scalar mode for which WHILE_ULT is supported.  */
+  opt_scalar_int_mode cmp_mode_iter;
+  tree cmp_type = NULL_TREE;
+  FOR_EACH_MODE_IN_CLASS (cmp_mode_iter, MODE_INT)
+    {
+      unsigned int cmp_bits = GET_MODE_BITSIZE (cmp_mode_iter.require ());
+      if (cmp_bits >= min_ni_width
+	  && targetm.scalar_mode_supported_p (cmp_mode_iter.require ()))
+	{
+	  tree this_type = build_nonstandard_integer_type (cmp_bits, true);
+	  if (this_type
+	      && can_produce_all_loop_masks_p (loop_vinfo, this_type))
+	    {
+	      /* Although we could stop as soon as we find a valid mode,
+		 it's often better to continue until we hit Pmode, since the
+		 operands to the WHILE are more likely to be reusable in
+		 address calculations.  */
+	      cmp_type = this_type;
+	      if (cmp_bits >= GET_MODE_BITSIZE (Pmode))
+		break;
+	    }
+	}
+    }
+
+  if (!cmp_type)
+    return false;
+
+  LOOP_VINFO_MASK_COMPARE_TYPE (loop_vinfo) = cmp_type;
+  return true;
+}
+
+/* LOOP_VINFO uses a fully-masked loop and needs to use a capped
+   vectorization factor.  Decide whether the best way of doing that is:
+
+     cap_mask = IFN_WHILE_ULT (0, max_vf)
+     actual_vf = IFN_MASK_POPCOUNT (cap_mask)
+
+   CAP_MASK can then be used for an rgroup for which nS == 1 and nV == 1
+   (see the comment above rgroup_masks for details).
+
+   Return true if this does seem to be the best implementation and
+   update LOOP_VINFO_CAP accordingly.  */
+
+static bool
+vect_maybe_build_capped_vf_via_while (loop_vec_info loop_vinfo,
+				      gimple_seq *seq)
+{
+  poly_uint64 nunits = LOOP_VINFO_VECT_FACTOR (loop_vinfo);
+  if (nunits.is_constant ())
+    /* In this case the capped number of iterations is known at compile
+       time, so a POPCOUNT would be pointless.  */
+    return false;
+
+  if (LOOP_VINFO_MASKS (loop_vinfo).is_empty ())
+    return false;
+
+  rgroup_masks *rgm = &LOOP_VINFO_MASKS (loop_vinfo)[0];
+  if (rgm->max_nscalars_per_iter != 1)
+    /* There's no nS == 1 && nV == 1 mask that would benefit from
+       having a precomputed cap mask.  */
+    return false;
+
+  if (!direct_internal_fn_supported_p (IFN_MASK_POPCOUNT, rgm->mask_type,
+				       OPTIMIZE_FOR_SPEED))
+    return false;
+
+  tree compare_type = LOOP_VINFO_MASK_COMPARE_TYPE (loop_vinfo);
+  tree zero_index = build_int_cst (compare_type, 0);
+  tree limit = build_int_cst (compare_type,
+			      LOOP_VINFO_MAX_VECT_FACTOR (loop_vinfo));
+
+  tree cap_mask = make_temp_ssa_name (rgm->mask_type, NULL, "cap_mask");
+  gcall *stmt = vect_gen_while (cap_mask, zero_index, limit);
+  gimple_seq_add_stmt (seq, stmt);
+  LOOP_VINFO_CAP (loop_vinfo).mask = cap_mask;
+
+  tree vf = make_temp_ssa_name (sizetype, NULL, "vf");
+  stmt = gimple_build_call_internal (IFN_MASK_POPCOUNT, 1, cap_mask);
+  gimple_call_set_lhs (stmt, vf);
+  gimple_seq_add_stmt (seq, stmt);
+  LOOP_VINFO_CAP (loop_vinfo).niters = vf;
+
+  return true;
+}
+
+/* Initialize LOOP_VINFO_CAP (LOOP_VINFO).  */
+
+static void
+vect_build_cap (loop_vec_info loop_vinfo)
+{
+  tree vf = size_int (LOOP_VINFO_VECT_FACTOR (loop_vinfo));
+  if (!use_capped_vf (loop_vinfo))
+    LOOP_VINFO_CAP (loop_vinfo).niters = vf;
+  else
+    {
+      gimple_seq seq = NULL;
+      if (!vect_maybe_build_capped_vf_via_while (loop_vinfo, &seq))
+	{
+	  tree max_vf = size_int (LOOP_VINFO_MAX_VECT_FACTOR (loop_vinfo));
+	  LOOP_VINFO_CAP (loop_vinfo).niters
+	    = gimple_build (&seq, MIN_EXPR, sizetype, vf, max_vf);
+	}
+      if (seq)
+	{
+	  edge pe = loop_preheader_edge (LOOP_VINFO_LOOP (loop_vinfo));
+	  gsi_insert_seq_on_edge_immediate (pe, seq);
+	}
+    }
+}
 
 /* Calculate the cost of one scalar iteration of the loop.  */
 static void
@@ -1477,7 +1720,8 @@ vect_analyze_loop_form_1 (struct loop *loop, gcond **loop_cond,
 
   if (integer_zerop (*assumptions)
       || !*number_of_iterations
-      || chrec_contains_undetermined (*number_of_iterations))
+      || (loop->inner
+	  && chrec_contains_undetermined (*number_of_iterations)))
     {
       if (dump_enabled_p ())
 	dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
@@ -1485,6 +1729,15 @@ vect_analyze_loop_form_1 (struct loop *loop, gcond **loop_cond,
 			 "computed.\n");
       return false;
     }
+  else if (!loop->inner
+	   && chrec_contains_undetermined (*number_of_iterations))
+    {
+      if (dump_enabled_p ())
+	dump_printf_loc (MSG_NOTE, vect_location,
+			 "number of iterations cannot be computed, "
+			 "relying upon speculative execution\n");
+      return true;
+    }
 
   if (integer_zerop (*number_of_iterations))
     {
@@ -1511,6 +1764,21 @@ vect_analyze_loop_form (struct loop *loop)
     return NULL;
 
   loop_vec_info loop_vinfo = new _loop_vec_info (loop);
+
+  if (number_of_iterations
+      && chrec_contains_undetermined (number_of_iterations))
+    {
+      /* Nested loops are not supported for speculative execution.  */
+      gcc_assert (!loop->inner);
+
+      LOOP_VINFO_SPECULATIVE_EXECUTION (loop_vinfo) = true;
+
+      /* Since we don't know what the number of iterations there seems little
+	 point in having anything other than NULL.  */
+      number_of_iterations = NULL;
+      number_of_iterationsm1 = NULL;
+    }
+
   LOOP_VINFO_NITERSM1 (loop_vinfo) = number_of_iterationsm1;
   LOOP_VINFO_NITERS (loop_vinfo) = number_of_iterations;
   LOOP_VINFO_NITERS_UNCHANGED (loop_vinfo) = number_of_iterations;
@@ -1560,7 +1828,7 @@ vect_update_vf_for_slp (loop_vec_info loop_vinfo)
   struct loop *loop = LOOP_VINFO_LOOP (loop_vinfo);
   basic_block *bbs = LOOP_VINFO_BBS (loop_vinfo);
   int nbbs = loop->num_nodes;
-  unsigned int vectorization_factor;
+  poly_uint64 vectorization_factor;
   int i;
 
   if (dump_enabled_p ())
@@ -1568,7 +1836,7 @@ vect_update_vf_for_slp (loop_vec_info loop_vinfo)
 		     "=== vect_update_vf_for_slp ===\n");
 
   vectorization_factor = LOOP_VINFO_VECT_FACTOR (loop_vinfo);
-  gcc_assert (vectorization_factor != 0);
+  gcc_assert (must_ne (vectorization_factor, 0U));
 
   /* If all the stmts in the loop can be SLPed, we perform only SLP, and
      vectorization factor of the loop is the unrolling factor required by
@@ -1608,16 +1876,22 @@ vect_update_vf_for_slp (loop_vec_info loop_vinfo)
     {
       dump_printf_loc (MSG_NOTE, vect_location,
 		       "Loop contains SLP and non-SLP stmts\n");
+      /* Both the vectorization factor and unroll factor have the form
+	 current_vector_size * X for some rational X, so they must have
+	 a common multiple.  */
       vectorization_factor
-	= least_common_multiple (vectorization_factor,
+	= force_common_multiple (vectorization_factor,
 				 LOOP_VINFO_SLP_UNROLLING_FACTOR (loop_vinfo));
     }
 
   LOOP_VINFO_VECT_FACTOR (loop_vinfo) = vectorization_factor;
   if (dump_enabled_p ())
-    dump_printf_loc (MSG_NOTE, vect_location,
-		     "Updating vectorization factor to %d\n",
-		     vectorization_factor);
+    {
+      dump_printf_loc (MSG_NOTE, vect_location,
+		       "Updating vectorization factor to ");
+      dump_dec (MSG_NOTE, vectorization_factor);
+      dump_printf (MSG_NOTE, ".\n");
+    }
 }
 
 /* Function vect_analyze_loop_operations.
@@ -1778,6 +2052,101 @@ vect_analyze_loop_operations (loop_vec_info loop_vinfo)
   return true;
 }
 
+/* Analyze the cost of the loop described by LOOP_VINFO.  Decide if it
+   is worthwhile to vectorize.  Return 1 if definitely yes, 0 if
+   definitely no, or -1 if it's worth retrying.  */
+
+static int
+vect_analyze_loop_costing (loop_vec_info loop_vinfo)
+{
+  struct loop *loop = LOOP_VINFO_LOOP (loop_vinfo);
+  unsigned int assumed_vf = vect_vf_for_cost (loop_vinfo);
+
+  /* Only fully-masked loops can have iteration counts less than the
+     vectorization factor.  */
+  if (!LOOP_VINFO_FULLY_MASKED_P (loop_vinfo))
+    {
+      HOST_WIDE_INT max_niter;
+
+      if (LOOP_VINFO_NITERS_KNOWN_P (loop_vinfo))
+	max_niter = LOOP_VINFO_INT_NITERS (loop_vinfo);
+      else
+	max_niter = max_stmt_executions_int (loop);
+
+      if (max_niter != -1
+	  && (unsigned HOST_WIDE_INT) max_niter < assumed_vf)
+	{
+	  if (dump_enabled_p ())
+	    dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
+			     "not vectorized: iteration count smaller than "
+			     "vectorization factor.\n");
+	  return 0;
+	}
+    }
+
+  int min_profitable_iters, min_profitable_estimate;
+  vect_estimate_min_profitable_iters (loop_vinfo, &min_profitable_iters,
+				      &min_profitable_estimate);
+
+  if (min_profitable_iters < 0)
+    {
+      if (dump_enabled_p ())
+	dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
+			 "not vectorized: vectorization not profitable.\n");
+      if (dump_enabled_p ())
+	dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
+			 "not vectorized: vector version will never be "
+			 "profitable.\n");
+      return -1;
+    }
+
+  int min_scalar_loop_bound = (PARAM_VALUE (PARAM_MIN_VECT_LOOP_BOUND)
+			       * assumed_vf);
+
+  /* Use the cost model only if it is more conservative than user specified
+     threshold.  */
+  unsigned int th = (unsigned) MAX (min_scalar_loop_bound,
+				    min_profitable_iters);
+
+  LOOP_VINFO_COST_MODEL_THRESHOLD (loop_vinfo) = th;
+
+  if (LOOP_VINFO_NITERS_KNOWN_P (loop_vinfo)
+      && LOOP_VINFO_INT_NITERS (loop_vinfo) < th)
+    {
+      if (dump_enabled_p ())
+	dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
+			 "not vectorized: vectorization not profitable.\n");
+      if (dump_enabled_p ())
+	dump_printf_loc (MSG_NOTE, vect_location,
+			 "not vectorized: iteration count smaller than user "
+			 "specified loop bound parameter or minimum profitable "
+			 "iterations (whichever is more conservative).\n");
+      return 0;
+    }
+
+  HOST_WIDE_INT estimated_niter = estimated_stmt_executions_int (loop);
+  if (estimated_niter == -1)
+    estimated_niter = likely_max_stmt_executions_int (loop);
+  if (estimated_niter != -1
+      && ((unsigned HOST_WIDE_INT) estimated_niter
+	  < MAX (th, (unsigned) min_profitable_estimate)))
+    {
+      if (dump_enabled_p ())
+	dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
+			 "not vectorized: estimated iteration count too "
+			 "small.\n");
+      if (dump_enabled_p ())
+	dump_printf_loc (MSG_NOTE, vect_location,
+			 "not vectorized: estimated iteration count smaller "
+			 "than specified loop bound parameter or minimum "
+			 "profitable iterations (whichever is more "
+			 "conservative).\n");
+      return -1;
+    }
+
+  return 1;
+}
+
 
 /* Function vect_analyze_loop_2.
 
@@ -1788,8 +2157,9 @@ static bool
 vect_analyze_loop_2 (loop_vec_info loop_vinfo, bool &fatal)
 {
   bool ok;
-  int max_vf = MAX_VECTORIZATION_FACTOR;
-  int min_vf = 2;
+  int res;
+  unsigned int max_vf = MAX_VECTORIZATION_FACTOR;
+  poly_uint64 min_vf = 2;
   unsigned int n_stmts = 0;
 
   /* The first group of checks is independent of the vector size.  */
@@ -1861,6 +2231,25 @@ vect_analyze_loop_2 (loop_vec_info loop_vinfo, bool &fatal)
 	  }
       }
 
+  /* TODO: We can't currently support stores for speculative loops.  */
+  if (LOOP_VINFO_SPECULATIVE_EXECUTION (loop_vinfo)
+      && LOOP_VINFO_DATAREFS (loop_vinfo).length () > 0)
+    {
+      vec<data_reference_p> datarefs = LOOP_VINFO_DATAREFS (loop_vinfo);
+      struct data_reference *dr;
+      unsigned int i;
+
+      FOR_EACH_VEC_ELT (datarefs, i, dr)
+	if (!DR_IS_READ (dr))
+	  {
+	    if (dump_enabled_p ())
+	      dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
+			       "Stores not supported for speculative "
+			       "loops.\n");
+	    return false;
+	  }
+    }
+
   /* Analyze the data references and also adjust the minimal
      vectorization factor according to the loads and stores.  */
 
@@ -1910,11 +2299,15 @@ vect_analyze_loop_2 (loop_vec_info loop_vinfo, bool &fatal)
   /* Analyze data dependences between the data-refs in the loop
      and adjust the maximum vectorization factor according to
      the dependences.
-     FORNOW: fail at the first data dependence that we encounter.  */
+
+     We might be able to cope with max_vf that are smaller than the full
+     vector width by using a fully-masked loop.  Postpone that decision
+     until we know whether full masking is possible.  Of course, it might
+     not be a win to use vectors in this situation even if it is supported,
+     but that's a decision for the cost model.  */
 
   ok = vect_analyze_data_ref_dependences (loop_vinfo, &max_vf);
-  if (!ok
-      || max_vf < min_vf)
+  if (!ok || max_vf <= 1)
     {
       if (dump_enabled_p ())
 	    dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
@@ -1931,21 +2324,12 @@ vect_analyze_loop_2 (loop_vec_info loop_vinfo, bool &fatal)
 			 "can't determine vectorization factor.\n");
       return false;
     }
-  if (max_vf < LOOP_VINFO_VECT_FACTOR (loop_vinfo))
-    {
-      if (dump_enabled_p ())
-	dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
-			 "bad data dependence.\n");
-      return false;
-    }
 
   /* Compute the scalar iteration cost.  */
   vect_compute_single_scalar_iteration_cost (loop_vinfo);
 
-  int saved_vectorization_factor = LOOP_VINFO_VECT_FACTOR (loop_vinfo);
-  HOST_WIDE_INT estimated_niter;
+  poly_uint64 saved_vectorization_factor = LOOP_VINFO_VECT_FACTOR (loop_vinfo);
   unsigned th;
-  int min_scalar_loop_bound;
 
   /* Check the SLP opportunities in the loop, analyze and build SLP trees.  */
   ok = vect_analyze_slp (loop_vinfo, n_stmts);
@@ -1963,32 +2347,31 @@ vect_analyze_loop_2 (loop_vec_info loop_vinfo, bool &fatal)
       vect_update_vf_for_slp (loop_vinfo);
     }
 
+  bool saved_can_fully_mask_p = LOOP_VINFO_CAN_FULLY_MASK_P (loop_vinfo);
+
+  /* We don't expect to have to roll back to anything other than an empty
+     set of rgroups.  */
+  gcc_assert (LOOP_VINFO_MASKS (loop_vinfo).is_empty ()
+	      && LOOP_VINFO_NONSPECULATIVE_MASKS (loop_vinfo).is_empty ());
+
   /* This is the point where we can re-start analysis with SLP forced off.  */
 start_over:
 
   /* Now the vectorization factor is final.  */
-  unsigned vectorization_factor = LOOP_VINFO_VECT_FACTOR (loop_vinfo);
-  gcc_assert (vectorization_factor != 0);
+  poly_uint64 vectorization_factor = LOOP_VINFO_VECT_FACTOR (loop_vinfo);
+  gcc_assert (must_ne (vectorization_factor, 0U));
 
   if (LOOP_VINFO_NITERS_KNOWN_P (loop_vinfo) && dump_enabled_p ())
-    dump_printf_loc (MSG_NOTE, vect_location,
-		     "vectorization_factor = %d, niters = "
-		     HOST_WIDE_INT_PRINT_DEC "\n", vectorization_factor,
-		     LOOP_VINFO_INT_NITERS (loop_vinfo));
+    {
+      dump_printf_loc (MSG_NOTE, vect_location,
+		       "vectorization_factor = ");
+      dump_dec (MSG_NOTE, vectorization_factor);
+      dump_printf (MSG_NOTE, ", niters = " HOST_WIDE_INT_PRINT_DEC "\n",
+		   LOOP_VINFO_INT_NITERS (loop_vinfo));
+    }
 
   HOST_WIDE_INT max_niter
     = likely_max_stmt_executions_int (LOOP_VINFO_LOOP (loop_vinfo));
-  if ((LOOP_VINFO_NITERS_KNOWN_P (loop_vinfo)
-       && (LOOP_VINFO_INT_NITERS (loop_vinfo) < vectorization_factor))
-      || (max_niter != -1
-	  && (unsigned HOST_WIDE_INT) max_niter < vectorization_factor))
-    {
-      if (dump_enabled_p ())
-	dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
-			 "not vectorized: iteration count smaller than "
-			 "vectorization factor.\n");
-      return false;
-    }
 
   /* Analyze the alignment of the data-refs in the loop.
      Fail if a data reference is found that cannot be vectorized.  */
@@ -2047,16 +2430,93 @@ start_over:
       return false;
     }
 
+  if (LOOP_VINFO_CAN_FULLY_MASK_P (loop_vinfo)
+      && !LOOP_VINFO_PEELING_FOR_ALIGNMENT (loop_vinfo)
+      && LOOP_VINFO_SPECULATIVE_EXECUTION (loop_vinfo)
+      && !LOOP_VINFO_FIRSTFAULTING_EXECUTION (loop_vinfo)
+      && !LOOP_VINFO_NEEDS_NONSPECULATIVE_MASKS (loop_vinfo)
+      && !use_capped_vf (loop_vinfo))
+    {
+      LOOP_VINFO_CAN_FULLY_MASK_P (loop_vinfo) = false;
+      if (dump_enabled_p ())
+	dump_printf_loc (MSG_NOTE, vect_location,
+			 "No need to fully mask this speculative loop;"
+			 " it doesn't require first-faulting instructions"
+			 " and everything is naturally aligned.\n");
+    }
+
+  /* Decide whether to use a fully-masked loop for this vectorization
+     factor.  */
+  LOOP_VINFO_FULLY_MASKED_P (loop_vinfo)
+    = (LOOP_VINFO_CAN_FULLY_MASK_P (loop_vinfo)
+       && vect_verify_full_masking (loop_vinfo));
+  if (dump_enabled_p ())
+    {
+      if (LOOP_VINFO_FULLY_MASKED_P (loop_vinfo))
+	dump_printf_loc (MSG_NOTE, vect_location,
+			 "using a fully-masked loop.\n");
+      else
+	dump_printf_loc (MSG_NOTE, vect_location,
+			 "not using a fully-masked loop.\n");
+    }
+
+  if (!LOOP_VINFO_FULLY_MASKED_P (loop_vinfo))
+    {
+      if (LOOP_VINFO_FIRSTFAULTING_EXECUTION (loop_vinfo))
+	{
+	  if (dump_enabled_p ())
+	    dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
+			     "Not vectorized: first-faulting loads require "
+			     "a fully-masked loop.\n");
+	  return false;
+	}
+
+      if (LOOP_VINFO_NEEDS_NONSPECULATIVE_MASKS (loop_vinfo))
+	{
+	  if (dump_enabled_p ())
+	    dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
+			     "Not vectorized: non-speculative operations "
+			     "need a fully-masked loop.\n");
+	  return false;
+	}
+
+      if (use_capped_vf (loop_vinfo))
+	{
+	  if (dump_enabled_p ())
+	    dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
+			     "Need to cap the runtime vectorization factor to "
+			     HOST_WIDE_INT_PRINT_DEC " but cannot fully mask"
+			     " the loop.\n",
+			     LOOP_VINFO_MAX_VECT_FACTOR (loop_vinfo));
+	  /* Undoing SLP might allow us to use a mask.  */
+	  goto again;
+	}
+    }
+
+  if (LOOP_VINFO_NEEDS_NONSPECULATIVE_MASKS (loop_vinfo))
+    {
+      tree mask_type = vect_mask_type_for_speculation (loop_vinfo);
+      if (!direct_internal_fn_supported_p (IFN_BREAK_AFTER, mask_type,
+					   OPTIMIZE_FOR_SPEED))
+	{
+	  if (dump_enabled_p ())
+	    dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
+			     "Not vectorized: BREAK_AFTER not supported.\n");
+	  return false;
+	}
+    }
+
   /* If epilog loop is required because of data accesses with gaps,
      one additional iteration needs to be peeled.  Check if there is
      enough iterations for vectorization.  */
   if (LOOP_VINFO_PEELING_FOR_GAPS (loop_vinfo)
-      && LOOP_VINFO_NITERS_KNOWN_P (loop_vinfo))
+      && LOOP_VINFO_NITERS_KNOWN_P (loop_vinfo)
+      && !LOOP_VINFO_FULLY_MASKED_P (loop_vinfo))
     {
-      int vf = LOOP_VINFO_VECT_FACTOR (loop_vinfo);
+      poly_uint64 vf = LOOP_VINFO_VECT_FACTOR (loop_vinfo);
       tree scalar_niters = LOOP_VINFO_NITERSM1 (loop_vinfo);
 
-      if (wi::to_widest (scalar_niters) < vf)
+      if (must_lt (wi::to_widest (scalar_niters), vf))
 	{
 	  if (dump_enabled_p ())
 	    dump_printf_loc (MSG_NOTE, vect_location,
@@ -2066,89 +2526,57 @@ start_over:
 	}
     }
 
-  /* Analyze cost.  Decide if worth while to vectorize.  */
-  int min_profitable_estimate, min_profitable_iters;
-  vect_estimate_min_profitable_iters (loop_vinfo, &min_profitable_iters,
-				      &min_profitable_estimate);
-
-  if (min_profitable_iters < 0)
-    {
-      if (dump_enabled_p ())
-	dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
-			 "not vectorized: vectorization not profitable.\n");
-      if (dump_enabled_p ())
-	dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
-			 "not vectorized: vector version will never be "
-			 "profitable.\n");
-      goto again;
-    }
-
-  min_scalar_loop_bound = (PARAM_VALUE (PARAM_MIN_VECT_LOOP_BOUND)
-			   * vectorization_factor);
-
-  /* Use the cost model only if it is more conservative than user specified
-     threshold.  */
-  th = (unsigned) MAX (min_scalar_loop_bound, min_profitable_iters);
-
-  LOOP_VINFO_COST_MODEL_THRESHOLD (loop_vinfo) = th;
-
-  if (LOOP_VINFO_NITERS_KNOWN_P (loop_vinfo)
-      && LOOP_VINFO_INT_NITERS (loop_vinfo) < th)
+  if (!LOOP_VINFO_FULLY_MASKED_P (loop_vinfo)
+      && LOOP_VINFO_SPECULATIVE_EXECUTION (loop_vinfo)
+      && LOOP_VINFO_PEELING_FOR_ALIGNMENT (loop_vinfo))
     {
       if (dump_enabled_p ())
 	dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
-			 "not vectorized: vectorization not profitable.\n");
-      if (dump_enabled_p ())
-        dump_printf_loc (MSG_NOTE, vect_location,
-			 "not vectorized: iteration count smaller than user "
-			 "specified loop bound parameter or minimum profitable "
-			 "iterations (whichever is more conservative).\n");
-      goto again;
+			 "Not supported: peeling speculative vectorization"
+			 " without a fully-masked loop.\n");
+      return false;
     }
 
-  estimated_niter
-    = estimated_stmt_executions_int (LOOP_VINFO_LOOP (loop_vinfo));
-  if (estimated_niter == -1)
-    estimated_niter = max_niter;
-  if (estimated_niter != -1
-      && ((unsigned HOST_WIDE_INT) estimated_niter
-          < MAX (th, (unsigned) min_profitable_estimate)))
+  /* Check the costings of the loop make vectorizing worthwhile.  */
+  res = vect_analyze_loop_costing (loop_vinfo);
+  if (res < 0)
+    goto again;
+  if (!res)
     {
       if (dump_enabled_p ())
 	dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
-			 "not vectorized: estimated iteration count too "
-                         "small.\n");
-      if (dump_enabled_p ())
-        dump_printf_loc (MSG_NOTE, vect_location,
-			 "not vectorized: estimated iteration count smaller "
-                         "than specified loop bound parameter or minimum "
-                         "profitable iterations (whichever is more "
-                         "conservative).\n");
-      goto again;
+			 "Loop costings not worthwhile.\n");
+      return false;
     }
 
   /* Decide whether we need to create an epilogue loop to handle
      remaining scalar iterations.  */
-  th = ((LOOP_VINFO_COST_MODEL_THRESHOLD (loop_vinfo)
-	 / LOOP_VINFO_VECT_FACTOR (loop_vinfo))
-	* LOOP_VINFO_VECT_FACTOR (loop_vinfo));
+  th = LOOP_VINFO_COST_MODEL_THRESHOLD (loop_vinfo);
 
-  if (LOOP_VINFO_NITERS_KNOWN_P (loop_vinfo)
-      && LOOP_VINFO_PEELING_FOR_ALIGNMENT (loop_vinfo) > 0)
-    {
-      if (ctz_hwi (LOOP_VINFO_INT_NITERS (loop_vinfo)
-		   - LOOP_VINFO_PEELING_FOR_ALIGNMENT (loop_vinfo))
-	  < exact_log2 (LOOP_VINFO_VECT_FACTOR (loop_vinfo)))
+  unsigned HOST_WIDE_INT const_vf;
+  if (LOOP_VINFO_SPECULATIVE_EXECUTION (loop_vinfo))
+    LOOP_VINFO_PEELING_FOR_NITER (loop_vinfo) = false;
+  else if (LOOP_VINFO_FULLY_MASKED_P (loop_vinfo))
+    /* The main loop handles all iterations.  */
+    LOOP_VINFO_PEELING_FOR_NITER (loop_vinfo) = false;
+  else if (LOOP_VINFO_NITERS_KNOWN_P (loop_vinfo)
+	   && LOOP_VINFO_PEELING_FOR_ALIGNMENT (loop_vinfo) > 0)
+    {
+      if (!multiple_p (LOOP_VINFO_INT_NITERS (loop_vinfo)
+		       - LOOP_VINFO_PEELING_FOR_ALIGNMENT (loop_vinfo),
+		       LOOP_VINFO_VECT_FACTOR (loop_vinfo)))
 	LOOP_VINFO_PEELING_FOR_NITER (loop_vinfo) = true;
     }
   else if (LOOP_VINFO_PEELING_FOR_ALIGNMENT (loop_vinfo)
-	   || (tree_ctz (LOOP_VINFO_NITERS (loop_vinfo))
-	       < (unsigned)exact_log2 (LOOP_VINFO_VECT_FACTOR (loop_vinfo))
-               /* In case of versioning, check if the maximum number of
-                  iterations is greater than th.  If they are identical,
-                  the epilogue is unnecessary.  */
+	   || !LOOP_VINFO_VECT_FACTOR (loop_vinfo).is_constant (&const_vf)
+	   || ((tree_ctz (LOOP_VINFO_NITERS (loop_vinfo))
+		< (unsigned) exact_log2 (const_vf))
+	       /* In case of versioning, check if the maximum number of
+		  iterations is greater than th.  If they are identical,
+		  the epilogue is unnecessary.  */
 	       && (!LOOP_REQUIRES_VERSIONING (loop_vinfo)
-                   || (unsigned HOST_WIDE_INT) max_niter > th)))
+		   || ((unsigned HOST_WIDE_INT) max_niter
+		       > (th / const_vf) * const_vf))))
     LOOP_VINFO_PEELING_FOR_NITER (loop_vinfo) = true;
 
   /* If an epilogue loop is required make sure we can create one.  */
@@ -2175,33 +2603,35 @@ start_over:
      can be merged along with threshold check of loop versioning, so
      increase threshold for this case if necessary.  */
   if (LOOP_REQUIRES_VERSIONING (loop_vinfo)
-      && (LOOP_VINFO_PEELING_FOR_GAPS (loop_vinfo)
-	  || LOOP_VINFO_PEELING_FOR_NITER (loop_vinfo)))
+      && !LOOP_VINFO_SPECULATIVE_EXECUTION (loop_vinfo))
     {
-      unsigned niters_th;
+      poly_uint64 niters_th = 0;
 
-      /* Niters for peeled prolog loop.  */
-      if (LOOP_VINFO_PEELING_FOR_ALIGNMENT (loop_vinfo) < 0)
+      if (!vect_use_loop_mask_for_alignment_p (loop_vinfo))
 	{
-	  struct data_reference *dr = LOOP_VINFO_UNALIGNED_DR (loop_vinfo);
-	  tree vectype = STMT_VINFO_VECTYPE (vinfo_for_stmt (DR_STMT (dr)));
-
-	  niters_th = TYPE_VECTOR_SUBPARTS (vectype) - 1;
+	  /* Niters for peeled prolog loop.  */
+	  if (LOOP_VINFO_PEELING_FOR_ALIGNMENT (loop_vinfo) < 0)
+	    {
+	      struct data_reference *dr = LOOP_VINFO_UNALIGNED_DR (loop_vinfo);
+	      tree vectype
+		= STMT_VINFO_VECTYPE (vinfo_for_stmt (DR_STMT (dr)));
+	      niters_th += TYPE_VECTOR_SUBPARTS (vectype) - 1;
+	    }
+	  else
+	    niters_th += LOOP_VINFO_PEELING_FOR_ALIGNMENT (loop_vinfo);
 	}
-      else
-	niters_th = LOOP_VINFO_PEELING_FOR_ALIGNMENT (loop_vinfo);
 
       /* Niters for at least one iteration of vectorized loop.  */
-      niters_th += LOOP_VINFO_VECT_FACTOR (loop_vinfo);
+      if (!LOOP_VINFO_FULLY_MASKED_P (loop_vinfo))
+	niters_th += LOOP_VINFO_VECT_FACTOR (loop_vinfo);
       /* One additional iteration because of peeling for gap.  */
       if (LOOP_VINFO_PEELING_FOR_GAPS (loop_vinfo))
-	niters_th++;
-      if (LOOP_VINFO_COST_MODEL_THRESHOLD (loop_vinfo) < niters_th)
-	LOOP_VINFO_COST_MODEL_THRESHOLD (loop_vinfo) = niters_th;
+	niters_th += 1;
+      LOOP_VINFO_VERSIONING_THRESHOLD (loop_vinfo) = niters_th;
     }
 
-  gcc_assert (vectorization_factor
-	      == (unsigned)LOOP_VINFO_VECT_FACTOR (loop_vinfo));
+  gcc_assert (must_eq (vectorization_factor,
+		       LOOP_VINFO_VECT_FACTOR (loop_vinfo)));
 
   /* Ok to vectorize!  */
   return true;
@@ -2231,7 +2661,7 @@ again:
       vinfo = vinfo_for_stmt (STMT_VINFO_GROUP_FIRST_ELEMENT (vinfo));
       unsigned int size = STMT_VINFO_GROUP_SIZE (vinfo);
       tree vectype = STMT_VINFO_VECTYPE (vinfo);
-      if (! vect_store_lanes_supported (vectype, size)
+      if (! vect_store_lanes_supported (vectype, size, false)
 	  && ! vect_grouped_store_supported (vectype, size))
 	return false;
       FOR_EACH_VEC_ELT (SLP_INSTANCE_LOADS (instance), j, node)
@@ -2241,7 +2671,7 @@ again:
 	  bool single_element_p = !STMT_VINFO_GROUP_NEXT_ELEMENT (vinfo);
 	  size = STMT_VINFO_GROUP_SIZE (vinfo);
 	  vectype = STMT_VINFO_VECTYPE (vinfo);
-	  if (! vect_load_lanes_supported (vectype, size)
+	  if (! vect_load_lanes_supported (vectype, size, false)
 	      && ! vect_grouped_load_supported (vectype, single_element_p,
 						size))
 	    return false;
@@ -2290,16 +2720,22 @@ again:
 	}
     }
   /* Free optimized alias test DDRS.  */
+  LOOP_VINFO_LOWER_BOUNDS (loop_vinfo).truncate (0);
   LOOP_VINFO_COMP_ALIAS_DDRS (loop_vinfo).release ();
   LOOP_VINFO_CHECK_UNEQUAL_ADDRS (loop_vinfo).release ();
   /* Reset target cost data.  */
   destroy_cost_data (LOOP_VINFO_TARGET_COST_DATA (loop_vinfo));
   LOOP_VINFO_TARGET_COST_DATA (loop_vinfo)
     = init_cost (LOOP_VINFO_LOOP (loop_vinfo));
+  /* Reset accumulated rgroup information.  */
+  release_vec_loop_masks (&LOOP_VINFO_MASKS (loop_vinfo));
+  release_vec_loop_masks (&LOOP_VINFO_NONSPECULATIVE_MASKS (loop_vinfo));
   /* Reset assorted flags.  */
   LOOP_VINFO_PEELING_FOR_NITER (loop_vinfo) = false;
   LOOP_VINFO_PEELING_FOR_GAPS (loop_vinfo) = false;
   LOOP_VINFO_COST_MODEL_THRESHOLD (loop_vinfo) = 0;
+  LOOP_VINFO_VERSIONING_THRESHOLD (loop_vinfo) = 0;
+  LOOP_VINFO_CAN_FULLY_MASK_P (loop_vinfo) = saved_can_fully_mask_p;
 
   goto start_over;
 }
@@ -2314,11 +2750,12 @@ loop_vec_info
 vect_analyze_loop (struct loop *loop, loop_vec_info orig_loop_vinfo)
 {
   loop_vec_info loop_vinfo;
-  unsigned int vector_sizes;
+  auto_vector_sizes vector_sizes;
 
   /* Autodetect first vector size we try.  */
   current_vector_size = 0;
-  vector_sizes = targetm.vectorize.autovectorize_vector_sizes ();
+  targetm.vectorize.autovectorize_vector_sizes (&vector_sizes);
+  unsigned int next_size = 0;
 
   if (dump_enabled_p ())
     dump_printf_loc (MSG_NOTE, vect_location,
@@ -2334,6 +2771,7 @@ vect_analyze_loop (struct loop *loop, loop_vec_info orig_loop_vinfo)
       return NULL;
     }
 
+  poly_uint64 autodetected_vector_size = 0;
   while (1)
     {
       /* Check the CFG characteristics of the loop (nesting, entry/exit).  */
@@ -2360,21 +2798,54 @@ vect_analyze_loop (struct loop *loop, loop_vec_info orig_loop_vinfo)
 
       delete loop_vinfo;
 
-      vector_sizes &= ~current_vector_size;
+      if (next_size == 0)
+	autodetected_vector_size = current_vector_size;
+
+      if (next_size < vector_sizes.length ()
+	  && must_eq (vector_sizes[next_size], autodetected_vector_size))
+	next_size += 1;
+
       if (fatal
-	  || vector_sizes == 0
-	  || current_vector_size == 0)
+	  || next_size == vector_sizes.length ()
+	  || must_eq (current_vector_size, 0U))
 	return NULL;
 
       /* Try the next biggest vector size.  */
-      current_vector_size = 1 << floor_log2 (vector_sizes);
+      current_vector_size = vector_sizes[next_size++];
       if (dump_enabled_p ())
-	dump_printf_loc (MSG_NOTE, vect_location,
-			 "***** Re-trying analysis with "
-			 "vector size %d\n", current_vector_size);
+	{
+	  dump_printf_loc (MSG_NOTE, vect_location,
+			   "***** Re-trying analysis with "
+			   "vector size ");
+	  dump_dec (MSG_NOTE, current_vector_size);
+	  dump_printf (MSG_NOTE, "\n");
+	}
     }
 }
 
+/* Return true if the target supports in-order reductions for operation
+   CODE and type TYPE.  If the target supports it, store the reduction
+   operation in *REDUC_CODE.  */
+
+static bool
+fold_left_reduction_code (tree_code code, tree type, tree_code *reduc_code)
+{
+  switch (code)
+    {
+    case PLUS_EXPR:
+      code = FOLD_LEFT_PLUS_EXPR;
+      break;
+
+    default:
+      return false;
+    }
+
+  if (!target_supports_op_p (type, code, optab_vector))
+    return false;
+
+  *reduc_code = code;
+  return true;
+}
 
 /* Function reduction_code_for_scalar_code
 
@@ -2407,11 +2878,20 @@ reduction_code_for_scalar_code (enum tree_code code,
         *reduc_code = REDUC_PLUS_EXPR;
         return true;
 
-      case MULT_EXPR:
-      case MINUS_EXPR:
+      case BIT_AND_EXPR:
+	*reduc_code = REDUC_AND_EXPR;
+	return true;
+
       case BIT_IOR_EXPR:
+	*reduc_code = REDUC_IOR_EXPR;
+	return true;
+
       case BIT_XOR_EXPR:
-      case BIT_AND_EXPR:
+	*reduc_code = REDUC_XOR_EXPR;
+	return true;
+
+      case MULT_EXPR:
+      case MINUS_EXPR:
         *reduc_code = ERROR_MARK;
         return true;
 
@@ -2420,6 +2900,54 @@ reduction_code_for_scalar_code (enum tree_code code,
     }
 }
 
+/* If there is a neutral value X such that SLP reduction NODE would not
+   be affected by the introduction of additional X elements, return that X,
+   otherwise return null.  CODE is the code of the reduction.  REDUC_CHAIN
+   is true if the SLP statements perform a single reduction, false if each
+   statement performs an independent reduction.  */
+
+static tree
+neutral_op_for_slp_reduction (slp_tree slp_node, tree_code code,
+			      bool reduc_chain)
+{
+  vec<gimple *> stmts = SLP_TREE_SCALAR_STMTS (slp_node);
+  gimple *stmt = stmts[0];
+  stmt_vec_info stmt_vinfo = vinfo_for_stmt (stmt);
+  tree vector_type = STMT_VINFO_VECTYPE (stmt_vinfo);
+  tree scalar_type = TREE_TYPE (vector_type);
+  struct loop *loop = gimple_bb (stmt)->loop_father;
+  gcc_assert (loop);
+
+  switch (code)
+    {
+    case WIDEN_SUM_EXPR:
+    case DOT_PROD_EXPR:
+    case SAD_EXPR:
+    case PLUS_EXPR:
+    case MINUS_EXPR:
+    case BIT_IOR_EXPR:
+    case BIT_XOR_EXPR:
+      return build_zero_cst (scalar_type);
+
+    case MULT_EXPR:
+      return build_one_cst (scalar_type);
+
+    case BIT_AND_EXPR:
+      return build_all_ones_cst (scalar_type);
+
+    case MAX_EXPR:
+    case MIN_EXPR:
+      /* For MIN/MAX the initial values are neutral.  A reduction chain
+	 has only a single initial value, so that value is neutral for
+	 all statements.  */
+      if (reduc_chain)
+	return PHI_ARG_DEF_FROM_EDGE (stmt, loop_preheader_edge (loop));
+      return NULL_TREE;
+
+    default:
+      return NULL_TREE;
+    }
+}
 
 /* Error reporting helper for vect_is_simple_reduction below.  GIMPLE statement
    STMT is printed with a message MSG. */
@@ -2540,8 +3068,13 @@ vect_is_slp_reduction (loop_vec_info loop_info, gimple *phi,
      operand.  */
   lhs = PHI_RESULT (phi);
   next_stmt = GROUP_FIRST_ELEMENT (vinfo_for_stmt (current_stmt));
+  unsigned int first_uid = GROUP_FIRST_UID (vinfo_for_stmt (next_stmt));
+  unsigned int last_uid = GROUP_LAST_UID (vinfo_for_stmt (next_stmt));
   while (next_stmt)
     {
+      stmt_vec_info next_vinfo = vinfo_for_stmt (next_stmt);
+      first_uid = MIN (first_uid, STMT_VINFO_GROUP_FIRST_UID (next_vinfo));
+      last_uid = MAX (last_uid, STMT_VINFO_GROUP_LAST_UID (next_vinfo));
       if (gimple_assign_rhs2 (next_stmt) == lhs)
 	{
 	  tree op = gimple_assign_rhs1 (next_stmt);
@@ -2566,7 +3099,7 @@ vect_is_slp_reduction (loop_vec_info loop_info, gimple *phi,
                       && !is_loop_header_bb_p (gimple_bb (def_stmt)))))
 	    {
 	      lhs = gimple_assign_lhs (next_stmt);
-	      next_stmt = GROUP_NEXT_ELEMENT (vinfo_for_stmt (next_stmt));
+	      next_stmt = GROUP_NEXT_ELEMENT (next_vinfo);
  	      continue;
 	    }
 
@@ -2614,17 +3147,55 @@ vect_is_slp_reduction (loop_vec_info loop_info, gimple *phi,
         }
 
       lhs = gimple_assign_lhs (next_stmt);
-      next_stmt = GROUP_NEXT_ELEMENT (vinfo_for_stmt (next_stmt));
+      next_stmt = GROUP_NEXT_ELEMENT (next_vinfo);
     }
 
   /* Save the chain for further analysis in SLP detection.  */
   first = GROUP_FIRST_ELEMENT (vinfo_for_stmt (current_stmt));
   LOOP_VINFO_REDUCTION_CHAINS (loop_info).safe_push (first);
   GROUP_SIZE (vinfo_for_stmt (first)) = size;
+  GROUP_FIRST_UID (vinfo_for_stmt (first)) = first_uid;
+  GROUP_LAST_UID (vinfo_for_stmt (first)) = last_uid;
 
   return true;
 }
 
+/* Returns true if we need an in-order reduction for operation CODE
+   on type TYPE.  NEED_WRAPPING_INTEGRAL_OVERFLOW is true if integer
+   overflow must wrap.  */
+
+static bool
+needs_fold_left_reduction_p (tree type, tree_code code,
+			     bool need_wrapping_integral_overflow)
+{
+  /* CHECKME: check for !flag_finite_math_only too?  */
+  if (SCALAR_FLOAT_TYPE_P (type))
+    switch (code)
+      {
+      case MIN_EXPR:
+      case MAX_EXPR:
+	return false;
+
+      default:
+	return !flag_associative_math;
+      }
+
+  if (INTEGRAL_TYPE_P (type))
+    {
+      if (!operation_no_trapping_overflow (type, code))
+	return true;
+      if (need_wrapping_integral_overflow
+	  && !TYPE_OVERFLOW_WRAPS (type)
+	  && operation_can_overflow (code))
+	return true;
+      return false;
+    }
+
+  if (SAT_FIXED_POINT_TYPE_P (type))
+    return true;
+
+  return false;
+}
 
 /* Function vect_is_simple_reduction
 
@@ -2943,58 +3514,18 @@ vect_is_simple_reduction (loop_vec_info loop_info, gimple *phi,
       return NULL;
     }
 
-  /* Check that it's ok to change the order of the computation.
+  /* Check whether it's ok to change the order of the computation.
      Generally, when vectorizing a reduction we change the order of the
      computation.  This may change the behavior of the program in some
      cases, so we need to check that this is ok.  One exception is when
      vectorizing an outer-loop: the inner-loop is executed sequentially,
      and therefore vectorizing reductions in the inner-loop during
      outer-loop vectorization is safe.  */
-
-  if (*v_reduc_type != COND_REDUCTION
-      && check_reduction)
-    {
-      /* CHECKME: check for !flag_finite_math_only too?  */
-      if (SCALAR_FLOAT_TYPE_P (type) && !flag_associative_math)
-	{
-	  /* Changing the order of operations changes the semantics.  */
-	  if (dump_enabled_p ())
-	    report_vect_op (MSG_MISSED_OPTIMIZATION, def_stmt,
-			"reduction: unsafe fp math optimization: ");
-	  return NULL;
-	}
-      else if (INTEGRAL_TYPE_P (type))
-	{
-	  if (!operation_no_trapping_overflow (type, code))
-	    {
-	      /* Changing the order of operations changes the semantics.  */
-	      if (dump_enabled_p ())
-		report_vect_op (MSG_MISSED_OPTIMIZATION, def_stmt,
-				"reduction: unsafe int math optimization"
-				" (overflow traps): ");
-	      return NULL;
-	    }
-	  if (need_wrapping_integral_overflow
-	      && !TYPE_OVERFLOW_WRAPS (type)
-	      && operation_can_overflow (code))
-	    {
-	      /* Changing the order of operations changes the semantics.  */
-	      if (dump_enabled_p ())
-		report_vect_op (MSG_MISSED_OPTIMIZATION, def_stmt,
-				"reduction: unsafe int math optimization"
-				" (overflow doesn't wrap): ");
-	      return NULL;
-	    }
-	}
-      else if (SAT_FIXED_POINT_TYPE_P (type))
-	{
-	  /* Changing the order of operations changes the semantics.  */
-	  if (dump_enabled_p ())
-	  report_vect_op (MSG_MISSED_OPTIMIZATION, def_stmt,
-			  "reduction: unsafe fixed-point math optimization: ");
-	  return NULL;
-	}
-    }
+  if (check_reduction
+      && *v_reduc_type == TREE_CODE_REDUCTION
+      && needs_fold_left_reduction_p (type, code,
+				      need_wrapping_integral_overflow))
+    *v_reduc_type = FOLD_LEFT_REDUCTION;
 
   /* Reduction is safe. We're dealing with one of the following:
      1) integer arithmetic and no trapv
@@ -3258,6 +3789,7 @@ vect_force_simple_reduction (loop_vec_info loop_info, gimple *phi,
       STMT_VINFO_REDUC_TYPE (reduc_def_info) = v_reduc_type;
       STMT_VINFO_REDUC_DEF (reduc_def_info) = def;
       reduc_def_info = vinfo_for_stmt (def);
+      STMT_VINFO_REDUC_TYPE (reduc_def_info) = v_reduc_type;
       STMT_VINFO_REDUC_DEF (reduc_def_info) = phi;
     }
   return def;
@@ -3272,11 +3804,11 @@ vect_get_known_peeling_cost (loop_vec_info loop_vinfo, int peel_iters_prologue,
 			     stmt_vector_for_cost *epilogue_cost_vec)
 {
   int retval = 0;
-  int vf = LOOP_VINFO_VECT_FACTOR (loop_vinfo);
+  int assumed_vf = vect_vf_for_cost (loop_vinfo);
 
   if (!LOOP_VINFO_NITERS_KNOWN_P (loop_vinfo))
     {
-      *peel_iters_epilogue = vf/2;
+      *peel_iters_epilogue = assumed_vf / 2;
       if (dump_enabled_p ())
         dump_printf_loc (MSG_NOTE, vect_location,
 			 "cost model: epilogue peel iters set to vf/2 "
@@ -3294,11 +3826,11 @@ vect_get_known_peeling_cost (loop_vec_info loop_vinfo, int peel_iters_prologue,
       int niters = LOOP_VINFO_INT_NITERS (loop_vinfo);
       peel_iters_prologue = niters < peel_iters_prologue ?
                             niters : peel_iters_prologue;
-      *peel_iters_epilogue = (niters - peel_iters_prologue) % vf;
+      *peel_iters_epilogue = (niters - peel_iters_prologue) % assumed_vf;
       /* If we need to peel for gaps, but no peeling is required, we have to
 	 peel VF iterations.  */
       if (LOOP_VINFO_PEELING_FOR_GAPS (loop_vinfo) && !*peel_iters_epilogue)
-        *peel_iters_epilogue = vf;
+	*peel_iters_epilogue = assumed_vf;
     }
 
   stmt_info_for_cost *si;
@@ -3356,7 +3888,7 @@ vect_estimate_min_profitable_iters (loop_vec_info loop_vinfo,
   unsigned vec_epilogue_cost = 0;
   int scalar_single_iter_cost = 0;
   int scalar_outside_cost = 0;
-  int vf = LOOP_VINFO_VECT_FACTOR (loop_vinfo);
+  int assumed_vf = vect_vf_for_cost (loop_vinfo);
   int npeel = LOOP_VINFO_PEELING_FOR_ALIGNMENT (loop_vinfo);
   void *target_cost_data = LOOP_VINFO_TARGET_COST_DATA (loop_vinfo);
 
@@ -3393,6 +3925,18 @@ vect_estimate_min_profitable_iters (loop_vec_info loop_vinfo,
 	/* Count LEN - 1 ANDs and LEN comparisons.  */
 	(void) add_stmt_cost (target_cost_data, len * 2 - 1, scalar_stmt,
 			      NULL, 0, vect_prologue);
+      len = LOOP_VINFO_LOWER_BOUNDS (loop_vinfo).length ();
+      if (len)
+	{
+	  /* Count LEN - 1 ANDs and LEN comparisons.  */
+	  unsigned int nstmts = len * 2 - 1;
+	  /* +1 for each bias that needs adding.  */
+	  for (unsigned int i = 0; i < len; ++i)
+	    if (!LOOP_VINFO_LOWER_BOUNDS (loop_vinfo)[i].unsigned_p)
+	      nstmts += 1;
+	  (void) add_stmt_cost (target_cost_data, nstmts, scalar_stmt,
+				NULL, 0, vect_prologue);
+	}
       dump_printf (MSG_NOTE,
                    "cost model: Adding cost of checks for loop "
                    "versioning aliasing.\n");
@@ -3425,7 +3969,7 @@ vect_estimate_min_profitable_iters (loop_vec_info loop_vinfo,
     = LOOP_VINFO_SINGLE_SCALAR_ITERATION_COST (loop_vinfo);
 
   /* Add additional cost for the peeled instructions in prologue and epilogue
-     loop.
+     loop.  (For fully-masked loops there will be no peeling.)
 
      FORNOW: If we don't know the value of peel_iters for prologue or epilogue
      at compile-time - we assume it's vf/2 (the worst would be vf-1).
@@ -3433,15 +3977,37 @@ vect_estimate_min_profitable_iters (loop_vec_info loop_vinfo,
      TODO: Build an expression that represents peel_iters for prologue and
      epilogue to be used in a run-time test.  */
 
-  if (npeel  < 0)
+  if (LOOP_VINFO_FULLY_MASKED_P (loop_vinfo))
+    {
+      peel_iters_prologue = 0;
+      peel_iters_epilogue = 0;
+
+      if (LOOP_VINFO_PEELING_FOR_GAPS (loop_vinfo))
+	{
+	  /* We need to peel exactly one iteration.  */
+	  peel_iters_epilogue += 1;
+	  stmt_info_for_cost *si;
+	  int j;
+	  FOR_EACH_VEC_ELT (LOOP_VINFO_SCALAR_ITERATION_COST (loop_vinfo),
+			    j, si)
+	    {
+	      struct _stmt_vec_info *stmt_info
+		= si->stmt ? vinfo_for_stmt (si->stmt) : NULL;
+	      (void) add_stmt_cost (target_cost_data, si->count,
+				    si->kind, stmt_info, si->misalign,
+				    vect_epilogue);
+	    }
+	}
+    }
+  else if (npeel < 0)
     {
-      peel_iters_prologue = vf/2;
+      peel_iters_prologue = assumed_vf / 2;
       dump_printf (MSG_NOTE, "cost model: "
                    "prologue peel iters set to vf/2.\n");
 
       /* If peeling for alignment is unknown, loop bound of main loop becomes
          unknown.  */
-      peel_iters_epilogue = vf/2;
+      peel_iters_epilogue = assumed_vf / 2;
       dump_printf (MSG_NOTE, "cost model: "
                    "epilogue peel iters set to vf/2 because "
                    "peeling for alignment is unknown.\n");
@@ -3620,23 +4186,25 @@ vect_estimate_min_profitable_iters (loop_vec_info loop_vinfo,
      PL_ITERS = prologue iterations, EP_ITERS= epilogue iterations
      SOC = scalar outside cost for run time cost model check.  */
 
-  if ((scalar_single_iter_cost * vf) > (int) vec_inside_cost)
+  if ((scalar_single_iter_cost * assumed_vf) > (int) vec_inside_cost)
     {
-      if (vec_outside_cost <= 0)
+      min_profitable_iters = ((vec_outside_cost - scalar_outside_cost)
+			      * assumed_vf
+			      - vec_inside_cost * peel_iters_prologue
+			      - vec_inside_cost * peel_iters_epilogue);
+      if (min_profitable_iters <= 0)
         min_profitable_iters = 0;
       else
-        {
-          min_profitable_iters = ((vec_outside_cost - scalar_outside_cost) * vf
-				  - vec_inside_cost * peel_iters_prologue
-                                  - vec_inside_cost * peel_iters_epilogue)
-                                 / ((scalar_single_iter_cost * vf)
-                                    - vec_inside_cost);
-
-          if ((scalar_single_iter_cost * vf * min_profitable_iters)
-              <= (((int) vec_inside_cost * min_profitable_iters)
-                  + (((int) vec_outside_cost - scalar_outside_cost) * vf)))
-            min_profitable_iters++;
-        }
+	{
+	  min_profitable_iters /= ((scalar_single_iter_cost * assumed_vf)
+				   - vec_inside_cost);
+
+	  if ((scalar_single_iter_cost * assumed_vf * min_profitable_iters)
+	      <= (((int) vec_inside_cost * min_profitable_iters)
+		  + (((int) vec_outside_cost - scalar_outside_cost)
+		     * assumed_vf)))
+	    min_profitable_iters++;
+	}
     }
   /* vector version will never be profitable.  */
   else
@@ -3651,7 +4219,7 @@ vect_estimate_min_profitable_iters (loop_vec_info loop_vinfo,
 			 "divided by the scalar iteration cost = %d "
 			 "is greater or equal to the vectorization factor = %d"
                          ".\n",
-			 vec_inside_cost, scalar_single_iter_cost, vf);
+			 vec_inside_cost, scalar_single_iter_cost, assumed_vf);
       *ret_min_profitable_niters = -1;
       *ret_min_profitable_estimate = -1;
       return;
@@ -3661,9 +4229,10 @@ vect_estimate_min_profitable_iters (loop_vec_info loop_vinfo,
 	       "  Calculated minimum iters for profitability: %d\n",
 	       min_profitable_iters);
 
-  /* We want the vectorized loop to execute at least once.  */
-  if (min_profitable_iters < (vf + peel_iters_prologue))
-    min_profitable_iters = vf + peel_iters_prologue;
+  if (!LOOP_VINFO_FULLY_MASKED_P (loop_vinfo)
+      && min_profitable_iters < (assumed_vf + peel_iters_prologue))
+    /* We want the vectorized loop to execute at least once.  */
+    min_profitable_iters = assumed_vf + peel_iters_prologue;
 
   if (dump_enabled_p ())
     dump_printf_loc (MSG_NOTE, vect_location,
@@ -3683,10 +4252,11 @@ vect_estimate_min_profitable_iters (loop_vec_info loop_vinfo,
     min_profitable_estimate = 0;
   else
     {
-      min_profitable_estimate = ((vec_outside_cost + scalar_outside_cost) * vf
+      min_profitable_estimate = ((vec_outside_cost + scalar_outside_cost)
+				 * assumed_vf
 				 - vec_inside_cost * peel_iters_prologue
 				 - vec_inside_cost * peel_iters_epilogue)
-				 / ((scalar_single_iter_cost * vf)
+				 / ((scalar_single_iter_cost * assumed_vf)
 				   - vec_inside_cost);
     }
   min_profitable_estimate = MAX (min_profitable_estimate, min_profitable_iters);
@@ -3722,10 +4292,13 @@ have_whole_vector_shift (machine_mode mode)
   if (direct_optab_handler (vec_perm_const_optab, mode) == CODE_FOR_nothing)
     return false;
 
-  unsigned int i, nelt = GET_MODE_NUNITS (mode);
-  auto_vec_perm_indices sel (nelt);
+  /* Variable-length vectors should be handled via the optab.  */
+  unsigned int nelt;
+  if (!GET_MODE_NUNITS (mode).is_constant (&nelt))
+    return false;
 
-  for (i = nelt/2; i >= 1; i/=2)
+  auto_vec_perm_indices sel (nelt);
+  for (unsigned int i = nelt / 2; i >= 1; i /= 2)
     {
       sel.truncate (0);
       calc_vec_perm_mask_for_shift (i, nelt, &sel);
@@ -3748,7 +4321,7 @@ static void
 vect_model_reduction_cost (stmt_vec_info stmt_info, enum tree_code reduc_code,
 			   int ncopies)
 {
-  int prologue_cost = 0, epilogue_cost = 0;
+  int prologue_cost = 0, epilogue_cost = 0, inside_cost;
   enum tree_code code;
   optab optab;
   tree vectype;
@@ -3767,13 +4340,11 @@ vect_model_reduction_cost (stmt_vec_info stmt_info, enum tree_code reduc_code,
     target_cost_data = BB_VINFO_TARGET_COST_DATA (STMT_VINFO_BB_VINFO (stmt_info));
 
   /* Condition reductions generate two reductions in the loop.  */
-  if (STMT_VINFO_VEC_REDUCTION_TYPE (stmt_info) == COND_REDUCTION)
+  vect_reduction_type reduction_type
+    = STMT_VINFO_VEC_REDUCTION_TYPE (stmt_info);
+  if (reduction_type == COND_REDUCTION)
     ncopies *= 2;
 
-  /* Cost of reduction op inside loop.  */
-  unsigned inside_cost = add_stmt_cost (target_cost_data, ncopies, vector_stmt,
-					stmt_info, 0, vect_body);
-
   vectype = STMT_VINFO_VECTYPE (stmt_info);
   mode = TYPE_MODE (vectype);
   orig_stmt = STMT_VINFO_RELATED_STMT (stmt_info);
@@ -3783,14 +4354,31 @@ vect_model_reduction_cost (stmt_vec_info stmt_info, enum tree_code reduc_code,
 
   code = gimple_assign_rhs_code (orig_stmt);
 
-  /* Add in cost for initial definition.
-     For cond reduction we have four vectors: initial index, step, initial
-     result of the data reduction, initial value of the index reduction.  */
-  int prologue_stmts = STMT_VINFO_VEC_REDUCTION_TYPE (stmt_info)
-		       == COND_REDUCTION ? 4 : 1;
-  prologue_cost += add_stmt_cost (target_cost_data, prologue_stmts,
-				  scalar_to_vec, stmt_info, 0,
-				  vect_prologue);
+  if (reduction_type == EXTRACT_LAST_REDUCTION
+      || reduction_type == FOLD_LEFT_REDUCTION)
+    {
+      /* No extra instructions needed in the prologue.  */
+      prologue_cost = 0;
+
+      /* Count NCOPIES FOLD_EXTRACT_LAST operations.  */
+      inside_cost = add_stmt_cost (target_cost_data, ncopies, vec_to_scalar,
+				   stmt_info, 0, vect_body);
+    }
+  else
+    {
+      /* Add in cost for initial definition.
+	 For cond reduction we have four vectors: initial index, step,
+	 initial result of the data reduction, initial value of the index
+	 reduction.  */
+      int prologue_stmts = reduction_type == COND_REDUCTION ? 4 : 1;
+      prologue_cost += add_stmt_cost (target_cost_data, prologue_stmts,
+				      scalar_to_vec, stmt_info, 0,
+				      vect_prologue);
+
+      /* Cost of reduction op inside loop.  */
+      inside_cost = add_stmt_cost (target_cost_data, ncopies, vector_stmt,
+				   stmt_info, 0, vect_body);
+    }
 
   /* Determine cost of epilogue code.
 
@@ -3801,7 +4389,7 @@ vect_model_reduction_cost (stmt_vec_info stmt_info, enum tree_code reduc_code,
     {
       if (reduc_code != ERROR_MARK)
 	{
-	  if (STMT_VINFO_VEC_REDUCTION_TYPE (stmt_info) == COND_REDUCTION)
+	  if (reduction_type == COND_REDUCTION)
 	    {
 	      /* An EQ stmt and an COND_EXPR stmt.  */
 	      epilogue_cost += add_stmt_cost (target_cost_data, 2,
@@ -3826,18 +4414,24 @@ vect_model_reduction_cost (stmt_vec_info stmt_info, enum tree_code reduc_code,
 					      vect_epilogue);
 	    }
 	}
-      else if (STMT_VINFO_VEC_REDUCTION_TYPE (stmt_info) == COND_REDUCTION)
+      else if (reduction_type == COND_REDUCTION)
 	{
-	  unsigned nunits = TYPE_VECTOR_SUBPARTS (vectype);
+	  unsigned estimated_nunits = vect_nunits_for_cost (vectype);
 	  /* Extraction of scalar elements.  */
-	  epilogue_cost += add_stmt_cost (target_cost_data, 2 * nunits,
+	  epilogue_cost += add_stmt_cost (target_cost_data,
+					  2 * estimated_nunits,
 					  vec_to_scalar, stmt_info, 0,
 					  vect_epilogue);
 	  /* Scalar max reductions via COND_EXPR / MAX_EXPR.  */
-	  epilogue_cost += add_stmt_cost (target_cost_data, 2 * nunits - 3,
+	  epilogue_cost += add_stmt_cost (target_cost_data,
+					  2 * estimated_nunits - 3,
 					  scalar_stmt, stmt_info, 0,
 					  vect_epilogue);
 	}
+      else if (reduction_type == EXTRACT_LAST_REDUCTION
+	       || reduction_type == FOLD_LEFT_REDUCTION)
+	/* No extra instructions need in the epilogue.  */
+	;
       else
 	{
 	  int vec_size_in_bits = tree_to_uhwi (TYPE_SIZE (vectype));
@@ -3967,16 +4561,16 @@ get_initial_def_for_reduction (gimple *stmt, tree init_val,
   struct loop *loop = LOOP_VINFO_LOOP (loop_vinfo);
   tree scalar_type = TREE_TYPE (init_val);
   tree vectype = get_vectype_for_scalar_type (scalar_type);
-  int nunits;
+  poly_uint64 nunits;
   enum tree_code code = gimple_assign_rhs_code (stmt);
   tree def_for_init;
   tree init_def;
-  int i;
   bool nested_in_vect_loop = false;
   REAL_VALUE_TYPE real_init_val = dconst0;
   int int_init_val = 0;
   gimple *def_stmt = NULL;
   gimple_seq stmts = NULL;
+  unsigned HOST_WIDE_INT count;
 
   gcc_assert (vectype);
   nunits = TYPE_VECTOR_SUBPARTS (vectype);
@@ -4005,6 +4599,9 @@ get_initial_def_for_reduction (gimple *stmt, tree init_val,
       return vect_create_destination_var (init_val, vectype);
     }
 
+  vect_reduction_type reduction_type
+    = STMT_VINFO_VEC_REDUCTION_TYPE (stmt_vinfo);
+
   /* In case of a nested reduction do not use an adjustment def as
      that case is not supported by the epilogue generation correctly
      if ncopies is not one.  */
@@ -4049,12 +4646,22 @@ get_initial_def_for_reduction (gimple *stmt, tree init_val,
 	  /* Option1: the first element is '0' or '1' as well.  */
 	  init_def = gimple_build_vector_from_val (&stmts, vectype,
 						   def_for_init);
+	else if (!nunits.is_constant (&count))
+	  {
+	    /* Option2 (variable length): the first element is INIT_VAL.  */
+	    init_def = build_vector_from_val (vectype, def_for_init);
+	    gcall *call = gimple_build_call_internal (IFN_VEC_SHL_INSERT,
+						      2, init_def, init_val);
+	    init_def = make_ssa_name (vectype);
+	    gimple_call_set_lhs (call, init_def);
+	    gimple_seq_add_stmt (&stmts, call);
+	  }
 	else
 	  {
-	    /* Option2: the first element is INIT_VAL.  */
-	    auto_vec<tree, 32> elts (nunits);
+	    /* Option2 (constant length): the first element is INIT_VAL.  */
+	    auto_vec<tree, 32> elts (count);
 	    elts.quick_push (init_val);
-	    for (i = 1; i < nunits; ++i)
+	    for (unsigned int i = 1; i < count; ++i)
 	      elts.quick_push (def_for_init);
 	    init_def = gimple_build_vector (&stmts, vectype, elts);
 	  }
@@ -4068,7 +4675,8 @@ get_initial_def_for_reduction (gimple *stmt, tree init_val,
 	if (adjustment_def)
           {
 	    *adjustment_def = NULL_TREE;
-	    if (STMT_VINFO_VEC_REDUCTION_TYPE (stmt_vinfo) != COND_REDUCTION)
+	    if (reduction_type != COND_REDUCTION
+		&& reduction_type != EXTRACT_LAST_REDUCTION)
 	      {
 		init_def = vect_get_vec_def_for_operand (init_val, stmt);
 		break;
@@ -4089,32 +4697,32 @@ get_initial_def_for_reduction (gimple *stmt, tree init_val,
 }
 
 /* Get at the initial defs for the reduction PHIs in SLP_NODE.
-   NUMBER_OF_VECTORS is the number of vector defs to create.  */
+   NUMBER_OF_VECTORS is the number of vector defs to create.
+   If NEUTRAL_OP is nonnull, introducing extra elements of that
+   value will not change the result.  */
 
 static void
 get_initial_defs_for_reduction (slp_tree slp_node,
 				vec<tree> *vec_oprnds,
 				unsigned int number_of_vectors,
-				enum tree_code code, bool reduc_chain)
+				bool reduc_chain, tree neutral_op)
 {
   vec<gimple *> stmts = SLP_TREE_SCALAR_STMTS (slp_node);
   gimple *stmt = stmts[0];
   stmt_vec_info stmt_vinfo = vinfo_for_stmt (stmt);
-  unsigned nunits;
+  unsigned HOST_WIDE_INT nunits;
   unsigned j, number_of_places_left_in_vector;
-  tree vector_type, scalar_type;
+  tree vector_type;
   tree vop;
   int group_size = stmts.length ();
   unsigned int vec_num, i;
   unsigned number_of_copies = 1;
   vec<tree> voprnds;
   voprnds.create (number_of_vectors);
-  tree neutral_op = NULL;
   struct loop *loop;
+  auto_vec<tree, 16> permute_results;
 
   vector_type = STMT_VINFO_VECTYPE (stmt_vinfo);
-  scalar_type = TREE_TYPE (vector_type);
-  nunits = TYPE_VECTOR_SUBPARTS (vector_type);
 
   gcc_assert (STMT_VINFO_DEF_TYPE (stmt_vinfo) == vect_reduction_def);
 
@@ -4122,45 +4730,7 @@ get_initial_defs_for_reduction (slp_tree slp_node,
   gcc_assert (loop);
   edge pe = loop_preheader_edge (loop);
 
-  /* op is the reduction operand of the first stmt already.  */
-  /* For additional copies (see the explanation of NUMBER_OF_COPIES below)
-     we need either neutral operands or the original operands.  See
-     get_initial_def_for_reduction() for details.  */
-  switch (code)
-    {
-    case WIDEN_SUM_EXPR:
-    case DOT_PROD_EXPR:
-    case SAD_EXPR:
-    case PLUS_EXPR:
-    case MINUS_EXPR:
-    case BIT_IOR_EXPR:
-    case BIT_XOR_EXPR:
-      neutral_op = build_zero_cst (scalar_type);
-      break;
-
-    case MULT_EXPR:
-      neutral_op = build_one_cst (scalar_type);
-      break;
-
-    case BIT_AND_EXPR:
-      neutral_op = build_all_ones_cst (scalar_type);
-      break;
-
-    /* For MIN/MAX we don't have an easy neutral operand but
-       the initial values can be used fine here.  Only for
-       a reduction chain we have to force a neutral element.  */
-    case MAX_EXPR:
-    case MIN_EXPR:
-      if (! reduc_chain)
-	neutral_op = NULL;
-      else
-	neutral_op = PHI_ARG_DEF_FROM_EDGE (stmt, pe);
-      break;
-
-    default:
-      gcc_assert (! reduc_chain);
-      neutral_op = NULL;
-    }
+  gcc_assert (!reduc_chain || neutral_op);
 
   /* NUMBER_OF_COPIES is the number of times we need to use the same values in
      created vectors. It is greater than 1 if unrolling is performed.
@@ -4178,6 +4748,9 @@ get_initial_defs_for_reduction (slp_tree slp_node,
      (s1, s2, ..., s8).  We will create two vectors {s1, s2, s3, s4} and
      {s5, s6, s7, s8}.  */
 
+  if (!TYPE_VECTOR_SUBPARTS (vector_type).is_constant (&nunits))
+    nunits = group_size;
+
   number_of_copies = nunits * number_of_vectors / group_size;
 
   number_of_places_left_in_vector = nunits;
@@ -4204,7 +4777,40 @@ get_initial_defs_for_reduction (slp_tree slp_node,
           if (number_of_places_left_in_vector == 0)
             {
 	      gimple_seq ctor_seq = NULL;
-	      tree init = gimple_build_vector (&ctor_seq, vector_type, elts);
+	      tree init;
+	      if (must_eq (TYPE_VECTOR_SUBPARTS (vector_type), nunits))
+		/* Build the vector directly from ELTS.  */
+		init = gimple_build_vector (&ctor_seq, vector_type, elts);
+	      else if (neutral_op)
+		{
+		  /* Build a vector of the neutral value and shift the
+		     other elements into place.  */
+		  init = gimple_build_vector_from_val (&ctor_seq, vector_type,
+						       neutral_op);
+		  int k = nunits;
+		  while (k > 0 && elts[k - 1] == neutral_op)
+		    k -= 1;
+		  while (k > 0)
+		    {
+		      k -= 1;
+		      gcall *call = gimple_build_call_internal
+			(IFN_VEC_SHL_INSERT, 2, init, elts[k]);
+		      init = make_ssa_name (vector_type);
+		      gimple_call_set_lhs (call, init);
+		      gimple_seq_add_stmt (&ctor_seq, call);
+		    }
+		}
+	      else
+		{
+		  /* First time round, duplicate ELTS to fill the
+		     required number of vectors, then cherry pick the
+		     appropriate result for each iteration.  */
+		  if (vec_oprnds->is_empty ())
+		    duplicate_and_interleave (&ctor_seq, vector_type, elts,
+					      number_of_vectors,
+					      permute_results);
+		  init = permute_results[number_of_vectors - j - 1];
+		}
 	      if (ctor_seq != NULL)
 		gsi_insert_seq_on_edge_immediate (pe, ctor_seq);
 	      voprnds.quick_push (init);
@@ -4274,6 +4880,8 @@ get_initial_defs_for_reduction (slp_tree slp_node,
    DOUBLE_REDUC is TRUE if double reduction phi nodes should be handled.
    SLP_NODE is an SLP node containing a group of reduction statements. The 
      first one in this group is STMT.
+   NEUTRAL_OP is the value given by neutral_op_for_slp_reduction; it is
+     null if this is not an SLP reduction
 
    This function:
    1. Creates the reduction def-use cycles: sets the arguments for 
@@ -4321,7 +4929,8 @@ vect_create_epilog_for_reduction (vec<tree> vect_defs, gimple *stmt,
 				  vec<gimple *> reduction_phis,
                                   bool double_reduc, 
 				  slp_tree slp_node,
-				  slp_instance slp_node_instance)
+				  slp_instance slp_node_instance,
+				  tree neutral_op)
 {
   stmt_vec_info stmt_info = vinfo_for_stmt (stmt);
   stmt_vec_info prev_phi_info;
@@ -4357,6 +4966,7 @@ vect_create_epilog_for_reduction (vec<tree> vect_defs, gimple *stmt,
   auto_vec<tree> vec_initial_defs;
   auto_vec<gimple *> phis;
   bool slp_reduc = false;
+  bool direct_slp_reduc;
   tree new_phi_result;
   gimple *inner_phi = NULL;
   tree induction_index = NULL_TREE;
@@ -4400,8 +5010,9 @@ vect_create_epilog_for_reduction (vec<tree> vect_defs, gimple *stmt,
       unsigned vec_num = SLP_TREE_NUMBER_OF_VEC_STMTS (slp_node);
       vec_initial_defs.reserve (vec_num);
       get_initial_defs_for_reduction (slp_node_instance->reduc_phis,
-				      &vec_initial_defs, vec_num, code,
-				      GROUP_FIRST_ELEMENT (stmt_info));
+				      &vec_initial_defs, vec_num,
+				      GROUP_FIRST_ELEMENT (stmt_info),
+				      neutral_op);
     }
   else
     {
@@ -4480,8 +5091,7 @@ vect_create_epilog_for_reduction (vec<tree> vect_defs, gimple *stmt,
   if (STMT_VINFO_VEC_REDUCTION_TYPE (stmt_info) == COND_REDUCTION)
     {
       tree indx_before_incr, indx_after_incr;
-      int nunits_out = TYPE_VECTOR_SUBPARTS (vectype);
-      int k;
+      poly_uint64 nunits_out = TYPE_VECTOR_SUBPARTS (vectype);
 
       gimple *vec_stmt = STMT_VINFO_VEC_STMT (stmt_info);
       gcc_assert (gimple_assign_rhs_code (vec_stmt) == VEC_COND_EXPR);
@@ -4497,10 +5107,7 @@ vect_create_epilog_for_reduction (vec<tree> vect_defs, gimple *stmt,
 	 vector size (STEP).  */
 
       /* Create a {1,2,3,...} vector.  */
-      auto_vec<tree, 32> vtemp (nunits_out);
-      for (k = 0; k < nunits_out; ++k)
-	vtemp.quick_push (build_int_cst (cr_index_scalar_type, k + 1));
-      tree series_vect = build_vector (cr_index_vector_type, vtemp);
+      tree series_vect = build_index_vector (cr_index_vector_type, 1, 1);
 
       /* Create a vector of the step value.  */
       tree step = build_int_cst (cr_index_scalar_type, nunits_out);
@@ -4699,6 +5306,12 @@ vect_create_epilog_for_reduction (vec<tree> vect_defs, gimple *stmt,
      b2 = operation (b1)  */
   slp_reduc = (slp_node && !GROUP_FIRST_ELEMENT (vinfo_for_stmt (stmt)));
 
+  /* True if we should implement SLP_REDUC using native reduction operations
+     instead of scalar operations.  */
+  direct_slp_reduc = (reduc_code != ERROR_MARK
+		      && slp_reduc
+		      && !TYPE_VECTOR_SUBPARTS (vectype).is_constant ());
+
   /* In case of reduction chain, e.g.,
      # a1 = phi <a3, a0>
      a2 = operation (a1)
@@ -4706,7 +5319,7 @@ vect_create_epilog_for_reduction (vec<tree> vect_defs, gimple *stmt,
 
      we may end up with more than one vector result.  Here we reduce them to
      one vector.  */
-  if (GROUP_FIRST_ELEMENT (vinfo_for_stmt (stmt)))
+  if (GROUP_FIRST_ELEMENT (vinfo_for_stmt (stmt)) || direct_slp_reduc)
     {
       tree first_vect = PHI_RESULT (new_phis[0]);
       gassign *new_vec_stmt = NULL;
@@ -4881,8 +5494,11 @@ vect_create_epilog_for_reduction (vec<tree> vect_defs, gimple *stmt,
       tree data_eltype = TREE_TYPE (TREE_TYPE (new_phi_result));
       tree idx_eltype = TREE_TYPE (TREE_TYPE (induction_index));
       unsigned HOST_WIDE_INT el_size = tree_to_uhwi (TYPE_SIZE (idx_eltype));
-      unsigned HOST_WIDE_INT v_size
-	= el_size * TYPE_VECTOR_SUBPARTS (TREE_TYPE (induction_index));
+      poly_uint64 nunits = TYPE_VECTOR_SUBPARTS (TREE_TYPE (induction_index));
+      /* Enforced by vectorizable_reduction, which ensures we have target
+	 support before allowing a conditional reduction on variable-length
+	 vectors.  */
+      unsigned HOST_WIDE_INT v_size = el_size * nunits.to_constant ();
       tree idx_val = NULL_TREE, val = NULL_TREE;
       for (unsigned HOST_WIDE_INT off = 0; off < v_size; off += el_size)
 	{
@@ -4990,10 +5606,88 @@ vect_create_epilog_for_reduction (vec<tree> vect_defs, gimple *stmt,
 
       scalar_results.safe_push (new_temp);
     }
+  else if (direct_slp_reduc)
+    {
+      /* Here we create one vector for each of the GROUP_SIZE results,
+	 with the elements for other SLP statements replaced with the
+	 neutral value.  We can then do a normal reduction on each vector.  */
+
+      /* Enforced by vectorizable_reduction.  */
+      gcc_assert (new_phis.length () == 1);
+      gcc_assert (pow2p_hwi (group_size));
+
+      slp_tree orig_phis_slp_node = slp_node_instance->reduc_phis;
+      vec<gimple *> orig_phis = SLP_TREE_SCALAR_STMTS (orig_phis_slp_node);
+      gimple_seq seq = NULL;
+
+      /* Build a vector {0, 1, 2, ...}, with the same number of elements
+	 and the same element size as VECTYPE.  */
+      tree index = build_index_vector (vectype, 0, 1);
+      tree index_type = TREE_TYPE (index);
+      tree index_elt_type = TREE_TYPE (index_type);
+      tree mask_type = build_same_sized_truth_vector_type (index_type);
+
+      /* Create a vector that, for each element, identifies which of
+	 the GROUP_SIZE results should use it.  */
+      tree index_mask = build_int_cst (index_elt_type, group_size - 1);
+      index = gimple_build (&seq, BIT_AND_EXPR, index_type, index,
+			    build_vector_from_val (index_type, index_mask));
+
+      /* Get a neutral vector value.  This is simply a splat of the neutral
+	 scalar value if we have one, otherwise the initial scalar value
+	 is itself a neutral value.  */
+      tree vector_identity = NULL_TREE;
+      if (neutral_op)
+	vector_identity = gimple_build_vector_from_val (&seq, vectype,
+							neutral_op);
+      for (unsigned int i = 0; i < group_size; ++i)
+	{
+	  /* If there's no univeral neutral value, we can use the
+	     initial scalar value from the original PHI.  This is used
+	     for MIN and MAX reduction, for example.  */
+	  if (!neutral_op)
+	    {
+	      tree scalar_value
+		= PHI_ARG_DEF_FROM_EDGE (orig_phis[i],
+					 loop_preheader_edge (loop));
+	      vector_identity = gimple_build_vector_from_val (&seq, vectype,
+							      scalar_value);
+	    }
+
+	  /* Calculate the equivalent of:
+
+	     sel[j] = (index[j] == i);
+
+	     which selects the elements of NEW_PHI_RESULT that should
+	     be included in the result.  */
+	  tree compare_val = build_int_cst (index_elt_type, i);
+	  compare_val = build_vector_from_val (index_type, compare_val);
+	  tree sel = gimple_build (&seq, EQ_EXPR, mask_type,
+				   index, compare_val);
+
+	  /* Calculate the equivalent of:
+
+	     vec = seq ? new_phi_result : vector_identity;
+
+	     VEC is now suitable for a full vector reduction.  */
+	  tree vec = gimple_build (&seq, VEC_COND_EXPR, vectype,
+				   sel, new_phi_result, vector_identity);
+
+	  /* Do the reduction and convert it to the appropriate type.  */
+	  tree scalar = gimple_build (&seq, reduc_code,
+				      TREE_TYPE (vectype), vec);
+	  scalar = gimple_convert (&seq, scalar_type, scalar);
+	  scalar_results.safe_push (scalar);
+	}
+      gsi_insert_seq_before (&exit_gsi, seq, GSI_SAME_STMT);
+    }
   else
     {
       bool reduce_with_shift = have_whole_vector_shift (mode);
       int element_bitsize = tree_to_uhwi (bitsize);
+      /* Enforced by vectorizable_reduction, which disallows SLP reductions
+	 for variable-length vectors and also requires direct target support
+	 for loop reductions.  */
       int vec_size_in_bits = tree_to_uhwi (TYPE_SIZE (vectype));
       tree vec_temp;
 
@@ -5481,6 +6175,163 @@ vect_finalize_reduction:
     }
 }
 
+/* Return a vector of type VECTYPE that is equal to the vector select
+   operation "MASK ? VEC : IDENTITY".  Insert the select statements
+   before GSI.  */
+
+static tree
+merge_with_identity (gimple_stmt_iterator *gsi, tree mask, tree vectype,
+		     tree vec, tree identity)
+{
+  tree cond = make_temp_ssa_name (vectype, NULL, "cond");
+  gimple *new_stmt = gimple_build_assign (cond, VEC_COND_EXPR,
+					  mask, vec, identity);
+  gsi_insert_before (gsi, new_stmt, GSI_SAME_STMT);
+  return cond;
+}
+
+/* Perform an in-order reduction (FOLD_LEFT_REDUCTION).  STMT is the
+   statement that sets the live-out value.  REDUC_DEF_STMT is the phi
+   statement.  CODE is the operation performed by STMT and OPS are
+   its scalar operands.  REDUC_INDEX is the index of the operand in
+   OPS that is set by REDUC_DEF_STMT.  REDUC_CODE is the code that
+   implements in-order reduction and VECTYPE_IN is the type of its
+   vector input.  MASKS specifies the masks that should be used to
+   control the operation in a fully-masked loop.  */
+
+static bool
+vectorize_fold_left_reduction (gimple *stmt, gimple_stmt_iterator *gsi,
+			       gimple **vec_stmt, slp_tree slp_node,
+			       gimple *reduc_def_stmt,
+			       tree_code code, tree_code reduc_code,
+			       tree ops[3], tree vectype_in,
+			       int reduc_index, vec_loop_masks *masks)
+{
+  stmt_vec_info stmt_info = vinfo_for_stmt (stmt);
+  loop_vec_info loop_vinfo = STMT_VINFO_LOOP_VINFO (stmt_info);
+  struct loop *loop = LOOP_VINFO_LOOP (loop_vinfo);
+  tree vectype_out = STMT_VINFO_VECTYPE (stmt_info);
+  gimple *new_stmt = NULL;
+
+  int ncopies;
+  if (slp_node)
+    ncopies = 1;
+  else
+    ncopies = vect_get_num_copies (loop_vinfo, vectype_in);
+
+  gcc_assert (!nested_in_vect_loop_p (loop, stmt));
+  gcc_assert (ncopies == 1);
+  gcc_assert (TREE_CODE_LENGTH (code) == binary_op);
+  gcc_assert (reduc_index == (code == MINUS_EXPR ? 0 : 1));
+  gcc_assert (STMT_VINFO_VEC_REDUCTION_TYPE (stmt_info)
+	      == FOLD_LEFT_REDUCTION);
+
+  if (slp_node)
+    gcc_assert (must_eq (TYPE_VECTOR_SUBPARTS (vectype_out),
+			 TYPE_VECTOR_SUBPARTS (vectype_in)));
+
+  tree op0 = ops[1 - reduc_index];
+
+  int group_size = 1;
+  gimple *scalar_dest_def;
+  auto_vec<tree> vec_oprnds0;
+  if (slp_node)
+    {
+      vect_get_vec_defs (op0, NULL_TREE, stmt, &vec_oprnds0, NULL, slp_node);
+      group_size = SLP_TREE_SCALAR_STMTS (slp_node).length ();
+      scalar_dest_def = SLP_TREE_SCALAR_STMTS (slp_node)[group_size - 1];
+    }
+  else
+    {
+      tree loop_vec_def0 = vect_get_vec_def_for_operand (op0, stmt);
+      vec_oprnds0.create (1);
+      vec_oprnds0.quick_push (loop_vec_def0);
+      scalar_dest_def = stmt;
+    }
+
+  tree scalar_dest = gimple_assign_lhs (scalar_dest_def);
+  tree scalar_type = TREE_TYPE (scalar_dest);
+  tree reduc_var = gimple_phi_result (reduc_def_stmt);
+
+  int vec_num = vec_oprnds0.length ();
+  gcc_assert (vec_num == 1 || slp_node);
+  tree vec_elem_type = TREE_TYPE (vectype_out);
+  gcc_checking_assert (useless_type_conversion_p (scalar_type, vec_elem_type));
+
+  tree vector_identity = NULL_TREE;
+  if (LOOP_VINFO_FULLY_MASKED_P (loop_vinfo))
+    vector_identity = build_zero_cst (vectype_out);
+
+  int i;
+  tree def0;
+  FOR_EACH_VEC_ELT (vec_oprnds0, i, def0)
+    {
+      tree mask = NULL_TREE;
+      if (LOOP_VINFO_FULLY_MASKED_P (loop_vinfo))
+	mask = vect_get_loop_mask (gsi, masks, vec_num, vectype_in, i);
+
+      /* Handle MINUS by adding the negative.  */
+      if (code == MINUS_EXPR)
+	{
+	  tree negated = make_ssa_name (vectype_out);
+	  new_stmt = gimple_build_assign (negated, NEGATE_EXPR, def0);
+	  gsi_insert_before (gsi, new_stmt, GSI_SAME_STMT);
+	  def0 = negated;
+	}
+
+      if (mask)
+	def0 = merge_with_identity (gsi, mask, vectype_out, def0,
+				    vector_identity);
+
+      /* On the first iteration the input is simply the scalar phi
+	 result, and for subsequent iterations it is the output of
+	 the preceding operation.  */
+      tree expr = build2 (reduc_code, scalar_type, reduc_var, def0);
+
+      /* For chained SLP reductions the output of the previous reduction
+	 operation serves as the input of the next. For the final statement
+	 the output cannot be a temporary - we reuse the original
+	 scalar destination of the last statement.  */
+      if (i == vec_num - 1)
+	reduc_var = scalar_dest;
+      else
+	reduc_var = vect_create_destination_var (scalar_dest, NULL);
+      new_stmt = gimple_build_assign (reduc_var, expr);
+
+      if (i == vec_num - 1)
+	{
+	  SSA_NAME_DEF_STMT (reduc_var) = new_stmt;
+	  /* For chained SLP stmt is the first statement in the group and
+	     gsi points to the last statement in the group.  For non SLP stmt
+	     points to the same location as gsi.  */
+	  if (scalar_dest_def == gsi_stmt (*gsi))
+	    vect_finish_replace_stmt (scalar_dest_def, new_stmt);
+	  else
+	    {
+	      /* In this case we're moving the definition to later in the
+		 block.  That doesn't matter because the only uses of the
+		 lhs are in phi statements.  */
+	      gimple_stmt_iterator old_gsi = gsi_for_stmt (scalar_dest_def);
+	      gsi_remove (&old_gsi, true);
+	      vect_finish_stmt_generation (stmt, new_stmt, gsi);
+	    }
+        }
+      else
+	{
+	  reduc_var = make_ssa_name (reduc_var, new_stmt);
+	  gimple_assign_set_lhs (new_stmt, reduc_var);
+	  vect_finish_stmt_generation (stmt, new_stmt, gsi);
+	}
+
+      if (slp_node)
+	SLP_TREE_VEC_STMTS (slp_node).quick_push (new_stmt);
+    }
+
+  if (!slp_node)
+    STMT_VINFO_VEC_STMT (stmt_info) = *vec_stmt = new_stmt;
+
+  return true;
+}
 
 /* Function is_nonwrapping_integer_induction.
 
@@ -5660,10 +6511,21 @@ vectorizable_reduction (gimple *stmt, gimple_stmt_iterator *gsi,
 	  return true;
 	}
 
+      if (STMT_VINFO_REDUC_TYPE (stmt_info) == FOLD_LEFT_REDUCTION)
+	/* Leave the scalar phi in place.  Note that checking
+	   STMT_VINFO_VEC_REDUCTION_TYPE (as below) only works
+	   for reductions involving a single statement.  */
+	return true;
+
       gimple *reduc_stmt = STMT_VINFO_REDUC_DEF (stmt_info);
       if (STMT_VINFO_IN_PATTERN_P (vinfo_for_stmt (reduc_stmt)))
 	reduc_stmt = STMT_VINFO_RELATED_STMT (vinfo_for_stmt (reduc_stmt));
 
+      if (STMT_VINFO_VEC_REDUCTION_TYPE (vinfo_for_stmt (reduc_stmt))
+	  == EXTRACT_LAST_REDUCTION)
+	/* Leave the scalar phi in place.  */
+	return true;
+
       gcc_assert (is_gimple_assign (reduc_stmt));
       for (unsigned k = 1; k < gimple_num_ops (reduc_stmt); ++k)
 	{
@@ -5673,10 +6535,10 @@ vectorizable_reduction (gimple *stmt, gimple_stmt_iterator *gsi,
 	  if (k == 1
 	      && gimple_assign_rhs_code (reduc_stmt) == COND_EXPR)
 	    continue;
-	  tem = get_vectype_for_scalar_type (TREE_TYPE (op));
-	  if (! vectype_in
-	      || TYPE_VECTOR_SUBPARTS (tem) < TYPE_VECTOR_SUBPARTS (vectype_in))
-	    vectype_in = tem;
+	  if (!vectype_in
+	      || (GET_MODE_SIZE (SCALAR_TYPE_MODE (TREE_TYPE (vectype_in)))
+		  < GET_MODE_SIZE (SCALAR_TYPE_MODE (TREE_TYPE (op)))))
+	    vectype_in = get_vectype_for_scalar_type (TREE_TYPE (op));
 	  break;
 	}
       gcc_assert (vectype_in);
@@ -5703,9 +6565,10 @@ vectorizable_reduction (gimple *stmt, gimple_stmt_iterator *gsi,
 
       if (slp_node)
 	/* The size vect_schedule_slp_instance computes is off for us.  */
-	vec_num = ((LOOP_VINFO_VECT_FACTOR (loop_vinfo)
-		    * SLP_TREE_SCALAR_STMTS (slp_node).length ())
-		   / TYPE_VECTOR_SUBPARTS (vectype_in));
+	vec_num = vect_get_num_vectors
+	  (LOOP_VINFO_VECT_FACTOR (loop_vinfo)
+	   * SLP_TREE_SCALAR_STMTS (slp_node).length (),
+	   vectype_in);
       else
 	vec_num = 1;
 
@@ -5841,7 +6704,8 @@ vectorizable_reduction (gimple *stmt, gimple_stmt_iterator *gsi,
 	  /* To properly compute ncopies we are interested in the widest
 	     input type in case we're looking at a widening accumulation.  */
 	  if (!vectype_in
-	      || TYPE_VECTOR_SUBPARTS (vectype_in) > TYPE_VECTOR_SUBPARTS (tem))
+	      || (GET_MODE_SIZE (SCALAR_TYPE_MODE (TREE_TYPE (vectype_in)))
+		  < GET_MODE_SIZE (SCALAR_TYPE_MODE (TREE_TYPE (tem)))))
 	    vectype_in = tem;
 	}
 
@@ -5880,6 +6744,14 @@ vectorizable_reduction (gimple *stmt, gimple_stmt_iterator *gsi,
      directy used in stmt.  */
   if (reduc_index == -1)
     {
+      if (STMT_VINFO_REDUC_TYPE (stmt_info) == FOLD_LEFT_REDUCTION)
+	{
+	  if (dump_enabled_p ())
+	    dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
+			     "in-order reduction chain without SLP.\n");
+	  return false;
+	}
+
       if (orig_stmt)
 	reduc_def_stmt = STMT_VINFO_REDUC_DEF (orig_stmt_info);
       else
@@ -5914,16 +6786,6 @@ vectorizable_reduction (gimple *stmt, gimple_stmt_iterator *gsi,
   /* If we have a condition reduction, see if we can simplify it further.  */
   if (v_reduc_type == COND_REDUCTION)
     {
-      if (cond_reduc_dt == vect_induction_def)
-	{
-	  if (dump_enabled_p ())
-	    dump_printf_loc (MSG_NOTE, vect_location,
-			     "condition expression based on "
-			     "integer induction.\n");
-	  STMT_VINFO_VEC_REDUCTION_TYPE (stmt_info)
-	    = INTEGER_INDUC_COND_REDUCTION;
-	}
-
       /* Loop peeling modifies initial value of reduction PHI, which
 	 makes the reduction stmt to be transformed different to the
 	 original stmt analyzed.  We need to record reduction code for
@@ -5936,6 +6798,24 @@ vectorizable_reduction (gimple *stmt, gimple_stmt_iterator *gsi,
 	  gcc_assert (cond_reduc_dt == vect_constant_def);
 	  STMT_VINFO_VEC_REDUCTION_TYPE (stmt_info) = CONST_COND_REDUCTION;
 	}
+      else if (direct_internal_fn_supported_p (IFN_FOLD_EXTRACT_LAST,
+					       vectype_in, OPTIMIZE_FOR_SPEED))
+	{
+	  if (dump_enabled_p ())
+	    dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
+			     "optimizing condition reduction with"
+			     " FOLD_EXTRACT_LAST.\n");
+	  STMT_VINFO_VEC_REDUCTION_TYPE (stmt_info) = EXTRACT_LAST_REDUCTION;
+	}
+      else if (cond_reduc_dt == vect_induction_def)
+	{
+	  if (dump_enabled_p ())
+	    dump_printf_loc (MSG_NOTE, vect_location,
+			     "optimizing condition reduction based on "
+			     "integer induction.\n");
+	  STMT_VINFO_VEC_REDUCTION_TYPE (stmt_info)
+	    = INTEGER_INDUC_COND_REDUCTION;
+	}
       else if (cond_reduc_dt == vect_constant_def)
 	{
 	  enum vect_def_type cond_initial_dt;
@@ -5988,6 +6868,7 @@ vectorizable_reduction (gimple *stmt, gimple_stmt_iterator *gsi,
   gcc_assert (ncopies >= 1);
 
   vec_mode = TYPE_MODE (vectype_in);
+  poly_uint64 nunits_out = TYPE_VECTOR_SUBPARTS (vectype_out);
 
   if (code == COND_EXPR)
     {
@@ -6033,7 +6914,7 @@ vectorizable_reduction (gimple *stmt, gimple_stmt_iterator *gsi,
           if (dump_enabled_p ())
             dump_printf (MSG_NOTE, "op not supported by target.\n");
 
-          if (GET_MODE_SIZE (vec_mode) != UNITS_PER_WORD
+          if (may_ne (GET_MODE_SIZE (vec_mode), UNITS_PER_WORD)
 	      || !vect_worthwhile_without_simd_p (loop_vinfo, code))
             return false;
 
@@ -6088,12 +6969,14 @@ vectorizable_reduction (gimple *stmt, gimple_stmt_iterator *gsi,
           (and also the same tree-code) when generating the epilog code and
           when generating the code inside the loop.  */
 
-  if (orig_stmt)
+  vect_reduction_type reduction_type
+    = STMT_VINFO_VEC_REDUCTION_TYPE (stmt_info);
+  if (orig_stmt
+      && (reduction_type == TREE_CODE_REDUCTION
+	  || reduction_type == FOLD_LEFT_REDUCTION))
     {
       /* This is a reduction pattern: get the vectype from the type of the
          reduction variable, and get the tree-code from orig_stmt.  */
-      gcc_assert (STMT_VINFO_VEC_REDUCTION_TYPE (stmt_info)
-		  == TREE_CODE_REDUCTION);
       orig_code = gimple_assign_rhs_code (orig_stmt);
       gcc_assert (vectype_out);
       vec_mode = TYPE_MODE (vectype_out);
@@ -6109,13 +6992,12 @@ vectorizable_reduction (gimple *stmt, gimple_stmt_iterator *gsi,
 
       /* For simple condition reductions, replace with the actual expression
 	 we want to base our reduction around.  */
-      if (STMT_VINFO_VEC_REDUCTION_TYPE (stmt_info) == CONST_COND_REDUCTION)
+      if (reduction_type == CONST_COND_REDUCTION)
 	{
 	  orig_code = STMT_VINFO_VEC_CONST_COND_REDUC_CODE (stmt_info);
 	  gcc_assert (orig_code == MAX_EXPR || orig_code == MIN_EXPR);
 	}
-      else if (STMT_VINFO_VEC_REDUCTION_TYPE (stmt_info)
-		 == INTEGER_INDUC_COND_REDUCTION)
+      else if (reduction_type == INTEGER_INDUC_COND_REDUCTION)
 	orig_code = MAX_EXPR;
     }
 
@@ -6137,12 +7019,23 @@ vectorizable_reduction (gimple *stmt, gimple_stmt_iterator *gsi,
 
   epilog_reduc_code = ERROR_MARK;
 
-  if (STMT_VINFO_VEC_REDUCTION_TYPE (stmt_info) != COND_REDUCTION)
+  if (reduction_type == TREE_CODE_REDUCTION
+      || reduction_type == FOLD_LEFT_REDUCTION
+      || reduction_type == INTEGER_INDUC_COND_REDUCTION
+      || reduction_type == CONST_COND_REDUCTION)
     {
-      if (reduction_code_for_scalar_code (orig_code, &epilog_reduc_code))
+      bool have_reduc_support;
+      if (reduction_type == FOLD_LEFT_REDUCTION)
+	have_reduc_support = fold_left_reduction_code (orig_code, vectype_out,
+						       &epilog_reduc_code);
+      else
+	have_reduc_support
+	  = reduction_code_for_scalar_code (orig_code, &epilog_reduc_code);
+
+      if (have_reduc_support)
 	{
 	  reduc_optab = optab_for_tree_code (epilog_reduc_code, vectype_out,
-                                         optab_default);
+					     optab_default);
 	  if (!reduc_optab)
 	    {
 	      if (dump_enabled_p ())
@@ -6172,13 +7065,13 @@ vectorizable_reduction (gimple *stmt, gimple_stmt_iterator *gsi,
 	    }
 	}
     }
-  else
+  else if (reduction_type == COND_REDUCTION)
     {
       int scalar_precision
 	= GET_MODE_PRECISION (SCALAR_TYPE_MODE (scalar_type));
       cr_index_scalar_type = make_unsigned_type (scalar_precision);
-      cr_index_vector_type = build_vector_type
-	(cr_index_scalar_type, TYPE_VECTOR_SUBPARTS (vectype_out));
+      cr_index_vector_type = build_vector_type (cr_index_scalar_type,
+						nunits_out);
 
       optab = optab_for_tree_code (REDUC_MAX_EXPR, cr_index_vector_type,
 				   optab_default);
@@ -6187,8 +7080,18 @@ vectorizable_reduction (gimple *stmt, gimple_stmt_iterator *gsi,
 	epilog_reduc_code = REDUC_MAX_EXPR;
     }
 
-  if ((double_reduc
-       || STMT_VINFO_VEC_REDUCTION_TYPE (stmt_info) != TREE_CODE_REDUCTION)
+  if (reduction_type != EXTRACT_LAST_REDUCTION
+      && epilog_reduc_code == ERROR_MARK
+      && !nunits_out.is_constant ())
+    {
+      if (dump_enabled_p ())
+	dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
+			 "missing target support for reduction on"
+			 " variable-length vectors.\n");
+      return false;
+    }
+
+  if ((double_reduc || reduction_type != TREE_CODE_REDUCTION)
       && ncopies > 1)
     {
       if (dump_enabled_p ())
@@ -6198,6 +7101,101 @@ vectorizable_reduction (gimple *stmt, gimple_stmt_iterator *gsi,
       return false;
     }
 
+  /* For SLP reductions, see if there is a neutral value we can use.  */
+  tree neutral_op = NULL_TREE;
+  if (slp_node)
+    neutral_op
+      = neutral_op_for_slp_reduction (slp_node_instance->reduc_phis, code,
+				      GROUP_FIRST_ELEMENT (stmt_info) != NULL);
+
+  /* For double reductions, and for SLP reductions with a neutral value,
+     we construct a variable-length initial vector by loading a vector
+     full of the neutral value and then shift-and-inserting the start
+     values into the low-numbered elements.  */
+  if ((double_reduc || neutral_op)
+      && !nunits_out.is_constant ()
+      && !direct_internal_fn_supported_p (IFN_VEC_SHL_INSERT,
+					  vectype_out, OPTIMIZE_FOR_SPEED))
+    {
+      if (dump_enabled_p ())
+	dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
+			 "reduction on variable-length vectors requires"
+			 " target support for a vector-shift-and-insert"
+			 " operation.\n");
+      return false;
+    }
+
+  /* Check extra constraints for variable-length unchained SLP reductions.  */
+  if (STMT_SLP_TYPE (stmt_info)
+      && !GROUP_FIRST_ELEMENT (vinfo_for_stmt (stmt))
+      && !nunits_out.is_constant ())
+    {
+      /* We checked above that we could build the initial vector when
+	 there's a neutral element value.  Check here for the case in
+	 which each SLP statement has its own initial value and in which
+	 that value needs to be repeated for every instance of the
+	 statement within the initial vector.  */
+      unsigned int group_size = SLP_TREE_SCALAR_STMTS (slp_node).length ();
+      scalar_mode elt_mode = SCALAR_TYPE_MODE (TREE_TYPE (vectype_out));
+      if (!neutral_op
+	  && !can_duplicate_and_interleave_p (group_size, elt_mode))
+	{
+	  if (dump_enabled_p ())
+	    dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
+			     "unsupported form of SLP reduction for"
+			     " variable-length vectors: cannot build"
+			     " initial vector.\n");
+	  return false;
+	}
+      /* The epilogue code relies on the number of elements being a multiple
+	 of the group size.  The duplicate-and-interleave approach to setting
+	 up the the initial vector does too.  */
+      if (!multiple_p (nunits_out, group_size))
+	{
+	  if (dump_enabled_p ())
+	    dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
+			     "unsupported form of SLP reduction for"
+			     " variable-length vectors: the vector size"
+			     " is not a multiple of the number of results.\n");
+	  return false;
+	}
+    }
+
+  if (double_reduc && reduction_type == FOLD_LEFT_REDUCTION)
+    {
+      /* We can't support in-order reductions of code such as this:
+
+	   for (int i = 0; i < n1; ++i)
+	     for (int j = 0; j < n2; ++j)
+	       l += a[j];
+
+	 since GCC effectively transforms the loop when vectorizing:
+
+	   for (int i = 0; i < n1 / VF; ++i)
+	     for (int j = 0; j < n2; ++j)
+	       for (int k = 0; k < VF; ++k)
+		 l += a[j];
+
+	 which is a reassociation of the original operation.  */
+      if (dump_enabled_p ())
+	dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
+			 "in-order double reduction not supported.\n");
+
+      return false;
+    }
+
+  if (reduction_type == FOLD_LEFT_REDUCTION
+      && slp_node
+      && !GROUP_FIRST_ELEMENT (vinfo_for_stmt (stmt)))
+    {
+      /* We cannot in-order reductions in this case because there is
+         an implicit reassociation of the operations involved.  */
+      if (dump_enabled_p ())
+        dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
+			 "in-order unchained SLP reductions not supported.\n");
+      return false;
+    }
+
   /* In case of widenning multiplication by a constant, we update the type
      of the constant to be the type of the other operand.  We check that the
      constant fits the type in the pattern recognition pass.  */
@@ -6218,7 +7216,7 @@ vectorizable_reduction (gimple *stmt, gimple_stmt_iterator *gsi,
         }
     }
 
-  if (STMT_VINFO_VEC_REDUCTION_TYPE (stmt_info) == COND_REDUCTION)
+  if (reduction_type == COND_REDUCTION)
     {
       widest_int ni;
 
@@ -6304,10 +7302,49 @@ vectorizable_reduction (gimple *stmt, gimple_stmt_iterator *gsi,
       return false;
     }
 
+  if (slp_node)
+    vec_num = SLP_TREE_NUMBER_OF_VEC_STMTS (slp_node);
+  else
+    vec_num = 1;
+
+  internal_fn cond_fn = get_conditional_internal_fn (code, scalar_type);
+
+  /* In a speculative loop, the update must be predicated on the
+     nonspeculative masks, so that we don't include speculatively
+     loaded elements from beyond the end of the original loop.  */
+  vec_loop_masks *masks = &LOOP_VINFO_MASKS (loop_vinfo);
+  if (LOOP_VINFO_SPECULATIVE_EXECUTION (loop_vinfo))
+    masks = &LOOP_VINFO_NONSPECULATIVE_MASKS (loop_vinfo);
+
   if (!vec_stmt) /* transformation not required.  */
     {
       if (first_p)
 	vect_model_reduction_cost (stmt_info, epilog_reduc_code, ncopies);
+      if (loop_vinfo && LOOP_VINFO_CAN_FULLY_MASK_P (loop_vinfo))
+	{
+	  if (reduction_type != FOLD_LEFT_REDUCTION
+	      && (cond_fn == IFN_LAST
+		  || !direct_internal_fn_supported_p (cond_fn, vectype_in,
+						      OPTIMIZE_FOR_SPEED)))
+	    {
+	      if (dump_enabled_p ())
+		dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
+				 "can't use a fully-masked loop because no"
+				 " conditional operation is available.\n");
+	      LOOP_VINFO_CAN_FULLY_MASK_P (loop_vinfo) = false;
+	    }
+	  else if (reduc_index == -1)
+	    {
+	      if (dump_enabled_p ())
+		dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
+				 "can't use a fully-masked loop for chained"
+				 " reductions.\n");
+	      LOOP_VINFO_CAN_FULLY_MASK_P (loop_vinfo) = false;
+	    }
+	  else
+	    vect_record_loop_mask (loop_vinfo, masks, ncopies * vec_num,
+				   vectype_in);
+	}
       STMT_VINFO_TYPE (stmt_info) = reduc_vec_info_type;
       return true;
     }
@@ -6321,16 +7358,33 @@ vectorizable_reduction (gimple *stmt, gimple_stmt_iterator *gsi,
   if (code == COND_EXPR)
     gcc_assert (ncopies == 1);
 
+  bool masked_loop_p = LOOP_VINFO_FULLY_MASKED_P (loop_vinfo);
+
+  gimple_stmt_iterator nonspeculative_gsi
+    = gsi_end (LOOP_VINFO_NONSPECULATIVE_SEQ (loop_vinfo));
+  if (masked_loop_p
+      && masks == &LOOP_VINFO_NONSPECULATIVE_MASKS (loop_vinfo))
+    gsi = &nonspeculative_gsi;
+
+  if (reduction_type == FOLD_LEFT_REDUCTION)
+    return vectorize_fold_left_reduction
+      (stmt, gsi, vec_stmt, slp_node, reduc_def_stmt, code,
+       epilog_reduc_code, ops, vectype_in, reduc_index, masks);
+
+  if (reduction_type == EXTRACT_LAST_REDUCTION)
+    {
+      gcc_assert (!slp_node);
+      return vectorizable_condition (stmt, gsi, vec_stmt,
+				     NULL, reduc_index, NULL);
+    }
+
   /* Create the destination vector  */
   vec_dest = vect_create_destination_var (scalar_dest, vectype_out);
 
   prev_stmt_info = NULL;
   prev_phi_info = NULL;
-  if (slp_node)
-    vec_num = SLP_TREE_NUMBER_OF_VEC_STMTS (slp_node);
-  else
+  if (!slp_node)
     {
-      vec_num = 1;
       vec_oprnds0.create (1);
       vec_oprnds1.create (1);
       if (op_type == ternary_op)
@@ -6404,19 +7458,19 @@ vectorizable_reduction (gimple *stmt, gimple_stmt_iterator *gsi,
 	      gcc_assert (reduc_index != -1 || ! single_defuse_cycle);
 
 	      if (single_defuse_cycle && reduc_index == 0)
-		vec_oprnds0[0] = gimple_assign_lhs (new_stmt);
+		vec_oprnds0[0] = gimple_get_lhs (new_stmt);
 	      else
 		vec_oprnds0[0]
 		  = vect_get_vec_def_for_stmt_copy (dts[0], vec_oprnds0[0]);
 	      if (single_defuse_cycle && reduc_index == 1)
-		vec_oprnds1[0] = gimple_assign_lhs (new_stmt);
+		vec_oprnds1[0] = gimple_get_lhs (new_stmt);
 	      else
 		vec_oprnds1[0]
 		  = vect_get_vec_def_for_stmt_copy (dts[1], vec_oprnds1[0]);
 	      if (op_type == ternary_op)
 		{
 		  if (single_defuse_cycle && reduc_index == 2)
-		    vec_oprnds2[0] = gimple_assign_lhs (new_stmt);
+		    vec_oprnds2[0] = gimple_get_lhs (new_stmt);
 		  else
 		    vec_oprnds2[0] 
 		      = vect_get_vec_def_for_stmt_copy (dts[2], vec_oprnds2[0]);
@@ -6427,13 +7481,33 @@ vectorizable_reduction (gimple *stmt, gimple_stmt_iterator *gsi,
       FOR_EACH_VEC_ELT (vec_oprnds0, i, def0)
         {
 	  tree vop[3] = { def0, vec_oprnds1[i], NULL_TREE };
-	  if (op_type == ternary_op)
-	    vop[2] = vec_oprnds2[i];
+	  if (masked_loop_p)
+	    {
+	      /* Make sure that the reduction accumulator is vop[0].  */
+	      if (reduc_index == 1)
+		{
+		  gcc_assert (commutative_tree_code (code));
+		  std::swap (vop[0], vop[1]);
+		}
+	      tree mask = vect_get_loop_mask (gsi, masks, vec_num * ncopies,
+					      vectype_in, i * ncopies + j);
+	      gcall *call = gimple_build_call_internal (cond_fn, 3, mask,
+							vop[0], vop[1]);
+	      new_temp = make_ssa_name (vec_dest, call);
+	      gimple_call_set_lhs (call, new_temp);
+	      gimple_call_set_nothrow (call, true);
+	      new_stmt = call;
+	    }
+	  else
+	    {
+	      if (op_type == ternary_op)
+		vop[2] = vec_oprnds2[i];
 
-          new_temp = make_ssa_name (vec_dest, new_stmt);
-          new_stmt = gimple_build_assign (new_temp, code,
-					  vop[0], vop[1], vop[2]);
-          vect_finish_stmt_generation (stmt, new_stmt, gsi);
+	      new_temp = make_ssa_name (vec_dest, new_stmt);
+	      new_stmt = gimple_build_assign (new_temp, code,
+					      vop[0], vop[1], vop[2]);
+	    }
+	  vect_finish_stmt_generation (stmt, new_stmt, gsi);
 
           if (slp_node)
             {
@@ -6458,12 +7532,13 @@ vectorizable_reduction (gimple *stmt, gimple_stmt_iterator *gsi,
   /* Finalize the reduction-phi (set its arguments) and create the
      epilog reduction code.  */
   if ((!single_defuse_cycle || code == COND_EXPR) && !slp_node)
-    vect_defs[0] = gimple_assign_lhs (*vec_stmt);
+    vect_defs[0] = gimple_get_lhs (*vec_stmt);
 
   vect_create_epilog_for_reduction (vect_defs, stmt, reduc_def_stmt,
 				    epilog_copies,
                                     epilog_reduc_code, phis,
-				    double_reduc, slp_node, slp_node_instance);
+				    double_reduc, slp_node, slp_node_instance,
+				    neutral_op);
 
   return true;
 }
@@ -6473,7 +7548,7 @@ vectorizable_reduction (gimple *stmt, gimple_stmt_iterator *gsi,
    For a loop where we could vectorize the operation indicated by CODE,
    return the minimum vectorization factor that makes it worthwhile
    to use generic vectors.  */
-int
+static unsigned int
 vect_min_worthwhile_factor (enum tree_code code)
 {
   switch (code)
@@ -6502,9 +7577,10 @@ bool
 vect_worthwhile_without_simd_p (vec_info *vinfo, tree_code code)
 {
   loop_vec_info loop_vinfo = dyn_cast <loop_vec_info> (vinfo);
+  unsigned HOST_WIDE_INT value;
   return (loop_vinfo
-	  && (LOOP_VINFO_VECT_FACTOR (loop_vinfo)
-	      >= vect_min_worthwhile_factor (code)));
+	  && LOOP_VINFO_VECT_FACTOR (loop_vinfo).is_constant (&value)
+	  && value >= vect_min_worthwhile_factor (code));
 }
 
 /* Function vectorizable_induction
@@ -6534,7 +7610,7 @@ vectorizable_induction (gimple *phi,
   gphi *induction_phi;
   tree induc_def, vec_dest;
   tree init_expr, step_expr;
-  int vf = LOOP_VINFO_VECT_FACTOR (loop_vinfo);
+  tree vf = LOOP_VINFO_CAP (loop_vinfo).niters;
   unsigned i;
   tree expr;
   gimple_seq stmts;
@@ -6557,7 +7633,7 @@ vectorizable_induction (gimple *phi,
     return false;
 
   tree vectype = STMT_VINFO_VECTYPE (stmt_info);
-  unsigned nunits = TYPE_VECTOR_SUBPARTS (vectype);
+  poly_uint64 nunits = TYPE_VECTOR_SUBPARTS (vectype);
 
   if (slp_node)
     ncopies = 1;
@@ -6622,6 +7698,16 @@ vectorizable_induction (gimple *phi,
     iv_loop = loop;
   gcc_assert (iv_loop == (gimple_bb (phi))->loop_father);
 
+  if (slp_node && !nunits.is_constant ())
+    {
+      /* The current SLP code creates the initial value element-by-element.  */
+      if (dump_enabled_p ())
+	dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
+			 "SLP induction not supported for variable-length"
+			 " vectors.\n");
+      return false;
+    }
+
   if (!vec_stmt) /* transformation not required.  */
     {
       STMT_VINFO_TYPE (stmt_info) = induc_vec_info_type;
@@ -6652,9 +7738,28 @@ vectorizable_induction (gimple *phi,
   init_expr = PHI_ARG_DEF_FROM_EDGE (phi,
 				     loop_preheader_edge (iv_loop));
 
-  /* Convert the step to the desired type.  */
+  /* Convert the initial value and step to the desired type.  */
   stmts = NULL;
+  init_expr = gimple_convert (&stmts, TREE_TYPE (vectype), init_expr);
   step_expr = gimple_convert (&stmts, TREE_TYPE (vectype), step_expr);
+
+  /* If we are using the loop mask to "peel" for alignment then we need
+     to adjust the start value here.  */
+  tree skip_niters = LOOP_VINFO_MASK_SKIP_NITERS (loop_vinfo);
+  if (skip_niters != NULL_TREE)
+    {
+      if (FLOAT_TYPE_P (vectype))
+	skip_niters = gimple_build (&stmts, FLOAT_EXPR, TREE_TYPE (vectype),
+				    skip_niters);
+      else
+	skip_niters = gimple_convert (&stmts, TREE_TYPE (vectype),
+				      skip_niters);
+      tree skip_step = gimple_build (&stmts, MULT_EXPR, TREE_TYPE (vectype),
+				     skip_niters, step_expr);
+      init_expr = gimple_build (&stmts, MINUS_EXPR, TREE_TYPE (vectype),
+				init_expr, skip_step);
+    }
+
   if (stmts)
     {
       new_bb = gsi_insert_seq_on_edge_immediate (pe, stmts);
@@ -6670,6 +7775,9 @@ vectorizable_induction (gimple *phi,
      [VF*S, VF*S, VF*S, VF*S] for all.  */
   if (slp_node)
     {
+      /* Enforced above.  */
+      unsigned int const_nunits = nunits.to_constant ();
+
       /* Convert the init to the desired type.  */
       stmts = NULL;
       init_expr = gimple_convert (&stmts, TREE_TYPE (vectype), init_expr);
@@ -6681,12 +7789,9 @@ vectorizable_induction (gimple *phi,
 
       /* Generate [VF*S, VF*S, ... ].  */
       if (SCALAR_FLOAT_TYPE_P (TREE_TYPE (step_expr)))
-	{
-	  expr = build_int_cst (integer_type_node, vf);
-	  expr = fold_convert (TREE_TYPE (step_expr), expr);
-	}
+	expr = fold_convert (TREE_TYPE (step_expr), vf);
       else
-	expr = build_int_cst (TREE_TYPE (step_expr), vf);
+	expr = vf;
       new_name = fold_build2 (MULT_EXPR, TREE_TYPE (step_expr),
 			      expr, step_expr);
       if (! CONSTANT_CLASS_P (new_name))
@@ -6698,19 +7803,20 @@ vectorizable_induction (gimple *phi,
       /* Now generate the IVs.  */
       unsigned group_size = SLP_TREE_SCALAR_STMTS (slp_node).length ();
       unsigned nvects = SLP_TREE_NUMBER_OF_VEC_STMTS (slp_node);
-      unsigned elts = nunits * nvects;
-      unsigned nivs = least_common_multiple (group_size, nunits) / nunits;
+      unsigned elts = const_nunits * nvects;
+      unsigned nivs = least_common_multiple (group_size,
+					     const_nunits) / const_nunits;
       gcc_assert (elts % group_size == 0);
       tree elt = init_expr;
       unsigned ivn;
       for (ivn = 0; ivn < nivs; ++ivn)
 	{
-	  auto_vec<tree, 32> elts (nunits);
+	  auto_vec<tree, 32> elts (const_nunits);
 	  stmts = NULL;
-	  for (unsigned eltn = 0; eltn < nunits; ++eltn)
+	  for (unsigned eltn = 0; eltn < const_nunits; ++eltn)
 	    {
-	      if (ivn*nunits + eltn >= group_size
-		  && (ivn*nunits + eltn) % group_size == 0)
+	      if (ivn*const_nunits + eltn >= group_size
+		  && (ivn * const_nunits + eltn) % group_size == 0)
 		elt = gimple_build (&stmts, PLUS_EXPR, TREE_TYPE (elt),
 				    elt, step_expr);
 	      elts.quick_push (elt);
@@ -6747,7 +7853,7 @@ vectorizable_induction (gimple *phi,
       if (ivn < nvects)
 	{
 	  unsigned vfp
-	    = least_common_multiple (group_size, nunits) / group_size;
+	    = least_common_multiple (group_size, const_nunits) / group_size;
 	  /* Generate [VF'*S, VF'*S, ... ].  */
 	  if (SCALAR_FLOAT_TYPE_P (TREE_TYPE (step_expr)))
 	    {
@@ -6822,18 +7928,45 @@ vectorizable_induction (gimple *phi,
       stmts = NULL;
       new_name = gimple_convert (&stmts, TREE_TYPE (vectype), init_expr);
 
-      auto_vec<tree, 32> elts (nunits);
-      elts.quick_push (new_name);
-      for (i = 1; i < nunits; i++)
+      unsigned HOST_WIDE_INT const_nunits;
+      if (nunits.is_constant (&const_nunits))
 	{
-	  /* Create: new_name_i = new_name + step_expr  */
-	  new_name = gimple_build (&stmts, PLUS_EXPR, TREE_TYPE (new_name),
-				   new_name, step_expr);
+	  auto_vec<tree, 32> elts (const_nunits);
 	  elts.quick_push (new_name);
+	  for (i = 1; i < const_nunits; i++)
+	    {
+	      /* Create: new_name_i = new_name + step_expr  */
+	      new_name = gimple_build (&stmts, PLUS_EXPR, TREE_TYPE (new_name),
+				       new_name, step_expr);
+	      elts.quick_push (new_name);
+	    }
+	  /* Create a vector from [new_name_0, new_name_1, ...,
+	     new_name_nunits-1]  */
+	  vec_init = gimple_build_vector (&stmts, vectype, elts);
 	}
-      /* Create a vector from [new_name_0, new_name_1, ...,
-	 new_name_nunits-1]  */
-      vec_init = gimple_build_vector (&stmts, vectype, elts);
+      else if (INTEGRAL_TYPE_P (TREE_TYPE (step_expr)))
+	/* Build the initial value directly from a VEC_SERIES_EXPR.  */
+	vec_init = gimple_build (&stmts, VEC_SERIES_EXPR, vectype,
+				 new_name, step_expr);
+      else
+	{
+	  /* Build:
+	        [base, base, base, ...]
+		+ (vectype) [0, 1, 2, ...] * [step, step, step, ...].  */
+	  gcc_assert (SCALAR_FLOAT_TYPE_P (TREE_TYPE (step_expr)));
+	  gcc_assert (flag_associative_math);
+	  tree index = build_index_vector (vectype, 0, 1);
+	  tree base_vec = gimple_build_vector_from_val (&stmts, vectype,
+							new_name);
+	  tree step_vec = gimple_build_vector_from_val (&stmts, vectype,
+							step_expr);
+	  vec_init = gimple_build (&stmts, FLOAT_EXPR, vectype, index);
+	  vec_init = gimple_build (&stmts, MULT_EXPR, vectype,
+				   vec_init, step_vec);
+	  vec_init = gimple_build (&stmts, PLUS_EXPR, vectype,
+				   vec_init, base_vec);
+	}
+
       if (stmts)
 	{
 	  new_bb = gsi_insert_seq_on_edge_immediate (pe, stmts);
@@ -6841,39 +7974,58 @@ vectorizable_induction (gimple *phi,
 	}
     }
 
-
   /* Create the vector that holds the step of the induction.  */
-  if (nested_in_vect_loop)
-    /* iv_loop is nested in the loop to be vectorized. Generate:
-       vec_step = [S, S, S, S]  */
-    new_name = step_expr;
-  else
+  if (LOOP_VINFO_FIRSTFAULTING_EXECUTION (loop_vinfo))
     {
-      /* iv_loop is the loop to be vectorized. Generate:
-	  vec_step = [VF*S, VF*S, VF*S, VF*S]  */
+      bool insert_after;
+      standard_iv_increment_position (loop, &si, &insert_after);
+      gcc_assert (!insert_after);
+
+      t = LOOP_VINFO_NONFAULTING (loop_vinfo).niters;
+      tree step_type = TREE_TYPE (step_expr);
       gimple_seq seq = NULL;
-      if (SCALAR_FLOAT_TYPE_P (TREE_TYPE (step_expr)))
-	{
-	  expr = build_int_cst (integer_type_node, vf);
-	  expr = gimple_build (&seq, FLOAT_EXPR, TREE_TYPE (step_expr), expr);
-	}
+
+      /* Create scalar version of step.  */
+      t = gimple_convert (&seq, step_type, t);
+
+      /* Create vector version of step.  */
+      new_vec = gimple_build_vector_from_val (&seq, vectype, t);
+      gsi_insert_seq_before (&si, seq, GSI_SAME_STMT);
+      vec_step = vect_init_vector (phi, new_vec, vectype, &si);
+    }
+  else
+    {
+      si = gsi_after_labels (bb);
+
+      /* Create the vector that holds the step of the induction.  */
+      if (nested_in_vect_loop)
+	/* iv_loop is nested in the loop to be vectorized.  Generate:
+	   vec_step = [S, S, S, S]  */
+	new_name = step_expr;
       else
-	expr = build_int_cst (TREE_TYPE (step_expr), vf);
-      new_name = gimple_build (&seq, MULT_EXPR, TREE_TYPE (step_expr),
-			       expr, step_expr);
-      if (seq)
 	{
-	  new_bb = gsi_insert_seq_on_edge_immediate (pe, seq);
-	  gcc_assert (!new_bb);
+	  /* iv_loop is the loop to be vectorized.  Generate:
+	     vec_step = [VF*S, VF*S, VF*S, VF*S]  */
+	  gimple_seq seq = NULL;
+	  if (SCALAR_FLOAT_TYPE_P (TREE_TYPE (step_expr)))
+	    expr = gimple_build (&seq, FLOAT_EXPR, TREE_TYPE (step_expr), vf);
+	  else
+	    expr = gimple_convert (&seq, TREE_TYPE (step_expr), vf);
+	  new_name = gimple_build (&seq, MULT_EXPR, TREE_TYPE (step_expr),
+				   expr, step_expr);
+	  if (seq)
+	    {
+	      new_bb = gsi_insert_seq_on_edge_immediate (pe, seq);
+	      gcc_assert (!new_bb);
+	    }
 	}
-    }
-
-  t = unshare_expr (new_name);
-  gcc_assert (CONSTANT_CLASS_P (new_name)
-	      || TREE_CODE (new_name) == SSA_NAME);
-  new_vec = build_vector_from_val (vectype, t);
-  vec_step = vect_init_vector (phi, new_vec, vectype, NULL);
 
+      t = unshare_expr (new_name);
+      gcc_assert (CONSTANT_CLASS_P (new_name)
+		  || TREE_CODE (new_name) == SSA_NAME);
+      new_vec = build_vector_from_val (vectype, t);
+      vec_step = vect_init_vector (phi, new_vec, vectype, NULL);
+    }
 
   /* Create the following def-use cycle:
      loop prolog:
@@ -6919,6 +8071,9 @@ vectorizable_induction (gimple *phi,
       /* FORNOW. This restriction should be relaxed.  */
       gcc_assert (!nested_in_vect_loop);
 
+      if (LOOP_VINFO_FIRSTFAULTING_EXECUTION (loop_vinfo))
+	si = gsi_after_labels (bb);
+
       /* Create the vector that holds the step of the induction.  */
       if (SCALAR_FLOAT_TYPE_P (TREE_TYPE (step_expr)))
 	{
@@ -7024,10 +8179,12 @@ vectorizable_live_operation (gimple *stmt,
   imm_use_iterator imm_iter;
   tree lhs, lhs_type, bitsize, vec_bitsize;
   tree vectype = STMT_VINFO_VECTYPE (stmt_info);
-  int nunits = TYPE_VECTOR_SUBPARTS (vectype);
+  poly_uint64 nunits = TYPE_VECTOR_SUBPARTS (vectype);
   int ncopies;
   gimple *use_stmt;
   auto_vec<tree> vec_oprnds;
+  int vec_entry = 0;
+  poly_uint64 vec_index = 0;
 
   gcc_assert (STMT_VINFO_LIVE_P (stmt_info));
 
@@ -7056,9 +8213,102 @@ vectorizable_live_operation (gimple *stmt,
   else
     ncopies = vect_get_num_copies (loop_vinfo, vectype);
 
+  if (slp_node)
+    {
+      gcc_assert (slp_index >= 0);
+
+      int num_scalar = SLP_TREE_SCALAR_STMTS (slp_node).length ();
+      int num_vec = SLP_TREE_NUMBER_OF_VEC_STMTS (slp_node);
+
+      /* Get the last occurrence of the scalar index from the concatenation of
+	 all the slp vectors. Calculate which slp vector it is and the index
+	 within.  */
+      poly_uint64 pos = (num_vec * nunits) - num_scalar + slp_index;
+
+      /* Calculate which vector contains the result, and which lane of
+	 that vector we need.  */
+      if (!can_div_trunc_p (pos, nunits, &vec_entry, &vec_index))
+	{
+	  if (dump_enabled_p ())
+	    dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
+			     "Cannot determine which vector holds the"
+			     " final result.\n");
+	  return false;
+	}
+    }
+
+  if (LOOP_VINFO_SPECULATIVE_EXECUTION (loop_vinfo))
+    {
+      /* Need to construct the type because on the checking stage we don't
+	 yet have the speculative exit phi.  */
+      tree mask_type = build_same_sized_truth_vector_type (vectype);
+
+      if (!direct_internal_fn_supported_p (IFN_BREAK_AFTER, mask_type,
+					   OPTIMIZE_FOR_SPEED))
+	{
+	  if (dump_enabled_p ())
+	    dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
+			     "not vectorized: break after not supported.\n");
+	  return false;
+	}
+      if (!direct_internal_fn_supported_p (IFN_EXTRACT_LAST, vectype,
+					   OPTIMIZE_FOR_SPEED))
+	{
+	  if (dump_enabled_p ())
+	    dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
+			     "not vectorized: extract last not supported.\n");
+	  return false;
+	}
+      if (ncopies > 1)
+	{
+	  if (dump_enabled_p ())
+	    dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
+			     "not vectorized: ncopies is greater than 1.\n");
+	  return false;
+	}
+    }
+
   if (!vec_stmt)
-    /* No transformation required.  */
-    return true;
+    {
+      /* No transformation required.  */
+      if (LOOP_VINFO_CAN_FULLY_MASK_P (loop_vinfo))
+	{
+	  if (!direct_internal_fn_supported_p (IFN_EXTRACT_LAST, vectype,
+					       OPTIMIZE_FOR_SPEED))
+	    {
+	      if (dump_enabled_p ())
+		dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
+				 "can't use a fully-masked loop because "
+				 "the target doesn't support extract last "
+				 "reduction.\n");
+	      LOOP_VINFO_CAN_FULLY_MASK_P (loop_vinfo) = false;
+	    }
+	  else if (slp_node)
+	    {
+	      if (dump_enabled_p ())
+		dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
+				 "can't use a fully-masked loop because an "
+				 "SLP statement is live after the loop.\n");
+	      LOOP_VINFO_CAN_FULLY_MASK_P (loop_vinfo) = false;
+	    }
+	  else if (ncopies > 1)
+	    {
+	      if (dump_enabled_p ())
+		dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
+				 "can't use a fully-masked loop because"
+				 " ncopies is greater than 1.\n");
+	      LOOP_VINFO_CAN_FULLY_MASK_P (loop_vinfo) = false;
+	    }
+	  else
+	    {
+	      gcc_assert (ncopies == 1 && !slp_node);
+	      vect_record_loop_mask (loop_vinfo,
+				     &LOOP_VINFO_MASKS (loop_vinfo),
+				     1, vectype);
+	    }
+	}
+      return true;
+    }
 
   /* If stmt has a related stmt, then use that for getting the lhs.  */
   if (is_pattern_stmt_p (stmt_info))
@@ -7077,17 +8327,7 @@ vectorizable_live_operation (gimple *stmt,
   tree vec_lhs, bitstart;
   if (slp_node)
     {
-      gcc_assert (slp_index >= 0);
-
-      int num_scalar = SLP_TREE_SCALAR_STMTS (slp_node).length ();
-      int num_vec = SLP_TREE_NUMBER_OF_VEC_STMTS (slp_node);
-
-      /* Get the last occurrence of the scalar index from the concatenation of
-	 all the slp vectors. Calculate which slp vector it is and the index
-	 within.  */
-      int pos = (num_vec * nunits) - num_scalar + slp_index;
-      int vec_entry = pos / nunits;
-      int vec_index = pos % nunits;
+      gcc_assert (!LOOP_VINFO_FULLY_MASKED_P (loop_vinfo));
 
       /* Get the correct slp vectorized stmt.  */
       vec_lhs = gimple_get_lhs (SLP_TREE_VEC_STMTS (slp_node)[vec_entry]);
@@ -7100,6 +8340,8 @@ vectorizable_live_operation (gimple *stmt,
     {
       enum vect_def_type dt = STMT_VINFO_DEF_TYPE (stmt_info);
       vec_lhs = vect_get_vec_def_for_operand_1 (stmt, dt);
+      gcc_checking_assert (ncopies == 1
+			   || !LOOP_VINFO_FULLY_MASKED_P (loop_vinfo));
 
       /* For multiple copies, get the last copy.  */
       for (int i = 1; i < ncopies; ++i)
@@ -7110,15 +8352,57 @@ vectorizable_live_operation (gimple *stmt,
       bitstart = int_const_binop (MINUS_EXPR, vec_bitsize, bitsize);
     }
 
-  /* Create a new vectorized stmt for the uses of STMT and insert outside the
-     loop.  */
   gimple_seq stmts = NULL;
-  tree bftype = TREE_TYPE (vectype);
-  if (VECTOR_BOOLEAN_TYPE_P (vectype))
-    bftype = build_nonstandard_integer_type (tree_to_uhwi (bitsize), 1);
-  tree new_tree = build3 (BIT_FIELD_REF, bftype, vec_lhs, bitsize, bitstart);
-  new_tree = force_gimple_operand (fold_convert (lhs_type, new_tree), &stmts,
-				   true, NULL_TREE);
+  tree new_tree;
+  if (LOOP_VINFO_FULLY_MASKED_P (loop_vinfo)
+      || LOOP_VINFO_SPECULATIVE_EXECUTION (loop_vinfo))
+    {
+      tree mask;
+      if (LOOP_VINFO_SPECULATIVE_EXECUTION (loop_vinfo))
+	{
+	  gcc_assert (ncopies == 1);
+	  tree orig_mask = LOOP_VINFO_EXIT_MASK (loop_vinfo);
+	  tree all_ones = build_minus_one_cst (TREE_TYPE (orig_mask));
+
+	  mask = make_ssa_name (TREE_TYPE (orig_mask));
+	  gcall *new_stmt = gimple_build_call_internal (IFN_BREAK_AFTER, 2,
+							all_ones, orig_mask);
+	  gimple_call_set_lhs (new_stmt, mask);
+	  gimple_seq_add_stmt (&stmts, new_stmt);
+	}
+      else
+	{
+	  gcc_assert (ncopies == 1 && !slp_node);
+	  mask = vect_get_loop_mask (gsi, &LOOP_VINFO_MASKS (loop_vinfo),
+				     1, vectype, 0);
+	}
+
+      /* Emit:
+
+	   SCALAR_RES = EXTRACT_LAST <VEC_LHS, MASK>
+
+	 where VEC_LHS is the vectorized live-out result and MASK is
+	 the loop mask for the final iteration.  */
+      tree scalar_type = TREE_TYPE (STMT_VINFO_VECTYPE (stmt_info));
+      tree scalar_res = make_ssa_name (scalar_type);
+      gcall *new_stmt = gimple_build_call_internal (IFN_EXTRACT_LAST,
+						    2, mask, vec_lhs);
+      gimple_call_set_lhs (new_stmt, scalar_res);
+      gimple_seq_add_stmt (&stmts, new_stmt);
+
+      /* Convert the extracted vector element to the required scalar type.  */
+      new_tree = gimple_convert (&stmts, lhs_type, scalar_res);
+    }
+  else
+    {
+      tree bftype = TREE_TYPE (vectype);
+      if (VECTOR_BOOLEAN_TYPE_P (vectype))
+	bftype = build_nonstandard_integer_type (tree_to_uhwi (bitsize), 1);
+      new_tree = build3 (BIT_FIELD_REF, bftype, vec_lhs, bitsize, bitstart);
+      new_tree = force_gimple_operand (fold_convert (lhs_type, new_tree),
+				       &stmts, true, NULL_TREE);
+    }
+
   if (stmts)
     gsi_insert_seq_on_edge_immediate (single_exit (loop), stmts);
 
@@ -7192,6 +8476,9 @@ vect_loop_kill_debug_uses (struct loop *loop, gimple *stmt)
 static bool
 loop_niters_no_overflow (loop_vec_info loop_vinfo)
 {
+  if (LOOP_VINFO_SPECULATIVE_EXECUTION (loop_vinfo))
+    return false;
+
   /* Constant case.  */
   if (LOOP_VINFO_NITERS_KNOWN_P (loop_vinfo))
     {
@@ -7218,6 +8505,151 @@ loop_niters_no_overflow (loop_vec_info loop_vinfo)
   return false;
 }
 
+/* Return a mask type with half the number of elements as TYPE.  */
+
+tree
+vect_halve_mask_nunits (tree type)
+{
+  poly_uint64 nunits = exact_div (TYPE_VECTOR_SUBPARTS (type), 2);
+  return build_truth_vector_type (nunits, current_vector_size);
+}
+
+/* Return a mask type with twice as many elements as TYPE.  */
+
+tree
+vect_double_mask_nunits (tree type)
+{
+  poly_uint64 nunits = TYPE_VECTOR_SUBPARTS (type) * 2;
+  return build_truth_vector_type (nunits, current_vector_size);
+}
+
+/* Record that a fully-masked version of LOOP_VINFO would need MASKS to
+   contain a sequence of NVECTORS masks that each control a vector of type
+   VECTYPE.  */
+
+void
+vect_record_loop_mask (loop_vec_info loop_vinfo, vec_loop_masks *masks,
+		       unsigned int nvectors, tree vectype)
+{
+  gcc_assert (nvectors != 0);
+  if (masks->length () < nvectors)
+    masks->safe_grow_cleared (nvectors);
+  rgroup_masks *rgm = &(*masks)[nvectors - 1];
+  /* The number of scalars per iteration and the number of vectors are
+     both compile-time constants.  */
+  unsigned int nscalars_per_iter
+    = exact_div (nvectors * TYPE_VECTOR_SUBPARTS (vectype),
+		 LOOP_VINFO_VECT_FACTOR (loop_vinfo)).to_constant ();
+  if (rgm->max_nscalars_per_iter < nscalars_per_iter)
+    {
+      rgm->max_nscalars_per_iter = nscalars_per_iter;
+      rgm->mask_type = build_same_sized_truth_vector_type (vectype);
+    }
+
+  /* Ensure that the required nonspeculative masks are a subset of
+     the speculative ones.  This has two benefits: it means that we
+     can test for target support in one go, and that we can AND in
+     the speculative masks when setting up the nonspeculative ones.  */
+  if (masks == &LOOP_VINFO_NONSPECULATIVE_MASKS (loop_vinfo))
+    vect_record_loop_mask (loop_vinfo, &LOOP_VINFO_MASKS (loop_vinfo),
+			   nvectors, vectype);
+}
+
+/* Given a complete set of masks MASKS, extract mask number INDEX
+   for an rgroup that operates on NVECTORS vectors of type VECTYPE,
+   where 0 <= INDEX < NVECTORS.  Insert any set-up statements before GSI.
+
+   See the comment above vec_loop_masks for more details about the mask
+   arrangement.  */
+
+tree
+vect_get_loop_mask (gimple_stmt_iterator *gsi, vec_loop_masks *masks,
+		    unsigned int nvectors, tree vectype, unsigned int index)
+{
+  rgroup_masks *rgm = &(*masks)[nvectors - 1];
+  tree mask_type = rgm->mask_type;
+
+  /* Populate the rgroup's mask array, if this is the first time we've
+     used it.  */
+  if (rgm->masks.is_empty ())
+    {
+      rgm->masks.safe_grow_cleared (nvectors);
+      for (unsigned int i = 0; i < nvectors; ++i)
+	{
+	  tree mask = make_temp_ssa_name (mask_type, NULL, "loop_mask");
+	  /* Provide a dummy definition until the real one is available.  */
+	  SSA_NAME_DEF_STMT (mask) = gimple_build_nop ();
+	  rgm->masks[i] = mask;
+	}
+    }
+
+  tree mask = rgm->masks[index];
+  if (may_ne (TYPE_VECTOR_SUBPARTS (mask_type),
+	      TYPE_VECTOR_SUBPARTS (vectype)))
+    {
+      /* A loop mask for data type X can be reused for data type Y
+	 if X has N times more elements than Y and if Y's elements
+	 are N times bigger than X's.  In this case each sequence
+	 of N elements in the loop mask will be all-zero or all-one.
+	 We can then view-convert the mask so that each sequence of
+	 N elements is replaced by a single element.  */
+      gcc_assert (multiple_p (TYPE_VECTOR_SUBPARTS (mask_type),
+			      TYPE_VECTOR_SUBPARTS (vectype)));
+      gimple_seq seq = NULL;
+      mask_type = build_same_sized_truth_vector_type (vectype);
+      mask = gimple_build (&seq, VIEW_CONVERT_EXPR, mask_type, mask);
+      if (seq)
+	gsi_insert_seq_before (gsi, seq, GSI_SAME_STMT);
+    }
+  return mask;
+}
+
+/* Get the mask to use for loads in LOOP_VINFO, or null if loads don't
+   need to be masked.  The arguments are as for vec_get_loop_mask.  */
+
+tree
+vect_get_load_mask (loop_vec_info loop_vinfo, gimple_stmt_iterator *gsi,
+		    unsigned int nvectors, tree vectype, unsigned int index)
+{
+  /* At present all loads in a speculative loop are speculative.
+     They need to be masked iff we are using masking to reach
+     alignment.  */
+  if (LOOP_VINFO_SPECULATIVE_EXECUTION (loop_vinfo)
+      && !LOOP_VINFO_MASK_SKIP_NITERS (loop_vinfo))
+    return NULL_TREE;
+
+  return vect_get_loop_mask (gsi, &LOOP_VINFO_MASKS (loop_vinfo),
+			     nvectors, vectype, index);
+}
+
+/* Return the mask type to use when computing which scalar iterations
+   are active in speculative loop LOOP_VINFO.  */
+
+tree
+vect_mask_type_for_speculation (loop_vec_info loop_vinfo)
+{
+  gcc_checking_assert (LOOP_VINFO_SPECULATIVE_EXECUTION (loop_vinfo));
+  return build_truth_vector_type (LOOP_VINFO_VECT_FACTOR (loop_vinfo),
+				  current_vector_size);
+}
+
+/* Calculate the scalar number of iterations in NIM from its mask,
+   adding any new statements to SEQ.  Return the number of iterations.  */
+
+tree
+vect_get_niters_from_mask (gimple_seq *seq, vec_niters_and_mask *nim)
+{
+  if (nim->niters == NULL_TREE)
+    {
+      nim->niters = make_temp_ssa_name (sizetype, NULL, "niters");
+      gcall *call = gimple_build_call_internal (IFN_MASK_POPCOUNT,
+						1, nim->mask);
+      gimple_call_set_lhs (call, nim->niters);
+      gimple_seq_add_stmt (seq, call);
+    }
+  return nim->niters;
+}
+
 /* Scale profiling counters by estimation for LOOP which is vectorized
    by factor VF.  */
 
@@ -7267,8 +8699,11 @@ vect_transform_loop (loop_vec_info loop_vinfo)
   basic_block *bbs = LOOP_VINFO_BBS (loop_vinfo);
   int nbbs = loop->num_nodes;
   int i;
-  tree niters_vector = NULL;
-  int vf = LOOP_VINFO_VECT_FACTOR (loop_vinfo);
+  tree niters_vector = NULL_TREE;
+  tree step_vector = NULL_TREE;
+  tree niters_vector_mult_vf = NULL_TREE;
+  poly_uint64 vf = LOOP_VINFO_VECT_FACTOR (loop_vinfo);
+  unsigned int lowest_vf;
   bool grouped_store;
   bool slp_scheduled = false;
   gimple *stmt, *pattern_stmt;
@@ -7276,7 +8711,10 @@ vect_transform_loop (loop_vec_info loop_vinfo)
   gimple_stmt_iterator pattern_def_si = gsi_none ();
   bool transform_pattern_stmt = false;
   bool check_profitability = false;
-  int th;
+  unsigned int th;
+
+  lowest_vf = constant_lower_bound (vf);
+  lowest_vf = MIN (lowest_vf, LOOP_VINFO_MAX_VECT_FACTOR (loop_vinfo));
 
   if (dump_enabled_p ())
     dump_printf_loc (MSG_NOTE, vect_location, "=== vec_transform_loop ===\n");
@@ -7284,11 +8722,12 @@ vect_transform_loop (loop_vec_info loop_vinfo)
   /* Use the more conservative vectorization threshold.  If the number
      of iterations is constant assume the cost check has been performed
      by our caller.  If the threshold makes all loops profitable that
-     run at least the vectorization factor number of times checking
-     is pointless, too.  */
+     run at least the (estimated) vectorization factor number of times
+     checking is pointless, too.  */
   th = LOOP_VINFO_COST_MODEL_THRESHOLD (loop_vinfo);
-  if (th >= LOOP_VINFO_VECT_FACTOR (loop_vinfo)
-      && !LOOP_VINFO_NITERS_KNOWN_P (loop_vinfo))
+  if (th >= vect_vf_for_cost (loop_vinfo)
+      && !LOOP_VINFO_NITERS_KNOWN_P (loop_vinfo)
+      && !LOOP_VINFO_SPECULATIVE_EXECUTION (loop_vinfo))
     {
       if (dump_enabled_p ())
 	dump_printf_loc (MSG_NOTE, vect_location,
@@ -7312,7 +8751,17 @@ vect_transform_loop (loop_vec_info loop_vinfo)
 
   if (LOOP_REQUIRES_VERSIONING (loop_vinfo))
     {
-      vect_loop_versioning (loop_vinfo, th, check_profitability);
+      poly_uint64 versioning_threshold
+	= LOOP_VINFO_VERSIONING_THRESHOLD (loop_vinfo);
+      if (check_profitability
+	  && ordered_p (poly_uint64 (th), versioning_threshold))
+	{
+	  versioning_threshold = ordered_max (poly_uint64 (th),
+					      versioning_threshold);
+	  check_profitability = false;
+	}
+      vect_loop_versioning (loop_vinfo, th, check_profitability,
+			    versioning_threshold);
       check_profitability = false;
     }
 
@@ -7332,21 +8781,32 @@ vect_transform_loop (loop_vec_info loop_vinfo)
 	}
     }
 
+  vect_build_cap (loop_vinfo);
+
   tree niters = vect_build_loop_niters (loop_vinfo);
   LOOP_VINFO_NITERS_UNCHANGED (loop_vinfo) = niters;
   tree nitersm1 = unshare_expr (LOOP_VINFO_NITERSM1 (loop_vinfo));
   bool niters_no_overflow = loop_niters_no_overflow (loop_vinfo);
-  epilogue = vect_do_peeling (loop_vinfo, niters, nitersm1, &niters_vector, th,
+  epilogue = vect_do_peeling (loop_vinfo, niters, nitersm1, &niters_vector,
+			      &step_vector, &niters_vector_mult_vf, th,
 			      check_profitability, niters_no_overflow);
-  if (niters_vector == NULL_TREE)
+
+  if (niters_vector == NULL_TREE
+      && !LOOP_VINFO_SPECULATIVE_EXECUTION (loop_vinfo))
     {
-      if (LOOP_VINFO_NITERS_KNOWN_P (loop_vinfo))
-	niters_vector
-	  = build_int_cst (TREE_TYPE (LOOP_VINFO_NITERS (loop_vinfo)),
-			   LOOP_VINFO_INT_NITERS (loop_vinfo) / vf);
+      gcc_assert (!LOOP_VINFO_PEELING_FOR_GAPS (loop_vinfo));
+      if (LOOP_VINFO_NITERS_KNOWN_P (loop_vinfo)
+	  && !LOOP_VINFO_FULLY_MASKED_P (loop_vinfo)
+	  && must_eq (lowest_vf, vf))
+	{
+	  niters_vector
+	    = build_int_cst (TREE_TYPE (LOOP_VINFO_NITERS (loop_vinfo)),
+			     LOOP_VINFO_INT_NITERS (loop_vinfo) / lowest_vf);
+	  step_vector = build_one_cst (TREE_TYPE (niters));
+	}
       else
 	vect_gen_vector_loop_niters (loop_vinfo, niters, &niters_vector,
-				     niters_no_overflow);
+				     &step_vector, niters_no_overflow);
     }
 
   /* 1) Make sure the loop header has exactly two entries
@@ -7356,6 +8816,48 @@ vect_transform_loop (loop_vec_info loop_vinfo)
 
   split_edge (loop_preheader_edge (loop));
 
+  if (LOOP_VINFO_FULLY_MASKED_P (loop_vinfo)
+      && vect_use_loop_mask_for_alignment_p (loop_vinfo))
+    /* This will deal with any possible peeling.  */
+    vect_prepare_for_masked_peels (loop_vinfo);
+
+  if (LOOP_VINFO_SPECULATIVE_EXECUTION (loop_vinfo))
+    {
+      tree mask_type = vect_mask_type_for_speculation (loop_vinfo);
+      if (LOOP_VINFO_FIRSTFAULTING_EXECUTION (loop_vinfo))
+	{
+	  /* Update the IVs.  */
+	  gimple *tmp_stmt;
+	  gcc_assert (LOOP_VINFO_FULLY_MASKED_P (loop_vinfo));
+
+	  gimple_stmt_iterator iv_gsi;
+	  bool insert_after;
+	  standard_iv_increment_position (loop, &iv_gsi, &insert_after);
+
+	  /* Read the non-faulting mask.  */
+	  tree nfmask = make_temp_ssa_name (mask_type, NULL, "nfmask");
+	  tmp_stmt = gimple_build_call_internal (IFN_READ_NF, 0);
+	  gimple_call_set_lhs (tmp_stmt, nfmask);
+	  gsi_insert_before (&iv_gsi, tmp_stmt, GSI_SAME_STMT);
+
+	  /* Find the number steps the loop iterated on the current pass.  */
+	  tree loop_iter = make_temp_ssa_name (sizetype, NULL, "loop_iter");
+	  tmp_stmt = gimple_build_call_internal (IFN_MASK_POPCOUNT, 1,
+						 nfmask);
+	  gimple_call_set_lhs (tmp_stmt, loop_iter);
+	  gsi_insert_before (&iv_gsi, tmp_stmt, GSI_SAME_STMT);
+
+	  LOOP_VINFO_NONFAULTING (loop_vinfo).mask = nfmask;
+	  LOOP_VINFO_NONFAULTING (loop_vinfo).niters = loop_iter;
+	}
+
+      /* Create a dummy definition of the exit mask.  We'll fill in the
+	 real definition later.  */
+      tree mask = make_temp_ssa_name (mask_type, NULL, "exit_mask");
+      SSA_NAME_DEF_STMT (mask) = gimple_build_nop ();
+      LOOP_VINFO_EXIT_MASK (loop_vinfo) = mask;
+    }
+
   /* FORNOW: the vectorizer supports only loops which body consist
      of one basic block (header + empty latch). When the vectorizer will
      support more involved loop forms, the order by which the BBs are
@@ -7388,8 +8890,8 @@ vect_transform_loop (loop_vec_info loop_vinfo)
 	    continue;
 
 	  if (STMT_VINFO_VECTYPE (stmt_info)
-	      && (TYPE_VECTOR_SUBPARTS (STMT_VINFO_VECTYPE (stmt_info))
-		  != (unsigned HOST_WIDE_INT) vf)
+	      && may_ne (TYPE_VECTOR_SUBPARTS (STMT_VINFO_VECTYPE (stmt_info)),
+			 vf)
 	      && dump_enabled_p ())
 	    dump_printf_loc (MSG_NOTE, vect_location, "multiple-types.\n");
 
@@ -7521,11 +9023,10 @@ vect_transform_loop (loop_vec_info loop_vinfo)
 
 	  if (STMT_VINFO_VECTYPE (stmt_info))
 	    {
-	      unsigned int nunits
-		= (unsigned int)
-		  TYPE_VECTOR_SUBPARTS (STMT_VINFO_VECTYPE (stmt_info));
+	      poly_uint64 nunits
+		= TYPE_VECTOR_SUBPARTS (STMT_VINFO_VECTYPE (stmt_info));
 	      if (!STMT_SLP_TYPE (stmt_info)
-		  && nunits != (unsigned int) vf
+		  && may_ne (nunits, vf)
 		  && dump_enabled_p ())
 		  /* For SLP VF is set according to unrolling factor, and not
 		     to vector size, hence for SLP this print is not valid.  */
@@ -7595,12 +9096,49 @@ vect_transform_loop (loop_vec_info loop_vinfo)
 	      gsi_next (&si);
 	    }
 	}		        /* stmts in BB */
+
+      /* Stub out scalar statements that must not survive vectorization.
+	 Doing this here helps with grouped statements, or statements that
+	 are involved in patterns.  */
+      for (gimple_stmt_iterator gsi = gsi_start_bb (bb);
+	   !gsi_end_p (gsi); gsi_next (&gsi))
+	{
+	  gcall *call = dyn_cast <gcall *> (gsi_stmt (gsi));
+	  if (call && gimple_call_internal_p (call, IFN_MASK_LOAD))
+	    {
+	      tree lhs = gimple_get_lhs (call);
+	      if (!VECTOR_TYPE_P (TREE_TYPE (lhs)))
+		{
+		  tree zero = build_zero_cst (TREE_TYPE (lhs));
+		  gimple *new_stmt = gimple_build_assign (lhs, zero);
+		  gsi_replace (&gsi, new_stmt, true);
+		}
+	    }
+	}
     }				/* BBs in loop */
 
-  slpeel_make_loop_iterate_ntimes (loop, niters_vector);
+  /* Provide the real definition of LOOP_VINFO_EXIT_MASK.  */
+  if (LOOP_VINFO_SPECULATIVE_EXECUTION (loop_vinfo))
+    {
+      tree imask = LOOP_VINFO_EXIT_TEST_MASK (loop_vinfo);
+      tree omask = LOOP_VINFO_EXIT_MASK (loop_vinfo);
+      gphi *new_phi = create_phi_node (omask, single_exit (loop)->dest);
+      add_phi_arg (new_phi, imask, single_exit (loop), UNKNOWN_LOCATION);
+    }
+
+  /* The vectorization factor is always > 1, so if we use an IV increment of 1.
+     a zero NITERS becomes a nonzero NITERS_VECTOR.  */
+  if (step_vector && integer_onep (step_vector))
+    niters_no_overflow = true;
+  vect_set_loop_condition (loop, loop_vinfo, niters_vector, step_vector,
+			   niters_vector_mult_vf, !niters_no_overflow);
 
-  scale_profile_for_vect_loop (loop, vf);
+  unsigned int assumed_vf = vect_vf_for_cost (loop_vinfo);
+  scale_profile_for_vect_loop (loop, assumed_vf);
 
+  /* True if the final iteration might not handle a full vector's
+     worth of scalar iterations.  */
+  bool final_iter_may_be_partial = LOOP_VINFO_FULLY_MASKED_P (loop_vinfo);
   /* The minimum number of iterations performed by the epilogue.  This
      is 1 when peeling for gaps because we always need a final scalar
      iteration.  */
@@ -7608,18 +9146,41 @@ vect_transform_loop (loop_vec_info loop_vinfo)
   /* +1 to convert latch counts to loop iteration counts,
      -min_epilogue_iters to remove iterations that cannot be performed
        by the vector code.  */
-  int bias = 1 - min_epilogue_iters;
+  int bias_for_lowest = 1 - min_epilogue_iters;
+  int bias_for_assumed = bias_for_lowest;
+  int alignment_npeels = LOOP_VINFO_PEELING_FOR_ALIGNMENT (loop_vinfo);
+  if (alignment_npeels && LOOP_VINFO_FULLY_MASKED_P (loop_vinfo))
+    {
+      /* When the amount of peeling is known at compile time, the first
+	 iteration will have exactly alignment_npeels active elements.
+	 In the worst case it will have at least one.  */
+      int min_first_active = (alignment_npeels > 0 ? alignment_npeels : 1);
+      bias_for_lowest += lowest_vf - min_first_active;
+      bias_for_assumed += assumed_vf - min_first_active;
+    }
   /* In these calculations the "- 1" converts loop iteration counts
      back to latch counts.  */
   if (loop->any_upper_bound)
     loop->nb_iterations_upper_bound
-      = wi::udiv_floor (loop->nb_iterations_upper_bound + bias, vf) - 1;
+      = (final_iter_may_be_partial
+	 ? wi::udiv_ceil (loop->nb_iterations_upper_bound + bias_for_lowest,
+			  lowest_vf) - 1
+	 : wi::udiv_floor (loop->nb_iterations_upper_bound + bias_for_lowest,
+			   lowest_vf) - 1);
   if (loop->any_likely_upper_bound)
     loop->nb_iterations_likely_upper_bound
-      = wi::udiv_floor (loop->nb_iterations_likely_upper_bound + bias, vf) - 1;
+      = (final_iter_may_be_partial
+	 ? wi::udiv_ceil (loop->nb_iterations_likely_upper_bound
+			  + bias_for_lowest, lowest_vf) - 1
+	 : wi::udiv_floor (loop->nb_iterations_likely_upper_bound
+			   + bias_for_lowest, lowest_vf) - 1);
   if (loop->any_estimate)
     loop->nb_iterations_estimate
-      = wi::udiv_floor (loop->nb_iterations_estimate + bias, vf) - 1;
+      = (final_iter_may_be_partial
+	 ? wi::udiv_ceil (loop->nb_iterations_estimate + bias_for_assumed,
+			  assumed_vf) - 1
+	 : wi::udiv_floor (loop->nb_iterations_estimate + bias_for_assumed,
+			   assumed_vf) - 1);
 
   if (dump_enabled_p ())
     {
@@ -7633,9 +9194,12 @@ vect_transform_loop (loop_vec_info loop_vinfo)
 	  dump_printf (MSG_NOTE, "\n");
 	}
       else
-	dump_printf_loc (MSG_NOTE, vect_location,
-			 "LOOP EPILOGUE VECTORIZED (VS=%d)\n",
-			 current_vector_size);
+	{
+	  dump_printf_loc (MSG_NOTE, vect_location,
+			   "LOOP EPILOGUE VECTORIZED (VS=");
+	  dump_dec (MSG_NOTE, current_vector_size);
+	  dump_printf (MSG_NOTE, ")\n");
+	}
     }
 
   /* Free SLP instances here because otherwise stmt reference counting
@@ -7652,30 +9216,39 @@ vect_transform_loop (loop_vec_info loop_vinfo)
   if (LOOP_VINFO_EPILOGUE_P (loop_vinfo))
     epilogue = NULL;
 
+  if (!PARAM_VALUE (PARAM_VECT_EPILOGUES_NOMASK))
+    epilogue = NULL;
+
   if (epilogue)
     {
-	unsigned int vector_sizes
-	  = targetm.vectorize.autovectorize_vector_sizes ();
-	vector_sizes &= current_vector_size - 1;
+      auto_vector_sizes vector_sizes;
+      targetm.vectorize.autovectorize_vector_sizes (&vector_sizes);
+      unsigned int next_size = 0;
 
-	if (!PARAM_VALUE (PARAM_VECT_EPILOGUES_NOMASK))
-	  epilogue = NULL;
-	else if (!vector_sizes)
-	  epilogue = NULL;
-	else if (LOOP_VINFO_NITERS_KNOWN_P (loop_vinfo)
-		 && LOOP_VINFO_PEELING_FOR_ALIGNMENT (loop_vinfo) >= 0)
-	  {
-	    int smallest_vec_size = 1 << ctz_hwi (vector_sizes);
-	    int ratio = current_vector_size / smallest_vec_size;
-	    int eiters = LOOP_VINFO_INT_NITERS (loop_vinfo)
-	      - LOOP_VINFO_PEELING_FOR_ALIGNMENT (loop_vinfo);
-	    eiters = eiters % vf;
-
-	    epilogue->nb_iterations_upper_bound = eiters - 1;
+      if (LOOP_VINFO_NITERS_KNOWN_P (loop_vinfo)
+	  && LOOP_VINFO_PEELING_FOR_ALIGNMENT (loop_vinfo) >= 0
+	  && must_eq (vf, lowest_vf))
+	{
+	  unsigned int eiters
+	    = (LOOP_VINFO_INT_NITERS (loop_vinfo)
+	       - LOOP_VINFO_PEELING_FOR_ALIGNMENT (loop_vinfo));
+	  eiters = eiters % lowest_vf;
+	  epilogue->nb_iterations_upper_bound = eiters - 1;
+
+	  unsigned int ratio;
+	  while (next_size < vector_sizes.length ()
+		 && !(constant_multiple_p (current_vector_size,
+					   vector_sizes[next_size], &ratio)
+		      && eiters >= lowest_vf / ratio))
+	    next_size += 1;
+	}
+      else
+	while (next_size < vector_sizes.length ()
+	       && may_lt (current_vector_size, vector_sizes[next_size]))
+	  next_size += 1;
 
-	    if (eiters < vf / ratio)
-	      epilogue = NULL;
-	    }
+      if (next_size == vector_sizes.length ())
+	epilogue = NULL;
     }
 
   if (epilogue)