1 files changed, 276 insertions, 250 deletions
diff --git a/gcc/tree-vect-loop.c b/gcc/tree-vect-loop.c
index 91a3610a1a0..d784754c6de 100644
--- a/gcc/tree-vect-loop.c
+++ b/gcc/tree-vect-loop.c
@@ -52,9 +52,6 @@ along with GCC; see the file COPYING3.  If not see
 #include "tree-if-conv.h"
 #include "internal-fn.h"
 
-/* For lang_hooks.types.type_for_mode.  */
-#include "langhooks.h"
-
 /* Loop Vectorization Pass.
 
    This pass tries to vectorize loops.
@@ -989,8 +986,6 @@ vect_fixup_reduc_chain (gimple *stmt)
   gcc_assert (!GROUP_FIRST_ELEMENT (vinfo_for_stmt (firstp))
 	      && GROUP_FIRST_ELEMENT (vinfo_for_stmt (stmt)));
   GROUP_SIZE (vinfo_for_stmt (firstp)) = GROUP_SIZE (vinfo_for_stmt (stmt));
-  GROUP_NUM_STMTS (vinfo_for_stmt (firstp))
-    = GROUP_NUM_STMTS (vinfo_for_stmt (stmt));
   GROUP_FIRST_UID (vinfo_for_stmt (firstp))
     = GROUP_FIRST_UID (vinfo_for_stmt (stmt));
   GROUP_LAST_UID (vinfo_for_stmt (firstp))
@@ -1172,7 +1167,6 @@ _loop_vec_info::_loop_vec_info (struct loop *loop_in)
     scalar_loop (NULL),
     orig_loop_info (NULL),
     vect_addr_base_htab (31),
-    gather_scatter_htab (31),
     exit_test_mask (NULL_TREE),
     exit_mask (NULL_TREE),
     nonspeculative_seq (NULL)
@@ -1293,7 +1287,7 @@ _loop_vec_info::~_loop_vec_info ()
 }
 
 /* Return true if we can use CMP_TYPE as the comparison type to produce
-   all masks required to fully-mask LOOP_VINFO.  */
+   all masks required to mask LOOP_VINFO.  */
 
 static bool
 can_produce_all_loop_masks_p (loop_vec_info loop_vinfo, tree cmp_type)
@@ -1372,10 +1366,11 @@ vect_verify_full_masking (loop_vec_info loop_vinfo)
   tree cmp_type = NULL_TREE;
   FOR_EACH_MODE_IN_CLASS (cmp_mode_iter, MODE_INT)
     {
-      scalar_int_mode cmp_mode = cmp_mode_iter.require ();
-      if (GET_MODE_BITSIZE (cmp_mode) >= min_ni_width)
+      unsigned int cmp_bits = GET_MODE_BITSIZE (cmp_mode_iter.require ());
+      if (cmp_bits >= min_ni_width
+	  && targetm.scalar_mode_supported_p (cmp_mode_iter.require ()))
 	{
-	  tree this_type = lang_hooks.types.type_for_mode (cmp_mode, true);
+	  tree this_type = build_nonstandard_integer_type (cmp_bits, true);
 	  if (this_type
 	      && can_produce_all_loop_masks_p (loop_vinfo, this_type))
 	    {
@@ -1384,7 +1379,7 @@ vect_verify_full_masking (loop_vec_info loop_vinfo)
 		 operands to the WHILE are more likely to be reusable in
 		 address calculations.  */
 	      cmp_type = this_type;
-	      if (GET_MODE_SIZE (cmp_mode) >= GET_MODE_SIZE (Pmode))
+	      if (cmp_bits >= GET_MODE_BITSIZE (Pmode))
 		break;
 	    }
 	}
@@ -2057,11 +2052,9 @@ vect_analyze_loop_operations (loop_vec_info loop_vinfo)
   return true;
 }
 
-/* Function vect_analyze_loop_costing.
-
-   Analyze cost of loop.  Decide if it is worth while to vectorize.
-   Return 1 if definitely yes, 0 if definitely no, or -1 if it's
-   worth retrying.  */
+/* Analyze the cost of the loop described by LOOP_VINFO.  Decide if it
+   is worthwhile to vectorize.  Return 1 if definitely yes, 0 if
+   definitely no, or -1 if it's worth retrying.  */
 
 static int
 vect_analyze_loop_costing (loop_vec_info loop_vinfo)
@@ -2461,10 +2454,10 @@ start_over:
     {
       if (LOOP_VINFO_FULLY_MASKED_P (loop_vinfo))
 	dump_printf_loc (MSG_NOTE, vect_location,
-			 "Using a fully-masked loop.\n");
+			 "using a fully-masked loop.\n");
       else
 	dump_printf_loc (MSG_NOTE, vect_location,
-			 "Not using a fully-masked loop.\n");
+			 "not using a fully-masked loop.\n");
     }
 
   if (!LOOP_VINFO_FULLY_MASKED_P (loop_vinfo))
@@ -2830,24 +2823,24 @@ vect_analyze_loop (struct loop *loop, loop_vec_info orig_loop_vinfo)
     }
 }
 
-/* Return true if the target supports strict math reductions for operation
-   CODE and type TYPE.  If the target supports it, store the reduction operation
-   in REDUC_CODE.  */
+/* Return true if the target supports in-order reductions for operation
+   CODE and type TYPE.  If the target supports it, store the reduction
+   operation in *REDUC_CODE.  */
+
 static bool
-strict_reduction_code (tree_code code, tree type,
-		       tree_code *reduc_code)
+fold_left_reduction_code (tree_code code, tree type, tree_code *reduc_code)
 {
   switch (code)
     {
     case PLUS_EXPR:
-      code = STRICT_REDUC_PLUS_EXPR;
+      code = FOLD_LEFT_PLUS_EXPR;
       break;
 
     default:
       return false;
     }
 
-  if (!strict_reduction_support (code, type))
+  if (!target_supports_op_p (type, code, optab_vector))
     return false;
 
   *reduc_code = code;
@@ -2922,7 +2915,7 @@ neutral_op_for_slp_reduction (slp_tree slp_node, tree_code code,
   stmt_vec_info stmt_vinfo = vinfo_for_stmt (stmt);
   tree vector_type = STMT_VINFO_VECTYPE (stmt_vinfo);
   tree scalar_type = TREE_TYPE (vector_type);
-  struct loop *loop = (gimple_bb (stmt))->loop_father;
+  struct loop *loop = gimple_bb (stmt)->loop_father;
   gcc_assert (loop);
 
   switch (code)
@@ -3161,17 +3154,19 @@ vect_is_slp_reduction (loop_vec_info loop_info, gimple *phi,
   first = GROUP_FIRST_ELEMENT (vinfo_for_stmt (current_stmt));
   LOOP_VINFO_REDUCTION_CHAINS (loop_info).safe_push (first);
   GROUP_SIZE (vinfo_for_stmt (first)) = size;
-  GROUP_NUM_STMTS (vinfo_for_stmt (first)) = size;
   GROUP_FIRST_UID (vinfo_for_stmt (first)) = first_uid;
   GROUP_LAST_UID (vinfo_for_stmt (first)) = last_uid;
 
   return true;
 }
 
-/* Returns TRUE if we need to perform a strict math reduction for TYPE.  */
+/* Returns true if we need an in-order reduction for operation CODE
+   on type TYPE.  NEED_WRAPPING_INTEGRAL_OVERFLOW is true if integer
+   overflow must wrap.  */
+
 static bool
-needs_strict_reduction (tree type, tree_code code,
-			bool need_wrapping_integral_overflow)
+needs_fold_left_reduction_p (tree type, tree_code code,
+			     bool need_wrapping_integral_overflow)
 {
   /* CHECKME: check for !flag_finite_math_only too?  */
   if (SCALAR_FLOAT_TYPE_P (type))
@@ -3184,7 +3179,8 @@ needs_strict_reduction (tree type, tree_code code,
       default:
 	return !flag_associative_math;
       }
-  else if (INTEGRAL_TYPE_P (type))
+
+  if (INTEGRAL_TYPE_P (type))
     {
       if (!operation_no_trapping_overflow (type, code))
 	return true;
@@ -3194,10 +3190,11 @@ needs_strict_reduction (tree type, tree_code code,
 	return true;
       return false;
     }
-  else if (SAT_FIXED_POINT_TYPE_P (type))
+
+  if (SAT_FIXED_POINT_TYPE_P (type))
     return true;
-  else
-    return false;
+
+  return false;
 }
 
 /* Function vect_is_simple_reduction
@@ -3242,9 +3239,6 @@ needs_strict_reduction (tree type, tree_code code,
        if (a[i] < val)
 	ret_val = a[i];
 
-   Record in DOUBLE_REDUC whether this is a double reduction.
-   Record in STRICT_REDUC whether the reduction must be performed in order, i.e.
-   cannot be reassociated.
 */
 
 static gimple *
@@ -3529,9 +3523,9 @@ vect_is_simple_reduction (loop_vec_info loop_info, gimple *phi,
      outer-loop vectorization is safe.  */
   if (check_reduction
       && *v_reduc_type == TREE_CODE_REDUCTION
-      && needs_strict_reduction (type, code,
-				 need_wrapping_integral_overflow))
-    *v_reduc_type = STRICT_FP_REDUCTION;
+      && needs_fold_left_reduction_p (type, code,
+				      need_wrapping_integral_overflow))
+    *v_reduc_type = FOLD_LEFT_REDUCTION;
 
   /* Reduction is safe. We're dealing with one of the following:
      1) integer arithmetic and no trapv
@@ -4327,7 +4321,7 @@ static void
 vect_model_reduction_cost (stmt_vec_info stmt_info, enum tree_code reduc_code,
 			   int ncopies)
 {
-  int prologue_cost = 0, epilogue_cost = 0;
+  int prologue_cost = 0, epilogue_cost = 0, inside_cost;
   enum tree_code code;
   optab optab;
   tree vectype;
@@ -4346,13 +4340,11 @@ vect_model_reduction_cost (stmt_vec_info stmt_info, enum tree_code reduc_code,
     target_cost_data = BB_VINFO_TARGET_COST_DATA (STMT_VINFO_BB_VINFO (stmt_info));
 
   /* Condition reductions generate two reductions in the loop.  */
-  if (STMT_VINFO_VEC_REDUCTION_TYPE (stmt_info) == COND_REDUCTION)
+  vect_reduction_type reduction_type
+    = STMT_VINFO_VEC_REDUCTION_TYPE (stmt_info);
+  if (reduction_type == COND_REDUCTION)
     ncopies *= 2;
 
-  /* Cost of reduction op inside loop.  */
-  unsigned inside_cost = add_stmt_cost (target_cost_data, ncopies, vector_stmt,
-					stmt_info, 0, vect_body);
-
   vectype = STMT_VINFO_VECTYPE (stmt_info);
   mode = TYPE_MODE (vectype);
   orig_stmt = STMT_VINFO_RELATED_STMT (stmt_info);
@@ -4362,14 +4354,31 @@ vect_model_reduction_cost (stmt_vec_info stmt_info, enum tree_code reduc_code,
 
   code = gimple_assign_rhs_code (orig_stmt);
 
-  /* Add in cost for initial definition.
-     For cond reduction we have four vectors: initial index, step, initial
-     result of the data reduction, initial value of the index reduction.  */
-  int prologue_stmts = STMT_VINFO_VEC_REDUCTION_TYPE (stmt_info)
-		       == COND_REDUCTION ? 4 : 1;
-  prologue_cost += add_stmt_cost (target_cost_data, prologue_stmts,
-				  scalar_to_vec, stmt_info, 0,
-				  vect_prologue);
+  if (reduction_type == EXTRACT_LAST_REDUCTION
+      || reduction_type == FOLD_LEFT_REDUCTION)
+    {
+      /* No extra instructions needed in the prologue.  */
+      prologue_cost = 0;
+
+      /* Count NCOPIES FOLD_EXTRACT_LAST operations.  */
+      inside_cost = add_stmt_cost (target_cost_data, ncopies, vec_to_scalar,
+				   stmt_info, 0, vect_body);
+    }
+  else
+    {
+      /* Add in cost for initial definition.
+	 For cond reduction we have four vectors: initial index, step,
+	 initial result of the data reduction, initial value of the index
+	 reduction.  */
+      int prologue_stmts = reduction_type == COND_REDUCTION ? 4 : 1;
+      prologue_cost += add_stmt_cost (target_cost_data, prologue_stmts,
+				      scalar_to_vec, stmt_info, 0,
+				      vect_prologue);
+
+      /* Cost of reduction op inside loop.  */
+      inside_cost = add_stmt_cost (target_cost_data, ncopies, vector_stmt,
+				   stmt_info, 0, vect_body);
+    }
 
   /* Determine cost of epilogue code.
 
@@ -4380,10 +4389,7 @@ vect_model_reduction_cost (stmt_vec_info stmt_info, enum tree_code reduc_code,
     {
       if (reduc_code != ERROR_MARK)
 	{
-	  if (STMT_VINFO_VEC_REDUCTION_TYPE (stmt_info) == STRICT_FP_REDUCTION)
-	    inside_cost += add_stmt_cost (target_cost_data, 1, vec_to_scalar,
-					  stmt_info, 0, vect_body);
-	  else if (STMT_VINFO_VEC_REDUCTION_TYPE (stmt_info) == COND_REDUCTION)
+	  if (reduction_type == COND_REDUCTION)
 	    {
 	      /* An EQ stmt and an COND_EXPR stmt.  */
 	      epilogue_cost += add_stmt_cost (target_cost_data, 2,
@@ -4408,7 +4414,7 @@ vect_model_reduction_cost (stmt_vec_info stmt_info, enum tree_code reduc_code,
 					      vect_epilogue);
 	    }
 	}
-      else if (STMT_VINFO_VEC_REDUCTION_TYPE (stmt_info) == COND_REDUCTION)
+      else if (reduction_type == COND_REDUCTION)
 	{
 	  unsigned estimated_nunits = vect_nunits_for_cost (vectype);
 	  /* Extraction of scalar elements.  */
@@ -4422,10 +4428,12 @@ vect_model_reduction_cost (stmt_vec_info stmt_info, enum tree_code reduc_code,
 					  scalar_stmt, stmt_info, 0,
 					  vect_epilogue);
 	}
-      else if (STMT_VINFO_VEC_REDUCTION_TYPE (stmt_info)
-	       != COND_REDUCTION_CLASTB)
+      else if (reduction_type == EXTRACT_LAST_REDUCTION
+	       || reduction_type == FOLD_LEFT_REDUCTION)
+	/* No extra instructions need in the epilogue.  */
+	;
+      else
 	{
-	  /* Enforced by vectorizable_reduction.  */
 	  int vec_size_in_bits = tree_to_uhwi (TYPE_SIZE (vectype));
 	  tree bitsize =
 	    TYPE_SIZE (TREE_TYPE (gimple_assign_lhs (orig_stmt)));
@@ -4591,6 +4599,9 @@ get_initial_def_for_reduction (gimple *stmt, tree init_val,
       return vect_create_destination_var (init_val, vectype);
     }
 
+  vect_reduction_type reduction_type
+    = STMT_VINFO_VEC_REDUCTION_TYPE (stmt_vinfo);
+
   /* In case of a nested reduction do not use an adjustment def as
      that case is not supported by the epilogue generation correctly
      if ncopies is not one.  */
@@ -4664,8 +4675,8 @@ get_initial_def_for_reduction (gimple *stmt, tree init_val,
 	if (adjustment_def)
           {
 	    *adjustment_def = NULL_TREE;
-	    if (! REDUCTION_IS_FULL_COND_REDUCTION_P
-			(STMT_VINFO_VEC_REDUCTION_TYPE (stmt_vinfo)))
+	    if (reduction_type != COND_REDUCTION
+		&& reduction_type != EXTRACT_LAST_REDUCTION)
 	      {
 		init_def = vect_get_vec_def_for_operand (init_val, stmt);
 		break;
@@ -4772,6 +4783,8 @@ get_initial_defs_for_reduction (slp_tree slp_node,
 		init = gimple_build_vector (&ctor_seq, vector_type, elts);
 	      else if (neutral_op)
 		{
+		  /* Build a vector of the neutral value and shift the
+		     other elements into place.  */
 		  init = gimple_build_vector_from_val (&ctor_seq, vector_type,
 						       neutral_op);
 		  int k = nunits;
@@ -4789,6 +4802,9 @@ get_initial_defs_for_reduction (slp_tree slp_node,
 		}
 	      else
 		{
+		  /* First time round, duplicate ELTS to fill the
+		     required number of vectors, then cherry pick the
+		     appropriate result for each iteration.  */
 		  if (vec_oprnds->is_empty ())
 		    duplicate_and_interleave (&ctor_seq, vector_type, elts,
 					      number_of_vectors,
@@ -5403,7 +5419,7 @@ vect_create_epilog_for_reduction (vec<tree> vect_defs, gimple *stmt,
       gsi_insert_before (&exit_gsi, max_index_vec_stmt, GSI_SAME_STMT);
 
       /* Next we compare the new vector (MAX_INDEX_VEC) full of max indexes
-	 with the vector (COND_REDUC_RES) of found indexes, choosing values
+	 with the vector (INDUCTION_INDEX) of found indexes, choosing values
 	 from the data vector (NEW_PHI_RESULT) for matches, 0 (ZERO_VEC)
 	 otherwise.  Only one value should match, resulting in a vector
 	 (VEC_COND) with one data value and the rest zeros.
@@ -5592,6 +5608,10 @@ vect_create_epilog_for_reduction (vec<tree> vect_defs, gimple *stmt,
     }
   else if (direct_slp_reduc)
     {
+      /* Here we create one vector for each of the GROUP_SIZE results,
+	 with the elements for other SLP statements replaced with the
+	 neutral value.  We can then do a normal reduction on each vector.  */
+
       /* Enforced by vectorizable_reduction.  */
       gcc_assert (new_phis.length () == 1);
       gcc_assert (pow2p_hwi (group_size));
@@ -5599,6 +5619,9 @@ vect_create_epilog_for_reduction (vec<tree> vect_defs, gimple *stmt,
       slp_tree orig_phis_slp_node = slp_node_instance->reduc_phis;
       vec<gimple *> orig_phis = SLP_TREE_SCALAR_STMTS (orig_phis_slp_node);
       gimple_seq seq = NULL;
+
+      /* Build a vector {0, 1, 2, ...}, with the same number of elements
+	 and the same element size as VECTYPE.  */
       tree index = build_index_vector (vectype, 0, 1);
       tree index_type = TREE_TYPE (index);
       tree index_elt_type = TREE_TYPE (index_type);
@@ -5611,8 +5634,8 @@ vect_create_epilog_for_reduction (vec<tree> vect_defs, gimple *stmt,
 			    build_vector_from_val (index_type, index_mask));
 
       /* Get a neutral vector value.  This is simply a splat of the neutral
-	 scalar value if we have one, otherwise the initial vector is itself
-	 a neutral value.  */
+	 scalar value if we have one, otherwise the initial scalar value
+	 is itself a neutral value.  */
       tree vector_identity = NULL_TREE;
       if (neutral_op)
 	vector_identity = gimple_build_vector_from_val (&seq, vectype,
@@ -5633,7 +5656,7 @@ vect_create_epilog_for_reduction (vec<tree> vect_defs, gimple *stmt,
 
 	  /* Calculate the equivalent of:
 
-	     sel = (index == i);
+	     sel[j] = (index[j] == i);
 
 	     which selects the elements of NEW_PHI_RESULT that should
 	     be included in the result.  */
@@ -6167,30 +6190,30 @@ merge_with_identity (gimple_stmt_iterator *gsi, tree mask, tree vectype,
   return cond;
 }
 
-/* Perform in-order reductions for strict FP math, as opposed to the
-   tree-based method used for fast math.  For SLP this only works for
-   chained reductions, as non chained reductions would require changing
-   the order.  */
+/* Perform an in-order reduction (FOLD_LEFT_REDUCTION).  STMT is the
+   statement that sets the live-out value.  REDUC_DEF_STMT is the phi
+   statement.  CODE is the operation performed by STMT and OPS are
+   its scalar operands.  REDUC_INDEX is the index of the operand in
+   OPS that is set by REDUC_DEF_STMT.  REDUC_CODE is the code that
+   implements in-order reduction and VECTYPE_IN is the type of its
+   vector input.  MASKS specifies the masks that should be used to
+   control the operation in a fully-masked loop.  */
 
 static bool
-vectorized_strict_reduction (gimple *stmt, gimple_stmt_iterator *gsi,
-			     gimple **vec_stmt, slp_tree slp_node,
-			     gimple *reduc_def_stmt,
-			     tree_code code, tree_code reduc_code,
-			     int op_type, tree ops[3], tree vectype_in,
-			     int reduc_index, vec_loop_masks *masks)
+vectorize_fold_left_reduction (gimple *stmt, gimple_stmt_iterator *gsi,
+			       gimple **vec_stmt, slp_tree slp_node,
+			       gimple *reduc_def_stmt,
+			       tree_code code, tree_code reduc_code,
+			       tree ops[3], tree vectype_in,
+			       int reduc_index, vec_loop_masks *masks)
 {
-  int i;
-  int ncopies;
   stmt_vec_info stmt_info = vinfo_for_stmt (stmt);
   loop_vec_info loop_vinfo = STMT_VINFO_LOOP_VINFO (stmt_info);
   struct loop *loop = LOOP_VINFO_LOOP (loop_vinfo);
-  tree def0, op0;
-  tree expr = NULL_TREE;
   tree vectype_out = STMT_VINFO_VECTYPE (stmt_info);
   gimple *new_stmt = NULL;
-  auto_vec<tree> vec_oprnds0;
 
+  int ncopies;
   if (slp_node)
     ncopies = 1;
   else
@@ -6198,19 +6221,20 @@ vectorized_strict_reduction (gimple *stmt, gimple_stmt_iterator *gsi,
 
   gcc_assert (!nested_in_vect_loop_p (loop, stmt));
   gcc_assert (ncopies == 1);
-  gcc_assert (op_type == binary_op);
+  gcc_assert (TREE_CODE_LENGTH (code) == binary_op);
   gcc_assert (reduc_index == (code == MINUS_EXPR ? 0 : 1));
   gcc_assert (STMT_VINFO_VEC_REDUCTION_TYPE (stmt_info)
-	      == STRICT_FP_REDUCTION);
+	      == FOLD_LEFT_REDUCTION);
 
   if (slp_node)
     gcc_assert (must_eq (TYPE_VECTOR_SUBPARTS (vectype_out),
 			 TYPE_VECTOR_SUBPARTS (vectype_in)));
 
-  op0 = ops[1 - reduc_index];
+  tree op0 = ops[1 - reduc_index];
 
   int group_size = 1;
   gimple *scalar_dest_def;
+  auto_vec<tree> vec_oprnds0;
   if (slp_node)
     {
       vect_get_vec_defs (op0, NULL_TREE, stmt, &vec_oprnds0, NULL, slp_node);
@@ -6238,11 +6262,15 @@ vectorized_strict_reduction (gimple *stmt, gimple_stmt_iterator *gsi,
   if (LOOP_VINFO_FULLY_MASKED_P (loop_vinfo))
     vector_identity = build_zero_cst (vectype_out);
 
+  int i;
+  tree def0;
   FOR_EACH_VEC_ELT (vec_oprnds0, i, def0)
     {
       tree mask = NULL_TREE;
       if (LOOP_VINFO_FULLY_MASKED_P (loop_vinfo))
 	mask = vect_get_loop_mask (gsi, masks, vec_num, vectype_in, i);
+
+      /* Handle MINUS by adding the negative.  */
       if (code == MINUS_EXPR)
 	{
 	  tree negated = make_ssa_name (vectype_out);
@@ -6255,25 +6283,27 @@ vectorized_strict_reduction (gimple *stmt, gimple_stmt_iterator *gsi,
 	def0 = merge_with_identity (gsi, mask, vectype_out, def0,
 				    vector_identity);
 
-      /* On first iteration the input is simply the scalar phi result, and for
-         subsequent iterations it is the output of the preceding operation.  */
-
-      expr = build2 (reduc_code, scalar_type, reduc_var, def0);
+      /* On the first iteration the input is simply the scalar phi
+	 result, and for subsequent iterations it is the output of
+	 the preceding operation.  */
+      tree expr = build2 (reduc_code, scalar_type, reduc_var, def0);
 
       /* For chained SLP reductions the output of the previous reduction
-         operation serves as the input of the next. For the final statement
-         the output cannot be a temporary - we reuse the original
-         scalar destination of the last statement.  */
+	 operation serves as the input of the next. For the final statement
+	 the output cannot be a temporary - we reuse the original
+	 scalar destination of the last statement.  */
       if (i == vec_num - 1)
-        reduc_var = scalar_dest;
+	reduc_var = scalar_dest;
       else
-        reduc_var = vect_create_destination_var (scalar_dest, NULL);
-
+	reduc_var = vect_create_destination_var (scalar_dest, NULL);
       new_stmt = gimple_build_assign (reduc_var, expr);
 
       if (i == vec_num - 1)
-        {
-          SSA_NAME_DEF_STMT (reduc_var) = new_stmt;
+	{
+	  SSA_NAME_DEF_STMT (reduc_var) = new_stmt;
+	  /* For chained SLP stmt is the first statement in the group and
+	     gsi points to the last statement in the group.  For non SLP stmt
+	     points to the same location as gsi.  */
 	  if (scalar_dest_def == gsi_stmt (*gsi))
 	    vect_finish_replace_stmt (scalar_dest_def, new_stmt);
 	  else
@@ -6287,14 +6317,14 @@ vectorized_strict_reduction (gimple *stmt, gimple_stmt_iterator *gsi,
 	    }
         }
       else
-        {
-          reduc_var = make_ssa_name (reduc_var, new_stmt);
-          gimple_assign_set_lhs (new_stmt, reduc_var);
-          vect_finish_stmt_generation (stmt, new_stmt, gsi);
-        }
+	{
+	  reduc_var = make_ssa_name (reduc_var, new_stmt);
+	  gimple_assign_set_lhs (new_stmt, reduc_var);
+	  vect_finish_stmt_generation (stmt, new_stmt, gsi);
+	}
 
       if (slp_node)
-        SLP_TREE_VEC_STMTS (slp_node).quick_push (new_stmt);
+	SLP_TREE_VEC_STMTS (slp_node).quick_push (new_stmt);
     }
 
   if (!slp_node)
@@ -6481,7 +6511,10 @@ vectorizable_reduction (gimple *stmt, gimple_stmt_iterator *gsi,
 	  return true;
 	}
 
-      if (STMT_VINFO_REDUC_TYPE (stmt_info) == STRICT_FP_REDUCTION)
+      if (STMT_VINFO_REDUC_TYPE (stmt_info) == FOLD_LEFT_REDUCTION)
+	/* Leave the scalar phi in place.  Note that checking
+	   STMT_VINFO_VEC_REDUCTION_TYPE (as below) only works
+	   for reductions involving a single statement.  */
 	return true;
 
       gimple *reduc_stmt = STMT_VINFO_REDUC_DEF (stmt_info);
@@ -6489,11 +6522,11 @@ vectorizable_reduction (gimple *stmt, gimple_stmt_iterator *gsi,
 	reduc_stmt = STMT_VINFO_RELATED_STMT (vinfo_for_stmt (reduc_stmt));
 
       if (STMT_VINFO_VEC_REDUCTION_TYPE (vinfo_for_stmt (reduc_stmt))
-	  == COND_REDUCTION_CLASTB)
+	  == EXTRACT_LAST_REDUCTION)
+	/* Leave the scalar phi in place.  */
 	return true;
 
       gcc_assert (is_gimple_assign (reduc_stmt));
-
       for (unsigned k = 1; k < gimple_num_ops (reduc_stmt); ++k)
 	{
 	  tree op = gimple_op (reduc_stmt, k);
@@ -6711,11 +6744,11 @@ vectorizable_reduction (gimple *stmt, gimple_stmt_iterator *gsi,
      directy used in stmt.  */
   if (reduc_index == -1)
     {
-      if (STMT_VINFO_REDUC_TYPE (stmt_info) == STRICT_FP_REDUCTION)
+      if (STMT_VINFO_REDUC_TYPE (stmt_info) == FOLD_LEFT_REDUCTION)
 	{
 	  if (dump_enabled_p ())
 	    dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
-			     "In-order reduction chain without SLP.\n");
+			     "in-order reduction chain without SLP.\n");
 	  return false;
 	}
 
@@ -6765,19 +6798,20 @@ vectorizable_reduction (gimple *stmt, gimple_stmt_iterator *gsi,
 	  gcc_assert (cond_reduc_dt == vect_constant_def);
 	  STMT_VINFO_VEC_REDUCTION_TYPE (stmt_info) = CONST_COND_REDUCTION;
 	}
-      else if (direct_internal_fn_supported_p (IFN_CLASTB, vectype_in,
-					       OPTIMIZE_FOR_SPEED))
+      else if (direct_internal_fn_supported_p (IFN_FOLD_EXTRACT_LAST,
+					       vectype_in, OPTIMIZE_FOR_SPEED))
 	{
 	  if (dump_enabled_p ())
 	    dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
-			     "Optimizing condition reduction with CLASTB.\n");
-	  STMT_VINFO_VEC_REDUCTION_TYPE (stmt_info) = COND_REDUCTION_CLASTB;
+			     "optimizing condition reduction with"
+			     " FOLD_EXTRACT_LAST.\n");
+	  STMT_VINFO_VEC_REDUCTION_TYPE (stmt_info) = EXTRACT_LAST_REDUCTION;
 	}
       else if (cond_reduc_dt == vect_induction_def)
 	{
 	  if (dump_enabled_p ())
 	    dump_printf_loc (MSG_NOTE, vect_location,
-			     "Optimizing condition reduction based on "
+			     "optimizing condition reduction based on "
 			     "integer induction.\n");
 	  STMT_VINFO_VEC_REDUCTION_TYPE (stmt_info)
 	    = INTEGER_INDUC_COND_REDUCTION;
@@ -6935,9 +6969,11 @@ vectorizable_reduction (gimple *stmt, gimple_stmt_iterator *gsi,
           (and also the same tree-code) when generating the epilog code and
           when generating the code inside the loop.  */
 
+  vect_reduction_type reduction_type
+    = STMT_VINFO_VEC_REDUCTION_TYPE (stmt_info);
   if (orig_stmt
-      && (!REDUCTION_IS_COND_REDUCTION_P
-	  (STMT_VINFO_VEC_REDUCTION_TYPE (stmt_info))))
+      && (reduction_type == TREE_CODE_REDUCTION
+	  || reduction_type == FOLD_LEFT_REDUCTION))
     {
       /* This is a reduction pattern: get the vectype from the type of the
          reduction variable, and get the tree-code from orig_stmt.  */
@@ -6956,13 +6992,12 @@ vectorizable_reduction (gimple *stmt, gimple_stmt_iterator *gsi,
 
       /* For simple condition reductions, replace with the actual expression
 	 we want to base our reduction around.  */
-      if (STMT_VINFO_VEC_REDUCTION_TYPE (stmt_info) == CONST_COND_REDUCTION)
+      if (reduction_type == CONST_COND_REDUCTION)
 	{
 	  orig_code = STMT_VINFO_VEC_CONST_COND_REDUC_CODE (stmt_info);
 	  gcc_assert (orig_code == MAX_EXPR || orig_code == MIN_EXPR);
 	}
-      else if (STMT_VINFO_VEC_REDUCTION_TYPE (stmt_info)
-		 == INTEGER_INDUC_COND_REDUCTION)
+      else if (reduction_type == INTEGER_INDUC_COND_REDUCTION)
 	orig_code = MAX_EXPR;
     }
 
@@ -6984,17 +7019,15 @@ vectorizable_reduction (gimple *stmt, gimple_stmt_iterator *gsi,
 
   epilog_reduc_code = ERROR_MARK;
 
-  vect_reduction_type reduction_type
-    = STMT_VINFO_VEC_REDUCTION_TYPE (stmt_info);
   if (reduction_type == TREE_CODE_REDUCTION
-      || reduction_type == STRICT_FP_REDUCTION
+      || reduction_type == FOLD_LEFT_REDUCTION
       || reduction_type == INTEGER_INDUC_COND_REDUCTION
       || reduction_type == CONST_COND_REDUCTION)
     {
       bool have_reduc_support;
-      if (reduction_type == STRICT_FP_REDUCTION)
-	have_reduc_support = strict_reduction_code (orig_code, vectype_out,
-						    &epilog_reduc_code);
+      if (reduction_type == FOLD_LEFT_REDUCTION)
+	have_reduc_support = fold_left_reduction_code (orig_code, vectype_out,
+						       &epilog_reduc_code);
       else
 	have_reduc_support
 	  = reduction_code_for_scalar_code (orig_code, &epilog_reduc_code);
@@ -7047,7 +7080,7 @@ vectorizable_reduction (gimple *stmt, gimple_stmt_iterator *gsi,
 	epilog_reduc_code = REDUC_MAX_EXPR;
     }
 
-  if (reduction_type != COND_REDUCTION_CLASTB
+  if (reduction_type != EXTRACT_LAST_REDUCTION
       && epilog_reduc_code == ERROR_MARK
       && !nunits_out.is_constant ())
     {
@@ -7058,7 +7091,7 @@ vectorizable_reduction (gimple *stmt, gimple_stmt_iterator *gsi,
       return false;
     }
 
-  if ((double_reduc || REDUCTION_IS_COND_REDUCTION_P (reduction_type))
+  if ((double_reduc || reduction_type != TREE_CODE_REDUCTION)
       && ncopies > 1)
     {
       if (dump_enabled_p ())
@@ -7071,9 +7104,9 @@ vectorizable_reduction (gimple *stmt, gimple_stmt_iterator *gsi,
   /* For SLP reductions, see if there is a neutral value we can use.  */
   tree neutral_op = NULL_TREE;
   if (slp_node)
-    neutral_op = neutral_op_for_slp_reduction
-      (slp_node_instance->reduc_phis, code,
-       GROUP_FIRST_ELEMENT (stmt_info) != NULL);
+    neutral_op
+      = neutral_op_for_slp_reduction (slp_node_instance->reduc_phis, code,
+				      GROUP_FIRST_ELEMENT (stmt_info) != NULL);
 
   /* For double reductions, and for SLP reductions with a neutral value,
      we construct a variable-length initial vector by loading a vector
@@ -7086,7 +7119,7 @@ vectorizable_reduction (gimple *stmt, gimple_stmt_iterator *gsi,
     {
       if (dump_enabled_p ())
 	dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
-			 "Reduction on variable-length vectors requires"
+			 "reduction on variable-length vectors requires"
 			 " target support for a vector-shift-and-insert"
 			 " operation.\n");
       return false;
@@ -7109,8 +7142,8 @@ vectorizable_reduction (gimple *stmt, gimple_stmt_iterator *gsi,
 	{
 	  if (dump_enabled_p ())
 	    dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
-			     "Unsupported form of SLP reduction for"
-			     " variable-width vectors: cannot build"
+			     "unsupported form of SLP reduction for"
+			     " variable-length vectors: cannot build"
 			     " initial vector.\n");
 	  return false;
 	}
@@ -7121,58 +7154,45 @@ vectorizable_reduction (gimple *stmt, gimple_stmt_iterator *gsi,
 	{
 	  if (dump_enabled_p ())
 	    dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
-			     "Unsupported form of SLP reduction for"
-			     " variable-width vectors: the vector size"
+			     "unsupported form of SLP reduction for"
+			     " variable-length vectors: the vector size"
 			     " is not a multiple of the number of results.\n");
 	  return false;
 	}
     }
 
-  if (double_reduc && reduction_type == STRICT_FP_REDUCTION)
+  if (double_reduc && reduction_type == FOLD_LEFT_REDUCTION)
     {
-      /* We can't support strict math reductions of code such as this:
+      /* We can't support in-order reductions of code such as this:
+
 	   for (int i = 0; i < n1; ++i)
 	     for (int j = 0; j < n2; ++j)
 	       l += a[j];
 
-	 since gcc effectively transforms the loop when vectorizing:
+	 since GCC effectively transforms the loop when vectorizing:
 
 	   for (int i = 0; i < n1 / VF; ++i)
 	     for (int j = 0; j < n2; ++j)
 	       for (int k = 0; k < VF; ++k)
 		 l += a[j];
 
-	 The strict code could implement the second loop above exactly. The
-	 problem is that the second loop is already wrong because it's a
-	 reassociation of the first.
-      */
+	 which is a reassociation of the original operation.  */
       if (dump_enabled_p ())
 	dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
-			 "double reduction not supported for strict math\n");
+			 "in-order double reduction not supported.\n");
 
       return false;
     }
 
-  /* TODO SVE: This restriction should be relaxed once we can support
-     widening, narrowing operations.  */
-  if (reduction_type == STRICT_FP_REDUCTION && ncopies > 1)
-    {
-      if (dump_enabled_p ())
-        dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
-                         "strict reduction with ncopies > 1.\n");
-      return false;
-    }
-
-  if (reduction_type == STRICT_FP_REDUCTION
+  if (reduction_type == FOLD_LEFT_REDUCTION
       && slp_node
       && !GROUP_FIRST_ELEMENT (vinfo_for_stmt (stmt)))
     {
-      /* We cannot support strict math reductions in this case because there is
+      /* We cannot in-order reductions in this case because there is
          an implicit reassociation of the operations involved.  */
       if (dump_enabled_p ())
-        dump_printf_loc
-          (MSG_MISSED_OPTIMIZATION, vect_location,
-           "non chained SLP reduction not supported for strict math.\n");
+        dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
+			 "in-order unchained SLP reductions not supported.\n");
       return false;
     }
 
@@ -7282,6 +7302,11 @@ vectorizable_reduction (gimple *stmt, gimple_stmt_iterator *gsi,
       return false;
     }
 
+  if (slp_node)
+    vec_num = SLP_TREE_NUMBER_OF_VEC_STMTS (slp_node);
+  else
+    vec_num = 1;
+
   internal_fn cond_fn = get_conditional_internal_fn (code, scalar_type);
 
   /* In a speculative loop, the update must be predicated on the
@@ -7291,25 +7316,20 @@ vectorizable_reduction (gimple *stmt, gimple_stmt_iterator *gsi,
   if (LOOP_VINFO_SPECULATIVE_EXECUTION (loop_vinfo))
     masks = &LOOP_VINFO_NONSPECULATIVE_MASKS (loop_vinfo);
 
-  if (slp_node)
-    vec_num = SLP_TREE_NUMBER_OF_VEC_STMTS (slp_node);
-  else
-    vec_num = 1;
-
   if (!vec_stmt) /* transformation not required.  */
     {
       if (first_p)
 	vect_model_reduction_cost (stmt_info, epilog_reduc_code, ncopies);
       if (loop_vinfo && LOOP_VINFO_CAN_FULLY_MASK_P (loop_vinfo))
 	{
-	  if (reduction_type != STRICT_FP_REDUCTION
+	  if (reduction_type != FOLD_LEFT_REDUCTION
 	      && (cond_fn == IFN_LAST
 		  || !direct_internal_fn_supported_p (cond_fn, vectype_in,
 						      OPTIMIZE_FOR_SPEED)))
 	    {
 	      if (dump_enabled_p ())
 		dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
-				 "Can't use a fully-masked loop because no"
+				 "can't use a fully-masked loop because no"
 				 " conditional operation is available.\n");
 	      LOOP_VINFO_CAN_FULLY_MASK_P (loop_vinfo) = false;
 	    }
@@ -7317,7 +7337,7 @@ vectorizable_reduction (gimple *stmt, gimple_stmt_iterator *gsi,
 	    {
 	      if (dump_enabled_p ())
 		dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
-				 "Can't use a fully-masked loop for chained"
+				 "can't use a fully-masked loop for chained"
 				 " reductions.\n");
 	      LOOP_VINFO_CAN_FULLY_MASK_P (loop_vinfo) = false;
 	    }
@@ -7346,15 +7366,15 @@ vectorizable_reduction (gimple *stmt, gimple_stmt_iterator *gsi,
       && masks == &LOOP_VINFO_NONSPECULATIVE_MASKS (loop_vinfo))
     gsi = &nonspeculative_gsi;
 
-  if (reduction_type == STRICT_FP_REDUCTION)
-    return vectorized_strict_reduction
+  if (reduction_type == FOLD_LEFT_REDUCTION)
+    return vectorize_fold_left_reduction
       (stmt, gsi, vec_stmt, slp_node, reduc_def_stmt, code,
-       epilog_reduc_code, op_type, ops, vectype_in, reduc_index, masks);
+       epilog_reduc_code, ops, vectype_in, reduc_index, masks);
 
-  if (reduction_type == COND_REDUCTION_CLASTB)
+  if (reduction_type == EXTRACT_LAST_REDUCTION)
     {
       gcc_assert (!slp_node);
-      return vectorizable_condition (stmt, gsi, vec_stmt, 
+      return vectorizable_condition (stmt, gsi, vec_stmt,
 				     NULL, reduc_index, NULL);
     }
 
@@ -8217,42 +8237,6 @@ vectorizable_live_operation (gimple *stmt,
 	}
     }
 
-  /* Check if required operations can be supported.  */
-
-  if (!direct_internal_fn_supported_p (IFN_EXTRACT_LAST, vectype,
-				       OPTIMIZE_FOR_SPEED))
-    {
-      if (LOOP_VINFO_SPECULATIVE_EXECUTION (loop_vinfo))
-	{
-	  if (dump_enabled_p ())
-	    dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
-			     "Not vectorized: "
-			     "Extract last reduction not supported.\n");
-	  return false;
-	}
-
-      if (LOOP_VINFO_CAN_FULLY_MASK_P (loop_vinfo))
-	{
-	  if (dump_enabled_p ())
-	    dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
-			     "Can't use a fully-masked loop because "
-			     "the target doesn't support extract last "
-			     "reduction.\n");
-	  LOOP_VINFO_CAN_FULLY_MASK_P (loop_vinfo) = false;
-	  /* Don't return - we can still vectorize without masking.  */
-	}
-    }
-
-  if (slp_node && LOOP_VINFO_CAN_FULLY_MASK_P (loop_vinfo))
-    {
-      if (dump_enabled_p ())
-	dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
-			 "Can't use a fully-masked loop; "
-			 "SLP statement is live after the loop.\n");
-      LOOP_VINFO_CAN_FULLY_MASK_P (loop_vinfo) = false;
-      /* Don't return - we can still vectorize without masking.  */
-    }
-
   if (LOOP_VINFO_SPECULATIVE_EXECUTION (loop_vinfo))
     {
       /* Need to construct the type because on the checking stage we don't
@@ -8264,30 +8248,23 @@ vectorizable_live_operation (gimple *stmt,
 	{
 	  if (dump_enabled_p ())
 	    dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
-			     "Not vectorized: Break after not supported.\n");
+			     "not vectorized: break after not supported.\n");
 	  return false;
 	}
-    }
-
-  if (ncopies > 1)
-    {
-      if (LOOP_VINFO_SPECULATIVE_EXECUTION (loop_vinfo))
+      if (!direct_internal_fn_supported_p (IFN_EXTRACT_LAST, vectype,
+					   OPTIMIZE_FOR_SPEED))
 	{
 	  if (dump_enabled_p ())
 	    dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
-			     "Not vectorized: "
-			     "Multiple ncopies not supported.\n");
+			     "not vectorized: extract last not supported.\n");
 	  return false;
 	}
-
-      if (LOOP_VINFO_CAN_FULLY_MASK_P (loop_vinfo))
+      if (ncopies > 1)
 	{
 	  if (dump_enabled_p ())
 	    dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
-			     "Can't use a fully-masked loop because"
-			     " ncopies is greater than 1.\n");
-	  LOOP_VINFO_CAN_FULLY_MASK_P (loop_vinfo) = false;
-	  /* Don't return - we can still vectorize without masking.  */
+			     "not vectorized: ncopies is greater than 1.\n");
+	  return false;
 	}
     }
 
@@ -8296,9 +8273,39 @@ vectorizable_live_operation (gimple *stmt,
       /* No transformation required.  */
       if (LOOP_VINFO_CAN_FULLY_MASK_P (loop_vinfo))
 	{
-	  gcc_assert (ncopies == 1 && !slp_node);
-	  vect_record_loop_mask (loop_vinfo, &LOOP_VINFO_MASKS (loop_vinfo),
-				 1, vectype);
+	  if (!direct_internal_fn_supported_p (IFN_EXTRACT_LAST, vectype,
+					       OPTIMIZE_FOR_SPEED))
+	    {
+	      if (dump_enabled_p ())
+		dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
+				 "can't use a fully-masked loop because "
+				 "the target doesn't support extract last "
+				 "reduction.\n");
+	      LOOP_VINFO_CAN_FULLY_MASK_P (loop_vinfo) = false;
+	    }
+	  else if (slp_node)
+	    {
+	      if (dump_enabled_p ())
+		dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
+				 "can't use a fully-masked loop because an "
+				 "SLP statement is live after the loop.\n");
+	      LOOP_VINFO_CAN_FULLY_MASK_P (loop_vinfo) = false;
+	    }
+	  else if (ncopies > 1)
+	    {
+	      if (dump_enabled_p ())
+		dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
+				 "can't use a fully-masked loop because"
+				 " ncopies is greater than 1.\n");
+	      LOOP_VINFO_CAN_FULLY_MASK_P (loop_vinfo) = false;
+	    }
+	  else
+	    {
+	      gcc_assert (ncopies == 1 && !slp_node);
+	      vect_record_loop_mask (loop_vinfo,
+				     &LOOP_VINFO_MASKS (loop_vinfo),
+				     1, vectype);
+	    }
 	}
       return true;
     }
@@ -8350,19 +8357,16 @@ vectorizable_live_operation (gimple *stmt,
   if (LOOP_VINFO_FULLY_MASKED_P (loop_vinfo)
       || LOOP_VINFO_SPECULATIVE_EXECUTION (loop_vinfo))
     {
-      tree scalar_type = TREE_TYPE (STMT_VINFO_VECTYPE (stmt_info));
-      tree scalar_res = make_ssa_name (scalar_type);
       tree mask;
-      gimple *new_stmt;
-
       if (LOOP_VINFO_SPECULATIVE_EXECUTION (loop_vinfo))
 	{
+	  gcc_assert (ncopies == 1);
 	  tree orig_mask = LOOP_VINFO_EXIT_MASK (loop_vinfo);
 	  tree all_ones = build_minus_one_cst (TREE_TYPE (orig_mask));
 
 	  mask = make_ssa_name (TREE_TYPE (orig_mask));
-	  new_stmt = gimple_build_call_internal (IFN_BREAK_AFTER, 2,
-						 all_ones, orig_mask);
+	  gcall *new_stmt = gimple_build_call_internal (IFN_BREAK_AFTER, 2,
+							all_ones, orig_mask);
 	  gimple_call_set_lhs (new_stmt, mask);
 	  gimple_seq_add_stmt (&stmts, new_stmt);
 	}
@@ -8373,11 +8377,20 @@ vectorizable_live_operation (gimple *stmt,
 				     1, vectype, 0);
 	}
 
-      new_stmt = gimple_build_call_internal (IFN_EXTRACT_LAST, 2, vec_lhs,
-					     mask);
+      /* Emit:
+
+	   SCALAR_RES = EXTRACT_LAST <VEC_LHS, MASK>
+
+	 where VEC_LHS is the vectorized live-out result and MASK is
+	 the loop mask for the final iteration.  */
+      tree scalar_type = TREE_TYPE (STMT_VINFO_VECTYPE (stmt_info));
+      tree scalar_res = make_ssa_name (scalar_type);
+      gcall *new_stmt = gimple_build_call_internal (IFN_EXTRACT_LAST,
+						    2, mask, vec_lhs);
       gimple_call_set_lhs (new_stmt, scalar_res);
       gimple_seq_add_stmt (&stmts, new_stmt);
 
+      /* Convert the extracted vector element to the required scalar type.  */
       new_tree = gimple_convert (&stmts, lhs_type, scalar_res);
     }
   else
@@ -8778,24 +8791,17 @@ vect_transform_loop (loop_vec_info loop_vinfo)
 			      &step_vector, &niters_vector_mult_vf, th,
 			      check_profitability, niters_no_overflow);
 
-  bool final_iter_may_be_partial = LOOP_VINFO_FULLY_MASKED_P (loop_vinfo);
   if (niters_vector == NULL_TREE
       && !LOOP_VINFO_SPECULATIVE_EXECUTION (loop_vinfo))
     {
       gcc_assert (!LOOP_VINFO_PEELING_FOR_GAPS (loop_vinfo));
       if (LOOP_VINFO_NITERS_KNOWN_P (loop_vinfo)
-	  && must_eq (lowest_vf, vf)
-	  && !LOOP_VINFO_FULLY_MASKED_P (loop_vinfo))
-	{
-	  wide_int niters_vector_val
-	    = (final_iter_may_be_partial
-	       ? wi::udiv_ceil (wi::to_wide (LOOP_VINFO_NITERS (loop_vinfo)),
-				lowest_vf)
-	       : wi::udiv_floor (wi::to_wide (LOOP_VINFO_NITERS (loop_vinfo)),
-				 lowest_vf));
+	  && !LOOP_VINFO_FULLY_MASKED_P (loop_vinfo)
+	  && must_eq (lowest_vf, vf))
+	{
 	  niters_vector
-	    = wide_int_to_tree (TREE_TYPE (LOOP_VINFO_NITERS (loop_vinfo)),
-				niters_vector_val);
+	    = build_int_cst (TREE_TYPE (LOOP_VINFO_NITERS (loop_vinfo)),
+			     LOOP_VINFO_INT_NITERS (loop_vinfo) / lowest_vf);
 	  step_vector = build_one_cst (TREE_TYPE (niters));
 	}
       else
@@ -9064,13 +9070,11 @@ vect_transform_loop (loop_vec_info loop_vinfo)
             {
 	      if (STMT_VINFO_GROUPED_ACCESS (stmt_info))
 		{
-		  /* Remove all the stores once we've vectorized the
-		     whole group.  */
+		  /* Interleaving. If IS_STORE is TRUE, the vectorization of the
+		     interleaving chain was completed - free all the stores in
+		     the chain.  */
 		  gsi_next (&si);
-		  gimple *first_stmt = GROUP_FIRST_ELEMENT (stmt_info);
-		  if (GROUP_STORE_COUNT (vinfo_for_stmt (first_stmt))
-		      == GROUP_NUM_STMTS (vinfo_for_stmt (first_stmt)))
-		    vect_remove_stores (first_stmt);
+		  vect_remove_stores (GROUP_FIRST_ELEMENT (stmt_info));
 		}
 	      else
 		{
@@ -9092,6 +9096,25 @@ vect_transform_loop (loop_vec_info loop_vinfo)
 	      gsi_next (&si);
 	    }
 	}		        /* stmts in BB */
+
+      /* Stub out scalar statements that must not survive vectorization.
+	 Doing this here helps with grouped statements, or statements that
+	 are involved in patterns.  */
+      for (gimple_stmt_iterator gsi = gsi_start_bb (bb);
+	   !gsi_end_p (gsi); gsi_next (&gsi))
+	{
+	  gcall *call = dyn_cast <gcall *> (gsi_stmt (gsi));
+	  if (call && gimple_call_internal_p (call, IFN_MASK_LOAD))
+	    {
+	      tree lhs = gimple_get_lhs (call);
+	      if (!VECTOR_TYPE_P (TREE_TYPE (lhs)))
+		{
+		  tree zero = build_zero_cst (TREE_TYPE (lhs));
+		  gimple *new_stmt = gimple_build_assign (lhs, zero);
+		  gsi_replace (&gsi, new_stmt, true);
+		}
+	    }
+	}
     }				/* BBs in loop */
 
   /* Provide the real definition of LOOP_VINFO_EXIT_MASK.  */
@@ -9113,6 +9136,9 @@ vect_transform_loop (loop_vec_info loop_vinfo)
   unsigned int assumed_vf = vect_vf_for_cost (loop_vinfo);
   scale_profile_for_vect_loop (loop, assumed_vf);
 
+  /* True if the final iteration might not handle a full vector's
+     worth of scalar iterations.  */
+  bool final_iter_may_be_partial = LOOP_VINFO_FULLY_MASKED_P (loop_vinfo);
   /* The minimum number of iterations performed by the epilogue.  This
      is 1 when peeling for gaps because we always need a final scalar
      iteration.  */