2013-06-26 Basile Starynkevitch <basile@starynkevitch.net>

{{merged with trunk [4.9] svn rev. 196654-200426}} MELT branch merged with trunk rev. 200426 using svnmerge.py [gcc/] 2013-06-26 Basile Starynkevitch <basile@starynkevitch.net> {{merge with trunk [4.9] svn rev. 196654-200426}} * melt-runtime.c (melt_val2passflag): TODO_ggc_collect & TODO_do_not_ggc_collect are conditionalized. * melt/generated/warmelt-first+03.cc: Manually remove calls to MELT_TRACE_EXIT_LOCATION macro. * melt/generated/warmelt-base+03.cc: Ditto. git-svn-id: svn+ssh://gcc.gnu.org/svn/gcc/branches/melt-branch@200430 138bc75d-0d04-0410-961f-82ee72b054a4
author: bstarynk <bstarynk@138bc75d-0d04-0410-961f-82ee72b054a4> 2013-06-26 18:39:06 +0000
committer: bstarynk <bstarynk@138bc75d-0d04-0410-961f-82ee72b054a4> 2013-06-26 18:39:06 +0000
commit: 21543d4cd558cada630271a0cf3075ad7ce94cbf (patch)
tree: 08bdb3f3e0a9d0f71e72bb56d9ddb7b916e7dfeb /gcc/tree-vect-slp.c
parent: ed0bc1ffb674fe93d0df68654b5bb76869f0bc8c (diff)
download: gcc-21543d4cd558cada630271a0cf3075ad7ce94cbf.tar.gz
1 files changed, 661 insertions, 727 deletions
diff --git a/gcc/tree-vect-slp.c b/gcc/tree-vect-slp.c
index e184326c3b5..ea8c202d187 100644
--- a/gcc/tree-vect-slp.c
+++ b/gcc/tree-vect-slp.c
@@ -67,17 +67,18 @@ static void
 vect_free_slp_tree (slp_tree node)
 {
   int i;
-  slp_void_p child;
+  slp_tree child;
 
   if (!node)
     return;
 
   FOR_EACH_VEC_ELT (SLP_TREE_CHILDREN (node), i, child)
-    vect_free_slp_tree ((slp_tree) child);
+    vect_free_slp_tree (child);
 
   SLP_TREE_CHILDREN (node).release ();
   SLP_TREE_SCALAR_STMTS (node).release ();
   SLP_TREE_VEC_STMTS (node).release ();
+  SLP_TREE_LOAD_PERMUTATION (node).release ();
 
   free (node);
 }
@@ -89,7 +90,6 @@ void
 vect_free_slp_instance (slp_instance instance)
 {
   vect_free_slp_tree (SLP_INSTANCE_TREE (instance));
-  SLP_INSTANCE_LOAD_PERMUTATION (instance).release ();
   SLP_INSTANCE_LOADS (instance).release ();
   SLP_INSTANCE_BODY_COST_VEC (instance).release ();
   free (instance);
@@ -120,6 +120,7 @@ vect_create_new_slp_node (vec<gimple> scalar_stmts)
   SLP_TREE_SCALAR_STMTS (node) = scalar_stmts;
   SLP_TREE_VEC_STMTS (node).create (0);
   SLP_TREE_CHILDREN (node).create (nops);
+  SLP_TREE_LOAD_PERMUTATION (node) = vNULL;
 
   return node;
 }
@@ -140,8 +141,7 @@ vect_create_oprnd_info (int nops, int group_size)
       oprnd_info = XNEW (struct _slp_oprnd_info);
       oprnd_info->def_stmts.create (group_size);
       oprnd_info->first_dt = vect_uninitialized_def;
-      oprnd_info->first_def_type = NULL_TREE;
-      oprnd_info->first_const_oprnd = NULL_TREE;
+      oprnd_info->first_op_type = NULL_TREE;
       oprnd_info->first_pattern = false;
       oprnds_info.quick_push (oprnd_info);
     }
@@ -168,31 +168,48 @@ vect_free_oprnd_info (vec<slp_oprnd_info> &oprnds_info)
 }
 
 
+/* Find the place of the data-ref in STMT in the interleaving chain that starts
+   from FIRST_STMT.  Return -1 if the data-ref is not a part of the chain.  */
+
+static int
+vect_get_place_in_interleaving_chain (gimple stmt, gimple first_stmt)
+{
+  gimple next_stmt = first_stmt;
+  int result = 0;
+
+  if (first_stmt != GROUP_FIRST_ELEMENT (vinfo_for_stmt (stmt)))
+    return -1;
+
+  do
+    {
+      if (next_stmt == stmt)
+	return result;
+      result++;
+      next_stmt = GROUP_NEXT_ELEMENT (vinfo_for_stmt (next_stmt));
+    }
+  while (next_stmt);
+
+  return -1;
+}
+
+
 /* Get the defs for the rhs of STMT (collect them in OPRNDS_INFO), check that
    they are of a valid type and that they match the defs of the first stmt of
    the SLP group (stored in OPRNDS_INFO).  */
 
 static bool
 vect_get_and_check_slp_defs (loop_vec_info loop_vinfo, bb_vec_info bb_vinfo,
-                             slp_tree slp_node, gimple stmt,
-			     int ncopies_for_cost, bool first,
-                             vec<slp_oprnd_info> *oprnds_info,
-			     stmt_vector_for_cost *prologue_cost_vec,
-			     stmt_vector_for_cost *body_cost_vec)
+                             gimple stmt, bool first,
+                             vec<slp_oprnd_info> *oprnds_info)
 {
   tree oprnd;
   unsigned int i, number_of_oprnds;
-  tree def, def_op0 = NULL_TREE;
+  tree def;
   gimple def_stmt;
   enum vect_def_type dt = vect_uninitialized_def;
-  enum vect_def_type dt_op0 = vect_uninitialized_def;
-  stmt_vec_info stmt_info = vinfo_for_stmt (stmt);
-  tree lhs = gimple_get_lhs (stmt);
   struct loop *loop = NULL;
-  enum tree_code rhs_code;
-  bool different_types = false;
   bool pattern = false;
-  slp_oprnd_info oprnd_info, oprnd0_info, oprnd1_info;
+  slp_oprnd_info oprnd_info;
   int op_idx = 1;
   tree compare_rhs = NULL_TREE;
 
@@ -304,38 +321,7 @@ vect_get_and_check_slp_defs (loop_vec_info loop_vinfo, bb_vec_info bb_vinfo,
 	{
 	  oprnd_info->first_dt = dt;
 	  oprnd_info->first_pattern = pattern;
-	  if (def)
-	    {
-	      oprnd_info->first_def_type = TREE_TYPE (def);
-	      oprnd_info->first_const_oprnd = NULL_TREE;
-	    }
-	  else
-            {
-              oprnd_info->first_def_type = NULL_TREE;
-              oprnd_info->first_const_oprnd = oprnd;
-            }
-
-	  if (i == 0)
-	    {
-	      def_op0 = def;
-	      dt_op0 = dt;
-	      /* Analyze costs (for the first stmt of the group only).  */
-	      if (REFERENCE_CLASS_P (lhs))
-		/* Store.  */
-                vect_model_store_cost (stmt_info, ncopies_for_cost, false,
-				       dt, slp_node, prologue_cost_vec,
-				       body_cost_vec);
-	      else
-		{
-		  enum vect_def_type dts[2];
-		  dts[0] = dt;
-		  dts[1] = vect_uninitialized_def;
-		  /* Not memory operation (we don't call this function for
-		     loads).  */
-		  vect_model_simple_cost (stmt_info, ncopies_for_cost, dts,
-					  prologue_cost_vec, body_cost_vec);
-		}
-	    }
+	  oprnd_info->first_op_type = TREE_TYPE (oprnd);
 	}
       else
 	{
@@ -346,64 +332,19 @@ vect_get_and_check_slp_defs (loop_vec_info loop_vinfo, bb_vec_info bb_vinfo,
 	     vect_internal_def.  */
 	  if (((oprnd_info->first_dt != dt
                 && !(oprnd_info->first_dt == vect_reduction_def
-                     && dt == vect_internal_def))
-               || (oprnd_info->first_def_type != NULL_TREE
-		   && def
-		   && !types_compatible_p (oprnd_info->first_def_type,
-					   TREE_TYPE (def))))
-	       || (!def
-		   && !types_compatible_p (TREE_TYPE (oprnd_info->first_const_oprnd),
-					   TREE_TYPE (oprnd)))
-	       || different_types)
+                     && dt == vect_internal_def)
+		&& !((oprnd_info->first_dt == vect_external_def
+		      || oprnd_info->first_dt == vect_constant_def)
+		     && (dt == vect_external_def
+			 || dt == vect_constant_def)))
+               || !types_compatible_p (oprnd_info->first_op_type,
+				       TREE_TYPE (oprnd))))
 	    {
-	      if (number_of_oprnds != 2)
-		{
-		  if (dump_enabled_p ())
-		    dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
-				     "Build SLP failed: different types ");
-
-		  return false;
-                }
-
-	      /* Try to swap operands in case of binary operation.  */
-              if (i == 0)
-                different_types = true;
-              else
-		{
-		  oprnd0_info = (*oprnds_info)[0];
-		  if (is_gimple_assign (stmt)
-		      && (rhs_code = gimple_assign_rhs_code (stmt))
- 		      && TREE_CODE_CLASS (rhs_code) == tcc_binary
-		      && commutative_tree_code (rhs_code)
-		      && oprnd0_info->first_dt == dt
-		      && oprnd_info->first_dt == dt_op0
-		      && def_op0 && def
-		      && !(oprnd0_info->first_def_type
-			   && !types_compatible_p (oprnd0_info->first_def_type,
-			                           TREE_TYPE (def)))
-                      && !(oprnd_info->first_def_type
-                           && !types_compatible_p (oprnd_info->first_def_type,
-                                                   TREE_TYPE (def_op0))))
-                    {
-                      if (dump_enabled_p ())
-	                {
-			  dump_printf_loc (MSG_NOTE, vect_location,
-					   "Swapping operands of ");
- 		          dump_gimple_stmt (MSG_NOTE, TDF_SLIM, stmt, 0);
-			}
-
- 		      swap_tree_operands (stmt, gimple_assign_rhs1_ptr (stmt),
- 	                                  gimple_assign_rhs2_ptr (stmt));
-		    }
-                  else
-                    {
-         	      if (dump_enabled_p ())
-			dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
-					 "Build SLP failed: different types ");
+	      if (dump_enabled_p ())
+		dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
+				 "Build SLP failed: different types ");
 
-		      return false;
-		    }
-		}
+	      return false;
 	    }
 	}
 
@@ -416,18 +357,7 @@ vect_get_and_check_slp_defs (loop_vec_info loop_vinfo, bb_vec_info bb_vinfo,
 	  break;
 
 	case vect_internal_def:
-          if (different_types)
-            {
-	      oprnd0_info = (*oprnds_info)[0];
-	      oprnd1_info = (*oprnds_info)[0];
-              if (i == 0)
-                oprnd1_info->def_stmts.quick_push (def_stmt);
-              else
-                oprnd0_info->def_stmts.quick_push (def_stmt);
-            }
-	  else
- 	    oprnd_info->def_stmts.quick_push (def_stmt);
-
+	  oprnd_info->def_stmts.quick_push (def_stmt);
 	  break;
 
 	default:
@@ -447,60 +377,40 @@ vect_get_and_check_slp_defs (loop_vec_info loop_vinfo, bb_vec_info bb_vinfo,
 }
 
 
-/* Recursively build an SLP tree starting from NODE.
-   Fail (and return FALSE) if def-stmts are not isomorphic, require data
-   permutation or are of unsupported types of operation.  Otherwise, return
-   TRUE.  */
+/* Verify if the scalar stmts STMTS are isomorphic, require data
+   permutation or are of unsupported types of operation.  Return
+   true if they are, otherwise return false and indicate in *MATCHES
+   which stmts are not isomorphic to the first one.  If MATCHES[0]
+   is false then this indicates the comparison could not be
+   carried out or the stmts will never be vectorized by SLP.  */
 
 static bool
-vect_build_slp_tree (loop_vec_info loop_vinfo, bb_vec_info bb_vinfo,
-                     slp_tree *node, unsigned int group_size, int *outside_cost,
-                     int ncopies_for_cost, unsigned int *max_nunits,
-                     vec<int> *load_permutation,
-                     vec<slp_tree> *loads,
-                     unsigned int vectorization_factor, bool *loads_permuted,
-		     stmt_vector_for_cost *prologue_cost_vec,
-		     stmt_vector_for_cost *body_cost_vec)
+vect_build_slp_tree_1 (loop_vec_info loop_vinfo, bb_vec_info bb_vinfo,
+		       vec<gimple> stmts, unsigned int group_size,
+		       unsigned nops, unsigned int *max_nunits,
+		       unsigned int vectorization_factor, bool *matches)
 {
   unsigned int i;
-  vec<gimple> stmts = SLP_TREE_SCALAR_STMTS (*node);
   gimple stmt = stmts[0];
   enum tree_code first_stmt_code = ERROR_MARK, rhs_code = ERROR_MARK;
   enum tree_code first_cond_code = ERROR_MARK;
   tree lhs;
-  bool stop_recursion = false, need_same_oprnds = false;
+  bool need_same_oprnds = false;
   tree vectype, scalar_type, first_op1 = NULL_TREE;
-  unsigned int ncopies;
   optab optab;
   int icode;
   enum machine_mode optab_op2_mode;
   enum machine_mode vec_mode;
   struct data_reference *first_dr;
   HOST_WIDE_INT dummy;
-  bool permutation = false;
-  unsigned int load_place;
   gimple first_load = NULL, prev_first_load = NULL, old_first_load = NULL;
-  vec<slp_oprnd_info> oprnds_info;
-  unsigned int nops;
-  slp_oprnd_info oprnd_info;
   tree cond;
 
-  if (is_gimple_call (stmt))
-    nops = gimple_call_num_args (stmt);
-  else if (is_gimple_assign (stmt))
-    {
-      nops = gimple_num_ops (stmt) - 1;
-      if (gimple_assign_rhs_code (stmt) == COND_EXPR)
-	nops++;
-    }
-  else
-    return false;
-
-  oprnds_info = vect_create_oprnd_info (nops, group_size);
-
   /* For every stmt in NODE find its def stmt/s.  */
   FOR_EACH_VEC_ELT (stmts, i, stmt)
     {
+      matches[i] = false;
+
       if (dump_enabled_p ())
 	{
 	  dump_printf_loc (MSG_NOTE, vect_location, "Build SLP for ");
@@ -516,8 +426,8 @@ vect_build_slp_tree (loop_vec_info loop_vinfo, bb_vec_info bb_vinfo,
 			       "Build SLP failed: unvectorizable statement ");
               dump_gimple_stmt (MSG_MISSED_OPTIMIZATION, TDF_SLIM, stmt, 0);
             }
-
-	  vect_free_oprnd_info (oprnds_info);
+	  /* Fatal mismatch.  */
+	  matches[0] = false;
           return false;
         }
 
@@ -531,8 +441,8 @@ vect_build_slp_tree (loop_vec_info loop_vinfo, bb_vec_info bb_vinfo,
 			       "GIMPLE_CALL ");
 	      dump_gimple_stmt (MSG_MISSED_OPTIMIZATION, TDF_SLIM, stmt, 0);
 	    }
-
-	  vect_free_oprnd_info (oprnds_info);
+	  /* Fatal mismatch.  */
+	  matches[0] = false;
 	  return false;
 	}
 
@@ -548,8 +458,8 @@ vect_build_slp_tree (loop_vec_info loop_vinfo, bb_vec_info bb_vinfo,
 			       "comparison ");
               dump_gimple_stmt (MSG_MISSED_OPTIMIZATION, TDF_SLIM, stmt, 0);
             }
-
-	  vect_free_oprnd_info (oprnds_info);
+	  /* Fatal mismatch.  */
+	  matches[0] = false;
           return false;
         }
 
@@ -564,8 +474,8 @@ vect_build_slp_tree (loop_vec_info loop_vinfo, bb_vec_info bb_vinfo,
               dump_generic_expr (MSG_MISSED_OPTIMIZATION, TDF_SLIM,
 				 scalar_type);
             }
-
-	  vect_free_oprnd_info (oprnds_info);
+	  /* Fatal mismatch.  */
+	  matches[0] = false;
           return false;
         }
 
@@ -577,8 +487,6 @@ vect_build_slp_tree (loop_vec_info loop_vinfo, bb_vec_info bb_vinfo,
             vectorization_factor = *max_nunits;
         }
 
-      ncopies = vectorization_factor / TYPE_VECTOR_SUBPARTS (vectype);
-
       if (is_gimple_call (stmt))
 	{
 	  rhs_code = CALL_EXPR;
@@ -594,8 +502,8 @@ vect_build_slp_tree (loop_vec_info loop_vinfo, bb_vec_info bb_vinfo,
 				   "Build SLP failed: unsupported call type ");
 		  dump_gimple_stmt (MSG_MISSED_OPTIMIZATION, TDF_SLIM, stmt, 0);
 		}
-
-	      vect_free_oprnd_info (oprnds_info);
+	      /* Fatal mismatch.  */
+	      matches[0] = false;
 	      return false;
 	    }
 	}
@@ -631,7 +539,8 @@ vect_build_slp_tree (loop_vec_info loop_vinfo, bb_vec_info bb_vinfo,
 		      if (dump_enabled_p ())
 			dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
 					 "Build SLP failed: no optab.");
-	  	      vect_free_oprnd_info (oprnds_info);
+		      /* Fatal mismatch.  */
+		      matches[0] = false;
 		      return false;
 		    }
 		  icode = (int) optab_handler (optab, vec_mode);
@@ -641,7 +550,8 @@ vect_build_slp_tree (loop_vec_info loop_vinfo, bb_vec_info bb_vinfo,
 			dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
 					 "Build SLP failed: "
 					 "op not supported by target.");
-	  	      vect_free_oprnd_info (oprnds_info);
+		      /* Fatal mismatch.  */
+		      matches[0] = false;
 		      return false;
 		    }
 		  optab_op2_mode = insn_data[icode].operand[2].mode;
@@ -667,6 +577,7 @@ vect_build_slp_tree (loop_vec_info loop_vinfo, bb_vec_info bb_vinfo,
 		  || rhs_code != IMAGPART_EXPR)
               && !(STMT_VINFO_GROUPED_ACCESS (vinfo_for_stmt (stmt))
                    && (first_stmt_code == ARRAY_REF
+                       || first_stmt_code == BIT_FIELD_REF
                        || first_stmt_code == INDIRECT_REF
                        || first_stmt_code == COMPONENT_REF
                        || first_stmt_code == MEM_REF)))
@@ -678,9 +589,8 @@ vect_build_slp_tree (loop_vec_info loop_vinfo, bb_vec_info bb_vinfo,
 				   "in stmt ");
 		  dump_gimple_stmt (MSG_MISSED_OPTIMIZATION, TDF_SLIM, stmt, 0);
 		}
-
-	      vect_free_oprnd_info (oprnds_info);
-	      return false;
+	      /* Mismatch.  */
+	      continue;
 	    }
 
 	  if (need_same_oprnds
@@ -693,9 +603,8 @@ vect_build_slp_tree (loop_vec_info loop_vinfo, bb_vec_info bb_vinfo,
 				   "arguments in ");
 		  dump_gimple_stmt (MSG_MISSED_OPTIMIZATION, TDF_SLIM, stmt, 0);
 		}
-
-	      vect_free_oprnd_info (oprnds_info);
-	      return false;
+	      /* Mismatch.  */
+	      continue;
 	    }
 
 	  if (rhs_code == CALL_EXPR)
@@ -714,9 +623,8 @@ vect_build_slp_tree (loop_vec_info loop_vinfo, bb_vec_info bb_vinfo,
 		      dump_gimple_stmt (MSG_MISSED_OPTIMIZATION, TDF_SLIM,
 					stmt, 0);
 		    }
-
-		  vect_free_oprnd_info (oprnds_info);
-		  return false;
+		  /* Mismatch.  */
+		  continue;
 		}
 	    }
 	}
@@ -727,24 +635,24 @@ vect_build_slp_tree (loop_vec_info loop_vinfo, bb_vec_info bb_vinfo,
 	  if (REFERENCE_CLASS_P (lhs))
 	    {
 	      /* Store.  */
-	      if (!vect_get_and_check_slp_defs (loop_vinfo, bb_vinfo, *node,
-						stmt, ncopies_for_cost,
-						(i == 0), &oprnds_info,
-						prologue_cost_vec,
-						body_cost_vec))
-		{
-	  	  vect_free_oprnd_info (oprnds_info);
- 		  return false;
-		}
+	      ;
 	    }
 	  else
 	    {
 	      /* Load.  */
-              /* FORNOW: Check that there is no gap between the loads.  */
-              if ((GROUP_FIRST_ELEMENT (vinfo_for_stmt (stmt)) == stmt
-                   && GROUP_GAP (vinfo_for_stmt (stmt)) != 0)
-                  || (GROUP_FIRST_ELEMENT (vinfo_for_stmt (stmt)) != stmt
-                      && GROUP_GAP (vinfo_for_stmt (stmt)) != 1))
+	      unsigned unrolling_factor
+		= least_common_multiple
+		    (*max_nunits, group_size) / group_size;
+              /* FORNOW: Check that there is no gap between the loads
+		 and no gap between the groups when we need to load
+		 multiple groups at once.
+		 ???  We should enhance this to only disallow gaps
+		 inside vectors.  */
+              if ((unrolling_factor > 1
+		   && GROUP_FIRST_ELEMENT (vinfo_for_stmt (stmt)) == stmt
+		   && GROUP_GAP (vinfo_for_stmt (stmt)) != 0)
+		  || (GROUP_FIRST_ELEMENT (vinfo_for_stmt (stmt)) != stmt
+		      && GROUP_GAP (vinfo_for_stmt (stmt)) != 1))
                 {
                   if (dump_enabled_p ())
                     {
@@ -754,15 +662,20 @@ vect_build_slp_tree (loop_vec_info loop_vinfo, bb_vec_info bb_vinfo,
                       dump_gimple_stmt (MSG_MISSED_OPTIMIZATION, TDF_SLIM,
 					stmt, 0);
                     }
-
-	  	  vect_free_oprnd_info (oprnds_info);
+		  /* Fatal mismatch.  */
+		  matches[0] = false;
                   return false;
                 }
 
               /* Check that the size of interleaved loads group is not
                  greater than the SLP group size.  */
+	      unsigned ncopies
+		= vectorization_factor / TYPE_VECTOR_SUBPARTS (vectype);
               if (loop_vinfo
-                  && GROUP_SIZE (vinfo_for_stmt (stmt)) > ncopies * group_size)
+		  && GROUP_FIRST_ELEMENT (vinfo_for_stmt (stmt)) == stmt
+                  && ((GROUP_SIZE (vinfo_for_stmt (stmt))
+		       - GROUP_GAP (vinfo_for_stmt (stmt)))
+		      > ncopies * group_size))
                 {
                   if (dump_enabled_p ())
                     {
@@ -773,8 +686,8 @@ vect_build_slp_tree (loop_vec_info loop_vinfo, bb_vec_info bb_vinfo,
                       dump_gimple_stmt (MSG_MISSED_OPTIMIZATION, TDF_SLIM,
 					stmt, 0);
                     }
-
-	  	  vect_free_oprnd_info (oprnds_info);
+		  /* Fatal mismatch.  */
+		  matches[0] = false;
                   return false;
                 }
 
@@ -783,11 +696,8 @@ vect_build_slp_tree (loop_vec_info loop_vinfo, bb_vec_info bb_vinfo,
               if (prev_first_load)
                 {
                   /* Check that there are no loads from different interleaving
-                     chains in the same node.  The only exception is complex
-                     numbers.  */
-                  if (prev_first_load != first_load
-                      && rhs_code != REALPART_EXPR
-                      && rhs_code != IMAGPART_EXPR)
+                     chains in the same node.  */
+                  if (prev_first_load != first_load)
                     {
                       if (dump_enabled_p ())
                         {
@@ -798,9 +708,8 @@ vect_build_slp_tree (loop_vec_info loop_vinfo, bb_vec_info bb_vinfo,
                           dump_gimple_stmt (MSG_MISSED_OPTIMIZATION, TDF_SLIM,
 					    stmt, 0);
                         }
-
-	  	      vect_free_oprnd_info (oprnds_info);
-                      return false;
+		      /* Mismatch.  */
+		      continue;
                     }
                 }
               else
@@ -823,30 +732,11 @@ vect_build_slp_tree (loop_vec_info loop_vinfo, bb_vec_info bb_vinfo,
                           dump_gimple_stmt (MSG_MISSED_OPTIMIZATION, TDF_SLIM,
 					    stmt, 0);
                         }
-
-	  	      vect_free_oprnd_info (oprnds_info);
+		      /* Fatal mismatch.  */
+		      matches[0] = false;
                       return false;
                     }
-
-                  /* Analyze costs (for the first stmt in the group).  */
-                  vect_model_load_cost (vinfo_for_stmt (stmt),
-                                        ncopies_for_cost, false, *node,
-					prologue_cost_vec, body_cost_vec);
                 }
-
-              /* Store the place of this load in the interleaving chain.  In
-                 case that permutation is needed we later decide if a specific
-                 permutation is supported.  */
-              load_place = vect_get_place_in_interleaving_chain (stmt,
-                                                                 first_load);
-              if (load_place != i)
-                permutation = true;
-
-              load_permutation->safe_push (load_place);
-
-              /* We stop the tree when we reach a group of loads.  */
-              stop_recursion = true;
-             continue;
            }
         } /* Grouped access.  */
       else
@@ -862,7 +752,8 @@ vect_build_slp_tree (loop_vec_info loop_vinfo, bb_vec_info bb_vinfo,
 		}
 
 	      /* FORNOW: Not grouped loads are not supported.  */
-	      vect_free_oprnd_info (oprnds_info);
+	      /* Fatal mismatch.  */
+	      matches[0] = false;
 	      return false;
 	    }
 
@@ -879,8 +770,8 @@ vect_build_slp_tree (loop_vec_info loop_vinfo, bb_vec_info bb_vinfo,
 		  dump_printf (MSG_MISSED_OPTIMIZATION, " unsupported ");
 		  dump_gimple_stmt (MSG_MISSED_OPTIMIZATION, TDF_SLIM, stmt, 0);
 		}
-
-	      vect_free_oprnd_info (oprnds_info);
+	      /* Fatal mismatch.  */
+	      matches[0] = false;
 	      return false;
 	    }
 
@@ -900,73 +791,161 @@ vect_build_slp_tree (loop_vec_info loop_vinfo, bb_vec_info bb_vinfo,
                       dump_gimple_stmt (MSG_MISSED_OPTIMIZATION, TDF_SLIM,
 					stmt, 0);
                     }
-
-		  vect_free_oprnd_info (oprnds_info);
-                  return false;
+		  /* Mismatch.  */
+		  continue;
 		}
             }
-
-	  /* Find the def-stmts.  */
-	  if (!vect_get_and_check_slp_defs (loop_vinfo, bb_vinfo, *node, stmt,
-					    ncopies_for_cost, (i == 0),
-					    &oprnds_info, prologue_cost_vec,
-					    body_cost_vec))
-	    {
-	      vect_free_oprnd_info (oprnds_info);
-	      return false;
-	    }
 	}
+
+      matches[i] = true;
+    }
+
+  for (i = 0; i < group_size; ++i)
+    if (!matches[i])
+      return false;
+
+  return true;
+}
+
+/* Recursively build an SLP tree starting from NODE.
+   Fail (and return a value not equal to zero) if def-stmts are not
+   isomorphic, require data permutation or are of unsupported types of
+   operation.  Otherwise, return 0.
+   The value returned is the depth in the SLP tree where a mismatch
+   was found.  */
+
+static bool
+vect_build_slp_tree (loop_vec_info loop_vinfo, bb_vec_info bb_vinfo,
+                     slp_tree *node, unsigned int group_size,
+                     unsigned int *max_nunits,
+                     vec<slp_tree> *loads,
+                     unsigned int vectorization_factor,
+		     bool *matches, unsigned *npermutes)
+{
+  unsigned nops, i, this_npermutes = 0;
+  gimple stmt;
+
+  if (!matches)
+    matches = XALLOCAVEC (bool, group_size);
+  if (!npermutes)
+    npermutes = &this_npermutes;
+
+  matches[0] = false;
+
+  stmt = SLP_TREE_SCALAR_STMTS (*node)[0];
+  if (is_gimple_call (stmt))
+    nops = gimple_call_num_args (stmt);
+  else if (is_gimple_assign (stmt))
+    {
+      nops = gimple_num_ops (stmt) - 1;
+      if (gimple_assign_rhs_code (stmt) == COND_EXPR)
+	nops++;
     }
+  else
+    return false;
+
+  if (!vect_build_slp_tree_1 (loop_vinfo, bb_vinfo,
+			      SLP_TREE_SCALAR_STMTS (*node), group_size, nops,
+			      max_nunits, vectorization_factor, matches))
+    return false;
 
-  /* Grouped loads were reached - stop the recursion.  */
-  if (stop_recursion)
+  /* If the SLP node is a load, terminate the recursion.  */
+  if (STMT_VINFO_GROUPED_ACCESS (vinfo_for_stmt (stmt))
+      && DR_IS_READ (STMT_VINFO_DATA_REF (vinfo_for_stmt (stmt))))
     {
       loads->safe_push (*node);
-      if (permutation)
-        {
-	  gimple first_stmt = stmts[0];
-          *loads_permuted = true;
-	  (void) record_stmt_cost (body_cost_vec, group_size, vec_perm, 
-				   vinfo_for_stmt (first_stmt), 0, vect_body);
-        }
-      else
-        {
-          /* We don't check here complex numbers chains, so we set
-             LOADS_PERMUTED for further check in
-             vect_supported_load_permutation_p.  */
-          if (rhs_code == REALPART_EXPR || rhs_code == IMAGPART_EXPR)
-            *loads_permuted = true;
-        }
-
-      vect_free_oprnd_info (oprnds_info);
       return true;
     }
 
+  /* Get at the operands, verifying they are compatible.  */
+  vec<slp_oprnd_info> oprnds_info = vect_create_oprnd_info (nops, group_size);
+  slp_oprnd_info oprnd_info;
+  FOR_EACH_VEC_ELT (SLP_TREE_SCALAR_STMTS (*node), i, stmt)
+    {
+      if (!vect_get_and_check_slp_defs (loop_vinfo, bb_vinfo,
+					stmt, (i == 0), &oprnds_info))
+	{
+	  vect_free_oprnd_info (oprnds_info);
+	  return false;
+	}
+    }
+
+  stmt = SLP_TREE_SCALAR_STMTS (*node)[0];
+
   /* Create SLP_TREE nodes for the definition node/s.  */
   FOR_EACH_VEC_ELT (oprnds_info, i, oprnd_info)
     {
       slp_tree child;
+      unsigned old_nloads = loads->length ();
+      unsigned old_max_nunits = *max_nunits;
 
       if (oprnd_info->first_dt != vect_internal_def)
         continue;
 
       child = vect_create_new_slp_node (oprnd_info->def_stmts);
-      if (!child
-          || !vect_build_slp_tree (loop_vinfo, bb_vinfo, &child, group_size,
-				   outside_cost, ncopies_for_cost,
-				   max_nunits, load_permutation, loads,
-				   vectorization_factor, loads_permuted,
-				   prologue_cost_vec, body_cost_vec))
-        {
-	  if (child)
-	    oprnd_info->def_stmts = vNULL;
-	  vect_free_slp_tree (child);
+      if (!child)
+	{
 	  vect_free_oprnd_info (oprnds_info);
-   	  return false;
+	  return false;
+	}
+
+      bool *matches = XALLOCAVEC (bool, group_size);
+      if (vect_build_slp_tree (loop_vinfo, bb_vinfo, &child,
+			       group_size, max_nunits, loads,
+			       vectorization_factor, matches, npermutes))
+	{
+	  oprnd_info->def_stmts = vNULL;
+	  SLP_TREE_CHILDREN (*node).quick_push (child);
+	  continue;
 	}
 
-      oprnd_info->def_stmts.create (0);
-      SLP_TREE_CHILDREN (*node).quick_push (child);
+      /* If the SLP build for operand zero failed and operand zero
+	 and one can be commutated try that for the scalar stmts
+	 that failed the match.  */
+      if (i == 0
+	  /* A first scalar stmt mismatch signals a fatal mismatch.  */
+	  && matches[0]
+	  /* ???  For COND_EXPRs we can swap the comparison operands
+	     as well as the arms under some constraints.  */
+	  && nops == 2
+	  && oprnds_info[1]->first_dt == vect_internal_def
+	  && is_gimple_assign (stmt)
+	  && commutative_tree_code (gimple_assign_rhs_code (stmt))
+	  /* Do so only if the number of not successful permutes was nor more
+	     than a cut-ff as re-trying the recursive match on
+	     possibly each level of the tree would expose exponential
+	     behavior.  */
+	  && *npermutes < 4)
+	{
+	  /* Roll back.  */
+	  *max_nunits = old_max_nunits;
+	  loads->truncate (old_nloads);
+	  /* Swap mismatched definition stmts.  */
+	  for (unsigned j = 0; j < group_size; ++j)
+	    if (!matches[j])
+	      {
+		gimple tem = oprnds_info[0]->def_stmts[j];
+		oprnds_info[0]->def_stmts[j] = oprnds_info[1]->def_stmts[j];
+		oprnds_info[1]->def_stmts[j] = tem;
+	      }
+	  /* And try again ... */
+	  if (vect_build_slp_tree (loop_vinfo, bb_vinfo, &child,
+				   group_size, max_nunits, loads,
+				   vectorization_factor,
+				   matches, npermutes))
+	    {
+	      oprnd_info->def_stmts = vNULL;
+	      SLP_TREE_CHILDREN (*node).quick_push (child);
+	      continue;
+	    }
+
+	  ++*npermutes;
+	}
+
+      oprnd_info->def_stmts = vNULL;
+      vect_free_slp_tree (child);
+      vect_free_oprnd_info (oprnds_info);
+      return false;
     }
 
   vect_free_oprnd_info (oprnds_info);
@@ -980,7 +959,7 @@ vect_print_slp_tree (int dump_kind, slp_tree node)
 {
   int i;
   gimple stmt;
-  slp_void_p child;
+  slp_tree child;
 
   if (!node)
     return;
@@ -994,7 +973,7 @@ vect_print_slp_tree (int dump_kind, slp_tree node)
   dump_printf (dump_kind, "\n");
 
   FOR_EACH_VEC_ELT (SLP_TREE_CHILDREN (node), i, child)
-    vect_print_slp_tree (dump_kind, (slp_tree) child);
+    vect_print_slp_tree (dump_kind, child);
 }
 
 
@@ -1008,7 +987,7 @@ vect_mark_slp_stmts (slp_tree node, enum slp_vect_type mark, int j)
 {
   int i;
   gimple stmt;
-  slp_void_p child;
+  slp_tree child;
 
   if (!node)
     return;
@@ -1018,7 +997,7 @@ vect_mark_slp_stmts (slp_tree node, enum slp_vect_type mark, int j)
       STMT_SLP_TYPE (vinfo_for_stmt (stmt)) = mark;
 
   FOR_EACH_VEC_ELT (SLP_TREE_CHILDREN (node), i, child)
-    vect_mark_slp_stmts ((slp_tree) child, mark, j);
+    vect_mark_slp_stmts (child, mark, j);
 }
 
 
@@ -1030,7 +1009,7 @@ vect_mark_slp_stmts_relevant (slp_tree node)
   int i;
   gimple stmt;
   stmt_vec_info stmt_info;
-  slp_void_p child;
+  slp_tree child;
 
   if (!node)
     return;
@@ -1044,69 +1023,7 @@ vect_mark_slp_stmts_relevant (slp_tree node)
     }
 
   FOR_EACH_VEC_ELT (SLP_TREE_CHILDREN (node), i, child)
-    vect_mark_slp_stmts_relevant ((slp_tree) child);
-}
-
-
-/* Check if the permutation required by the SLP INSTANCE is supported.
-   Reorganize the SLP nodes stored in SLP_INSTANCE_LOADS if needed.  */
-
-static bool
-vect_supported_slp_permutation_p (slp_instance instance)
-{
-  slp_tree node = SLP_INSTANCE_LOADS (instance)[0];
-  gimple stmt = SLP_TREE_SCALAR_STMTS (node)[0];
-  gimple first_load = GROUP_FIRST_ELEMENT (vinfo_for_stmt (stmt));
-  vec<slp_tree> sorted_loads = vNULL;
-  int index;
-  slp_tree *tmp_loads = NULL;
-  int group_size = SLP_INSTANCE_GROUP_SIZE (instance), i, j;
-  slp_tree load;
-
-  /* FORNOW: The only supported loads permutation is loads from the same
-     location in all the loads in the node, when the data-refs in
-     nodes of LOADS constitute an interleaving chain.
-     Sort the nodes according to the order of accesses in the chain.  */
-  tmp_loads = (slp_tree *) xmalloc (sizeof (slp_tree) * group_size);
-  for (i = 0, j = 0;
-       SLP_INSTANCE_LOAD_PERMUTATION (instance).iterate (i, &index)
-       && SLP_INSTANCE_LOADS (instance).iterate (j, &load);
-       i += group_size, j++)
-    {
-      gimple scalar_stmt = SLP_TREE_SCALAR_STMTS (load)[0];
-      /* Check that the loads are all in the same interleaving chain.  */
-      if (GROUP_FIRST_ELEMENT (vinfo_for_stmt (scalar_stmt)) != first_load)
-        {
-          if (dump_enabled_p ())
-            {
-              dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
-			       "Build SLP failed: unsupported data "
-			       "permutation ");
-              dump_gimple_stmt (MSG_MISSED_OPTIMIZATION, TDF_SLIM,
-				scalar_stmt, 0);
-            }
-
-          free (tmp_loads);
-          return false;
-        }
-
-      tmp_loads[index] = load;
-    }
-
-  sorted_loads.create (group_size);
-  for (i = 0; i < group_size; i++)
-     sorted_loads.safe_push (tmp_loads[i]);
-
-  SLP_INSTANCE_LOADS (instance).release ();
-  SLP_INSTANCE_LOADS (instance) = sorted_loads;
-  free (tmp_loads);
-
-  if (!vect_transform_slp_perm_load (stmt, vNULL, NULL,
-                                     SLP_INSTANCE_UNROLLING_FACTOR (instance),
-                                     instance, true))
-    return false;
-
-  return true;
+    vect_mark_slp_stmts_relevant (child);
 }
 
 
@@ -1114,63 +1031,51 @@ vect_supported_slp_permutation_p (slp_instance instance)
 
 static void
 vect_slp_rearrange_stmts (slp_tree node, unsigned int group_size,
-                          vec<int> permutation)
+                          vec<unsigned> permutation)
 {
   gimple stmt;
   vec<gimple> tmp_stmts;
-  unsigned int index, i;
-  slp_void_p child;
-
-  if (!node)
-    return;
+  unsigned int i;
+  slp_tree child;
 
   FOR_EACH_VEC_ELT (SLP_TREE_CHILDREN (node), i, child)
-    vect_slp_rearrange_stmts ((slp_tree) child, group_size, permutation);
+    vect_slp_rearrange_stmts (child, group_size, permutation);
 
   gcc_assert (group_size == SLP_TREE_SCALAR_STMTS (node).length ());
   tmp_stmts.create (group_size);
-
-  for (i = 0; i < group_size; i++)
-    tmp_stmts.safe_push (NULL);
+  tmp_stmts.quick_grow_cleared (group_size);
 
   FOR_EACH_VEC_ELT (SLP_TREE_SCALAR_STMTS (node), i, stmt)
-    {
-      index = permutation[i];
-      tmp_stmts[index] = stmt;
-    }
+    tmp_stmts[permutation[i]] = stmt;
 
   SLP_TREE_SCALAR_STMTS (node).release ();
   SLP_TREE_SCALAR_STMTS (node) = tmp_stmts;
 }
 
 
-/* Check if the required load permutation is supported.
-   LOAD_PERMUTATION contains a list of indices of the loads.
-   In SLP this permutation is relative to the order of grouped stores that are
-   the base of the SLP instance.  */
+/* Check if the required load permutations in the SLP instance
+   SLP_INSTN are supported.  */
 
 static bool
-vect_supported_load_permutation_p (slp_instance slp_instn, int group_size,
-                                   vec<int> load_permutation)
+vect_supported_load_permutation_p (slp_instance slp_instn)
 {
-  int i = 0, j, prev = -1, next, k, number_of_groups;
-  bool supported, bad_permutation = false;
+  unsigned int group_size = SLP_INSTANCE_GROUP_SIZE (slp_instn);
+  unsigned int i, j, k, next;
   sbitmap load_index;
-  slp_tree node, other_complex_node;
-  gimple stmt, first = NULL, other_node_first, load, next_load, first_load;
-  unsigned complex_numbers = 0;
+  slp_tree node;
+  gimple stmt, load, next_load, first_load;
   struct data_reference *dr;
-  bb_vec_info bb_vinfo;
-
-  /* FORNOW: permutations are only supported in SLP.  */
-  if (!slp_instn)
-    return false;
 
   if (dump_enabled_p ())
     {
       dump_printf_loc (MSG_NOTE, vect_location, "Load permutation ");
-      FOR_EACH_VEC_ELT (load_permutation, i, next)
-        dump_printf (MSG_NOTE, "%d ", next);
+      FOR_EACH_VEC_ELT (SLP_INSTANCE_LOADS (slp_instn), i, node)
+	if (node->load_permutation.exists ())
+	  FOR_EACH_VEC_ELT (node->load_permutation, j, next)
+	    dump_printf (MSG_NOTE, "%d ", next);
+	else
+	  for (i = 0; i < group_size; ++i)
+	    dump_printf (MSG_NOTE, "%d ", i);
     }
 
   /* In case of reduction every load permutation is allowed, since the order
@@ -1181,266 +1086,161 @@ vect_supported_load_permutation_p (slp_instance slp_instn, int group_size,
      permutation).  */
 
   /* Check that all the load nodes are of the same size.  */
+  /* ???  Can't we assert this? */
   FOR_EACH_VEC_ELT (SLP_INSTANCE_LOADS (slp_instn), i, node)
-    {
-      if (SLP_TREE_SCALAR_STMTS (node).length () != (unsigned) group_size)
-        return false;
-
-      stmt = SLP_TREE_SCALAR_STMTS (node)[0];
-      if (is_gimple_assign (stmt) 
-          && (gimple_assign_rhs_code (stmt) == REALPART_EXPR
-              || gimple_assign_rhs_code (stmt) == IMAGPART_EXPR))
-        complex_numbers++;
-    }
-
-  /* Complex operands can be swapped as following:
-      real_c = real_b + real_a;
-      imag_c = imag_a + imag_b;
-     i.e., we have {real_b, imag_a} and {real_a, imag_b} instead of 
-     {real_a, imag_a} and {real_b, imag_b}.  We check here that if interleaving
-     chains are mixed, they match the above pattern.  */
-  if (complex_numbers)
-    {
-      FOR_EACH_VEC_ELT (SLP_INSTANCE_LOADS (slp_instn), i, node)
-        {
-	  FOR_EACH_VEC_ELT (SLP_TREE_SCALAR_STMTS (node), j, stmt)
-            {
-              if (j == 0)
-                first = stmt;
-              else
-                {
-                  if (GROUP_FIRST_ELEMENT (vinfo_for_stmt (stmt)) != first)
-                    {
-                      if (complex_numbers != 2)
-                        return false;
-
-                      if (i == 0)
-                        k = 1;
-                      else
-                        k = 0;
- 
-                      other_complex_node = SLP_INSTANCE_LOADS (slp_instn)[k];
-                      other_node_first = 
-                                SLP_TREE_SCALAR_STMTS (other_complex_node)[0];
-
-                      if (GROUP_FIRST_ELEMENT (vinfo_for_stmt (stmt))
-                          != other_node_first)
-                       return false;
-                    }
-                }
-            }
-        }
-    }
+    if (SLP_TREE_SCALAR_STMTS (node).length () != (unsigned) group_size)
+      return false;
 
-  /* We checked that this case ok, so there is no need to proceed with 
-     permutation tests.  */
-  if (complex_numbers == 2
-      && SLP_INSTANCE_LOADS (slp_instn).length () == 2)
-    {
-      SLP_INSTANCE_LOADS (slp_instn).release ();
-      SLP_INSTANCE_LOAD_PERMUTATION (slp_instn).release ();
-      return true;
-    }
-                   
   node = SLP_INSTANCE_TREE (slp_instn);
   stmt = SLP_TREE_SCALAR_STMTS (node)[0];
-  /* LOAD_PERMUTATION is a list of indices of all the loads of the SLP
-     instance, not all the loads belong to the same node or interleaving
-     group.  Hence, we need to divide them into groups according to
-     GROUP_SIZE.  */
-  number_of_groups = load_permutation.length () / group_size;
 
   /* Reduction (there are no data-refs in the root).
      In reduction chain the order of the loads is important.  */
   if (!STMT_VINFO_DATA_REF (vinfo_for_stmt (stmt))
       && !GROUP_FIRST_ELEMENT (vinfo_for_stmt (stmt)))
     {
-      int first_group_load_index;
-
-      /* Compare all the permutation sequences to the first one.  */
-      for (i = 1; i < number_of_groups; i++)
-        {
-          k = 0;
-          for (j = i * group_size; j < i * group_size + group_size; j++)
-            {
-              next = load_permutation[j];
-              first_group_load_index = load_permutation[k];
-
-              if (next != first_group_load_index)
-                {
-                  bad_permutation = true;
-                  break;
-                }
-
-              k++;
-            }
-
-          if (bad_permutation)
-            break;
-        }
-
-      if (!bad_permutation)
-        {
-          /* Check that the loads in the first sequence are different and there
-             are no gaps between them.  */
-          load_index = sbitmap_alloc (group_size);
-          bitmap_clear (load_index);
-          for (k = 0; k < group_size; k++)
-            {
-              first_group_load_index = load_permutation[k];
-              if (bitmap_bit_p (load_index, first_group_load_index))
-                {
-                  bad_permutation = true;
-                  break;
-                }
+      slp_tree load;
+      unsigned int lidx;
 
-              bitmap_set_bit (load_index, first_group_load_index);
-            }
-
-          if (!bad_permutation)
-            for (k = 0; k < group_size; k++)
-              if (!bitmap_bit_p (load_index, k))
-                {
-                  bad_permutation = true;
-                  break;
-                }
-
-          sbitmap_free (load_index);
-        }
+      /* Compare all the permutation sequences to the first one.  We know
+         that at least one load is permuted.  */
+      node = SLP_INSTANCE_LOADS (slp_instn)[0];
+      if (!node->load_permutation.exists ())
+	return false;
+      for (i = 1; SLP_INSTANCE_LOADS (slp_instn).iterate (i, &load); ++i)
+	{
+	  if (!load->load_permutation.exists ())
+	    return false;
+	  FOR_EACH_VEC_ELT (load->load_permutation, j, lidx)
+	    if (lidx != node->load_permutation[j])
+	      return false;
+	}
 
-      if (!bad_permutation)
-        {
-          /* This permutation is valid for reduction.  Since the order of the
-             statements in the nodes is not important unless they are memory
-             accesses, we can rearrange the statements in all the nodes 
-             according to the order of the loads.  */
-          vect_slp_rearrange_stmts (SLP_INSTANCE_TREE (slp_instn), group_size,
-                                    load_permutation);
-          SLP_INSTANCE_LOAD_PERMUTATION (slp_instn).release ();
-          return true;
-        }
+      /* Check that the loads in the first sequence are different and there
+	 are no gaps between them.  */
+      load_index = sbitmap_alloc (group_size);
+      bitmap_clear (load_index);
+      FOR_EACH_VEC_ELT (node->load_permutation, i, lidx)
+	{
+	  if (bitmap_bit_p (load_index, lidx))
+	    {
+	      sbitmap_free (load_index);
+	      return false;
+	    }
+	  bitmap_set_bit (load_index, lidx);
+	}
+      for (i = 0; i < group_size; i++)
+	if (!bitmap_bit_p (load_index, i))
+	  {
+	    sbitmap_free (load_index);
+	    return false;
+	  }
+      sbitmap_free (load_index);
+
+      /* This permutation is valid for reduction.  Since the order of the
+	 statements in the nodes is not important unless they are memory
+	 accesses, we can rearrange the statements in all the nodes
+	 according to the order of the loads.  */
+      vect_slp_rearrange_stmts (SLP_INSTANCE_TREE (slp_instn), group_size,
+				node->load_permutation);
+
+      /* We are done, no actual permutations need to be generated.  */
+      FOR_EACH_VEC_ELT (SLP_INSTANCE_LOADS (slp_instn), i, node)
+	SLP_TREE_LOAD_PERMUTATION (node).release ();
+      return true;
     }
 
   /* In basic block vectorization we allow any subchain of an interleaving
      chain.
      FORNOW: not supported in loop SLP because of realignment compications.  */
-  bb_vinfo = STMT_VINFO_BB_VINFO (vinfo_for_stmt (stmt));
-  bad_permutation = false;
-  /* Check that for every node in the instance the loads form a subchain.  */
-  if (bb_vinfo)
+  if (STMT_VINFO_BB_VINFO (vinfo_for_stmt (stmt)))
     {
+      /* Check that for every node in the instance the loads
+	 form a subchain.  */
       FOR_EACH_VEC_ELT (SLP_INSTANCE_LOADS (slp_instn), i, node)
         {
           next_load = NULL;
-          first_load = NULL;
           FOR_EACH_VEC_ELT (SLP_TREE_SCALAR_STMTS (node), j, load)
             {
-              if (!first_load)
-                first_load = GROUP_FIRST_ELEMENT (vinfo_for_stmt (load));
-              else if (first_load
-                         != GROUP_FIRST_ELEMENT (vinfo_for_stmt (load)))
-                {
-                  bad_permutation = true;
-	          break;
-	        }
-
               if (j != 0 && next_load != load)
-                {
-                  bad_permutation = true;
-                  break;
-                }
-
+		return false;
               next_load = GROUP_NEXT_ELEMENT (vinfo_for_stmt (load));
             }
-
-          if (bad_permutation)
-            break;
         }
 
       /* Check that the alignment of the first load in every subchain, i.e.,
-         the first statement in every load node, is supported.  */
-      if (!bad_permutation)
-        {
-          FOR_EACH_VEC_ELT (SLP_INSTANCE_LOADS (slp_instn), i, node)
-            {
-              first_load = SLP_TREE_SCALAR_STMTS (node)[0];
-              if (first_load
-                    != GROUP_FIRST_ELEMENT (vinfo_for_stmt (first_load)))
-                {
-                  dr = STMT_VINFO_DATA_REF (vinfo_for_stmt (first_load));
-                  if (vect_supportable_dr_alignment (dr, false)
- 	               == dr_unaligned_unsupported)
-                    {
-   		      if (dump_enabled_p ())
-		        {
-  	                  dump_printf_loc (MSG_MISSED_OPTIMIZATION,
-					   vect_location, 
-					   "unsupported unaligned load ");
-                          dump_gimple_stmt (MSG_MISSED_OPTIMIZATION, TDF_SLIM,
-					    first_load, 0);
-                        }
-  		      bad_permutation = true;
-                      break;
-                    }
-	        }
-            }
+         the first statement in every load node, is supported.
+	 ???  This belongs in alignment checking.  */
+      FOR_EACH_VEC_ELT (SLP_INSTANCE_LOADS (slp_instn), i, node)
+	{
+	  first_load = SLP_TREE_SCALAR_STMTS (node)[0];
+	  if (first_load != GROUP_FIRST_ELEMENT (vinfo_for_stmt (first_load)))
+	    {
+	      dr = STMT_VINFO_DATA_REF (vinfo_for_stmt (first_load));
+	      if (vect_supportable_dr_alignment (dr, false)
+		  == dr_unaligned_unsupported)
+		{
+		  if (dump_enabled_p ())
+		    {
+		      dump_printf_loc (MSG_MISSED_OPTIMIZATION,
+				       vect_location,
+				       "unsupported unaligned load ");
+		      dump_gimple_stmt (MSG_MISSED_OPTIMIZATION, TDF_SLIM,
+					first_load, 0);
+		    }
+		  return false;
+		}
+	    }
+	}
 
-          if (!bad_permutation)
-            {
-              SLP_INSTANCE_LOAD_PERMUTATION (slp_instn).release ();
-              return true;
-    	    }
-        }
+      /* We are done, no actual permutations need to be generated.  */
+      FOR_EACH_VEC_ELT (SLP_INSTANCE_LOADS (slp_instn), i, node)
+	SLP_TREE_LOAD_PERMUTATION (node).release ();
+      return true;
     }
 
   /* FORNOW: the only supported permutation is 0..01..1.. of length equal to
      GROUP_SIZE and where each sequence of same drs is of GROUP_SIZE length as
      well (unless it's reduction).  */
-  if (load_permutation.length ()
-      != (unsigned int) (group_size * group_size))
+  if (SLP_INSTANCE_LOADS (slp_instn).length () != group_size)
     return false;
+  FOR_EACH_VEC_ELT (SLP_INSTANCE_LOADS (slp_instn), i, node)
+    if (!node->load_permutation.exists ())
+      return false;
 
-  supported = true;
   load_index = sbitmap_alloc (group_size);
   bitmap_clear (load_index);
-  for (j = 0; j < group_size; j++)
-    {
-      for (i = j * group_size, k = 0;
-           load_permutation.iterate (i, &next) && k < group_size;
-           i++, k++)
-       {
-         if (i != j * group_size && next != prev)
-          {
-            supported = false;
-            break;
-          }
-
-         prev = next;
-       }
-
-      if (bitmap_bit_p (load_index, prev))
-        {
-          supported = false;
-          break;
-        }
-
-      bitmap_set_bit (load_index, prev);
+  FOR_EACH_VEC_ELT (SLP_INSTANCE_LOADS (slp_instn), i, node)
+    {
+      unsigned int lidx = node->load_permutation[0];
+      if (bitmap_bit_p (load_index, lidx))
+	{
+	  sbitmap_free (load_index);
+	  return false;
+	}
+      bitmap_set_bit (load_index, lidx);
+      FOR_EACH_VEC_ELT (node->load_permutation, j, k)
+	if (k != lidx)
+	  {
+	    sbitmap_free (load_index);
+	    return false;
+	  }
     }
- 
-  for (j = 0; j < group_size; j++)
-    if (!bitmap_bit_p (load_index, j))
+  for (i = 0; i < group_size; i++)
+    if (!bitmap_bit_p (load_index, i))
       {
 	sbitmap_free (load_index);
 	return false;
       }
-
   sbitmap_free (load_index);
 
-  if (supported && i == group_size * group_size
-      && vect_supported_slp_permutation_p (slp_instn))
-    return true;
-
-  return false;
+  FOR_EACH_VEC_ELT (SLP_INSTANCE_LOADS (slp_instn), i, node)
+    if (node->load_permutation.exists ()
+	&& !vect_transform_slp_perm_load
+	      (node, vNULL, NULL,
+	       SLP_INSTANCE_UNROLLING_FACTOR (slp_instn), slp_instn, true))
+      return false;
+  return true;
 }
 
 
@@ -1482,6 +1282,122 @@ vect_find_last_store_in_slp_instance (slp_instance instance)
   return last_store;
 }
 
+/* Compute the cost for the SLP node NODE in the SLP instance INSTANCE.  */
+
+static void
+vect_analyze_slp_cost_1 (loop_vec_info loop_vinfo, bb_vec_info bb_vinfo,
+			 slp_instance instance, slp_tree node,
+			 stmt_vector_for_cost *prologue_cost_vec,
+			 unsigned ncopies_for_cost)
+{
+  stmt_vector_for_cost *body_cost_vec = &SLP_INSTANCE_BODY_COST_VEC (instance);
+
+  unsigned i;
+  slp_tree child;
+  gimple stmt, s;
+  stmt_vec_info stmt_info;
+  tree lhs;
+  unsigned group_size = SLP_INSTANCE_GROUP_SIZE (instance);
+
+  /* Recurse down the SLP tree.  */
+  FOR_EACH_VEC_ELT (SLP_TREE_CHILDREN (node), i, child)
+    vect_analyze_slp_cost_1 (loop_vinfo, bb_vinfo,
+			     instance, child, prologue_cost_vec,
+			     ncopies_for_cost);
+
+  /* Look at the first scalar stmt to determine the cost.  */
+  stmt = SLP_TREE_SCALAR_STMTS (node)[0];
+  stmt_info = vinfo_for_stmt (stmt);
+  if (STMT_VINFO_GROUPED_ACCESS (stmt_info))
+    {
+      if (DR_IS_WRITE (STMT_VINFO_DATA_REF (stmt_info)))
+	vect_model_store_cost (stmt_info, ncopies_for_cost, false,
+			       vect_uninitialized_def,
+			       node, prologue_cost_vec, body_cost_vec);
+      else
+	{
+	  int i;
+	  gcc_checking_assert (DR_IS_READ (STMT_VINFO_DATA_REF (stmt_info)));
+	  vect_model_load_cost (stmt_info, ncopies_for_cost, false,
+				node, prologue_cost_vec, body_cost_vec);
+	  /* If the load is permuted record the cost for the permutation.
+	     ???  Loads from multiple chains are let through here only
+	     for a single special case involving complex numbers where
+	     in the end no permutation is necessary.  */
+	  FOR_EACH_VEC_ELT (SLP_TREE_SCALAR_STMTS (node), i, s)
+	    if ((STMT_VINFO_GROUP_FIRST_ELEMENT (vinfo_for_stmt (s))
+		 == STMT_VINFO_GROUP_FIRST_ELEMENT (stmt_info))
+		&& vect_get_place_in_interleaving_chain
+		     (s, STMT_VINFO_GROUP_FIRST_ELEMENT (stmt_info)) != i)
+	      {
+		record_stmt_cost (body_cost_vec, group_size, vec_perm,
+				  stmt_info, 0, vect_body);
+		break;
+	      }
+	}
+    }
+  else
+    record_stmt_cost (body_cost_vec, ncopies_for_cost, vector_stmt,
+		      stmt_info, 0, vect_body);
+
+  /* Scan operands and account for prologue cost of constants/externals.
+     ???  This over-estimates cost for multiple uses and should be
+     re-engineered.  */
+  lhs = gimple_get_lhs (stmt);
+  for (i = 0; i < gimple_num_ops (stmt); ++i)
+    {
+      tree def, op = gimple_op (stmt, i);
+      gimple def_stmt;
+      enum vect_def_type dt;
+      if (!op || op == lhs)
+	continue;
+      if (vect_is_simple_use (op, NULL, loop_vinfo, bb_vinfo,
+			      &def_stmt, &def, &dt)
+	  && (dt == vect_constant_def || dt == vect_external_def))
+	record_stmt_cost (prologue_cost_vec, 1, vector_stmt,
+			  stmt_info, 0, vect_prologue);
+    }
+}
+
+/* Compute the cost for the SLP instance INSTANCE.  */
+
+static void
+vect_analyze_slp_cost (loop_vec_info loop_vinfo, bb_vec_info bb_vinfo,
+		       slp_instance instance, unsigned nunits)
+{
+  stmt_vector_for_cost body_cost_vec, prologue_cost_vec;
+  unsigned ncopies_for_cost;
+  stmt_info_for_cost *si;
+  unsigned i;
+
+  /* Calculate the number of vector stmts to create based on the unrolling
+     factor (number of vectors is 1 if NUNITS >= GROUP_SIZE, and is
+     GROUP_SIZE / NUNITS otherwise.  */
+  unsigned group_size = SLP_INSTANCE_GROUP_SIZE (instance);
+  ncopies_for_cost = least_common_multiple (nunits, group_size) / nunits;
+
+  prologue_cost_vec.create (10);
+  body_cost_vec.create (10);
+  SLP_INSTANCE_BODY_COST_VEC (instance) = body_cost_vec;
+  vect_analyze_slp_cost_1 (loop_vinfo, bb_vinfo,
+			   instance, SLP_INSTANCE_TREE (instance),
+			   &prologue_cost_vec, ncopies_for_cost);
+
+  /* Record the prologue costs, which were delayed until we were
+     sure that SLP was successful.  Unlike the body costs, we know
+     the final values now regardless of the loop vectorization factor.  */
+  void *data = (loop_vinfo ? LOOP_VINFO_TARGET_COST_DATA (loop_vinfo)
+		: BB_VINFO_TARGET_COST_DATA (bb_vinfo));
+  FOR_EACH_VEC_ELT (prologue_cost_vec, i, si)
+    {
+      struct _stmt_vec_info *stmt_info
+	= si->stmt ? vinfo_for_stmt (si->stmt) : NULL;
+      (void) add_stmt_cost (data, si->count, si->kind, stmt_info,
+			    si->misalign, vect_prologue);
+    }
+
+  prologue_cost_vec.release ();
+}
 
 /* Analyze an SLP instance starting from a group of grouped stores.  Call
    vect_build_slp_tree to build a tree of packed stmts if possible.
@@ -1498,15 +1414,11 @@ vect_analyze_slp_instance (loop_vec_info loop_vinfo, bb_vec_info bb_vinfo,
   tree vectype, scalar_type = NULL_TREE;
   gimple next;
   unsigned int vectorization_factor = 0;
-  int outside_cost = 0, ncopies_for_cost, i;
+  int i;
   unsigned int max_nunits = 0;
-  vec<int> load_permutation;
   vec<slp_tree> loads;
   struct data_reference *dr = STMT_VINFO_DATA_REF (vinfo_for_stmt (stmt));
-  bool loads_permuted = false;
   vec<gimple> scalar_stmts;
-  stmt_vector_for_cost body_cost_vec, prologue_cost_vec;
-  stmt_info_for_cost *si;
 
   if (GROUP_FIRST_ELEMENT (vinfo_for_stmt (stmt)))
     {
@@ -1587,26 +1499,13 @@ vect_analyze_slp_instance (loop_vec_info loop_vinfo, bb_vec_info bb_vinfo,
 
   node = vect_create_new_slp_node (scalar_stmts);
 
-  /* Calculate the number of vector stmts to create based on the unrolling
-     factor (number of vectors is 1 if NUNITS >= GROUP_SIZE, and is
-     GROUP_SIZE / NUNITS otherwise.  */
-  ncopies_for_cost = unrolling_factor * group_size / nunits;
-
-  load_permutation.create (group_size * group_size);
   loads.create (group_size);
-  prologue_cost_vec.create (10);
-  body_cost_vec.create (10);
 
   /* Build the tree for the SLP instance.  */
   if (vect_build_slp_tree (loop_vinfo, bb_vinfo, &node, group_size,
-                           &outside_cost, ncopies_for_cost,
-			   &max_nunits, &load_permutation, &loads,
-			   vectorization_factor, &loads_permuted,
-			   &prologue_cost_vec, &body_cost_vec))
+			   &max_nunits, &loads,
+			   vectorization_factor, NULL, NULL))
     {
-      void *data = (loop_vinfo ? LOOP_VINFO_TARGET_COST_DATA (loop_vinfo)
-		    : BB_VINFO_TARGET_COST_DATA (bb_vinfo));
-
       /* Calculate the unrolling factor based on the smallest type.  */
       if (max_nunits > nunits)
         unrolling_factor = least_common_multiple (max_nunits, group_size)
@@ -1619,9 +1518,6 @@ vect_analyze_slp_instance (loop_vec_info loop_vinfo, bb_vec_info bb_vinfo,
 			     "Build SLP failed: unrolling required in basic"
 			     " block SLP");
 	  vect_free_slp_tree (node);
-	  body_cost_vec.release ();
-	  prologue_cost_vec.release ();
-	  load_permutation.release ();
 	  loads.release ();
           return false;
         }
@@ -1631,15 +1527,43 @@ vect_analyze_slp_instance (loop_vec_info loop_vinfo, bb_vec_info bb_vinfo,
       SLP_INSTANCE_TREE (new_instance) = node;
       SLP_INSTANCE_GROUP_SIZE (new_instance) = group_size;
       SLP_INSTANCE_UNROLLING_FACTOR (new_instance) = unrolling_factor;
-      SLP_INSTANCE_BODY_COST_VEC (new_instance) = body_cost_vec;
+      SLP_INSTANCE_BODY_COST_VEC (new_instance) = vNULL;
       SLP_INSTANCE_LOADS (new_instance) = loads;
       SLP_INSTANCE_FIRST_LOAD_STMT (new_instance) = NULL;
-      SLP_INSTANCE_LOAD_PERMUTATION (new_instance) = load_permutation;
+
+      /* Compute the load permutation.  */
+      slp_tree load_node;
+      bool loads_permuted = false;
+      FOR_EACH_VEC_ELT (loads, i, load_node)
+	{
+	  vec<unsigned> load_permutation;
+	  int j;
+	  gimple load, first_stmt;
+	  bool this_load_permuted = false;
+	  load_permutation.create (group_size);
+	  first_stmt = GROUP_FIRST_ELEMENT
+	      (vinfo_for_stmt (SLP_TREE_SCALAR_STMTS (load_node)[0]));
+	  FOR_EACH_VEC_ELT (SLP_TREE_SCALAR_STMTS (load_node), j, load)
+	    {
+	      int load_place
+		= vect_get_place_in_interleaving_chain (load, first_stmt);
+	      gcc_assert (load_place != -1);
+	      if (load_place != j)
+		this_load_permuted = true;
+	      load_permutation.safe_push (load_place);
+	    }
+	  if (!this_load_permuted)
+	    {
+	      load_permutation.release ();
+	      continue;
+	    }
+	  SLP_TREE_LOAD_PERMUTATION (load_node) = load_permutation;
+	  loads_permuted = true;
+	}
 
       if (loads_permuted)
         {
-          if (!vect_supported_load_permutation_p (new_instance, group_size,
-                                                  load_permutation))
+          if (!vect_supported_load_permutation_p (new_instance))
             {
               if (dump_enabled_p ())
                 {
@@ -1648,30 +1572,17 @@ vect_analyze_slp_instance (loop_vec_info loop_vinfo, bb_vec_info bb_vinfo,
 				   "permutation ");
                   dump_gimple_stmt (MSG_MISSED_OPTIMIZATION, TDF_SLIM, stmt, 0);
                 }
-
               vect_free_slp_instance (new_instance);
-	      prologue_cost_vec.release ();
               return false;
             }
 
           SLP_INSTANCE_FIRST_LOAD_STMT (new_instance)
-             = vect_find_first_load_in_slp_instance (new_instance);
+	    = vect_find_first_load_in_slp_instance (new_instance);
         }
-      else
-        SLP_INSTANCE_LOAD_PERMUTATION (new_instance).release ();
 
-      /* Record the prologue costs, which were delayed until we were
-	 sure that SLP was successful.  Unlike the body costs, we know
-	 the final values now regardless of the loop vectorization factor.  */
-      FOR_EACH_VEC_ELT (prologue_cost_vec, i, si)
-	{
-	  struct _stmt_vec_info *stmt_info
-	    = si->stmt ? vinfo_for_stmt (si->stmt) : NULL;
-	  (void) add_stmt_cost (data, si->count, si->kind, stmt_info,
-				si->misalign, vect_prologue);
-	}
-
-      prologue_cost_vec.release ();
+      /* Compute the costs of this SLP instance.  */
+      vect_analyze_slp_cost (loop_vinfo, bb_vinfo,
+			     new_instance, TYPE_VECTOR_SUBPARTS (vectype));
 
       if (loop_vinfo)
         LOOP_VINFO_SLP_INSTANCES (loop_vinfo).safe_push (new_instance);
@@ -1683,16 +1594,10 @@ vect_analyze_slp_instance (loop_vec_info loop_vinfo, bb_vec_info bb_vinfo,
 
       return true;
     }
-  else
-    {
-      body_cost_vec.release ();
-      prologue_cost_vec.release ();
-    }
 
   /* Failed to SLP.  */
   /* Free the allocated memory.  */
   vect_free_slp_tree (node);
-  load_permutation.release ();
   loads.release ();
 
   return false;
@@ -1793,7 +1698,7 @@ vect_make_slp_decision (loop_vec_info loop_vinfo)
   LOOP_VINFO_SLP_UNROLLING_FACTOR (loop_vinfo) = unrolling_factor;
 
   if (decided_to_slp && dump_enabled_p ())
-    dump_printf_loc (MSG_OPTIMIZED_LOCATIONS, vect_location,
+    dump_printf_loc (MSG_NOTE, vect_location,
 		     "Decided to SLP %d instances. Unrolling factor %d",
 		     decided_to_slp, unrolling_factor);
 
@@ -1813,7 +1718,7 @@ vect_detect_hybrid_slp_stmts (slp_tree node)
   imm_use_iterator imm_iter;
   gimple use_stmt;
   stmt_vec_info stmt_vinfo = vinfo_for_stmt (stmt);
-  slp_void_p child;
+  slp_tree child;
   loop_vec_info loop_vinfo = STMT_VINFO_LOOP_VINFO (stmt_vinfo);
   struct loop *loop = NULL;
   bb_vec_info bb_vinfo = STMT_VINFO_BB_VINFO (stmt_vinfo);
@@ -1844,7 +1749,7 @@ vect_detect_hybrid_slp_stmts (slp_tree node)
 	  vect_mark_slp_stmts (node, hybrid, i);
 
   FOR_EACH_VEC_ELT (SLP_TREE_CHILDREN (node), i, child)
-    vect_detect_hybrid_slp_stmts ((slp_tree) child);
+    vect_detect_hybrid_slp_stmts (child);
 }
 
 
@@ -1942,13 +1847,13 @@ vect_slp_analyze_node_operations (bb_vec_info bb_vinfo, slp_tree node)
   bool dummy;
   int i;
   gimple stmt;
-  slp_void_p child;
+  slp_tree child;
 
   if (!node)
     return true;
 
   FOR_EACH_VEC_ELT (SLP_TREE_CHILDREN (node), i, child)
-    if (!vect_slp_analyze_node_operations (bb_vinfo, (slp_tree) child))
+    if (!vect_slp_analyze_node_operations (bb_vinfo, child))
       return false;
 
   FOR_EACH_VEC_ELT (SLP_TREE_SCALAR_STMTS (node), i, stmt)
@@ -1993,6 +1898,71 @@ vect_slp_analyze_operations (bb_vec_info bb_vinfo)
   return true;
 }
 
+
+/* Compute the scalar cost of the SLP node NODE and its children
+   and return it.  Do not account defs that are marked in LIFE and
+   update LIFE according to uses of NODE.  */
+
+static unsigned
+vect_bb_slp_scalar_cost (basic_block bb,
+			 slp_tree node, vec<bool, va_stack> life)
+{
+  unsigned scalar_cost = 0;
+  unsigned i;
+  gimple stmt;
+  slp_tree child;
+
+  FOR_EACH_VEC_ELT (SLP_TREE_SCALAR_STMTS (node), i, stmt)
+    {
+      unsigned stmt_cost;
+      ssa_op_iter op_iter;
+      def_operand_p def_p;
+      stmt_vec_info stmt_info;
+
+      if (life[i])
+	continue;
+
+      /* If there is a non-vectorized use of the defs then the scalar
+         stmt is kept live in which case we do not account it or any
+	 required defs in the SLP children in the scalar cost.  This
+	 way we make the vectorization more costly when compared to
+	 the scalar cost.  */
+      FOR_EACH_SSA_DEF_OPERAND (def_p, stmt, op_iter, SSA_OP_DEF)
+	{
+	  imm_use_iterator use_iter;
+	  gimple use_stmt;
+	  FOR_EACH_IMM_USE_STMT (use_stmt, use_iter, DEF_FROM_PTR (def_p))
+	    if (gimple_code (use_stmt) == GIMPLE_PHI
+		|| gimple_bb (use_stmt) != bb
+		|| !STMT_VINFO_VECTORIZABLE (vinfo_for_stmt (use_stmt)))
+	      {
+		life[i] = true;
+		BREAK_FROM_IMM_USE_STMT (use_iter);
+	      }
+	}
+      if (life[i])
+	continue;
+
+      stmt_info = vinfo_for_stmt (stmt);
+      if (STMT_VINFO_DATA_REF (stmt_info))
+        {
+          if (DR_IS_READ (STMT_VINFO_DATA_REF (stmt_info)))
+            stmt_cost = vect_get_stmt_cost (scalar_load);
+          else
+            stmt_cost = vect_get_stmt_cost (scalar_store);
+        }
+      else
+        stmt_cost = vect_get_stmt_cost (scalar_stmt);
+
+      scalar_cost += stmt_cost;
+    }
+
+  FOR_EACH_VEC_ELT (SLP_TREE_CHILDREN (node), i, child)
+    scalar_cost += vect_bb_slp_scalar_cost (bb, child, life);
+
+  return scalar_cost;
+}
+
 /* Check if vectorization of the basic block is profitable.  */
 
 static bool
@@ -2003,10 +1973,6 @@ vect_bb_vectorization_profitable_p (bb_vec_info bb_vinfo)
   int i, j;
   unsigned int vec_inside_cost = 0, vec_outside_cost = 0, scalar_cost = 0;
   unsigned int vec_prologue_cost = 0, vec_epilogue_cost = 0;
-  unsigned int stmt_cost;
-  gimple stmt;
-  gimple_stmt_iterator si;
-  basic_block bb = BB_VINFO_BB (bb_vinfo);
   void *target_cost_data = BB_VINFO_TARGET_COST_DATA (bb_vinfo);
   stmt_vec_info stmt_info = NULL;
   stmt_vector_for_cost body_cost_vec;
@@ -2026,26 +1992,15 @@ vect_bb_vectorization_profitable_p (bb_vec_info bb_vinfo)
     }
 
   /* Calculate scalar cost.  */
-  for (si = gsi_start_bb (bb); !gsi_end_p (si); gsi_next (&si))
+  FOR_EACH_VEC_ELT (slp_instances, i, instance)
     {
-      stmt = gsi_stmt (si);
-      stmt_info = vinfo_for_stmt (stmt);
-
-      if (!stmt_info || !STMT_VINFO_VECTORIZABLE (stmt_info)
-          || !PURE_SLP_STMT (stmt_info))
-        continue;
-
-      if (STMT_VINFO_DATA_REF (stmt_info))
-        {
-          if (DR_IS_READ (STMT_VINFO_DATA_REF (stmt_info)))
-            stmt_cost = vect_get_stmt_cost (scalar_load);
-          else
-            stmt_cost = vect_get_stmt_cost (scalar_store);
-        }
-      else
-        stmt_cost = vect_get_stmt_cost (scalar_stmt);
-
-      scalar_cost += stmt_cost;
+      vec<bool, va_stack> life;
+      vec_stack_alloc (bool, life, SLP_INSTANCE_GROUP_SIZE (instance));
+      life.quick_grow_cleared (SLP_INSTANCE_GROUP_SIZE (instance));
+      scalar_cost += vect_bb_slp_scalar_cost (BB_VINFO_BB (bb_vinfo),
+					      SLP_INSTANCE_TREE (instance),
+					      life);
+      life.release ();
     }
 
   /* Complete the target-specific cost calculation.  */
@@ -2078,12 +2033,10 @@ static bb_vec_info
 vect_slp_analyze_bb_1 (basic_block bb)
 {
   bb_vec_info bb_vinfo;
-  vec<ddr_p> ddrs;
   vec<slp_instance> slp_instances;
   slp_instance instance;
   int i;
   int min_vf = 2;
-  int max_vf = MAX_VECTORIZATION_FACTOR;
 
   bb_vinfo = new_bb_vec_info (bb);
   if (!bb_vinfo)
@@ -2100,8 +2053,7 @@ vect_slp_analyze_bb_1 (basic_block bb)
       return NULL;
     }
 
-  ddrs = BB_VINFO_DDRS (bb_vinfo);
-  if (!ddrs.length ())
+  if (BB_VINFO_DATAREFS (bb_vinfo).length () < 2)
     {
       if (dump_enabled_p ())
         dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
@@ -2112,10 +2064,20 @@ vect_slp_analyze_bb_1 (basic_block bb)
       return NULL;
     }
 
+  if (!vect_analyze_data_ref_accesses (NULL, bb_vinfo))
+    {
+     if (dump_enabled_p ())
+       dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
+			"not vectorized: unhandled data access in "
+			"basic block.\n");
+
+      destroy_bb_vec_info (bb_vinfo);
+      return NULL;
+    }
+
   vect_pattern_recog (NULL, bb_vinfo);
 
-  if (!vect_analyze_data_ref_dependences (NULL, bb_vinfo, &max_vf)
-       || min_vf > max_vf)
+  if (!vect_slp_analyze_data_ref_dependences (bb_vinfo))
      {
        if (dump_enabled_p ())
 	 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
@@ -2137,17 +2099,6 @@ vect_slp_analyze_bb_1 (basic_block bb)
       return NULL;
     }
 
-  if (!vect_analyze_data_ref_accesses (NULL, bb_vinfo))
-    {
-     if (dump_enabled_p ())
-       dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
-			"not vectorized: unhandled data access in "
-			"basic block.\n");
-
-      destroy_bb_vec_info (bb_vinfo);
-      return NULL;
-    }
-
   /* Check the SLP opportunities in the basic block, analyze and build SLP
      trees.  */
   if (!vect_analyze_slp (NULL, bb_vinfo))
@@ -2468,7 +2419,7 @@ vect_get_constant_vectors (tree op, slp_tree slp_node,
 		       the lhs, so make sure the scalar is the right type if
 		       we are dealing with vectors of
 		       long long/long/short/char.  */
-		    if (op_num == 1 && constant_p)
+		    if (op_num == 1 && TREE_CODE (op) == INTEGER_CST)
 		      op = fold_convert (TREE_TYPE (vector_type), op);
 		    break;
 
@@ -2501,7 +2452,7 @@ vect_get_constant_vectors (tree op, slp_tree slp_node,
           number_of_places_left_in_vector--;
 	  if (!types_compatible_p (TREE_TYPE (vector_type), TREE_TYPE (op)))
 	    {
-	      if (constant_p)
+	      if (CONSTANT_CLASS_P (op))
 		{
 		  op = fold_unary (VIEW_CONVERT_EXPR,
 				   TREE_TYPE (vector_type), op);
@@ -2522,6 +2473,8 @@ vect_get_constant_vectors (tree op, slp_tree slp_node,
 		}
 	    }
 	  elts[number_of_places_left_in_vector] = op;
+	  if (!CONSTANT_CLASS_P (op))
+	    constant_p = false;
 
           if (number_of_places_left_in_vector == 0)
             {
@@ -2640,7 +2593,7 @@ vect_get_slp_defs (vec<tree> ops, slp_tree slp_node,
       vectorized_defs = false;
       if (SLP_TREE_CHILDREN (slp_node).length () > child_index)
         {
-          child = (slp_tree) SLP_TREE_CHILDREN (slp_node)[child_index];
+          child = SLP_TREE_CHILDREN (slp_node)[child_index];
 
 	  /* We have to check both pattern and original def, if available.  */
 	  gimple first_def = SLP_TREE_SCALAR_STMTS (child)[0];
@@ -2841,16 +2794,18 @@ vect_get_mask_element (gimple stmt, int first_mask_element, int m,
 
 /* Generate vector permute statements from a list of loads in DR_CHAIN.
    If ANALYZE_ONLY is TRUE, only check that it is possible to create valid
-   permute statements for SLP_NODE_INSTANCE.  */
+   permute statements for the SLP node NODE of the SLP instance
+   SLP_NODE_INSTANCE.  */
+
 bool
-vect_transform_slp_perm_load (gimple stmt, vec<tree> dr_chain,
+vect_transform_slp_perm_load (slp_tree node, vec<tree> dr_chain,
                               gimple_stmt_iterator *gsi, int vf,
                               slp_instance slp_node_instance, bool analyze_only)
 {
+  gimple stmt = SLP_TREE_SCALAR_STMTS (node)[0];
   stmt_vec_info stmt_info = vinfo_for_stmt (stmt);
   tree mask_element_type = NULL_TREE, mask_type;
   int i, j, k, nunits, vec_index = 0, scalar_index;
-  slp_tree node;
   tree vectype = STMT_VINFO_VECTYPE (stmt_info);
   gimple next_scalar_stmt;
   int group_size = SLP_INSTANCE_GROUP_SIZE (slp_node_instance);
@@ -2897,6 +2852,9 @@ vect_transform_slp_perm_load (gimple stmt, vec<tree> dr_chain,
      relatively to SLP_NODE_INSTANCE unrolling factor.  */
   ncopies = vf / SLP_INSTANCE_UNROLLING_FACTOR (slp_node_instance);
 
+  if (!STMT_VINFO_GROUPED_ACCESS (stmt_info))
+    return false;
+
   /* Generate permutation masks for every NODE. Number of masks for each NODE
      is equal to GROUP_SIZE.
      E.g., we have a group of three nodes with three loads from the same
@@ -2915,7 +2873,6 @@ vect_transform_slp_perm_load (gimple stmt, vec<tree> dr_chain,
      we need the second and the third vectors: {b1,c1,a2,b2} and
      {c2,a3,b3,c3}.  */
 
-  FOR_EACH_VEC_ELT  (SLP_INSTANCE_LOADS (slp_node_instance), i, node)
     {
       scalar_index = 0;
       index = 0;
@@ -2931,6 +2888,7 @@ vect_transform_slp_perm_load (gimple stmt, vec<tree> dr_chain,
         {
           for (k = 0; k < group_size; k++)
             {
+	      i = SLP_TREE_LOAD_PERMUTATION (node)[k];
               first_mask_element = i + j * group_size;
               if (!vect_get_mask_element (stmt, first_mask_element, 0,
 					  nunits, only_one_vec, index,
@@ -2943,9 +2901,7 @@ vect_transform_slp_perm_load (gimple stmt, vec<tree> dr_chain,
 
               if (index == nunits)
                 {
-		  tree mask_vec, *mask_elts;
-		  int l;
-
+		  index = 0;
 		  if (!can_vec_perm_p (mode, false, mask))
 		    {
 		      if (dump_enabled_p ())
@@ -2961,15 +2917,17 @@ vect_transform_slp_perm_load (gimple stmt, vec<tree> dr_chain,
 		      return false;
 		    }
 
-		  mask_elts = XALLOCAVEC (tree, nunits);
-		  for (l = 0; l < nunits; ++l)
-		    mask_elts[l] = build_int_cst (mask_element_type, mask[l]);
-		  mask_vec = build_vector (mask_type, mask_elts);
-		  index = 0;
-
                   if (!analyze_only)
                     {
-                      if (need_next_vector)
+		      int l;
+		      tree mask_vec, *mask_elts;
+		      mask_elts = XALLOCAVEC (tree, nunits);
+		      for (l = 0; l < nunits; ++l)
+			mask_elts[l] = build_int_cst (mask_element_type,
+						      mask[l]);
+		      mask_vec = build_vector (mask_type, mask_elts);
+
+		      if (need_next_vector)
                         {
                           first_vec_index = second_vec_index;
                           second_vec_index = vec_index;
@@ -3006,15 +2964,13 @@ vect_schedule_slp_instance (slp_tree node, slp_instance instance,
   unsigned int vec_stmts_size, nunits, group_size;
   tree vectype;
   int i;
-  slp_tree loads_node;
-  slp_void_p child;
+  slp_tree child;
 
   if (!node)
     return false;
 
   FOR_EACH_VEC_ELT (SLP_TREE_CHILDREN (node), i, child)
-    vect_schedule_slp_instance ((slp_tree) child, instance,
-                                vectorization_factor);
+    vect_schedule_slp_instance (child, instance, vectorization_factor);
 
   stmt = SLP_TREE_SCALAR_STMTS (node)[0];
   stmt_info = vinfo_for_stmt (stmt);
@@ -3031,20 +2987,6 @@ vect_schedule_slp_instance (slp_tree node, slp_instance instance,
      size.  */
   vec_stmts_size = (vectorization_factor * group_size) / nunits;
 
-  /* In case of load permutation we have to allocate vectorized statements for
-     all the nodes that participate in that permutation.  */
-  if (SLP_INSTANCE_LOAD_PERMUTATION (instance).exists ())
-    {
-      FOR_EACH_VEC_ELT (SLP_INSTANCE_LOADS (instance), i, loads_node)
-        {
-          if (!SLP_TREE_VEC_STMTS (loads_node).exists ())
-            {
-              SLP_TREE_VEC_STMTS (loads_node).create (vec_stmts_size);
-              SLP_TREE_NUMBER_OF_VEC_STMTS (loads_node) = vec_stmts_size;
-            }
-        }
-    }
-
   if (!SLP_TREE_VEC_STMTS (node).exists ())
     {
       SLP_TREE_VEC_STMTS (node).create (vec_stmts_size);
@@ -3062,7 +3004,7 @@ vect_schedule_slp_instance (slp_tree node, slp_instance instance,
   if (SLP_INSTANCE_FIRST_LOAD_STMT (instance)
       && STMT_VINFO_GROUPED_ACCESS (stmt_info)
       && !REFERENCE_CLASS_P (gimple_get_lhs (stmt))
-      && SLP_INSTANCE_LOAD_PERMUTATION (instance).exists ())
+      && SLP_TREE_LOAD_PERMUTATION (node).exists ())
     si = gsi_for_stmt (SLP_INSTANCE_FIRST_LOAD_STMT (instance));
   else if (is_pattern_stmt_p (stmt_info))
     si = gsi_for_stmt (STMT_VINFO_RELATED_STMT (stmt_info));
@@ -3104,7 +3046,7 @@ vect_remove_slp_scalar_calls (slp_tree node)
   gimple stmt, new_stmt;
   gimple_stmt_iterator gsi;
   int i;
-  slp_void_p child;
+  slp_tree child;
   tree lhs;
   stmt_vec_info stmt_info;
 
@@ -3112,7 +3054,7 @@ vect_remove_slp_scalar_calls (slp_tree node)
     return;
 
   FOR_EACH_VEC_ELT (SLP_TREE_CHILDREN (node), i, child)
-    vect_remove_slp_scalar_calls ((slp_tree) child);
+    vect_remove_slp_scalar_calls (child);
 
   FOR_EACH_VEC_ELT (SLP_TREE_SCALAR_STMTS (node), i, stmt)
     {
@@ -3141,8 +3083,7 @@ vect_schedule_slp (loop_vec_info loop_vinfo, bb_vec_info bb_vinfo)
 {
   vec<slp_instance> slp_instances;
   slp_instance instance;
-  slp_tree loads_node;
-  unsigned int i, j, vf;
+  unsigned int i, vf;
   bool is_store = false;
 
   if (loop_vinfo)
@@ -3161,14 +3102,6 @@ vect_schedule_slp (loop_vec_info loop_vinfo, bb_vec_info bb_vinfo)
       /* Schedule the tree of INSTANCE.  */
       is_store = vect_schedule_slp_instance (SLP_INSTANCE_TREE (instance),
                                              instance, vf);
-
-      /* Clear STMT_VINFO_VEC_STMT of all loads.  With shared loads
-         between SLP instances we fail to properly initialize the
-	 vectorized SLP stmts and confuse different load permutations.  */
-      FOR_EACH_VEC_ELT (SLP_INSTANCE_LOADS (instance), j, loads_node)
-	STMT_VINFO_VEC_STMT
-	  (vinfo_for_stmt (SLP_TREE_SCALAR_STMTS (loads_node)[0])) = NULL;
-
       if (dump_enabled_p ())
 	dump_printf_loc (MSG_NOTE, vect_location,
                          "vectorizing stmts using SLP.");
@@ -3249,7 +3182,8 @@ vect_slp_transform_bb (basic_block bb)
     }
 
   if (dump_enabled_p ())
-    dump_printf (MSG_OPTIMIZED_LOCATIONS, "BASIC BLOCK VECTORIZED\n");
+    dump_printf_loc (MSG_NOTE, vect_location,
+		     "BASIC BLOCK VECTORIZED\n");
 
   destroy_bb_vec_info (bb_vinfo);
 }
author	bstarynk <bstarynk@138bc75d-0d04-0410-961f-82ee72b054a4>	2013-06-26 18:39:06 +0000
committer	bstarynk <bstarynk@138bc75d-0d04-0410-961f-82ee72b054a4>	2013-06-26 18:39:06 +0000
commit	21543d4cd558cada630271a0cf3075ad7ce94cbf (patch)
tree	08bdb3f3e0a9d0f71e72bb56d9ddb7b916e7dfeb /gcc/tree-vect-slp.c
parent	ed0bc1ffb674fe93d0df68654b5bb76869f0bc8c (diff)
download	gcc-21543d4cd558cada630271a0cf3075ad7ce94cbf.tar.gz