1 files changed, 284 insertions, 80 deletions
diff --git a/gcc/tree-predcom.c b/gcc/tree-predcom.c
index 2cf47930dbe..73ebd951dba 100644
--- a/gcc/tree-predcom.c
+++ b/gcc/tree-predcom.c
@@ -234,7 +234,7 @@ along with GCC; see the file COPYING3.  If not see
 /* The maximum number of iterations between the considered memory
    references.  */
 
-#define MAX_DISTANCE (target_avail_regs < 16 ? 4 : 8)
+#define MAX_DISTANCE (target_avail_regs < 16 ? 3 : 7)
 
 /* Data references (or phi nodes that carry data reference values across
    loop iterations).  */
@@ -379,6 +379,71 @@ static bitmap looparound_phis;
 
 static hash_map<tree, name_expansion *> *name_expansions;
 
+/* True if we're running the early predcom pass and should only handle
+   cases that aid vectorization.  Specifically this means that:
+
+   - only CT_INVARIANT and CT_STORE_LOAD chains are used
+   - the maximum distance for a CT_STORE_LOAD chain is 1 iteration,
+     and at that distance the store must come after the load
+   - there's no unrolling or detection of looparound phis.
+
+   The idea is handle inductions that go via memory, such as:
+
+     for (int i = 1; i < n; ++i)
+       x[i] = x[i - 1] + 1;
+
+   As it stands this loop could never be vectorized, because a loop body
+   that contains a read of x[j] followed by a write to x[j + 1] would
+   have its vectorization factor limited to 1.  Transforming it to:
+
+     int tmp = x[0];
+     for (int i = 0; i < n; ++i)
+       {
+         tmp += 1;
+         x[i] = tmp:
+       }
+
+   exposes the fact that the stored value is a simple vectorizable
+   induction with start value x[0] and step 1.
+
+   [ Commoning is not always useful even in this situation.  For example,
+     carrying over the value of x[i] won't help us to vectorize:
+
+       for (int i = 1; i < n; ++i)
+	 {
+	   y[i] = x[i - 1];
+	   x[i] += i;
+	 }
+
+     There's no real need to restrict things further though, because we're
+     unable to vectorize these load/store combinations in their current
+     form whatever happens. ]
+
+   We require the store to come after the load when the distance is 1
+   to avoid cases like:
+
+     for (int i = 1; i < n; ++i)
+       {
+	 x[i] = ...;
+	 ... = x[i - 1];
+       }
+
+   These accesses effectively have a distance somewhere between 1 and 2,
+   since after commoning the value stored in the previous iteration would
+   still be live at the next store.  This means that the combination
+   isn't useful for exposing simple inductions.
+
+   Also, unlike the motivating case above, this combination does not
+   prevent vectorization.  If a write to x[j + 1] comes before a read
+   of x[j], the vectorized write completes for all vector elements
+   before the read starts for any vector elements.  */
+
+static bool only_simple_p;
+
+/* The maximum loop carry distance for this execution of the pass.  */
+
+static int max_distance;
+
 /* Dumps data reference REF to FILE.  */
 
 extern void dump_dref (FILE *, dref);
@@ -1024,6 +1089,17 @@ order_drefs (const void *a, const void *b)
   return (*da)->pos - (*db)->pos;
 }
 
+/* Compares two drefs A and B by their position.  Callback for qsort.  */
+
+static int
+order_drefs_by_pos (const void *a, const void *b)
+{
+  const dref *const da = (const dref *) a;
+  const dref *const db = (const dref *) b;
+
+  return (*da)->pos - (*db)->pos;
+}
+
 /* Returns root of the CHAIN.  */
 
 static inline dref
@@ -1056,7 +1132,13 @@ add_ref_to_chain (chain_p chain, dref ref)
 
   gcc_assert (wi::les_p (root->offset, ref->offset));
   widest_int dist = ref->offset - root->offset;
-  if (wi::leu_p (MAX_DISTANCE, dist))
+  /* When running before vectorization, only allow the maximum distance
+     if the consumer comes before the producer.  See the comment above
+     ONLY_SIMPLE_P for details.  */
+  if (wi::ltu_p (max_distance, dist)
+      || (only_simple_p
+	  && wi::eq_p (max_distance, dist)
+	  && root->pos < ref->pos))
     {
       free (ref);
       return;
@@ -1296,7 +1378,9 @@ add_looparound_copies (struct loop *loop, chain_p chain)
   dref ref, root = get_chain_root (chain);
   gphi *phi;
 
-  if (chain->type == CT_STORE_STORE)
+  /* There's no point doing this when running before vectorization,
+     since we won't unroll the loop or combine chains.  */
+  if (only_simple_p || chain->type == CT_STORE_STORE)
     return;
 
   FOR_EACH_VEC_ELT (chain->refs, i, ref)
@@ -1335,14 +1419,21 @@ determine_roots_comp (struct loop *loop,
 
   /* Trivial component.  */
   if (comp->refs.length () <= 1)
-    return;
+    {
+      if (comp->refs.length () == 1)
+	{
+	  free (comp->refs[0]);
+	  comp->refs.truncate (0);
+	}
+      return;
+    }
 
   comp->refs.qsort (order_drefs);
   FOR_EACH_VEC_ELT (comp->refs, i, a)
     {
       if (!chain
 	  || (!comp->eliminate_store_p && DR_IS_WRITE (a->ref))
-	  || wi::leu_p (MAX_DISTANCE, a->offset - last_ofs))
+	  || wi::ltu_p (max_distance, a->offset - last_ofs))
 	{
 	  if (nontrivial_chain_p (chain))
 	    {
@@ -1352,6 +1443,15 @@ determine_roots_comp (struct loop *loop,
 	  else
 	    release_chain (chain);
 
+	  /* Only create CT_STORE_LOAD and CT_INVARIANT chains when
+	     running before vectorization.  */
+	  if (only_simple_p && !DR_IS_WRITE (a->ref))
+	    {
+	      free (a);
+	      chain = NULL;
+	      continue;
+	    }
+
 	  if (DR_IS_READ (a->ref))
 	    type = CT_LOAD;
 	  else
@@ -2116,6 +2216,10 @@ determine_unroll_factor (vec<chain_p> chains)
   unsigned factor = 1, af, nfactor, i;
   unsigned max = PARAM_VALUE (PARAM_MAX_UNROLL_TIMES);
 
+  /* Do not unroll when running before vectorization.  */
+  if (only_simple_p)
+    return 1;
+
   FOR_EACH_VEC_ELT (chains, i, chain)
     {
       if (chain->type == CT_INVARIANT)
@@ -2524,11 +2628,10 @@ remove_name_from_operation (gimple *stmt, tree op)
 }
 
 /* Reassociates the expression in that NAME1 and NAME2 are used so that they
-   are combined in a single statement, and returns this statement.  Note the
-   statement is inserted before INSERT_BEFORE if it's not NULL.  */
+   are combined in a single statement, and returns this statement.  */
 
 static gimple *
-reassociate_to_the_same_stmt (tree name1, tree name2, gimple *insert_before)
+reassociate_to_the_same_stmt (tree name1, tree name2)
 {
   gimple *stmt1, *stmt2, *root1, *root2, *s1, *s2;
   gassign *new_stmt, *tmp_stmt;
@@ -2585,12 +2688,6 @@ reassociate_to_the_same_stmt (tree name1, tree name2, gimple *insert_before)
   var = create_tmp_reg (type, "predreastmp");
   new_name = make_ssa_name (var);
   new_stmt = gimple_build_assign (new_name, code, name1, name2);
-  if (insert_before && stmt_dominates_stmt_p (insert_before, s1))
-    bsi = gsi_for_stmt (insert_before);
-  else
-    bsi = gsi_for_stmt (s1);
-
-  gsi_insert_before (&bsi, new_stmt, GSI_SAME_STMT);
 
   var = create_tmp_reg (type, "predreastmp");
   tmp_name = make_ssa_name (var);
@@ -2607,6 +2704,7 @@ reassociate_to_the_same_stmt (tree name1, tree name2, gimple *insert_before)
   s1 = gsi_stmt (bsi);
   update_stmt (s1);
 
+  gsi_insert_before (&bsi, new_stmt, GSI_SAME_STMT);
   gsi_insert_before (&bsi, tmp_stmt, GSI_SAME_STMT);
 
   return new_stmt;
@@ -2615,11 +2713,10 @@ reassociate_to_the_same_stmt (tree name1, tree name2, gimple *insert_before)
 /* Returns the statement that combines references R1 and R2.  In case R1
    and R2 are not used in the same statement, but they are used with an
    associative and commutative operation in the same expression, reassociate
-   the expression so that they are used in the same statement.  The combined
-   statement is inserted before INSERT_BEFORE if it's not NULL.  */
+   the expression so that they are used in the same statement.  */
 
 static gimple *
-stmt_combining_refs (dref r1, dref r2, gimple *insert_before)
+stmt_combining_refs (dref r1, dref r2)
 {
   gimple *stmt1, *stmt2;
   tree name1 = name_for_ref (r1);
@@ -2630,7 +2727,7 @@ stmt_combining_refs (dref r1, dref r2, gimple *insert_before)
   if (stmt1 == stmt2)
     return stmt1;
 
-  return reassociate_to_the_same_stmt (name1, name2, insert_before);
+  return reassociate_to_the_same_stmt (name1, name2);
 }
 
 /* Tries to combine chains CH1 and CH2 together.  If this succeeds, the
@@ -2643,8 +2740,7 @@ combine_chains (chain_p ch1, chain_p ch2)
   enum tree_code op = ERROR_MARK;
   bool swap = false;
   chain_p new_chain;
-  int i, j, num;
-  gimple *root_stmt;
+  unsigned i;
   tree rslt_type = NULL_TREE;
 
   if (ch1 == ch2)
@@ -2665,9 +2761,6 @@ combine_chains (chain_p ch1, chain_p ch2)
 	return NULL;
     }
 
-  ch1->combined = true;
-  ch2->combined = true;
-
   if (swap)
     std::swap (ch1, ch2);
 
@@ -2679,69 +2772,65 @@ combine_chains (chain_p ch1, chain_p ch2)
   new_chain->rslt_type = rslt_type;
   new_chain->length = ch1->length;
 
-  gimple *insert = NULL;
-  num = ch1->refs.length ();
-  i = (new_chain->length == 0) ? num - 1 : 0;
-  j = (new_chain->length == 0) ? -1 : 1;
-  /* For ZERO length chain, process refs in reverse order so that dominant
-     position is ready when it comes to the root ref.
-     For non-ZERO length chain, process refs in order.  See PR79663.  */
-  for (; num > 0; num--, i += j)
-    {
-      r1 = ch1->refs[i];
-      r2 = ch2->refs[i];
+  for (i = 0; (ch1->refs.iterate (i, &r1)
+	       && ch2->refs.iterate (i, &r2)); i++)
+    {
       nw = XCNEW (struct dref_d);
+      nw->stmt = stmt_combining_refs (r1, r2);
       nw->distance = r1->distance;
 
-      /* For ZERO length chain, insert combined stmt of root ref at dominant
-	 position.  */
-      nw->stmt = stmt_combining_refs (r1, r2, i == 0 ? insert : NULL);
-      /* For ZERO length chain, record dominant position where combined stmt
-	 of root ref should be inserted.  In this case, though all root refs
-	 dominate following ones, it's possible that combined stmt doesn't.
-	 See PR70754.  */
-      if (new_chain->length == 0
-	  && (insert == NULL || stmt_dominates_stmt_p (nw->stmt, insert)))
-	insert = nw->stmt;
-
       new_chain->refs.safe_push (nw);
     }
-  if (new_chain->length == 0)
-    {
-      /* Restore the order for ZERO length chain's refs.  */
-      num = new_chain->refs.length () >> 1;
-      for (i = 0, j = new_chain->refs.length () - 1; i < num; i++, j--)
-	std::swap (new_chain->refs[i], new_chain->refs[j]);
 
-      /* For ZERO length chain, has_max_use_after must be true since root
-	 combined stmt must dominates others.  */
-      new_chain->has_max_use_after = true;
-      return new_chain;
-    }
+  ch1->combined = true;
+  ch2->combined = true;
+  return new_chain;
+}
 
-  new_chain->has_max_use_after = false;
-  root_stmt = get_chain_root (new_chain)->stmt;
-  for (i = 1; new_chain->refs.iterate (i, &nw); i++)
-    {
-      if (nw->distance == new_chain->length
-	  && !stmt_dominates_stmt_p (nw->stmt, root_stmt))
-	{
-	  new_chain->has_max_use_after = true;
-	  break;
-	}
-    }
+/* Recursively update position information of all offspring chains to ROOT
+   chain's position information.  */
 
-  return new_chain;
+static void
+update_pos_for_combined_chains (chain_p root)
+{
+  chain_p ch1 = root->ch1, ch2 = root->ch2;
+  dref ref, ref1, ref2;
+  for (unsigned j = 0; (root->refs.iterate (j, &ref)
+			&& ch1->refs.iterate (j, &ref1)
+			&& ch2->refs.iterate (j, &ref2)); ++j)
+    ref1->pos = ref2->pos = ref->pos;
+
+  if (ch1->type == CT_COMBINATION)
+    update_pos_for_combined_chains (ch1);
+  if (ch2->type == CT_COMBINATION)
+    update_pos_for_combined_chains (ch2);
 }
 
-/* Try to combine the CHAINS.  */
+/* Returns true if statement S1 dominates statement S2.  */
+
+static bool
+pcom_stmt_dominates_stmt_p (gimple *s1, gimple *s2)
+{
+  basic_block bb1 = gimple_bb (s1), bb2 = gimple_bb (s2);
+
+  if (!bb1 || s1 == s2)
+    return true;
+
+  if (bb1 == bb2)
+    return gimple_uid (s1) < gimple_uid (s2);
+
+  return dominated_by_p (CDI_DOMINATORS, bb2, bb1);
+}
+
+/* Try to combine the CHAINS in LOOP.  */
 
 static void
-try_combine_chains (vec<chain_p> *chains)
+try_combine_chains (struct loop *loop, vec<chain_p> *chains)
 {
   unsigned i, j;
   chain_p ch1, ch2, cch;
   auto_vec<chain_p> worklist;
+  bool combined_p = false;
 
   FOR_EACH_VEC_ELT (*chains, i, ch1)
     if (chain_can_be_combined_p (ch1))
@@ -2763,6 +2852,78 @@ try_combine_chains (vec<chain_p> *chains)
 	    {
 	      worklist.safe_push (cch);
 	      chains->safe_push (cch);
+	      combined_p = true;
+	      break;
+	    }
+	}
+    }
+  if (!combined_p)
+    return;
+
+  /* Setup UID for all statements in dominance order.  */
+  basic_block *bbs = get_loop_body (loop);
+  renumber_gimple_stmt_uids_in_blocks (bbs, loop->num_nodes);
+  free (bbs);
+
+  /* Re-association in combined chains may generate statements different to
+     order of references of the original chain.  We need to keep references
+     of combined chain in dominance order so that all uses will be inserted
+     after definitions.  Note:
+       A) This is necessary for all combined chains.
+       B) This is only necessary for ZERO distance references because other
+	  references inherit value from loop carried PHIs.
+
+     We first update position information for all combined chains.  */
+  dref ref;
+  for (i = 0; chains->iterate (i, &ch1); ++i)
+    {
+      if (ch1->type != CT_COMBINATION || ch1->combined)
+	continue;
+
+      for (j = 0; ch1->refs.iterate (j, &ref); ++j)
+	ref->pos = gimple_uid (ref->stmt);
+
+      update_pos_for_combined_chains (ch1);
+    }
+  /* Then sort references according to newly updated position information.  */
+  for (i = 0; chains->iterate (i, &ch1); ++i)
+    {
+      if (ch1->type != CT_COMBINATION && !ch1->combined)
+	continue;
+
+      /* Find the first reference with non-ZERO distance.  */
+      if (ch1->length == 0)
+	j = ch1->refs.length();
+      else
+	{
+	  for (j = 0; ch1->refs.iterate (j, &ref); ++j)
+	    if (ref->distance != 0)
+	      break;
+	}
+
+      /* Sort all ZERO distance references by position.  */
+      qsort (&ch1->refs[0], j, sizeof (ch1->refs[0]), order_drefs_by_pos);
+
+      if (ch1->combined)
+	continue;
+
+      /* For ZERO length chain, has_max_use_after must be true since root
+	 combined stmt must dominates others.  */
+      if (ch1->length == 0)
+	{
+	  ch1->has_max_use_after = true;
+	  continue;
+	}
+      /* Check if there is use at max distance after root for combined chains
+	 and set flag accordingly.  */
+      ch1->has_max_use_after = false;
+      gimple *root_stmt = get_chain_root (ch1)->stmt;
+      for (j = 1; ch1->refs.iterate (j, &ref); ++j)
+	{
+	  if (ref->distance == ch1->length
+	      && !pcom_stmt_dominates_stmt_p (ref->stmt, root_stmt))
+	    {
+	      ch1->has_max_use_after = true;
 	      break;
 	    }
 	}
@@ -3099,8 +3260,11 @@ tree_predictive_commoning_loop (struct loop *loop)
   prepare_initializers (loop, chains);
   loop_closed_ssa = prepare_finalizers (loop, chains);
 
-  /* Try to combine the chains that are always worked with together.  */
-  try_combine_chains (&chains);
+  /* During the main pass, try to combine the chains that are always
+     worked with together.  For the early pass it should be better
+     to leave this to the vectorizer.  */
+  if (!only_simple_p)
+    try_combine_chains (loop, &chains);
 
   insert_init_seqs (loop, chains);
 
@@ -3162,14 +3326,18 @@ end: ;
   return (unroll ? 1 : 0) | (loop_closed_ssa ? 2 : 0);
 }
 
-/* Runs predictive commoning.  */
+/* Runs predictive commoning.  EARLY_P is true if we are running before
+   vectorization.  */
 
 unsigned
-tree_predictive_commoning (void)
+tree_predictive_commoning (bool early_p)
 {
   struct loop *loop;
   unsigned ret = 0, changed = 0;
 
+  only_simple_p = early_p;
+  max_distance = early_p ? 1 : MAX_DISTANCE;
+
   initialize_original_copy_tables ();
   FOR_EACH_LOOP (loop, LI_ONLY_INNERMOST)
     if (optimize_loop_for_speed_p (loop))
@@ -3191,19 +3359,51 @@ tree_predictive_commoning (void)
   return ret;
 }
 
-/* Predictive commoning Pass.  */
+/* Predictive commoning pass.  EARLY_P is true if we are running before
+   vectorization.  */
 
 static unsigned
-run_tree_predictive_commoning (struct function *fun)
+run_tree_predictive_commoning (struct function *fun, bool early_p)
 {
   if (number_of_loops (fun) <= 1)
     return 0;
 
-  return tree_predictive_commoning ();
+  return tree_predictive_commoning (early_p);
 }
 
 namespace {
 
+const pass_data pass_data_early_predcom =
+{
+  GIMPLE_PASS, /* type */
+  "epcom", /* name */
+  OPTGROUP_LOOP, /* optinfo_flags */
+  TV_PREDCOM, /* tv_id */
+  PROP_cfg, /* properties_required */
+  0, /* properties_provided */
+  0, /* properties_destroyed */
+  0, /* todo_flags_start */
+  TODO_update_ssa_only_virtuals, /* todo_flags_finish */
+};
+
+class pass_early_predcom : public gimple_opt_pass
+{
+public:
+  pass_early_predcom (gcc::context *ctxt)
+    : gimple_opt_pass (pass_data_early_predcom, ctxt)
+  {}
+
+  /* opt_pass methods: */
+  virtual bool gate (function *)
+  {
+    return flag_predictive_commoning && flag_tree_loop_vectorize;
+  }
+  virtual unsigned int execute (function *fun)
+  {
+    return run_tree_predictive_commoning (fun, true);
+  }
+}; // class pass_early_predcom
+
 const pass_data pass_data_predcom =
 {
   GIMPLE_PASS, /* type */
@@ -3228,7 +3428,7 @@ public:
   virtual bool gate (function *) { return flag_predictive_commoning != 0; }
   virtual unsigned int execute (function *fun)
     {
-      return run_tree_predictive_commoning (fun);
+      return run_tree_predictive_commoning (fun, false);
     }
 
 }; // class pass_predcom
@@ -3236,9 +3436,13 @@ public:
 } // anon namespace
 
 gimple_opt_pass *
+make_pass_early_predcom (gcc::context *ctxt)
+{
+  return new pass_early_predcom (ctxt);
+}
+
+gimple_opt_pass *
 make_pass_predcom (gcc::context *ctxt)
 {
   return new pass_predcom (ctxt);
 }
-
-