diff options
author | bstarynk <bstarynk@138bc75d-0d04-0410-961f-82ee72b054a4> | 2009-04-02 15:23:55 +0000 |
---|---|---|
committer | bstarynk <bstarynk@138bc75d-0d04-0410-961f-82ee72b054a4> | 2009-04-02 15:23:55 +0000 |
commit | a217eb614007708e37e64dde0e08c53672cb4ce8 (patch) | |
tree | a62894572a4c16deff12aa8a4292fec0e424af08 /gcc/tree-vect-slp.c | |
parent | 61087bee65377ecf20720addf51ff00fbefded1b (diff) | |
download | gcc-a217eb614007708e37e64dde0e08c53672cb4ce8.tar.gz |
2009-04-02 Basile Starynkevitch <basile@starynkevitch.net>
MELT branch merged with trunk r145451
git-svn-id: svn+ssh://gcc.gnu.org/svn/gcc/branches/melt-branch@145454 138bc75d-0d04-0410-961f-82ee72b054a4
Diffstat (limited to 'gcc/tree-vect-slp.c')
-rw-r--r-- | gcc/tree-vect-slp.c | 1710 |
1 files changed, 1710 insertions, 0 deletions
diff --git a/gcc/tree-vect-slp.c b/gcc/tree-vect-slp.c new file mode 100644 index 00000000000..60478e9e5e8 --- /dev/null +++ b/gcc/tree-vect-slp.c @@ -0,0 +1,1710 @@ +/* SLP - Basic Block Vectorization + Copyright (C) 2007, 2008, 2009 Free Software Foundation, Inc. + Foundation, Inc. + Contributed by Dorit Naishlos <dorit@il.ibm.com> + and Ira Rosen <irar@il.ibm.com> + +This file is part of GCC. + +GCC is free software; you can redistribute it and/or modify it under +the terms of the GNU General Public License as published by the Free +Software Foundation; either version 3, or (at your option) any later +version. + +GCC is distributed in the hope that it will be useful, but WITHOUT ANY +WARRANTY; without even the implied warranty of MERCHANTABILITY or +FITNESS FOR A PARTICULAR PURPOSE. See the GNU General Public License +for more details. + +You should have received a copy of the GNU General Public License +along with GCC; see the file COPYING3. If not see +<http://www.gnu.org/licenses/>. */ + +#include "config.h" +#include "system.h" +#include "coretypes.h" +#include "tm.h" +#include "ggc.h" +#include "tree.h" +#include "target.h" +#include "basic-block.h" +#include "diagnostic.h" +#include "tree-flow.h" +#include "tree-dump.h" +#include "cfgloop.h" +#include "cfglayout.h" +#include "expr.h" +#include "recog.h" +#include "optabs.h" +#include "tree-vectorizer.h" + +/* Recursively free the memory allocated for the SLP tree rooted at NODE. */ + +static void +vect_free_slp_tree (slp_tree node) +{ + if (!node) + return; + + if (SLP_TREE_LEFT (node)) + vect_free_slp_tree (SLP_TREE_LEFT (node)); + + if (SLP_TREE_RIGHT (node)) + vect_free_slp_tree (SLP_TREE_RIGHT (node)); + + VEC_free (gimple, heap, SLP_TREE_SCALAR_STMTS (node)); + + if (SLP_TREE_VEC_STMTS (node)) + VEC_free (gimple, heap, SLP_TREE_VEC_STMTS (node)); + + free (node); +} + + +/* Free the memory allocated for the SLP instance. */ + +void +vect_free_slp_instance (slp_instance instance) +{ + vect_free_slp_tree (SLP_INSTANCE_TREE (instance)); + VEC_free (int, heap, SLP_INSTANCE_LOAD_PERMUTATION (instance)); + VEC_free (slp_tree, heap, SLP_INSTANCE_LOADS (instance)); +} + + +/* Get the defs for the rhs of STMT (collect them in DEF_STMTS0/1), check that + they are of a legal type and that they match the defs of the first stmt of + the SLP group (stored in FIRST_STMT_...). */ + +static bool +vect_get_and_check_slp_defs (loop_vec_info loop_vinfo, slp_tree slp_node, + gimple stmt, VEC (gimple, heap) **def_stmts0, + VEC (gimple, heap) **def_stmts1, + enum vect_def_type *first_stmt_dt0, + enum vect_def_type *first_stmt_dt1, + tree *first_stmt_def0_type, + tree *first_stmt_def1_type, + tree *first_stmt_const_oprnd, + int ncopies_for_cost, + bool *pattern0, bool *pattern1) +{ + tree oprnd; + unsigned int i, number_of_oprnds; + tree def; + gimple def_stmt; + enum vect_def_type dt[2] = {vect_unknown_def_type, vect_unknown_def_type}; + stmt_vec_info stmt_info = + vinfo_for_stmt (VEC_index (gimple, SLP_TREE_SCALAR_STMTS (slp_node), 0)); + enum gimple_rhs_class rhs_class; + struct loop *loop = LOOP_VINFO_LOOP (loop_vinfo); + + rhs_class = get_gimple_rhs_class (gimple_assign_rhs_code (stmt)); + number_of_oprnds = gimple_num_ops (stmt) - 1; /* RHS only */ + + for (i = 0; i < number_of_oprnds; i++) + { + oprnd = gimple_op (stmt, i + 1); + + if (!vect_is_simple_use (oprnd, loop_vinfo, &def_stmt, &def, &dt[i]) + || (!def_stmt && dt[i] != vect_constant_def)) + { + if (vect_print_dump_info (REPORT_SLP)) + { + fprintf (vect_dump, "Build SLP failed: can't find def for "); + print_generic_expr (vect_dump, oprnd, TDF_SLIM); + } + + return false; + } + + /* Check if DEF_STMT is a part of a pattern and get the def stmt from + the pattern. Check that all the stmts of the node are in the + pattern. */ + if (def_stmt && gimple_bb (def_stmt) + && flow_bb_inside_loop_p (loop, gimple_bb (def_stmt)) + && vinfo_for_stmt (def_stmt) + && STMT_VINFO_IN_PATTERN_P (vinfo_for_stmt (def_stmt))) + { + if (!*first_stmt_dt0) + *pattern0 = true; + else + { + if (i == 1 && !*first_stmt_dt1) + *pattern1 = true; + else if ((i == 0 && !*pattern0) || (i == 1 && !*pattern1)) + { + if (vect_print_dump_info (REPORT_DETAILS)) + { + fprintf (vect_dump, "Build SLP failed: some of the stmts" + " are in a pattern, and others are not "); + print_generic_expr (vect_dump, oprnd, TDF_SLIM); + } + + return false; + } + } + + def_stmt = STMT_VINFO_RELATED_STMT (vinfo_for_stmt (def_stmt)); + dt[i] = STMT_VINFO_DEF_TYPE (vinfo_for_stmt (def_stmt)); + + if (*dt == vect_unknown_def_type) + { + if (vect_print_dump_info (REPORT_DETAILS)) + fprintf (vect_dump, "Unsupported pattern."); + return false; + } + + switch (gimple_code (def_stmt)) + { + case GIMPLE_PHI: + def = gimple_phi_result (def_stmt); + break; + + case GIMPLE_ASSIGN: + def = gimple_assign_lhs (def_stmt); + break; + + default: + if (vect_print_dump_info (REPORT_DETAILS)) + fprintf (vect_dump, "unsupported defining stmt: "); + return false; + } + } + + if (!*first_stmt_dt0) + { + /* op0 of the first stmt of the group - store its info. */ + *first_stmt_dt0 = dt[i]; + if (def) + *first_stmt_def0_type = TREE_TYPE (def); + else + *first_stmt_const_oprnd = oprnd; + + /* Analyze costs (for the first stmt of the group only). */ + if (rhs_class != GIMPLE_SINGLE_RHS) + /* Not memory operation (we don't call this functions for loads). */ + vect_model_simple_cost (stmt_info, ncopies_for_cost, dt, slp_node); + else + /* Store. */ + vect_model_store_cost (stmt_info, ncopies_for_cost, dt[0], slp_node); + } + + else + { + if (!*first_stmt_dt1 && i == 1) + { + /* op1 of the first stmt of the group - store its info. */ + *first_stmt_dt1 = dt[i]; + if (def) + *first_stmt_def1_type = TREE_TYPE (def); + else + { + /* We assume that the stmt contains only one constant + operand. We fail otherwise, to be on the safe side. */ + if (*first_stmt_const_oprnd) + { + if (vect_print_dump_info (REPORT_SLP)) + fprintf (vect_dump, "Build SLP failed: two constant " + "oprnds in stmt"); + return false; + } + *first_stmt_const_oprnd = oprnd; + } + } + else + { + /* Not first stmt of the group, check that the def-stmt/s match + the def-stmt/s of the first stmt. */ + if ((i == 0 + && (*first_stmt_dt0 != dt[i] + || (*first_stmt_def0_type && def + && *first_stmt_def0_type != TREE_TYPE (def)))) + || (i == 1 + && (*first_stmt_dt1 != dt[i] + || (*first_stmt_def1_type && def + && *first_stmt_def1_type != TREE_TYPE (def)))) + || (!def + && TREE_TYPE (*first_stmt_const_oprnd) + != TREE_TYPE (oprnd))) + { + if (vect_print_dump_info (REPORT_SLP)) + fprintf (vect_dump, "Build SLP failed: different types "); + + return false; + } + } + } + + /* Check the types of the definitions. */ + switch (dt[i]) + { + case vect_constant_def: + case vect_invariant_def: + break; + + case vect_loop_def: + if (i == 0) + VEC_safe_push (gimple, heap, *def_stmts0, def_stmt); + else + VEC_safe_push (gimple, heap, *def_stmts1, def_stmt); + break; + + default: + /* FORNOW: Not supported. */ + if (vect_print_dump_info (REPORT_SLP)) + { + fprintf (vect_dump, "Build SLP failed: illegal type of def "); + print_generic_expr (vect_dump, def, TDF_SLIM); + } + + return false; + } + } + + return true; +} + + +/* Recursively build an SLP tree starting from NODE. + Fail (and return FALSE) if def-stmts are not isomorphic, require data + permutation or are of unsupported types of operation. Otherwise, return + TRUE. */ + +static bool +vect_build_slp_tree (loop_vec_info loop_vinfo, slp_tree *node, + unsigned int group_size, + int *inside_cost, int *outside_cost, + int ncopies_for_cost, unsigned int *max_nunits, + VEC (int, heap) **load_permutation, + VEC (slp_tree, heap) **loads) +{ + VEC (gimple, heap) *def_stmts0 = VEC_alloc (gimple, heap, group_size); + VEC (gimple, heap) *def_stmts1 = VEC_alloc (gimple, heap, group_size); + unsigned int i; + VEC (gimple, heap) *stmts = SLP_TREE_SCALAR_STMTS (*node); + gimple stmt = VEC_index (gimple, stmts, 0); + enum vect_def_type first_stmt_dt0 = 0, first_stmt_dt1 = 0; + enum tree_code first_stmt_code = 0, rhs_code; + tree first_stmt_def1_type = NULL_TREE, first_stmt_def0_type = NULL_TREE; + tree lhs; + bool stop_recursion = false, need_same_oprnds = false; + tree vectype, scalar_type, first_op1 = NULL_TREE; + unsigned int vectorization_factor = 0, ncopies; + optab optab; + int icode; + enum machine_mode optab_op2_mode; + enum machine_mode vec_mode; + tree first_stmt_const_oprnd = NULL_TREE; + struct data_reference *first_dr; + bool pattern0 = false, pattern1 = false; + HOST_WIDE_INT dummy; + bool permutation = false; + unsigned int load_place; + gimple first_load; + + /* For every stmt in NODE find its def stmt/s. */ + for (i = 0; VEC_iterate (gimple, stmts, i, stmt); i++) + { + if (vect_print_dump_info (REPORT_SLP)) + { + fprintf (vect_dump, "Build SLP for "); + print_gimple_stmt (vect_dump, stmt, 0, TDF_SLIM); + } + + lhs = gimple_get_lhs (stmt); + if (lhs == NULL_TREE) + { + if (vect_print_dump_info (REPORT_SLP)) + { + fprintf (vect_dump, + "Build SLP failed: not GIMPLE_ASSIGN nor GIMPLE_CALL"); + print_gimple_stmt (vect_dump, stmt, 0, TDF_SLIM); + } + + return false; + } + + scalar_type = vect_get_smallest_scalar_type (stmt, &dummy, &dummy); + vectype = get_vectype_for_scalar_type (scalar_type); + if (!vectype) + { + if (vect_print_dump_info (REPORT_SLP)) + { + fprintf (vect_dump, "Build SLP failed: unsupported data-type "); + print_generic_expr (vect_dump, scalar_type, TDF_SLIM); + } + return false; + } + + gcc_assert (LOOP_VINFO_VECT_FACTOR (loop_vinfo)); + vectorization_factor = LOOP_VINFO_VECT_FACTOR (loop_vinfo); + ncopies = vectorization_factor / TYPE_VECTOR_SUBPARTS (vectype); + if (ncopies > 1 && vect_print_dump_info (REPORT_SLP)) + fprintf (vect_dump, "SLP with multiple types "); + + /* In case of multiple types we need to detect the smallest type. */ + if (*max_nunits < TYPE_VECTOR_SUBPARTS (vectype)) + *max_nunits = TYPE_VECTOR_SUBPARTS (vectype); + + if (is_gimple_call (stmt)) + rhs_code = CALL_EXPR; + else + rhs_code = gimple_assign_rhs_code (stmt); + + /* Check the operation. */ + if (i == 0) + { + first_stmt_code = rhs_code; + + /* Shift arguments should be equal in all the packed stmts for a + vector shift with scalar shift operand. */ + if (rhs_code == LSHIFT_EXPR || rhs_code == RSHIFT_EXPR + || rhs_code == LROTATE_EXPR + || rhs_code == RROTATE_EXPR) + { + vec_mode = TYPE_MODE (vectype); + + /* First see if we have a vector/vector shift. */ + optab = optab_for_tree_code (rhs_code, vectype, + optab_vector); + + if (!optab + || (optab->handlers[(int) vec_mode].insn_code + == CODE_FOR_nothing)) + { + /* No vector/vector shift, try for a vector/scalar shift. */ + optab = optab_for_tree_code (rhs_code, vectype, + optab_scalar); + + if (!optab) + { + if (vect_print_dump_info (REPORT_SLP)) + fprintf (vect_dump, "Build SLP failed: no optab."); + return false; + } + icode = (int) optab->handlers[(int) vec_mode].insn_code; + if (icode == CODE_FOR_nothing) + { + if (vect_print_dump_info (REPORT_SLP)) + fprintf (vect_dump, "Build SLP failed: " + "op not supported by target."); + return false; + } + optab_op2_mode = insn_data[icode].operand[2].mode; + if (!VECTOR_MODE_P (optab_op2_mode)) + { + need_same_oprnds = true; + first_op1 = gimple_assign_rhs2 (stmt); + } + } + } + } + else + { + if (first_stmt_code != rhs_code + && (first_stmt_code != IMAGPART_EXPR + || rhs_code != REALPART_EXPR) + && (first_stmt_code != REALPART_EXPR + || rhs_code != IMAGPART_EXPR)) + { + if (vect_print_dump_info (REPORT_SLP)) + { + fprintf (vect_dump, + "Build SLP failed: different operation in stmt "); + print_gimple_stmt (vect_dump, stmt, 0, TDF_SLIM); + } + + return false; + } + + if (need_same_oprnds + && !operand_equal_p (first_op1, gimple_assign_rhs2 (stmt), 0)) + { + if (vect_print_dump_info (REPORT_SLP)) + { + fprintf (vect_dump, + "Build SLP failed: different shift arguments in "); + print_gimple_stmt (vect_dump, stmt, 0, TDF_SLIM); + } + + return false; + } + } + + /* Strided store or load. */ + if (STMT_VINFO_STRIDED_ACCESS (vinfo_for_stmt (stmt))) + { + if (REFERENCE_CLASS_P (lhs)) + { + /* Store. */ + if (!vect_get_and_check_slp_defs (loop_vinfo, *node, stmt, + &def_stmts0, &def_stmts1, + &first_stmt_dt0, + &first_stmt_dt1, + &first_stmt_def0_type, + &first_stmt_def1_type, + &first_stmt_const_oprnd, + ncopies_for_cost, + &pattern0, &pattern1)) + return false; + } + else + { + /* Load. */ + /* FORNOW: Check that there is no gap between the loads. */ + if ((DR_GROUP_FIRST_DR (vinfo_for_stmt (stmt)) == stmt + && DR_GROUP_GAP (vinfo_for_stmt (stmt)) != 0) + || (DR_GROUP_FIRST_DR (vinfo_for_stmt (stmt)) != stmt + && DR_GROUP_GAP (vinfo_for_stmt (stmt)) != 1)) + { + if (vect_print_dump_info (REPORT_SLP)) + { + fprintf (vect_dump, "Build SLP failed: strided " + "loads have gaps "); + print_gimple_stmt (vect_dump, stmt, 0, TDF_SLIM); + } + + return false; + } + + /* Check that the size of interleaved loads group is not + greater than the SLP group size. */ + if (DR_GROUP_SIZE (vinfo_for_stmt (stmt)) + > ncopies * group_size) + { + if (vect_print_dump_info (REPORT_SLP)) + { + fprintf (vect_dump, "Build SLP failed: the number of " + "interleaved loads is greater than" + " the SLP group size "); + print_gimple_stmt (vect_dump, stmt, 0, TDF_SLIM); + } + + return false; + } + + first_load = DR_GROUP_FIRST_DR (vinfo_for_stmt (stmt)); + + if (first_load == stmt) + { + first_dr = STMT_VINFO_DATA_REF (vinfo_for_stmt (stmt)); + if (vect_supportable_dr_alignment (first_dr) + == dr_unaligned_unsupported) + { + if (vect_print_dump_info (REPORT_SLP)) + { + fprintf (vect_dump, "Build SLP failed: unsupported " + "unaligned load "); + print_gimple_stmt (vect_dump, stmt, 0, TDF_SLIM); + } + + return false; + } + + /* Analyze costs (for the first stmt in the group). */ + vect_model_load_cost (vinfo_for_stmt (stmt), + ncopies_for_cost, *node); + } + + /* Store the place of this load in the interleaving chain. In + case that permutation is needed we later decide if a specific + permutation is supported. */ + load_place = vect_get_place_in_interleaving_chain (stmt, + first_load); + if (load_place != i) + permutation = true; + + VEC_safe_push (int, heap, *load_permutation, load_place); + + /* We stop the tree when we reach a group of loads. */ + stop_recursion = true; + continue; + } + } /* Strided access. */ + else + { + if (TREE_CODE_CLASS (rhs_code) == tcc_reference) + { + /* Not strided load. */ + if (vect_print_dump_info (REPORT_SLP)) + { + fprintf (vect_dump, "Build SLP failed: not strided load "); + print_gimple_stmt (vect_dump, stmt, 0, TDF_SLIM); + } + + /* FORNOW: Not strided loads are not supported. */ + return false; + } + + /* Not memory operation. */ + if (TREE_CODE_CLASS (rhs_code) != tcc_binary + && TREE_CODE_CLASS (rhs_code) != tcc_unary) + { + if (vect_print_dump_info (REPORT_SLP)) + { + fprintf (vect_dump, "Build SLP failed: operation"); + fprintf (vect_dump, " unsupported "); + print_gimple_stmt (vect_dump, stmt, 0, TDF_SLIM); + } + + return false; + } + + /* Find the def-stmts. */ + if (!vect_get_and_check_slp_defs (loop_vinfo, *node, stmt, + &def_stmts0, &def_stmts1, + &first_stmt_dt0, &first_stmt_dt1, + &first_stmt_def0_type, + &first_stmt_def1_type, + &first_stmt_const_oprnd, + ncopies_for_cost, + &pattern0, &pattern1)) + return false; + } + } + + /* Add the costs of the node to the overall instance costs. */ + *inside_cost += SLP_TREE_INSIDE_OF_LOOP_COST (*node); + *outside_cost += SLP_TREE_OUTSIDE_OF_LOOP_COST (*node); + + /* Strided loads were reached - stop the recursion. */ + if (stop_recursion) + { + if (permutation) + { + VEC_safe_push (slp_tree, heap, *loads, *node); + *inside_cost += TARG_VEC_PERMUTE_COST * group_size; + } + + return true; + } + + /* Create SLP_TREE nodes for the definition node/s. */ + if (first_stmt_dt0 == vect_loop_def) + { + slp_tree left_node = XNEW (struct _slp_tree); + SLP_TREE_SCALAR_STMTS (left_node) = def_stmts0; + SLP_TREE_VEC_STMTS (left_node) = NULL; + SLP_TREE_LEFT (left_node) = NULL; + SLP_TREE_RIGHT (left_node) = NULL; + SLP_TREE_OUTSIDE_OF_LOOP_COST (left_node) = 0; + SLP_TREE_INSIDE_OF_LOOP_COST (left_node) = 0; + if (!vect_build_slp_tree (loop_vinfo, &left_node, group_size, + inside_cost, outside_cost, ncopies_for_cost, + max_nunits, load_permutation, loads)) + return false; + + SLP_TREE_LEFT (*node) = left_node; + } + + if (first_stmt_dt1 == vect_loop_def) + { + slp_tree right_node = XNEW (struct _slp_tree); + SLP_TREE_SCALAR_STMTS (right_node) = def_stmts1; + SLP_TREE_VEC_STMTS (right_node) = NULL; + SLP_TREE_LEFT (right_node) = NULL; + SLP_TREE_RIGHT (right_node) = NULL; + SLP_TREE_OUTSIDE_OF_LOOP_COST (right_node) = 0; + SLP_TREE_INSIDE_OF_LOOP_COST (right_node) = 0; + if (!vect_build_slp_tree (loop_vinfo, &right_node, group_size, + inside_cost, outside_cost, ncopies_for_cost, + max_nunits, load_permutation, loads)) + return false; + + SLP_TREE_RIGHT (*node) = right_node; + } + + return true; +} + + +static void +vect_print_slp_tree (slp_tree node) +{ + int i; + gimple stmt; + + if (!node) + return; + + fprintf (vect_dump, "node "); + for (i = 0; VEC_iterate (gimple, SLP_TREE_SCALAR_STMTS (node), i, stmt); i++) + { + fprintf (vect_dump, "\n\tstmt %d ", i); + print_gimple_stmt (vect_dump, stmt, 0, TDF_SLIM); + } + fprintf (vect_dump, "\n"); + + vect_print_slp_tree (SLP_TREE_LEFT (node)); + vect_print_slp_tree (SLP_TREE_RIGHT (node)); +} + + +/* Mark the tree rooted at NODE with MARK (PURE_SLP or HYBRID). + If MARK is HYBRID, it refers to a specific stmt in NODE (the stmt at index + J). Otherwise, MARK is PURE_SLP and J is -1, which indicates that all the + stmts in NODE are to be marked. */ + +static void +vect_mark_slp_stmts (slp_tree node, enum slp_vect_type mark, int j) +{ + int i; + gimple stmt; + + if (!node) + return; + + for (i = 0; VEC_iterate (gimple, SLP_TREE_SCALAR_STMTS (node), i, stmt); i++) + if (j < 0 || i == j) + STMT_SLP_TYPE (vinfo_for_stmt (stmt)) = mark; + + vect_mark_slp_stmts (SLP_TREE_LEFT (node), mark, j); + vect_mark_slp_stmts (SLP_TREE_RIGHT (node), mark, j); +} + + +/* Check if the permutation required by the SLP INSTANCE is supported. + Reorganize the SLP nodes stored in SLP_INSTANCE_LOADS if needed. */ + +static bool +vect_supported_slp_permutation_p (slp_instance instance) +{ + slp_tree node = VEC_index (slp_tree, SLP_INSTANCE_LOADS (instance), 0); + gimple stmt = VEC_index (gimple, SLP_TREE_SCALAR_STMTS (node), 0); + gimple first_load = DR_GROUP_FIRST_DR (vinfo_for_stmt (stmt)); + VEC (slp_tree, heap) *sorted_loads = NULL; + int index; + slp_tree *tmp_loads = NULL; + int group_size = SLP_INSTANCE_GROUP_SIZE (instance), i, j; + slp_tree load; + + /* FORNOW: The only supported loads permutation is loads from the same + location in all the loads in the node, when the data-refs in + nodes of LOADS constitute an interleaving chain. + Sort the nodes according to the order of accesses in the chain. */ + tmp_loads = (slp_tree *) xmalloc (sizeof (slp_tree) * group_size); + for (i = 0, j = 0; + VEC_iterate (int, SLP_INSTANCE_LOAD_PERMUTATION (instance), i, index) + && VEC_iterate (slp_tree, SLP_INSTANCE_LOADS (instance), j, load); + i += group_size, j++) + { + gimple scalar_stmt = VEC_index (gimple, SLP_TREE_SCALAR_STMTS (load), 0); + /* Check that the loads are all in the same interleaving chain. */ + if (DR_GROUP_FIRST_DR (vinfo_for_stmt (scalar_stmt)) != first_load) + { + if (vect_print_dump_info (REPORT_DETAILS)) + { + fprintf (vect_dump, "Build SLP failed: unsupported data " + "permutation "); + print_gimple_stmt (vect_dump, scalar_stmt, 0, TDF_SLIM); + } + + free (tmp_loads); + return false; + } + + tmp_loads[index] = load; + } + + sorted_loads = VEC_alloc (slp_tree, heap, group_size); + for (i = 0; i < group_size; i++) + VEC_safe_push (slp_tree, heap, sorted_loads, tmp_loads[i]); + + VEC_free (slp_tree, heap, SLP_INSTANCE_LOADS (instance)); + SLP_INSTANCE_LOADS (instance) = sorted_loads; + free (tmp_loads); + + if (!vect_transform_slp_perm_load (stmt, NULL, NULL, + SLP_INSTANCE_UNROLLING_FACTOR (instance), + instance, true)) + return false; + + return true; +} + + +/* Check if the required load permutation is supported. + LOAD_PERMUTATION contains a list of indices of the loads. + In SLP this permutation is relative to the order of strided stores that are + the base of the SLP instance. */ + +static bool +vect_supported_load_permutation_p (slp_instance slp_instn, int group_size, + VEC (int, heap) *load_permutation) +{ + int i = 0, j, prev = -1, next, k; + bool supported; + + /* FORNOW: permutations are only supported for loop-aware SLP. */ + if (!slp_instn) + return false; + + if (vect_print_dump_info (REPORT_SLP)) + { + fprintf (vect_dump, "Load permutation "); + for (i = 0; VEC_iterate (int, load_permutation, i, next); i++) + fprintf (vect_dump, "%d ", next); + } + + /* FORNOW: the only supported permutation is 0..01..1.. of length equal to + GROUP_SIZE and where each sequence of same drs is of GROUP_SIZE length as + well. */ + if (VEC_length (int, load_permutation) + != (unsigned int) (group_size * group_size)) + return false; + + supported = true; + for (j = 0; j < group_size; j++) + { + for (i = j * group_size, k = 0; + VEC_iterate (int, load_permutation, i, next) && k < group_size; + i++, k++) + { + if (i != j * group_size && next != prev) + { + supported = false; + break; + } + + prev = next; + } + } + + if (supported && i == group_size * group_size + && vect_supported_slp_permutation_p (slp_instn)) + return true; + + return false; +} + + +/* Find the first load in the loop that belongs to INSTANCE. + When loads are in several SLP nodes, there can be a case in which the first + load does not appear in the first SLP node to be transformed, causing + incorrect order of statements. Since we generate all the loads together, + they must be inserted before the first load of the SLP instance and not + before the first load of the first node of the instance. */ +static gimple +vect_find_first_load_in_slp_instance (slp_instance instance) +{ + int i, j; + slp_tree load_node; + gimple first_load = NULL, load; + + for (i = 0; + VEC_iterate (slp_tree, SLP_INSTANCE_LOADS (instance), i, load_node); + i++) + for (j = 0; + VEC_iterate (gimple, SLP_TREE_SCALAR_STMTS (load_node), j, load); + j++) + first_load = get_earlier_stmt (load, first_load); + + return first_load; +} + + +/* Analyze an SLP instance starting from a group of strided stores. Call + vect_build_slp_tree to build a tree of packed stmts if possible. + Return FALSE if it's impossible to SLP any stmt in the loop. */ + +static bool +vect_analyze_slp_instance (loop_vec_info loop_vinfo, gimple stmt) +{ + slp_instance new_instance; + slp_tree node = XNEW (struct _slp_tree); + unsigned int group_size = DR_GROUP_SIZE (vinfo_for_stmt (stmt)); + unsigned int unrolling_factor = 1, nunits; + tree vectype, scalar_type; + gimple next; + unsigned int vectorization_factor = 0, ncopies; + bool slp_impossible = false; + int inside_cost = 0, outside_cost = 0, ncopies_for_cost; + unsigned int max_nunits = 0; + VEC (int, heap) *load_permutation; + VEC (slp_tree, heap) *loads; + + scalar_type = TREE_TYPE (DR_REF (STMT_VINFO_DATA_REF ( + vinfo_for_stmt (stmt)))); + vectype = get_vectype_for_scalar_type (scalar_type); + if (!vectype) + { + if (vect_print_dump_info (REPORT_SLP)) + { + fprintf (vect_dump, "Build SLP failed: unsupported data-type "); + print_generic_expr (vect_dump, scalar_type, TDF_SLIM); + } + return false; + } + + nunits = TYPE_VECTOR_SUBPARTS (vectype); + vectorization_factor = LOOP_VINFO_VECT_FACTOR (loop_vinfo); + ncopies = vectorization_factor / nunits; + + /* Create a node (a root of the SLP tree) for the packed strided stores. */ + SLP_TREE_SCALAR_STMTS (node) = VEC_alloc (gimple, heap, group_size); + next = stmt; + /* Collect the stores and store them in SLP_TREE_SCALAR_STMTS. */ + while (next) + { + VEC_safe_push (gimple, heap, SLP_TREE_SCALAR_STMTS (node), next); + next = DR_GROUP_NEXT_DR (vinfo_for_stmt (next)); + } + + SLP_TREE_VEC_STMTS (node) = NULL; + SLP_TREE_NUMBER_OF_VEC_STMTS (node) = 0; + SLP_TREE_LEFT (node) = NULL; + SLP_TREE_RIGHT (node) = NULL; + SLP_TREE_OUTSIDE_OF_LOOP_COST (node) = 0; + SLP_TREE_INSIDE_OF_LOOP_COST (node) = 0; + + /* Calculate the unrolling factor. */ + unrolling_factor = least_common_multiple (nunits, group_size) / group_size; + + /* Calculate the number of vector stmts to create based on the unrolling + factor (number of vectors is 1 if NUNITS >= GROUP_SIZE, and is + GROUP_SIZE / NUNITS otherwise. */ + ncopies_for_cost = unrolling_factor * group_size / nunits; + + load_permutation = VEC_alloc (int, heap, group_size * group_size); + loads = VEC_alloc (slp_tree, heap, group_size); + + /* Build the tree for the SLP instance. */ + if (vect_build_slp_tree (loop_vinfo, &node, group_size, &inside_cost, + &outside_cost, ncopies_for_cost, &max_nunits, + &load_permutation, &loads)) + { + /* Create a new SLP instance. */ + new_instance = XNEW (struct _slp_instance); + SLP_INSTANCE_TREE (new_instance) = node; + SLP_INSTANCE_GROUP_SIZE (new_instance) = group_size; + /* Calculate the unrolling factor based on the smallest type in the + loop. */ + if (max_nunits > nunits) + unrolling_factor = least_common_multiple (max_nunits, group_size) + / group_size; + + SLP_INSTANCE_UNROLLING_FACTOR (new_instance) = unrolling_factor; + SLP_INSTANCE_OUTSIDE_OF_LOOP_COST (new_instance) = outside_cost; + SLP_INSTANCE_INSIDE_OF_LOOP_COST (new_instance) = inside_cost; + SLP_INSTANCE_LOADS (new_instance) = loads; + SLP_INSTANCE_FIRST_LOAD_STMT (new_instance) = NULL; + SLP_INSTANCE_LOAD_PERMUTATION (new_instance) = load_permutation; + if (VEC_length (slp_tree, loads)) + { + if (!vect_supported_load_permutation_p (new_instance, group_size, + load_permutation)) + { + if (vect_print_dump_info (REPORT_SLP)) + { + fprintf (vect_dump, "Build SLP failed: unsupported load " + "permutation "); + print_gimple_stmt (vect_dump, stmt, 0, TDF_SLIM); + } + + vect_free_slp_instance (new_instance); + return false; + } + + SLP_INSTANCE_FIRST_LOAD_STMT (new_instance) + = vect_find_first_load_in_slp_instance (new_instance); + } + else + VEC_free (int, heap, SLP_INSTANCE_LOAD_PERMUTATION (new_instance)); + + VEC_safe_push (slp_instance, heap, LOOP_VINFO_SLP_INSTANCES (loop_vinfo), + new_instance); + if (vect_print_dump_info (REPORT_SLP)) + vect_print_slp_tree (node); + + return true; + } + + /* Failed to SLP. */ + /* Free the allocated memory. */ + vect_free_slp_tree (node); + VEC_free (int, heap, load_permutation); + VEC_free (slp_tree, heap, loads); + + if (slp_impossible) + return false; + + /* SLP failed for this instance, but it is still possible to SLP other stmts + in the loop. */ + return true; +} + + +/* Check if there are stmts in the loop can be vectorized using SLP. Build SLP + trees of packed scalar stmts if SLP is possible. */ + +bool +vect_analyze_slp (loop_vec_info loop_vinfo) +{ + unsigned int i; + VEC (gimple, heap) *strided_stores = LOOP_VINFO_STRIDED_STORES (loop_vinfo); + gimple store; + + if (vect_print_dump_info (REPORT_SLP)) + fprintf (vect_dump, "=== vect_analyze_slp ==="); + + for (i = 0; VEC_iterate (gimple, strided_stores, i, store); i++) + if (!vect_analyze_slp_instance (loop_vinfo, store)) + { + /* SLP failed. No instance can be SLPed in the loop. */ + if (vect_print_dump_info (REPORT_UNVECTORIZED_LOOPS)) + fprintf (vect_dump, "SLP failed."); + + return false; + } + + return true; +} + + +/* For each possible SLP instance decide whether to SLP it and calculate overall + unrolling factor needed to SLP the loop. */ + +void +vect_make_slp_decision (loop_vec_info loop_vinfo) +{ + unsigned int i, unrolling_factor = 1; + VEC (slp_instance, heap) *slp_instances = LOOP_VINFO_SLP_INSTANCES (loop_vinfo); + slp_instance instance; + int decided_to_slp = 0; + + if (vect_print_dump_info (REPORT_SLP)) + fprintf (vect_dump, "=== vect_make_slp_decision ==="); + + for (i = 0; VEC_iterate (slp_instance, slp_instances, i, instance); i++) + { + /* FORNOW: SLP if you can. */ + if (unrolling_factor < SLP_INSTANCE_UNROLLING_FACTOR (instance)) + unrolling_factor = SLP_INSTANCE_UNROLLING_FACTOR (instance); + + /* Mark all the stmts that belong to INSTANCE as PURE_SLP stmts. Later we + call vect_detect_hybrid_slp () to find stmts that need hybrid SLP and + loop-based vectorization. Such stmts will be marked as HYBRID. */ + vect_mark_slp_stmts (SLP_INSTANCE_TREE (instance), pure_slp, -1); + decided_to_slp++; + } + + LOOP_VINFO_SLP_UNROLLING_FACTOR (loop_vinfo) = unrolling_factor; + + if (decided_to_slp && vect_print_dump_info (REPORT_SLP)) + fprintf (vect_dump, "Decided to SLP %d instances. Unrolling factor %d", + decided_to_slp, unrolling_factor); +} + + +/* Find stmts that must be both vectorized and SLPed (since they feed stmts that + can't be SLPed) in the tree rooted at NODE. Mark such stmts as HYBRID. */ + +static void +vect_detect_hybrid_slp_stmts (slp_tree node) +{ + int i; + gimple stmt; + imm_use_iterator imm_iter; + gimple use_stmt; + + if (!node) + return; + + for (i = 0; VEC_iterate (gimple, SLP_TREE_SCALAR_STMTS (node), i, stmt); i++) + if (PURE_SLP_STMT (vinfo_for_stmt (stmt)) + && TREE_CODE (gimple_op (stmt, 0)) == SSA_NAME) + FOR_EACH_IMM_USE_STMT (use_stmt, imm_iter, gimple_op (stmt, 0)) + if (vinfo_for_stmt (use_stmt) + && !STMT_SLP_TYPE (vinfo_for_stmt (use_stmt)) + && STMT_VINFO_RELEVANT (vinfo_for_stmt (use_stmt))) + vect_mark_slp_stmts (node, hybrid, i); + + vect_detect_hybrid_slp_stmts (SLP_TREE_LEFT (node)); + vect_detect_hybrid_slp_stmts (SLP_TREE_RIGHT (node)); +} + + +/* Find stmts that must be both vectorized and SLPed. */ + +void +vect_detect_hybrid_slp (loop_vec_info loop_vinfo) +{ + unsigned int i; + VEC (slp_instance, heap) *slp_instances = LOOP_VINFO_SLP_INSTANCES (loop_vinfo); + slp_instance instance; + + if (vect_print_dump_info (REPORT_SLP)) + fprintf (vect_dump, "=== vect_detect_hybrid_slp ==="); + + for (i = 0; VEC_iterate (slp_instance, slp_instances, i, instance); i++) + vect_detect_hybrid_slp_stmts (SLP_INSTANCE_TREE (instance)); +} + +/* SLP costs are calculated according to SLP instance unrolling factor (i.e., + the number of created vector stmts depends on the unrolling factor). However, + the actual number of vector stmts for every SLP node depends on VF which is + set later in vect_analyze_operations(). Hence, SLP costs should be updated. + In this function we assume that the inside costs calculated in + vect_model_xxx_cost are linear in ncopies. */ + +void +vect_update_slp_costs_according_to_vf (loop_vec_info loop_vinfo) +{ + unsigned int i, vf = LOOP_VINFO_VECT_FACTOR (loop_vinfo); + VEC (slp_instance, heap) *slp_instances = LOOP_VINFO_SLP_INSTANCES (loop_vinfo); + slp_instance instance; + + if (vect_print_dump_info (REPORT_SLP)) + fprintf (vect_dump, "=== vect_update_slp_costs_according_to_vf ==="); + + for (i = 0; VEC_iterate (slp_instance, slp_instances, i, instance); i++) + /* We assume that costs are linear in ncopies. */ + SLP_INSTANCE_INSIDE_OF_LOOP_COST (instance) *= vf + / SLP_INSTANCE_UNROLLING_FACTOR (instance); +} + +/* For constant and loop invariant defs of SLP_NODE this function returns + (vector) defs (VEC_OPRNDS) that will be used in the vectorized stmts. + OP_NUM determines if we gather defs for operand 0 or operand 1 of the scalar + stmts. NUMBER_OF_VECTORS is the number of vector defs to create. */ + +static void +vect_get_constant_vectors (slp_tree slp_node, VEC(tree,heap) **vec_oprnds, + unsigned int op_num, unsigned int number_of_vectors) +{ + VEC (gimple, heap) *stmts = SLP_TREE_SCALAR_STMTS (slp_node); + gimple stmt = VEC_index (gimple, stmts, 0); + stmt_vec_info stmt_vinfo = vinfo_for_stmt (stmt); + tree vectype = STMT_VINFO_VECTYPE (stmt_vinfo); + int nunits; + tree vec_cst; + tree t = NULL_TREE; + int j, number_of_places_left_in_vector; + tree vector_type; + tree op, vop; + int group_size = VEC_length (gimple, stmts); + unsigned int vec_num, i; + int number_of_copies = 1; + VEC (tree, heap) *voprnds = VEC_alloc (tree, heap, number_of_vectors); + bool constant_p, is_store; + + if (STMT_VINFO_DATA_REF (stmt_vinfo)) + { + is_store = true; + op = gimple_assign_rhs1 (stmt); + } + else + { + is_store = false; + op = gimple_op (stmt, op_num + 1); + } + + if (CONSTANT_CLASS_P (op)) + { + vector_type = vectype; + constant_p = true; + } + else + { + vector_type = get_vectype_for_scalar_type (TREE_TYPE (op)); + gcc_assert (vector_type); + constant_p = false; + } + + nunits = TYPE_VECTOR_SUBPARTS (vector_type); + + /* NUMBER_OF_COPIES is the number of times we need to use the same values in + created vectors. It is greater than 1 if unrolling is performed. + + For example, we have two scalar operands, s1 and s2 (e.g., group of + strided accesses of size two), while NUNITS is four (i.e., four scalars + of this type can be packed in a vector). The output vector will contain + two copies of each scalar operand: {s1, s2, s1, s2}. (NUMBER_OF_COPIES + will be 2). + + If GROUP_SIZE > NUNITS, the scalars will be split into several vectors + containing the operands. + + For example, NUNITS is four as before, and the group size is 8 + (s1, s2, ..., s8). We will create two vectors {s1, s2, s3, s4} and + {s5, s6, s7, s8}. */ + + number_of_copies = least_common_multiple (nunits, group_size) / group_size; + + number_of_places_left_in_vector = nunits; + for (j = 0; j < number_of_copies; j++) + { + for (i = group_size - 1; VEC_iterate (gimple, stmts, i, stmt); i--) + { + if (is_store) + op = gimple_assign_rhs1 (stmt); + else + op = gimple_op (stmt, op_num + 1); + + /* Create 'vect_ = {op0,op1,...,opn}'. */ + t = tree_cons (NULL_TREE, op, t); + + number_of_places_left_in_vector--; + + if (number_of_places_left_in_vector == 0) + { + number_of_places_left_in_vector = nunits; + + if (constant_p) + vec_cst = build_vector (vector_type, t); + else + vec_cst = build_constructor_from_list (vector_type, t); + VEC_quick_push (tree, voprnds, + vect_init_vector (stmt, vec_cst, vector_type, NULL)); + t = NULL_TREE; + } + } + } + + /* Since the vectors are created in the reverse order, we should invert + them. */ + vec_num = VEC_length (tree, voprnds); + for (j = vec_num - 1; j >= 0; j--) + { + vop = VEC_index (tree, voprnds, j); + VEC_quick_push (tree, *vec_oprnds, vop); + } + + VEC_free (tree, heap, voprnds); + + /* In case that VF is greater than the unrolling factor needed for the SLP + group of stmts, NUMBER_OF_VECTORS to be created is greater than + NUMBER_OF_SCALARS/NUNITS or NUNITS/NUMBER_OF_SCALARS, and hence we have + to replicate the vectors. */ + while (number_of_vectors > VEC_length (tree, *vec_oprnds)) + { + for (i = 0; VEC_iterate (tree, *vec_oprnds, i, vop) && i < vec_num; i++) + VEC_quick_push (tree, *vec_oprnds, vop); + } +} + + +/* Get vectorized definitions from SLP_NODE that contains corresponding + vectorized def-stmts. */ + +static void +vect_get_slp_vect_defs (slp_tree slp_node, VEC (tree,heap) **vec_oprnds) +{ + tree vec_oprnd; + gimple vec_def_stmt; + unsigned int i; + + gcc_assert (SLP_TREE_VEC_STMTS (slp_node)); + + for (i = 0; + VEC_iterate (gimple, SLP_TREE_VEC_STMTS (slp_node), i, vec_def_stmt); + i++) + { + gcc_assert (vec_def_stmt); + vec_oprnd = gimple_get_lhs (vec_def_stmt); + VEC_quick_push (tree, *vec_oprnds, vec_oprnd); + } +} + + +/* Get vectorized definitions for SLP_NODE. + If the scalar definitions are loop invariants or constants, collect them and + call vect_get_constant_vectors() to create vector stmts. + Otherwise, the def-stmts must be already vectorized and the vectorized stmts + must be stored in the LEFT/RIGHT node of SLP_NODE, and we call + vect_get_slp_vect_defs() to retrieve them. + If VEC_OPRNDS1 is NULL, don't get vector defs for the second operand (from + the right node. This is used when the second operand must remain scalar. */ + +void +vect_get_slp_defs (slp_tree slp_node, VEC (tree,heap) **vec_oprnds0, + VEC (tree,heap) **vec_oprnds1) +{ + gimple first_stmt; + enum tree_code code; + int number_of_vects; + HOST_WIDE_INT lhs_size_unit, rhs_size_unit; + + first_stmt = VEC_index (gimple, SLP_TREE_SCALAR_STMTS (slp_node), 0); + /* The number of vector defs is determined by the number of vector statements + in the node from which we get those statements. */ + if (SLP_TREE_LEFT (slp_node)) + number_of_vects = SLP_TREE_NUMBER_OF_VEC_STMTS (SLP_TREE_LEFT (slp_node)); + else + { + number_of_vects = SLP_TREE_NUMBER_OF_VEC_STMTS (slp_node); + /* Number of vector stmts was calculated according to LHS in + vect_schedule_slp_instance(), fix it by replacing LHS with RHS, if + necessary. See vect_get_smallest_scalar_type() for details. */ + vect_get_smallest_scalar_type (first_stmt, &lhs_size_unit, + &rhs_size_unit); + if (rhs_size_unit != lhs_size_unit) + { + number_of_vects *= rhs_size_unit; + number_of_vects /= lhs_size_unit; + } + } + + /* Allocate memory for vectorized defs. */ + *vec_oprnds0 = VEC_alloc (tree, heap, number_of_vects); + + /* SLP_NODE corresponds either to a group of stores or to a group of + unary/binary operations. We don't call this function for loads. */ + if (SLP_TREE_LEFT (slp_node)) + /* The defs are already vectorized. */ + vect_get_slp_vect_defs (SLP_TREE_LEFT (slp_node), vec_oprnds0); + else + /* Build vectors from scalar defs. */ + vect_get_constant_vectors (slp_node, vec_oprnds0, 0, number_of_vects); + + if (STMT_VINFO_DATA_REF (vinfo_for_stmt (first_stmt))) + /* Since we don't call this function with loads, this is a group of + stores. */ + return; + + code = gimple_assign_rhs_code (first_stmt); + if (get_gimple_rhs_class (code) != GIMPLE_BINARY_RHS || !vec_oprnds1) + return; + + /* The number of vector defs is determined by the number of vector statements + in the node from which we get those statements. */ + if (SLP_TREE_RIGHT (slp_node)) + number_of_vects = SLP_TREE_NUMBER_OF_VEC_STMTS (SLP_TREE_RIGHT (slp_node)); + else + number_of_vects = SLP_TREE_NUMBER_OF_VEC_STMTS (slp_node); + + *vec_oprnds1 = VEC_alloc (tree, heap, number_of_vects); + + if (SLP_TREE_RIGHT (slp_node)) + /* The defs are already vectorized. */ + vect_get_slp_vect_defs (SLP_TREE_RIGHT (slp_node), vec_oprnds1); + else + /* Build vectors from scalar defs. */ + vect_get_constant_vectors (slp_node, vec_oprnds1, 1, number_of_vects); +} + +/* Create NCOPIES permutation statements using the mask MASK_BYTES (by + building a vector of type MASK_TYPE from it) and two input vectors placed in + DR_CHAIN at FIRST_VEC_INDX and SECOND_VEC_INDX for the first copy and + shifting by STRIDE elements of DR_CHAIN for every copy. + (STRIDE is the number of vectorized stmts for NODE divided by the number of + copies). + VECT_STMTS_COUNTER specifies the index in the vectorized stmts of NODE, where + the created stmts must be inserted. */ + +static inline void +vect_create_mask_and_perm (gimple stmt, gimple next_scalar_stmt, + int *mask_array, int mask_nunits, + tree mask_element_type, tree mask_type, + int first_vec_indx, int second_vec_indx, + gimple_stmt_iterator *gsi, slp_tree node, + tree builtin_decl, tree vectype, + VEC(tree,heap) *dr_chain, + int ncopies, int vect_stmts_counter) +{ + tree t = NULL_TREE, mask_vec, mask, perm_dest; + gimple perm_stmt = NULL; + stmt_vec_info next_stmt_info; + int i, group_size, stride, dr_chain_size; + tree first_vec, second_vec, data_ref; + tree sym; + ssa_op_iter iter; + VEC (tree, heap) *params = NULL; + + /* Create a vector mask. */ + for (i = mask_nunits - 1; i >= 0; --i) + t = tree_cons (NULL_TREE, build_int_cst (mask_element_type, mask_array[i]), + t); + mask_vec = build_vector (mask_type, t); + mask = vect_init_vector (stmt, mask_vec, mask_type, NULL); + + group_size = VEC_length (gimple, SLP_TREE_SCALAR_STMTS (node)); + stride = SLP_TREE_NUMBER_OF_VEC_STMTS (node) / ncopies; + dr_chain_size = VEC_length (tree, dr_chain); + + /* Initialize the vect stmts of NODE to properly insert the generated + stmts later. */ + for (i = VEC_length (gimple, SLP_TREE_VEC_STMTS (node)); + i < (int) SLP_TREE_NUMBER_OF_VEC_STMTS (node); i++) + VEC_quick_push (gimple, SLP_TREE_VEC_STMTS (node), NULL); + + perm_dest = vect_create_destination_var (gimple_assign_lhs (stmt), vectype); + for (i = 0; i < ncopies; i++) + { + first_vec = VEC_index (tree, dr_chain, first_vec_indx); + second_vec = VEC_index (tree, dr_chain, second_vec_indx); + + /* Build argument list for the vectorized call. */ + VEC_free (tree, heap, params); + params = VEC_alloc (tree, heap, 3); + VEC_quick_push (tree, params, first_vec); + VEC_quick_push (tree, params, second_vec); + VEC_quick_push (tree, params, mask); + + /* Generate the permute statement. */ + perm_stmt = gimple_build_call_vec (builtin_decl, params); + data_ref = make_ssa_name (perm_dest, perm_stmt); + gimple_call_set_lhs (perm_stmt, data_ref); + vect_finish_stmt_generation (stmt, perm_stmt, gsi); + FOR_EACH_SSA_TREE_OPERAND (sym, perm_stmt, iter, SSA_OP_ALL_VIRTUALS) + { + if (TREE_CODE (sym) == SSA_NAME) + sym = SSA_NAME_VAR (sym); + mark_sym_for_renaming (sym); + } + + /* Store the vector statement in NODE. */ + VEC_replace (gimple, SLP_TREE_VEC_STMTS (node), + stride * i + vect_stmts_counter, perm_stmt); + + first_vec_indx += stride; + second_vec_indx += stride; + } + + /* Mark the scalar stmt as vectorized. */ + next_stmt_info = vinfo_for_stmt (next_scalar_stmt); + STMT_VINFO_VEC_STMT (next_stmt_info) = perm_stmt; +} + + +/* Given FIRST_MASK_ELEMENT - the mask element in element representation, + return in CURRENT_MASK_ELEMENT its equivalent in target specific + representation. Check that the mask is valid and return FALSE if not. + Return TRUE in NEED_NEXT_VECTOR if the permutation requires to move to + the next vector, i.e., the current first vector is not needed. */ + +static bool +vect_get_mask_element (gimple stmt, int first_mask_element, int m, + int mask_nunits, bool only_one_vec, int index, + int *mask, int *current_mask_element, + bool *need_next_vector) +{ + int i; + static int number_of_mask_fixes = 1; + static bool mask_fixed = false; + static bool needs_first_vector = false; + + /* Convert to target specific representation. */ + *current_mask_element = first_mask_element + m; + /* Adjust the value in case it's a mask for second and third vectors. */ + *current_mask_element -= mask_nunits * (number_of_mask_fixes - 1); + + if (*current_mask_element < mask_nunits) + needs_first_vector = true; + + /* We have only one input vector to permute but the mask accesses values in + the next vector as well. */ + if (only_one_vec && *current_mask_element >= mask_nunits) + { + if (vect_print_dump_info (REPORT_DETAILS)) + { + fprintf (vect_dump, "permutation requires at least two vectors "); + print_gimple_stmt (vect_dump, stmt, 0, TDF_SLIM); + } + + return false; + } + + /* The mask requires the next vector. */ + if (*current_mask_element >= mask_nunits * 2) + { + if (needs_first_vector || mask_fixed) + { + /* We either need the first vector too or have already moved to the + next vector. In both cases, this permutation needs three + vectors. */ + if (vect_print_dump_info (REPORT_DETAILS)) + { + fprintf (vect_dump, "permutation requires at " + "least three vectors "); + print_gimple_stmt (vect_dump, stmt, 0, TDF_SLIM); + } + + return false; + } + + /* We move to the next vector, dropping the first one and working with + the second and the third - we need to adjust the values of the mask + accordingly. */ + *current_mask_element -= mask_nunits * number_of_mask_fixes; + + for (i = 0; i < index; i++) + mask[i] -= mask_nunits * number_of_mask_fixes; + + (number_of_mask_fixes)++; + mask_fixed = true; + } + + *need_next_vector = mask_fixed; + + /* This was the last element of this mask. Start a new one. */ + if (index == mask_nunits - 1) + { + number_of_mask_fixes = 1; + mask_fixed = false; + needs_first_vector = false; + } + + return true; +} + + +/* Generate vector permute statements from a list of loads in DR_CHAIN. + If ANALYZE_ONLY is TRUE, only check that it is possible to create valid + permute statements for SLP_NODE_INSTANCE. */ +bool +vect_transform_slp_perm_load (gimple stmt, VEC (tree, heap) *dr_chain, + gimple_stmt_iterator *gsi, int vf, + slp_instance slp_node_instance, bool analyze_only) +{ + stmt_vec_info stmt_info = vinfo_for_stmt (stmt); + tree mask_element_type = NULL_TREE, mask_type; + int i, j, k, m, scale, mask_nunits, nunits, vec_index = 0, scalar_index; + slp_tree node; + tree vectype = STMT_VINFO_VECTYPE (stmt_info), builtin_decl; + gimple next_scalar_stmt; + int group_size = SLP_INSTANCE_GROUP_SIZE (slp_node_instance); + int first_mask_element; + int index, unroll_factor, *mask, current_mask_element, ncopies; + bool only_one_vec = false, need_next_vector = false; + int first_vec_index, second_vec_index, orig_vec_stmts_num, vect_stmts_counter; + + if (!targetm.vectorize.builtin_vec_perm) + { + if (vect_print_dump_info (REPORT_DETAILS)) + { + fprintf (vect_dump, "no builtin for vect permute for "); + print_gimple_stmt (vect_dump, stmt, 0, TDF_SLIM); + } + + return false; + } + + builtin_decl = targetm.vectorize.builtin_vec_perm (vectype, + &mask_element_type); + if (!builtin_decl || !mask_element_type) + { + if (vect_print_dump_info (REPORT_DETAILS)) + { + fprintf (vect_dump, "no builtin for vect permute for "); + print_gimple_stmt (vect_dump, stmt, 0, TDF_SLIM); + } + + return false; + } + + mask_type = get_vectype_for_scalar_type (mask_element_type); + mask_nunits = TYPE_VECTOR_SUBPARTS (mask_type); + mask = (int *) xmalloc (sizeof (int) * mask_nunits); + nunits = TYPE_VECTOR_SUBPARTS (vectype); + scale = mask_nunits / nunits; + unroll_factor = SLP_INSTANCE_UNROLLING_FACTOR (slp_node_instance); + + /* The number of vector stmts to generate based only on SLP_NODE_INSTANCE + unrolling factor. */ + orig_vec_stmts_num = group_size * + SLP_INSTANCE_UNROLLING_FACTOR (slp_node_instance) / nunits; + if (orig_vec_stmts_num == 1) + only_one_vec = true; + + /* Number of copies is determined by the final vectorization factor + relatively to SLP_NODE_INSTANCE unrolling factor. */ + ncopies = vf / SLP_INSTANCE_UNROLLING_FACTOR (slp_node_instance); + + /* Generate permutation masks for every NODE. Number of masks for each NODE + is equal to GROUP_SIZE. + E.g., we have a group of three nodes with three loads from the same + location in each node, and the vector size is 4. I.e., we have a + a0b0c0a1b1c1... sequence and we need to create the following vectors: + for a's: a0a0a0a1 a1a1a2a2 a2a3a3a3 + for b's: b0b0b0b1 b1b1b2b2 b2b3b3b3 + ... + + The masks for a's should be: {0,0,0,3} {3,3,6,6} {6,9,9,9} (in target + scpecific type, e.g., in bytes for Altivec. + The last mask is illegal since we assume two operands for permute + operation, and the mask element values can't be outside that range. Hence, + the last mask must be converted into {2,5,5,5}. + For the first two permutations we need the first and the second input + vectors: {a0,b0,c0,a1} and {b1,c1,a2,b2}, and for the last permutation + we need the second and the third vectors: {b1,c1,a2,b2} and + {c2,a3,b3,c3}. */ + + for (i = 0; + VEC_iterate (slp_tree, SLP_INSTANCE_LOADS (slp_node_instance), + i, node); + i++) + { + scalar_index = 0; + index = 0; + vect_stmts_counter = 0; + vec_index = 0; + first_vec_index = vec_index++; + if (only_one_vec) + second_vec_index = first_vec_index; + else + second_vec_index = vec_index++; + + for (j = 0; j < unroll_factor; j++) + { + for (k = 0; k < group_size; k++) + { + first_mask_element = (i + j * group_size) * scale; + for (m = 0; m < scale; m++) + { + if (!vect_get_mask_element (stmt, first_mask_element, m, + mask_nunits, only_one_vec, index, mask, + ¤t_mask_element, &need_next_vector)) + return false; + + mask[index++] = current_mask_element; + } + + if (index == mask_nunits) + { + index = 0; + if (!analyze_only) + { + if (need_next_vector) + { + first_vec_index = second_vec_index; + second_vec_index = vec_index; + } + + next_scalar_stmt = VEC_index (gimple, + SLP_TREE_SCALAR_STMTS (node), scalar_index++); + + vect_create_mask_and_perm (stmt, next_scalar_stmt, + mask, mask_nunits, mask_element_type, mask_type, + first_vec_index, second_vec_index, gsi, node, + builtin_decl, vectype, dr_chain, ncopies, + vect_stmts_counter++); + } + } + } + } + } + + free (mask); + return true; +} + + + +/* Vectorize SLP instance tree in postorder. */ + +static bool +vect_schedule_slp_instance (slp_tree node, slp_instance instance, + unsigned int vectorization_factor) +{ + gimple stmt; + bool strided_store, is_store; + gimple_stmt_iterator si; + stmt_vec_info stmt_info; + unsigned int vec_stmts_size, nunits, group_size; + tree vectype; + int i; + slp_tree loads_node; + + if (!node) + return false; + + vect_schedule_slp_instance (SLP_TREE_LEFT (node), instance, + vectorization_factor); + vect_schedule_slp_instance (SLP_TREE_RIGHT (node), instance, + vectorization_factor); + + stmt = VEC_index (gimple, SLP_TREE_SCALAR_STMTS (node), 0); + stmt_info = vinfo_for_stmt (stmt); + + /* VECTYPE is the type of the destination. */ + vectype = get_vectype_for_scalar_type (TREE_TYPE (gimple_assign_lhs (stmt))); + nunits = (unsigned int) TYPE_VECTOR_SUBPARTS (vectype); + group_size = SLP_INSTANCE_GROUP_SIZE (instance); + + /* For each SLP instance calculate number of vector stmts to be created + for the scalar stmts in each node of the SLP tree. Number of vector + elements in one vector iteration is the number of scalar elements in + one scalar iteration (GROUP_SIZE) multiplied by VF divided by vector + size. */ + vec_stmts_size = (vectorization_factor * group_size) / nunits; + + /* In case of load permutation we have to allocate vectorized statements for + all the nodes that participate in that permutation. */ + if (SLP_INSTANCE_LOAD_PERMUTATION (instance)) + { + for (i = 0; + VEC_iterate (slp_tree, SLP_INSTANCE_LOADS (instance), i, loads_node); + i++) + { + if (!SLP_TREE_VEC_STMTS (loads_node)) + { + SLP_TREE_VEC_STMTS (loads_node) = VEC_alloc (gimple, heap, + vec_stmts_size); + SLP_TREE_NUMBER_OF_VEC_STMTS (loads_node) = vec_stmts_size; + } + } + } + + if (!SLP_TREE_VEC_STMTS (node)) + { + SLP_TREE_VEC_STMTS (node) = VEC_alloc (gimple, heap, vec_stmts_size); + SLP_TREE_NUMBER_OF_VEC_STMTS (node) = vec_stmts_size; + } + + if (vect_print_dump_info (REPORT_DETAILS)) + { + fprintf (vect_dump, "------>vectorizing SLP node starting from: "); + print_gimple_stmt (vect_dump, stmt, 0, TDF_SLIM); + } + + /* Loads should be inserted before the first load. */ + if (SLP_INSTANCE_FIRST_LOAD_STMT (instance) + && STMT_VINFO_STRIDED_ACCESS (stmt_info) + && !REFERENCE_CLASS_P (gimple_get_lhs (stmt))) + si = gsi_for_stmt (SLP_INSTANCE_FIRST_LOAD_STMT (instance)); + else + si = gsi_for_stmt (stmt); + + is_store = vect_transform_stmt (stmt, &si, &strided_store, node, instance); + if (is_store) + { + if (DR_GROUP_FIRST_DR (stmt_info)) + /* If IS_STORE is TRUE, the vectorization of the + interleaving chain was completed - free all the stores in + the chain. */ + vect_remove_stores (DR_GROUP_FIRST_DR (stmt_info)); + else + /* FORNOW: SLP originates only from strided stores. */ + gcc_unreachable (); + + return true; + } + + /* FORNOW: SLP originates only from strided stores. */ + return false; +} + + +bool +vect_schedule_slp (loop_vec_info loop_vinfo) +{ + VEC (slp_instance, heap) *slp_instances = + LOOP_VINFO_SLP_INSTANCES (loop_vinfo); + slp_instance instance; + unsigned int i; + bool is_store = false; + + for (i = 0; VEC_iterate (slp_instance, slp_instances, i, instance); i++) + { + /* Schedule the tree of INSTANCE. */ + is_store = vect_schedule_slp_instance (SLP_INSTANCE_TREE (instance), + instance, LOOP_VINFO_VECT_FACTOR (loop_vinfo)); + + if (vect_print_dump_info (REPORT_VECTORIZED_LOOPS) + || vect_print_dump_info (REPORT_UNVECTORIZED_LOOPS)) + fprintf (vect_dump, "vectorizing stmts using SLP."); + } + + return is_store; +} |