From b5aeb3bb3e9d1a0ce78fe2d7de9f510a7413605d Mon Sep 17 00:00:00 2001 From: Ira Rosen Date: Mon, 19 Apr 2010 09:10:45 +0000 Subject: re PR tree-optimization/37027 (SLP loop vectorization missing support for reductions) PR tree-optimization/37027 * tree-vectorizer.h (struct _loop_vec_info): Add new field reductions and macro to access it. (vectorizable_reduction): Add argument. (vect_get_slp_defs): Likewise. * tree-vect-loop.c (vect_analyze_scalar_cycles_1): Collect reduction statements for possible use in SLP. (new_loop_vec_info): Initialize LOOP_VINFO_REDUCTIONS. (destroy_loop_vec_info): Free LOOP_VINFO_REDUCTIONS. (vect_create_epilog_for_reduction): Handle SLP. Modify documentation, add new argument. (vectorizable_reduction): Likewise. * tree-vect-stmts.c (vect_get_vec_defs): Update call to vect_get_slp_defs. (vectorizable_type_demotion, vectorizable_type_promotion, vectorizable_store): Likewise. (vect_analyze_stmt): Update call to vectorizable_reduction. (vect_transform_stmt): Likewise. * tree-vect-slp.c (vect_get_and_check_slp_defs): Handle reduction. (vect_build_slp_tree): Fix indentation. Check that there are no loads from different interleaving chains in same node. (vect_slp_rearrange_stmts): New function. (vect_supported_load_permutation_p): Allow load permutations for reductions. Call vect_slp_rearrange_stmts() to rearrange statements inside SLP nodes if necessary. (vect_analyze_slp_instance): Handle reductions. (vect_analyze_slp): Try to build SLP instances originating from groups of reductions. (vect_detect_hybrid_slp_stmts): Skip reduction statements. (vect_get_constant_vectors): Create initial vectors for reductions according to reduction code. Add new argument. (vect_get_slp_defs): Add new argument, pass it to vect_get_constant_vectors. (vect_schedule_slp_instance): Remove SLP tree root statements. From-SVN: r158506 --- gcc/tree-vect-slp.c | 399 ++++++++++++++++++++++++++++++++++++++++++---------- 1 file changed, 325 insertions(+), 74 deletions(-) (limited to 'gcc/tree-vect-slp.c') diff --git a/gcc/tree-vect-slp.c b/gcc/tree-vect-slp.c index afc4f311078..99a865fee20 100644 --- a/gcc/tree-vect-slp.c +++ b/gcc/tree-vect-slp.c @@ -273,6 +273,7 @@ vect_get_and_check_slp_defs (loop_vec_info loop_vinfo, bb_vec_info bb_vinfo, break; case vect_internal_def: + case vect_reduction_def: if (i == 0) VEC_safe_push (gimple, heap, *def_stmts0, def_stmt); else @@ -332,7 +333,7 @@ vect_build_slp_tree (loop_vec_info loop_vinfo, bb_vec_info bb_vinfo, HOST_WIDE_INT dummy; bool permutation = false; unsigned int load_place; - gimple first_load; + gimple first_load, prev_first_load = NULL; /* For every stmt in NODE find its def stmt/s. */ for (i = 0; VEC_iterate (gimple, stmts, i, stmt); i++) @@ -485,42 +486,62 @@ vect_build_slp_tree (loop_vec_info loop_vinfo, bb_vec_info bb_vinfo, &pattern0, &pattern1)) return false; } - else - { - /* Load. */ - /* FORNOW: Check that there is no gap between the loads. */ - if ((DR_GROUP_FIRST_DR (vinfo_for_stmt (stmt)) == stmt - && DR_GROUP_GAP (vinfo_for_stmt (stmt)) != 0) - || (DR_GROUP_FIRST_DR (vinfo_for_stmt (stmt)) != stmt - && DR_GROUP_GAP (vinfo_for_stmt (stmt)) != 1)) - { - if (vect_print_dump_info (REPORT_SLP)) - { - fprintf (vect_dump, "Build SLP failed: strided " - "loads have gaps "); - print_gimple_stmt (vect_dump, stmt, 0, TDF_SLIM); - } + else + { + /* Load. */ + /* FORNOW: Check that there is no gap between the loads. */ + if ((DR_GROUP_FIRST_DR (vinfo_for_stmt (stmt)) == stmt + && DR_GROUP_GAP (vinfo_for_stmt (stmt)) != 0) + || (DR_GROUP_FIRST_DR (vinfo_for_stmt (stmt)) != stmt + && DR_GROUP_GAP (vinfo_for_stmt (stmt)) != 1)) + { + if (vect_print_dump_info (REPORT_SLP)) + { + fprintf (vect_dump, "Build SLP failed: strided " + "loads have gaps "); + print_gimple_stmt (vect_dump, stmt, 0, TDF_SLIM); + } - return false; - } - - /* Check that the size of interleaved loads group is not - greater than the SLP group size. */ - if (DR_GROUP_SIZE (vinfo_for_stmt (stmt)) - > ncopies * group_size) - { - if (vect_print_dump_info (REPORT_SLP)) - { - fprintf (vect_dump, "Build SLP failed: the number of " - "interleaved loads is greater than" - " the SLP group size "); - print_gimple_stmt (vect_dump, stmt, 0, TDF_SLIM); - } + return false; + } - return false; - } + /* Check that the size of interleaved loads group is not + greater than the SLP group size. */ + if (DR_GROUP_SIZE (vinfo_for_stmt (stmt)) > ncopies * group_size) + { + if (vect_print_dump_info (REPORT_SLP)) + { + fprintf (vect_dump, "Build SLP failed: the number of " + "interleaved loads is greater than" + " the SLP group size "); + print_gimple_stmt (vect_dump, stmt, 0, TDF_SLIM); + } - first_load = DR_GROUP_FIRST_DR (vinfo_for_stmt (stmt)); + return false; + } + + first_load = DR_GROUP_FIRST_DR (vinfo_for_stmt (stmt)); + if (prev_first_load) + { + /* Check that there are no loads from different interleaving + chains in the same node. The only exception is complex + numbers. */ + if (prev_first_load != first_load + && rhs_code != REALPART_EXPR + && rhs_code != IMAGPART_EXPR) + { + if (vect_print_dump_info (REPORT_SLP)) + { + fprintf (vect_dump, "Build SLP failed: different " + "interleaving chains in one node "); + print_gimple_stmt (vect_dump, stmt, 0, TDF_SLIM); + } + + return false; + } + } + else + prev_first_load = first_load; if (first_load == stmt) { @@ -787,6 +808,39 @@ vect_supported_slp_permutation_p (slp_instance instance) } +/* Rearrange the statements of NODE according to PERMUTATION. */ + +static void +vect_slp_rearrange_stmts (slp_tree node, unsigned int group_size, + VEC (int, heap) *permutation) +{ + gimple stmt; + VEC (gimple, heap) *tmp_stmts; + unsigned int index, i; + + if (!node) + return; + + vect_slp_rearrange_stmts (SLP_TREE_LEFT (node), group_size, permutation); + vect_slp_rearrange_stmts (SLP_TREE_RIGHT (node), group_size, permutation); + + gcc_assert (group_size == VEC_length (gimple, SLP_TREE_SCALAR_STMTS (node))); + tmp_stmts = VEC_alloc (gimple, heap, group_size); + + for (i = 0; i < group_size; i++) + VEC_safe_push (gimple, heap, tmp_stmts, NULL); + + for (i = 0; VEC_iterate (gimple, SLP_TREE_SCALAR_STMTS (node), i, stmt); i++) + { + index = VEC_index (int, permutation, i); + VEC_replace (gimple, tmp_stmts, index, stmt); + } + + VEC_free (gimple, heap, SLP_TREE_SCALAR_STMTS (node)); + SLP_TREE_SCALAR_STMTS (node) = tmp_stmts; +} + + /* Check if the required load permutation is supported. LOAD_PERMUTATION contains a list of indices of the loads. In SLP this permutation is relative to the order of strided stores that are @@ -796,9 +850,11 @@ static bool vect_supported_load_permutation_p (slp_instance slp_instn, int group_size, VEC (int, heap) *load_permutation) { - int i = 0, j, prev = -1, next, k; - bool supported; + int i = 0, j, prev = -1, next, k, number_of_groups; + bool supported, bad_permutation = false; sbitmap load_index; + slp_tree node; + gimple stmt; /* FORNOW: permutations are only supported in SLP. */ if (!slp_instn) @@ -811,9 +867,72 @@ vect_supported_load_permutation_p (slp_instance slp_instn, int group_size, fprintf (vect_dump, "%d ", next); } + /* In case of reduction every load permutation is allowed, since the order + of the reduction statements is not important (as opposed to the case of + strided stores). The only condition we need to check is that all the + load nodes are of the same size and have the same permutation (and then + rearrange all the nodes of the SLP instance according to this + permutation). */ + + /* Check that all the load nodes are of the same size. */ + for (i = 0; + VEC_iterate (slp_tree, SLP_INSTANCE_LOADS (slp_instn), i, node); + i++) + if (VEC_length (gimple, SLP_TREE_SCALAR_STMTS (node)) + != (unsigned) group_size) + return false; + + node = SLP_INSTANCE_TREE (slp_instn); + stmt = VEC_index (gimple, SLP_TREE_SCALAR_STMTS (node), 0); + /* LOAD_PERMUTATION is a list of indices of all the loads of the SLP + instance, not all the loads belong to the same node or interleaving + group. Hence, we need to divide them into groups according to + GROUP_SIZE. */ + number_of_groups = VEC_length (int, load_permutation) / group_size; + + /* Reduction (there are no data-refs in the root). */ + if (!STMT_VINFO_DATA_REF (vinfo_for_stmt (stmt))) + { + int first_group_load_index; + + /* Compare all the permutation sequences to the first one. */ + for (i = 1; i < number_of_groups; i++) + { + k = 0; + for (j = i * group_size; j < i * group_size + group_size; j++) + { + next = VEC_index (int, load_permutation, j); + first_group_load_index = VEC_index (int, load_permutation, k); + + if (next != first_group_load_index) + { + bad_permutation = true; + break; + } + + k++; + } + + if (bad_permutation) + break; + } + + if (!bad_permutation) + { + /* This permutaion is valid for reduction. Since the order of the + statements in the nodes is not important unless they are memory + accesses, we can rearrange the statements in all the nodes + according to the order of the loads. */ + vect_slp_rearrange_stmts (SLP_INSTANCE_TREE (slp_instn), group_size, + load_permutation); + VEC_free (int, heap, SLP_INSTANCE_LOAD_PERMUTATION (slp_instn)); + return true; + } + } + /* FORNOW: the only supported permutation is 0..01..1.. of length equal to GROUP_SIZE and where each sequence of same drs is of GROUP_SIZE length as - well. */ + well (unless it's reduction). */ if (VEC_length (int, load_permutation) != (unsigned int) (group_size * group_size)) return false; @@ -896,17 +1015,28 @@ vect_analyze_slp_instance (loop_vec_info loop_vinfo, bb_vec_info bb_vinfo, slp_tree node = XNEW (struct _slp_tree); unsigned int group_size = DR_GROUP_SIZE (vinfo_for_stmt (stmt)); unsigned int unrolling_factor = 1, nunits; - tree vectype, scalar_type; + tree vectype, scalar_type = NULL_TREE; gimple next; unsigned int vectorization_factor = 0; - int inside_cost = 0, outside_cost = 0, ncopies_for_cost; + int inside_cost = 0, outside_cost = 0, ncopies_for_cost, i; unsigned int max_nunits = 0; VEC (int, heap) *load_permutation; VEC (slp_tree, heap) *loads; + struct data_reference *dr = STMT_VINFO_DATA_REF (vinfo_for_stmt (stmt)); + + if (dr) + { + scalar_type = TREE_TYPE (DR_REF (dr)); + vectype = get_vectype_for_scalar_type (scalar_type); + group_size = DR_GROUP_SIZE (vinfo_for_stmt (stmt)); + } + else + { + gcc_assert (loop_vinfo); + vectype = STMT_VINFO_VECTYPE (vinfo_for_stmt (stmt)); + group_size = VEC_length (gimple, LOOP_VINFO_REDUCTIONS (loop_vinfo)); + } - scalar_type = TREE_TYPE (DR_REF (STMT_VINFO_DATA_REF ( - vinfo_for_stmt (stmt)))); - vectype = get_vectype_for_scalar_type (scalar_type); if (!vectype) { if (vect_print_dump_info (REPORT_SLP)) @@ -914,6 +1044,7 @@ vect_analyze_slp_instance (loop_vec_info loop_vinfo, bb_vec_info bb_vinfo, fprintf (vect_dump, "Build SLP failed: unsupported data-type "); print_generic_expr (vect_dump, scalar_type, TDF_SLIM); } + return false; } @@ -938,11 +1069,29 @@ vect_analyze_slp_instance (loop_vec_info loop_vinfo, bb_vec_info bb_vinfo, /* Create a node (a root of the SLP tree) for the packed strided stores. */ SLP_TREE_SCALAR_STMTS (node) = VEC_alloc (gimple, heap, group_size); next = stmt; - /* Collect the stores and store them in SLP_TREE_SCALAR_STMTS. */ - while (next) + if (dr) { - VEC_safe_push (gimple, heap, SLP_TREE_SCALAR_STMTS (node), next); - next = DR_GROUP_NEXT_DR (vinfo_for_stmt (next)); + /* Collect the stores and store them in SLP_TREE_SCALAR_STMTS. */ + while (next) + { + VEC_safe_push (gimple, heap, SLP_TREE_SCALAR_STMTS (node), next); + next = DR_GROUP_NEXT_DR (vinfo_for_stmt (next)); + } + } + else + { + /* Collect reduction statements. */ + for (i = 0; VEC_iterate (gimple, LOOP_VINFO_REDUCTIONS (loop_vinfo), i, + next); + i++) + { + VEC_safe_push (gimple, heap, SLP_TREE_SCALAR_STMTS (node), next); + if (vect_print_dump_info (REPORT_DETAILS)) + { + fprintf (vect_dump, "pushing reduction into node: "); + print_gimple_stmt (vect_dump, next, 0, TDF_SLIM); + } + } } SLP_TREE_VEC_STMTS (node) = NULL; @@ -1035,7 +1184,7 @@ bool vect_analyze_slp (loop_vec_info loop_vinfo, bb_vec_info bb_vinfo) { unsigned int i; - VEC (gimple, heap) *strided_stores; + VEC (gimple, heap) *strided_stores, *reductions = NULL; gimple store; bool ok = false; @@ -1043,10 +1192,14 @@ vect_analyze_slp (loop_vec_info loop_vinfo, bb_vec_info bb_vinfo) fprintf (vect_dump, "=== vect_analyze_slp ==="); if (loop_vinfo) - strided_stores = LOOP_VINFO_STRIDED_STORES (loop_vinfo); + { + strided_stores = LOOP_VINFO_STRIDED_STORES (loop_vinfo); + reductions = LOOP_VINFO_REDUCTIONS (loop_vinfo); + } else strided_stores = BB_VINFO_STRIDED_STORES (bb_vinfo); + /* Find SLP sequences starting from groups of strided stores. */ for (i = 0; VEC_iterate (gimple, strided_stores, i, store); i++) if (vect_analyze_slp_instance (loop_vinfo, bb_vinfo, store)) ok = true; @@ -1059,6 +1212,12 @@ vect_analyze_slp (loop_vec_info loop_vinfo, bb_vec_info bb_vinfo) return false; } + /* Find SLP sequences starting from groups of reductions. */ + if (loop_vinfo && VEC_length (gimple, LOOP_VINFO_REDUCTIONS (loop_vinfo)) + && vect_analyze_slp_instance (loop_vinfo, bb_vinfo, + VEC_index (gimple, reductions, 0))) + ok = true; + return true; } @@ -1120,7 +1279,10 @@ vect_detect_hybrid_slp_stmts (slp_tree node) if ((stmt_vinfo = vinfo_for_stmt (use_stmt)) && !STMT_SLP_TYPE (stmt_vinfo) && (STMT_VINFO_RELEVANT (stmt_vinfo) - || VECTORIZABLE_CYCLE_DEF (STMT_VINFO_DEF_TYPE (stmt_vinfo)))) + || VECTORIZABLE_CYCLE_DEF (STMT_VINFO_DEF_TYPE (stmt_vinfo))) + && !(gimple_code (use_stmt) == GIMPLE_PHI + && STMT_VINFO_DEF_TYPE (vinfo_for_stmt (use_stmt)) + == vect_reduction_def)) vect_mark_slp_stmts (node, hybrid, i); vect_detect_hybrid_slp_stmts (SLP_TREE_LEFT (node)); @@ -1429,11 +1591,14 @@ vect_update_slp_costs_according_to_vf (loop_vec_info loop_vinfo) /* For constant and loop invariant defs of SLP_NODE this function returns (vector) defs (VEC_OPRNDS) that will be used in the vectorized stmts. OP_NUM determines if we gather defs for operand 0 or operand 1 of the scalar - stmts. NUMBER_OF_VECTORS is the number of vector defs to create. */ + stmts. NUMBER_OF_VECTORS is the number of vector defs to create. + REDUC_INDEX is the index of the reduction operand in the statements, unless + it is -1. */ static void vect_get_constant_vectors (slp_tree slp_node, VEC(tree,heap) **vec_oprnds, - unsigned int op_num, unsigned int number_of_vectors) + unsigned int op_num, unsigned int number_of_vectors, + int reduc_index) { VEC (gimple, heap) *stmts = SLP_TREE_SCALAR_STMTS (slp_node); gimple stmt = VEC_index (gimple, stmts, 0); @@ -1449,6 +1614,50 @@ vect_get_constant_vectors (slp_tree slp_node, VEC(tree,heap) **vec_oprnds, int number_of_copies = 1; VEC (tree, heap) *voprnds = VEC_alloc (tree, heap, number_of_vectors); bool constant_p, is_store; + tree neutral_op = NULL; + + if (STMT_VINFO_DEF_TYPE (stmt_vinfo) == vect_reduction_def) + { + enum tree_code code = gimple_assign_rhs_code (stmt); + if (reduc_index == -1) + { + VEC_free (tree, heap, *vec_oprnds); + return; + } + + op_num = reduc_index - 1; + op = gimple_op (stmt, op_num + 1); + /* For additional copies (see the explanation of NUMBER_OF_COPIES below) + we need either neutral operands or the original operands. See + get_initial_def_for_reduction() for details. */ + switch (code) + { + case WIDEN_SUM_EXPR: + case DOT_PROD_EXPR: + case PLUS_EXPR: + case MINUS_EXPR: + case BIT_IOR_EXPR: + case BIT_XOR_EXPR: + if (SCALAR_FLOAT_TYPE_P (TREE_TYPE (op))) + neutral_op = build_real (TREE_TYPE (op), dconst0); + else + neutral_op = build_int_cst (TREE_TYPE (op), 0); + + break; + + case MULT_EXPR: + case BIT_AND_EXPR: + if (SCALAR_FLOAT_TYPE_P (TREE_TYPE (op))) + neutral_op = build_real (TREE_TYPE (op), dconst1); + else + neutral_op = build_int_cst (TREE_TYPE (op), 1); + + break; + + default: + neutral_op = NULL; + } + } if (STMT_VINFO_DATA_REF (stmt_vinfo)) { @@ -1499,6 +1708,19 @@ vect_get_constant_vectors (slp_tree slp_node, VEC(tree,heap) **vec_oprnds, else op = gimple_op (stmt, op_num + 1); + if (reduc_index != -1) + { + struct loop *loop = (gimple_bb (stmt))->loop_father; + gimple def_stmt = SSA_NAME_DEF_STMT (op); + + gcc_assert (loop); + /* Get the def before the loop. */ + op = PHI_ARG_DEF_FROM_EDGE (def_stmt, + loop_preheader_edge (loop)); + if (j != (number_of_copies - 1) && neutral_op) + op = neutral_op; + } + /* Create 'vect_ = {op0,op1,...,opn}'. */ t = tree_cons (NULL_TREE, op, t); @@ -1536,8 +1758,25 @@ vect_get_constant_vectors (slp_tree slp_node, VEC(tree,heap) **vec_oprnds, to replicate the vectors. */ while (number_of_vectors > VEC_length (tree, *vec_oprnds)) { - for (i = 0; VEC_iterate (tree, *vec_oprnds, i, vop) && i < vec_num; i++) - VEC_quick_push (tree, *vec_oprnds, vop); + tree neutral_vec = NULL; + + if (neutral_op) + { + if (!neutral_vec) + { + t = NULL; + for (i = 0; i < (unsigned) nunits; i++) + t = tree_cons (NULL_TREE, neutral_op, t); + neutral_vec = build_vector (vector_type, t); + } + + VEC_quick_push (tree, *vec_oprnds, neutral_vec); + } + else + { + for (i = 0; VEC_iterate (tree, *vec_oprnds, i, vop) && i < vec_num; i++) + VEC_quick_push (tree, *vec_oprnds, vop); + } } } @@ -1576,7 +1815,7 @@ vect_get_slp_vect_defs (slp_tree slp_node, VEC (tree,heap) **vec_oprnds) void vect_get_slp_defs (slp_tree slp_node, VEC (tree,heap) **vec_oprnds0, - VEC (tree,heap) **vec_oprnds1) + VEC (tree,heap) **vec_oprnds1, int reduc_index) { gimple first_stmt; enum tree_code code; @@ -1607,19 +1846,26 @@ vect_get_slp_defs (slp_tree slp_node, VEC (tree,heap) **vec_oprnds0, *vec_oprnds0 = VEC_alloc (tree, heap, number_of_vects); /* SLP_NODE corresponds either to a group of stores or to a group of - unary/binary operations. We don't call this function for loads. */ - if (SLP_TREE_LEFT (slp_node)) + unary/binary operations. We don't call this function for loads. + For reduction defs we call vect_get_constant_vectors(), since we are + looking for initial loop invariant values. */ + if (SLP_TREE_LEFT (slp_node) && reduc_index == -1) /* The defs are already vectorized. */ vect_get_slp_vect_defs (SLP_TREE_LEFT (slp_node), vec_oprnds0); else /* Build vectors from scalar defs. */ - vect_get_constant_vectors (slp_node, vec_oprnds0, 0, number_of_vects); + vect_get_constant_vectors (slp_node, vec_oprnds0, 0, number_of_vects, + reduc_index); if (STMT_VINFO_DATA_REF (vinfo_for_stmt (first_stmt))) /* Since we don't call this function with loads, this is a group of stores. */ return; + /* For reductions, we only need initial values. */ + if (reduc_index != -1) + return; + code = gimple_assign_rhs_code (first_stmt); if (get_gimple_rhs_class (code) != GIMPLE_BINARY_RHS || !vec_oprnds1) return; @@ -1638,7 +1884,7 @@ vect_get_slp_defs (slp_tree slp_node, VEC (tree,heap) **vec_oprnds0, vect_get_slp_vect_defs (SLP_TREE_RIGHT (slp_node), vec_oprnds1); else /* Build vectors from scalar defs. */ - vect_get_constant_vectors (slp_node, vec_oprnds1, 1, number_of_vects); + vect_get_constant_vectors (slp_node, vec_oprnds1, 1, number_of_vects, -1); } @@ -2027,22 +2273,7 @@ vect_schedule_slp_instance (slp_tree node, slp_instance instance, si = gsi_for_stmt (stmt); is_store = vect_transform_stmt (stmt, &si, &strided_store, node, instance); - if (is_store) - { - if (DR_GROUP_FIRST_DR (stmt_info)) - /* If IS_STORE is TRUE, the vectorization of the - interleaving chain was completed - free all the stores in - the chain. */ - vect_remove_stores (DR_GROUP_FIRST_DR (stmt_info)); - else - /* FORNOW: SLP originates only from strided stores. */ - gcc_unreachable (); - - return true; - } - - /* FORNOW: SLP originates only from strided stores. */ - return false; + return is_store; } @@ -2075,6 +2306,26 @@ vect_schedule_slp (loop_vec_info loop_vinfo, bb_vec_info bb_vinfo) fprintf (vect_dump, "vectorizing stmts using SLP."); } + for (i = 0; VEC_iterate (slp_instance, slp_instances, i, instance); i++) + { + slp_tree root = SLP_INSTANCE_TREE (instance); + gimple store; + unsigned int j; + gimple_stmt_iterator gsi; + + for (j = 0; VEC_iterate (gimple, SLP_TREE_SCALAR_STMTS (root), j, store) + && j < SLP_INSTANCE_GROUP_SIZE (instance); j++) + { + if (!STMT_VINFO_DATA_REF (vinfo_for_stmt (store))) + break; + + /* Free the attached stmt_vec_info and remove the stmt. */ + gsi = gsi_for_stmt (store); + gsi_remove (&gsi, true); + free_stmt_vec_info (store); + } + } + return is_store; } -- cgit v1.2.1