diff options
author | bstarynk <bstarynk@138bc75d-0d04-0410-961f-82ee72b054a4> | 2015-01-21 22:01:24 +0000 |
---|---|---|
committer | bstarynk <bstarynk@138bc75d-0d04-0410-961f-82ee72b054a4> | 2015-01-21 22:01:24 +0000 |
commit | ffde65b31066f17eef243be882bb89a6e19370aa (patch) | |
tree | ea876d041c0a63eefccdac5416a8678e75da4cfc /gcc/tree-vect-data-refs.c | |
parent | a8c7acc4db08ce7c8ac3ddcb943f9219e2893792 (diff) | |
download | gcc-ffde65b31066f17eef243be882bb89a6e19370aa.tar.gz |
[.]
2015-01-21 Basile Starynkevitch <basile@starynkevitch.net>
{{merged with trunk -i.e. GCC5.0 in stage4- using
svn merge -r209216:219879 svn+ssh://bstarynk@gcc.gnu.org/svn/gcc/trunk
but should probably have used
svn merge -r209216:219879 ^/trunk
we don't use svnmerge.py anymore since our svn is version 1.8.10
}}
VERY UNSTABLE
2015-01-20 Basile Starynkevitch <basile@starynkevitch.net>
Move previous topdir ChangeLog.MELT to ChangeLog.MELT.2008-2014
[contrib/]
2015-01-21 Basile Starynkevitch <basile@starynkevitch.net>
* MELT-Plugin-Makefile: Able to make upgrade-melt as a
plugin. Works for GCC 5.0. Remove GCC 4.7 old stuff.
Move previous contrib/ChangeLog.MELT to ChangeLog.MELT.2008-2014
[gcc/]
2015-01-21 Basile Starynkevitch <basile@starynkevitch.net>
{{merged with trunk -i.e. GCC5.0 in stage4- using
svn merge -r209216:219879 svn+ssh://bstarynk@gcc.gnu.org/svn/gcc/trunk
but should probably have used
svn merge -r209216:219879 ^/trunk
**@@@ UNSTABLE since libmelt-ana-gimple.melt not compiling, but
translator painfully bootstrapping!!@@@@ }}
* toplev.c: Merged manually by keeping MELT extra stuff.
* toplev.h: Likewise.
* gengtype.c: Add "melt-runtime.h" in list, but merged with trunk.
* melt-runtime.h (MELT_VERSION_STRING): Bump to "1.2-pre-merged".
(meltgc_walk_gimple_seq): Remove.
(gt_ggc_mx_gimple_statement_d): Same for GCC 4.9 & 5.0
* melt-runtime.cc: Update copyright year.
(ggc_alloc_cleared_melt_valuevector_st, melt_resize_scangcvect):
Call ggc_internal_cleared_alloc.
(melt_val2passflag): Skip TODO_verify_ssa, TODO_verify_flow,
TODO_verify_stmts, TODO_verify_rtl_sharing for GCC 5.0.
(meltgc_walkstmt_cb, meltgc_walktree_cb)
(melt_tree_walk_frame_size, meltgc_walk_gimple_seq): Remove.
(melt_gt_ggc_mx_gimple_seq_d): Call
gt_ggc_mx_gimple_statement_base.
* melt-build-script.tpl: Update copyright year. Don't symlink
meltrunsup.h anymore.
* melt-build-script.sh: Regenerate.
* melt/warmelt-base.melt: Update copyright year.
(valdesc_object, valdesc_mapobjects, valdesc_mapstrings)
(valdesc_multiple, valdesc_closure, valdesc_routine, valdesc_hook)
(valdesc_bucketlongs, valdesc_jsonobject, valdesc_string)
(valdesc_strbuf, valdesc_pair, valdesc_list, valdesc_int)
(valdesc_double, valdesc_mixint, valdesc_mixloc)
(valdesc_mixbigint, valdesc_real, valdesc_special_data): Use
ggc_internal_alloc & ggc_internal_cleared_alloc for GCC 5.0.
(json_canonical_name): Use ISUPPER, ISALPHA, TOUPPER instead of
their standard <ctype.h> lowercase macros.
* melt/warmelt-modes.melt: Update copyright year.
(generate_runtypesupport_forwcopy_fun): Emit both GCC 4.9 & 5.0
compatible code.
* melt/libmelt-ana-base.melt: Update copyright year.
* melt/libmelt-ana-gimple.melt: TO BE IMPROVED
* melt/generated/*: Painfully regenerated several times thru GCC
4.9 MELT plugin.
git-svn-id: svn+ssh://gcc.gnu.org/svn/gcc/branches/melt-branch@219975 138bc75d-0d04-0410-961f-82ee72b054a4
Diffstat (limited to 'gcc/tree-vect-data-refs.c')
-rw-r--r-- | gcc/tree-vect-data-refs.c | 989 |
1 files changed, 820 insertions, 169 deletions
diff --git a/gcc/tree-vect-data-refs.c b/gcc/tree-vect-data-refs.c index fbc35a3fe3c..52d6a869c4e 100644 --- a/gcc/tree-vect-data-refs.c +++ b/gcc/tree-vect-data-refs.c @@ -1,5 +1,5 @@ /* Data References Analysis and Manipulation Utilities for Vectorization. - Copyright (C) 2003-2014 Free Software Foundation, Inc. + Copyright (C) 2003-2015 Free Software Foundation, Inc. Contributed by Dorit Naishlos <dorit@il.ibm.com> and Ira Rosen <irar@il.ibm.com> @@ -24,10 +24,25 @@ along with GCC; see the file COPYING3. If not see #include "coretypes.h" #include "dumpfile.h" #include "tm.h" +#include "hash-set.h" +#include "machmode.h" +#include "vec.h" +#include "double-int.h" +#include "input.h" +#include "alias.h" +#include "symtab.h" +#include "wide-int.h" +#include "inchash.h" #include "tree.h" +#include "fold-const.h" #include "stor-layout.h" #include "tm_p.h" #include "target.h" +#include "predict.h" +#include "hard-reg-set.h" +#include "function.h" +#include "dominance.h" +#include "cfg.h" #include "basic-block.h" #include "gimple-pretty-print.h" #include "tree-ssa-alias.h" @@ -47,16 +62,34 @@ along with GCC; see the file COPYING3. If not see #include "tree-ssa-loop-ivopts.h" #include "tree-ssa-loop-manip.h" #include "tree-ssa-loop.h" -#include "dumpfile.h" #include "cfgloop.h" #include "tree-chrec.h" #include "tree-scalar-evolution.h" #include "tree-vectorizer.h" #include "diagnostic-core.h" +#include "hash-map.h" +#include "plugin-api.h" +#include "ipa-ref.h" #include "cgraph.h" /* Need to include rtl.h, expr.h, etc. for optabs. */ +#include "hashtab.h" +#include "rtl.h" +#include "flags.h" +#include "statistics.h" +#include "real.h" +#include "fixed-value.h" +#include "insn-config.h" +#include "expmed.h" +#include "dojump.h" +#include "explow.h" +#include "calls.h" +#include "emit-rtl.h" +#include "varasm.h" +#include "stmt.h" #include "expr.h" +#include "insn-codes.h" #include "optabs.h" +#include "builtins.h" /* Return true if load- or store-lanes optab OPTAB is implemented for COUNT vectors of type VECTYPE. NAME is the name of OPTAB. */ @@ -65,7 +98,7 @@ static bool vect_lanes_optab_supported_p (const char *name, convert_optab optab, tree vectype, unsigned HOST_WIDE_INT count) { - enum machine_mode mode, array_mode; + machine_mode mode, array_mode; bool limit_p; mode = TYPE_MODE (vectype); @@ -373,11 +406,14 @@ vect_analyze_data_ref_dependence (struct data_dependence_relation *ddr, .. = a[i+1]; where we will end up loading { a[i], a[i+1] } once, make sure that inserting group loads before the first load and - stores after the last store will do the right thing. */ - if ((STMT_VINFO_GROUPED_ACCESS (stmtinfo_a) - && GROUP_SAME_DR_STMT (stmtinfo_a)) - || (STMT_VINFO_GROUPED_ACCESS (stmtinfo_b) - && GROUP_SAME_DR_STMT (stmtinfo_b))) + stores after the last store will do the right thing. + Similar for groups like + a[i] = ...; + ... = a[i]; + a[i+1] = ...; + where loads from the group interleave with the store. */ + if (STMT_VINFO_GROUPED_ACCESS (stmtinfo_a) + || STMT_VINFO_GROUPED_ACCESS (stmtinfo_b)) { gimple earlier_stmt; earlier_stmt = get_earlier_stmt (DR_STMT (dra), DR_STMT (drb)); @@ -1066,7 +1102,7 @@ vect_peeling_hash_insert (loop_vec_info loop_vinfo, struct data_reference *dr, bool supportable_dr_alignment = vect_supportable_dr_alignment (dr, true); elem.npeel = npeel; - slot = LOOP_VINFO_PEELING_HTAB (loop_vinfo).find (&elem); + slot = LOOP_VINFO_PEELING_HTAB (loop_vinfo)->find (&elem); if (slot) slot->count++; else @@ -1075,7 +1111,8 @@ vect_peeling_hash_insert (loop_vec_info loop_vinfo, struct data_reference *dr, slot->npeel = npeel; slot->dr = dr; slot->count = 1; - new_slot = LOOP_VINFO_PEELING_HTAB (loop_vinfo).find_slot (slot, INSERT); + new_slot + = LOOP_VINFO_PEELING_HTAB (loop_vinfo)->find_slot (slot, INSERT); *new_slot = slot; } @@ -1195,15 +1232,15 @@ vect_peeling_hash_choose_best_peeling (loop_vec_info loop_vinfo, res.inside_cost = INT_MAX; res.outside_cost = INT_MAX; LOOP_VINFO_PEELING_HTAB (loop_vinfo) - .traverse <_vect_peel_extended_info *, - vect_peeling_hash_get_lowest_cost> (&res); + ->traverse <_vect_peel_extended_info *, + vect_peeling_hash_get_lowest_cost> (&res); } else { res.peel_info.count = 0; LOOP_VINFO_PEELING_HTAB (loop_vinfo) - .traverse <_vect_peel_extended_info *, - vect_peeling_hash_get_most_frequent> (&res); + ->traverse <_vect_peel_extended_info *, + vect_peeling_hash_get_most_frequent> (&res); } *npeel = res.peel_info.npeel; @@ -1395,8 +1432,9 @@ vect_enhance_data_refs_alignment (loop_vec_info loop_vinfo) size_zero_node) < 0; /* Save info about DR in the hash table. */ - if (!LOOP_VINFO_PEELING_HTAB (loop_vinfo).is_created ()) - LOOP_VINFO_PEELING_HTAB (loop_vinfo).create (1); + if (!LOOP_VINFO_PEELING_HTAB (loop_vinfo)) + LOOP_VINFO_PEELING_HTAB (loop_vinfo) + = new hash_table<peel_info_hasher> (1); vectype = STMT_VINFO_VECTYPE (stmt_info); nelements = TYPE_VECTOR_SUBPARTS (vectype); @@ -1508,10 +1546,20 @@ vect_enhance_data_refs_alignment (loop_vec_info loop_vinfo) || !slpeel_can_duplicate_loop_p (loop, single_exit (loop))) do_peeling = false; - if (do_peeling && all_misalignments_unknown + /* If we don't know how many times the peeling loop will run + assume it will run VF-1 times and disable peeling if the remaining + iters are less than the vectorization factor. */ + if (do_peeling + && all_misalignments_unknown + && LOOP_VINFO_NITERS_KNOWN_P (loop_vinfo) + && (LOOP_VINFO_INT_NITERS (loop_vinfo) + < 2 * (unsigned) LOOP_VINFO_VECT_FACTOR (loop_vinfo) - 1)) + do_peeling = false; + + if (do_peeling + && all_misalignments_unknown && vect_supportable_dr_alignment (dr0, false)) { - /* Check if the target requires to prefer stores over loads, i.e., if misaligned stores are more expensive than misaligned loads (taking drs with same alignment into account). */ @@ -1598,6 +1646,14 @@ vect_enhance_data_refs_alignment (loop_vec_info loop_vinfo) &body_cost_vec); if (!dr0 || !npeel) do_peeling = false; + + /* If peeling by npeel will result in a remaining loop not iterating + enough to be vectorized then do not peel. */ + if (do_peeling + && LOOP_VINFO_NITERS_KNOWN_P (loop_vinfo) + && (LOOP_VINFO_INT_NITERS (loop_vinfo) + < LOOP_VINFO_VECT_FACTOR (loop_vinfo) + npeel)) + do_peeling = false; } if (do_peeling) @@ -2506,8 +2562,7 @@ vect_analyze_data_ref_accesses (loop_vec_info loop_vinfo, bb_vec_info bb_vinfo) linear. Don't modify the original vector's order, it is needed for determining what dependencies are reversed. */ vec<data_reference_p> datarefs_copy = datarefs.copy (); - qsort (datarefs_copy.address (), datarefs_copy.length (), - sizeof (data_reference_p), dr_group_sort_cmp); + datarefs_copy.qsort (dr_group_sort_cmp); /* Build the interleaving chains. */ for (i = 0; i < datarefs_copy.length () - 1;) @@ -2527,11 +2582,14 @@ vect_analyze_data_ref_accesses (loop_vec_info loop_vinfo, bb_vec_info bb_vinfo) over them. The we can just skip ahead to the next DR here. */ /* Check that the data-refs have same first location (except init) - and they are both either store or load (not load and store). */ + and they are both either store or load (not load and store, + not masked loads or stores). */ if (DR_IS_READ (dra) != DR_IS_READ (drb) || !operand_equal_p (DR_BASE_ADDRESS (dra), DR_BASE_ADDRESS (drb), 0) - || !dr_equal_offsets_p (dra, drb)) + || !dr_equal_offsets_p (dra, drb) + || !gimple_assign_single_p (DR_STMT (dra)) + || !gimple_assign_single_p (DR_STMT (drb))) break; /* Check that the data-refs have the same constant size and step. */ @@ -2677,14 +2735,6 @@ comp_dr_with_seg_len_pair (const void *p1_, const void *p2_) return 0; } -template <class T> static void -swap (T& a, T& b) -{ - T c (a); - a = b; - b = c; -} - /* Function vect_vfa_segment_size. Create an expression that computes the size of segment @@ -2817,7 +2867,7 @@ vect_prune_runtime_alias_test_list (loop_vec_info loop_vinfo) dr_with_seg_len (dr_b, segment_length_b)); if (compare_tree (DR_BASE_ADDRESS (dr_a), DR_BASE_ADDRESS (dr_b)) > 0) - swap (dr_with_seg_len_pair.first, dr_with_seg_len_pair.second); + std::swap (dr_with_seg_len_pair.first, dr_with_seg_len_pair.second); comp_alias_ddrs.safe_push (dr_with_seg_len_pair); } @@ -2867,8 +2917,8 @@ vect_prune_runtime_alias_test_list (loop_vec_info loop_vinfo) and DR_A1 and DR_A2 are two consecutive memrefs. */ if (*dr_a1 == *dr_a2) { - swap (dr_a1, dr_b1); - swap (dr_a2, dr_b2); + std::swap (dr_a1, dr_b1); + std::swap (dr_a2, dr_b2); } if (!operand_equal_p (DR_BASE_ADDRESS (dr_a1->dr), @@ -2898,15 +2948,13 @@ vect_prune_runtime_alias_test_list (loop_vec_info loop_vinfo) */ - HOST_WIDE_INT - min_seg_len_b = (TREE_CODE (dr_b1->seg_len) == INTEGER_CST) ? - TREE_INT_CST_LOW (dr_b1->seg_len) : - vect_factor; + HOST_WIDE_INT min_seg_len_b = (tree_fits_shwi_p (dr_b1->seg_len) + ? tree_to_shwi (dr_b1->seg_len) + : vect_factor); if (diff <= min_seg_len_b - || (TREE_CODE (dr_a1->seg_len) == INTEGER_CST - && diff - (HOST_WIDE_INT) TREE_INT_CST_LOW (dr_a1->seg_len) < - min_seg_len_b)) + || (tree_fits_shwi_p (dr_a1->seg_len) + && diff - tree_to_shwi (dr_a1->seg_len) < min_seg_len_b)) { if (dump_enabled_p ()) { @@ -2956,7 +3004,7 @@ vect_check_gather (gimple stmt, loop_vec_info loop_vinfo, tree *basep, struct data_reference *dr = STMT_VINFO_DATA_REF (stmt_info); tree offtype = NULL_TREE; tree decl, base, off; - enum machine_mode pmode; + machine_mode pmode; int punsignedp, pvolatilep; base = DR_REF (dr); @@ -2999,8 +3047,8 @@ vect_check_gather (gimple stmt, loop_vec_info loop_vinfo, tree *basep, { if (off == NULL_TREE) { - double_int moff = mem_ref_offset (base); - off = double_int_to_tree (sizetype, moff); + offset_int moff = mem_ref_offset (base); + off = wide_int_to_tree (sizetype, moff); } else off = size_binop (PLUS_EXPR, off, @@ -3172,7 +3220,7 @@ vect_check_gather (gimple stmt, loop_vec_info loop_vinfo, tree *basep, bool vect_analyze_data_refs (loop_vec_info loop_vinfo, bb_vec_info bb_vinfo, - int *min_vf) + int *min_vf, unsigned *n_stmts) { struct loop *loop = NULL; basic_block bb = NULL; @@ -3207,6 +3255,9 @@ vect_analyze_data_refs (loop_vec_info loop_vinfo, for (gsi = gsi_start_bb (bbs[i]); !gsi_end_p (gsi); gsi_next (&gsi)) { gimple stmt = gsi_stmt (gsi); + if (is_gimple_debug (stmt)) + continue; + ++*n_stmts; if (!find_data_references_in_stmt (loop, stmt, &datarefs)) { if (is_gimple_call (stmt) && loop->safelen) @@ -3214,7 +3265,7 @@ vect_analyze_data_refs (loop_vec_info loop_vinfo, tree fndecl = gimple_call_fndecl (stmt), op; if (fndecl != NULL_TREE) { - struct cgraph_node *node = cgraph_get_node (fndecl); + struct cgraph_node *node = cgraph_node::get (fndecl); if (node != NULL && node->simd_clones != NULL) { unsigned int j, n = gimple_call_num_args (stmt); @@ -3260,6 +3311,9 @@ vect_analyze_data_refs (loop_vec_info loop_vinfo, for (gsi = gsi_start_bb (bb); !gsi_end_p (gsi); gsi_next (&gsi)) { gimple stmt = gsi_stmt (gsi); + if (is_gimple_debug (stmt)) + continue; + ++*n_stmts; if (!find_data_references_in_stmt (NULL, stmt, &BB_VINFO_DATAREFS (bb_vinfo))) { @@ -3523,7 +3577,7 @@ again: tree outer_step, outer_base, outer_init; HOST_WIDE_INT pbitsize, pbitpos; tree poffset; - enum machine_mode pmode; + machine_mode pmode; int punsignedp, pvolatilep; affine_iv base_iv, offset_iv; tree dinit; @@ -3832,6 +3886,9 @@ vect_get_new_vect_var (tree type, enum vect_var_kind var_kind, const char *name) is as follows: if LOOP=i_loop: &in (relative to i_loop) if LOOP=j_loop: &in+i*2B (relative to j_loop) + BYTE_OFFSET: Optional, defaulted to NULL. If supplied, it is added to the + initial address. Unlike OFFSET, which is number of elements to + be added, BYTE_OFFSET is measured in bytes. Output: 1. Return an SSA_NAME whose value is the address of the memory location of @@ -3845,7 +3902,8 @@ tree vect_create_addr_base_for_vector_ref (gimple stmt, gimple_seq *new_stmt_list, tree offset, - struct loop *loop) + struct loop *loop, + tree byte_offset) { stmt_vec_info stmt_info = vinfo_for_stmt (stmt); struct data_reference *dr = STMT_VINFO_DATA_REF (stmt_info); @@ -3898,6 +3956,12 @@ vect_create_addr_base_for_vector_ref (gimple stmt, base_offset = fold_build2 (PLUS_EXPR, sizetype, base_offset, offset); } + if (byte_offset) + { + byte_offset = fold_convert (sizetype, byte_offset); + base_offset = fold_build2 (PLUS_EXPR, sizetype, + base_offset, byte_offset); + } /* base + base_offset */ if (loop_vinfo) @@ -3919,8 +3983,12 @@ vect_create_addr_base_for_vector_ref (gimple stmt, && TREE_CODE (addr_base) == SSA_NAME) { duplicate_ssa_name_ptr_info (addr_base, DR_PTR_INFO (dr)); - if (offset) + unsigned int align = TYPE_ALIGN_UNIT (STMT_VINFO_VECTYPE (stmt_info)); + int misalign = DR_MISALIGNMENT (dr); + if (offset || byte_offset || (misalign == -1)) mark_ptr_info_alignment_unknown (SSA_NAME_PTR_INFO (addr_base)); + else + set_ptr_info_alignment (SSA_NAME_PTR_INFO (addr_base), align, misalign); } if (dump_enabled_p ()) @@ -3955,6 +4023,10 @@ vect_create_addr_base_for_vector_ref (gimple stmt, 5. BSI: location where the new stmts are to be placed if there is no loop 6. ONLY_INIT: indicate if ap is to be updated in the loop, or remain pointing to the initial address. + 7. BYTE_OFFSET (optional, defaults to NULL): a byte offset to be added + to the initial address accessed by the data-ref in STMT. This is + similar to OFFSET, but OFFSET is counted in elements, while BYTE_OFFSET + in bytes. Output: 1. Declare a new ptr to vector_type, and have it point to the base of the @@ -3968,6 +4040,8 @@ vect_create_addr_base_for_vector_ref (gimple stmt, initial_address = &a[init]; if OFFSET is supplied: initial_address = &a[init + OFFSET]; + if BYTE_OFFSET is supplied: + initial_address = &a[init] + BYTE_OFFSET; Return the initial_address in INITIAL_ADDRESS. @@ -3985,7 +4059,7 @@ tree vect_create_data_ref_ptr (gimple stmt, tree aggr_type, struct loop *at_loop, tree offset, tree *initial_address, gimple_stmt_iterator *gsi, gimple *ptr_incr, - bool only_init, bool *inv_p) + bool only_init, bool *inv_p, tree byte_offset) { const char *base_name; stmt_vec_info stmt_info = vinfo_for_stmt (stmt); @@ -4128,10 +4202,10 @@ vect_create_data_ref_ptr (gimple stmt, tree aggr_type, struct loop *at_loop, /* (2) Calculate the initial address of the aggregate-pointer, and set the aggregate-pointer to point to it before the loop. */ - /* Create: (&(base[init_val+offset]) in the loop preheader. */ + /* Create: (&(base[init_val+offset]+byte_offset) in the loop preheader. */ new_temp = vect_create_addr_base_for_vector_ref (stmt, &new_stmt_list, - offset, loop); + offset, loop, byte_offset); if (new_stmt_list) { if (pe) @@ -4282,7 +4356,7 @@ bump_vector_ptr (tree dataref_ptr, gimple ptr_incr, gimple_stmt_iterator *gsi, struct data_reference *dr = STMT_VINFO_DATA_REF (stmt_info); tree vectype = STMT_VINFO_VECTYPE (stmt_info); tree update = TYPE_SIZE_UNIT (vectype); - gimple incr_stmt; + gassign *incr_stmt; ssa_op_iter iter; use_operand_p use_p; tree new_dataref_ptr; @@ -4290,9 +4364,9 @@ bump_vector_ptr (tree dataref_ptr, gimple ptr_incr, gimple_stmt_iterator *gsi, if (bump) update = bump; - new_dataref_ptr = copy_ssa_name (dataref_ptr, NULL); - incr_stmt = gimple_build_assign_with_ops (POINTER_PLUS_EXPR, new_dataref_ptr, - dataref_ptr, update); + new_dataref_ptr = copy_ssa_name (dataref_ptr); + incr_stmt = gimple_build_assign (new_dataref_ptr, POINTER_PLUS_EXPR, + dataref_ptr, update); vect_finish_stmt_generation (stmt, incr_stmt, gsi); /* Copy the points-to information if it exists. */ @@ -4340,9 +4414,9 @@ vect_create_destination_var (tree scalar_dest, tree vectype) name = get_name (scalar_dest); if (name) - asprintf (&new_name, "%s_%u", name, SSA_NAME_VERSION (scalar_dest)); + new_name = xasprintf ("%s_%u", name, SSA_NAME_VERSION (scalar_dest)); else - asprintf (&new_name, "_%u", SSA_NAME_VERSION (scalar_dest)); + new_name = xasprintf ("_%u", SSA_NAME_VERSION (scalar_dest)); vec_dest = vect_get_new_vect_var (type, kind, new_name); free (new_name); @@ -4357,15 +4431,16 @@ vect_create_destination_var (tree scalar_dest, tree vectype) bool vect_grouped_store_supported (tree vectype, unsigned HOST_WIDE_INT count) { - enum machine_mode mode = TYPE_MODE (vectype); + machine_mode mode = TYPE_MODE (vectype); - /* vect_permute_store_chain requires the group size to be a power of two. */ - if (exact_log2 (count) == -1) + /* vect_permute_store_chain requires the group size to be equal to 3 or + be a power of two. */ + if (count != 3 && exact_log2 (count) == -1) { if (dump_enabled_p ()) dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location, - "the size of the group of accesses" - " is not a power of 2\n"); + "the size of the group of accesses" + " is not a power of 2 or not eqaul to 3\n"); return false; } @@ -4374,23 +4449,76 @@ vect_grouped_store_supported (tree vectype, unsigned HOST_WIDE_INT count) { unsigned int i, nelt = GET_MODE_NUNITS (mode); unsigned char *sel = XALLOCAVEC (unsigned char, nelt); - for (i = 0; i < nelt / 2; i++) + + if (count == 3) { - sel[i * 2] = i; - sel[i * 2 + 1] = i + nelt; + unsigned int j0 = 0, j1 = 0, j2 = 0; + unsigned int i, j; + + for (j = 0; j < 3; j++) + { + int nelt0 = ((3 - j) * nelt) % 3; + int nelt1 = ((3 - j) * nelt + 1) % 3; + int nelt2 = ((3 - j) * nelt + 2) % 3; + for (i = 0; i < nelt; i++) + { + if (3 * i + nelt0 < nelt) + sel[3 * i + nelt0] = j0++; + if (3 * i + nelt1 < nelt) + sel[3 * i + nelt1] = nelt + j1++; + if (3 * i + nelt2 < nelt) + sel[3 * i + nelt2] = 0; + } + if (!can_vec_perm_p (mode, false, sel)) + { + if (dump_enabled_p ()) + dump_printf (MSG_MISSED_OPTIMIZATION, + "permutaion op not supported by target.\n"); + return false; + } + + for (i = 0; i < nelt; i++) + { + if (3 * i + nelt0 < nelt) + sel[3 * i + nelt0] = 3 * i + nelt0; + if (3 * i + nelt1 < nelt) + sel[3 * i + nelt1] = 3 * i + nelt1; + if (3 * i + nelt2 < nelt) + sel[3 * i + nelt2] = nelt + j2++; + } + if (!can_vec_perm_p (mode, false, sel)) + { + if (dump_enabled_p ()) + dump_printf (MSG_MISSED_OPTIMIZATION, + "permutaion op not supported by target.\n"); + return false; + } + } + return true; } - if (can_vec_perm_p (mode, false, sel)) + else { - for (i = 0; i < nelt; i++) - sel[i] += nelt / 2; - if (can_vec_perm_p (mode, false, sel)) - return true; + /* If length is not equal to 3 then only power of 2 is supported. */ + gcc_assert (exact_log2 (count) != -1); + + for (i = 0; i < nelt / 2; i++) + { + sel[i * 2] = i; + sel[i * 2 + 1] = i + nelt; + } + if (can_vec_perm_p (mode, false, sel)) + { + for (i = 0; i < nelt; i++) + sel[i] += nelt / 2; + if (can_vec_perm_p (mode, false, sel)) + return true; + } } } if (dump_enabled_p ()) dump_printf (MSG_MISSED_OPTIMIZATION, - "interleave op not supported by target.\n"); + "permutaion op not supported by target.\n"); return false; } @@ -4410,9 +4538,9 @@ vect_store_lanes_supported (tree vectype, unsigned HOST_WIDE_INT count) /* Function vect_permute_store_chain. Given a chain of interleaved stores in DR_CHAIN of LENGTH that must be - a power of 2, generate interleave_high/low stmts to reorder the data - correctly for the stores. Return the final references for stores in - RESULT_CHAIN. + a power of 2 or equal to 3, generate interleave_high/low stmts to reorder + the data correctly for the stores. Return the final references for stores + in RESULT_CHAIN. E.g., LENGTH is 4 and the scalar type is short, i.e., VF is 8. The input is 4 vectors each containing 8 elements. We assign a number to @@ -4479,7 +4607,9 @@ vect_permute_store_chain (vec<tree> dr_chain, gimple perm_stmt; tree vectype = STMT_VINFO_VECTYPE (vinfo_for_stmt (stmt)); tree perm_mask_low, perm_mask_high; - unsigned int i, n; + tree data_ref; + tree perm3_mask_low, perm3_mask_high; + unsigned int i, n, log_length = exact_log2 (length); unsigned int j, nelt = TYPE_VECTOR_SUBPARTS (vectype); unsigned char *sel = XALLOCAVEC (unsigned char, nelt); @@ -4487,47 +4617,108 @@ vect_permute_store_chain (vec<tree> dr_chain, memcpy (result_chain->address (), dr_chain.address (), length * sizeof (tree)); - for (i = 0, n = nelt / 2; i < n; i++) + if (length == 3) { - sel[i * 2] = i; - sel[i * 2 + 1] = i + nelt; - } - perm_mask_high = vect_gen_perm_mask (vectype, sel); - gcc_assert (perm_mask_high != NULL); + unsigned int j0 = 0, j1 = 0, j2 = 0; + + for (j = 0; j < 3; j++) + { + int nelt0 = ((3 - j) * nelt) % 3; + int nelt1 = ((3 - j) * nelt + 1) % 3; + int nelt2 = ((3 - j) * nelt + 2) % 3; - for (i = 0; i < nelt; i++) - sel[i] += nelt / 2; - perm_mask_low = vect_gen_perm_mask (vectype, sel); - gcc_assert (perm_mask_low != NULL); + for (i = 0; i < nelt; i++) + { + if (3 * i + nelt0 < nelt) + sel[3 * i + nelt0] = j0++; + if (3 * i + nelt1 < nelt) + sel[3 * i + nelt1] = nelt + j1++; + if (3 * i + nelt2 < nelt) + sel[3 * i + nelt2] = 0; + } + perm3_mask_low = vect_gen_perm_mask_checked (vectype, sel); - for (i = 0, n = exact_log2 (length); i < n; i++) - { - for (j = 0; j < length/2; j++) - { - vect1 = dr_chain[j]; - vect2 = dr_chain[j+length/2]; + for (i = 0; i < nelt; i++) + { + if (3 * i + nelt0 < nelt) + sel[3 * i + nelt0] = 3 * i + nelt0; + if (3 * i + nelt1 < nelt) + sel[3 * i + nelt1] = 3 * i + nelt1; + if (3 * i + nelt2 < nelt) + sel[3 * i + nelt2] = nelt + j2++; + } + perm3_mask_high = vect_gen_perm_mask_checked (vectype, sel); + + vect1 = dr_chain[0]; + vect2 = dr_chain[1]; /* Create interleaving stmt: - high = VEC_PERM_EXPR <vect1, vect2, {0, nelt, 1, nelt+1, ...}> */ - high = make_temp_ssa_name (vectype, NULL, "vect_inter_high"); - perm_stmt - = gimple_build_assign_with_ops (VEC_PERM_EXPR, high, - vect1, vect2, perm_mask_high); + low = VEC_PERM_EXPR <vect1, vect2, + {j, nelt, *, j + 1, nelt + j + 1, *, + j + 2, nelt + j + 2, *, ...}> */ + data_ref = make_temp_ssa_name (vectype, NULL, "vect_shuffle3_low"); + perm_stmt = gimple_build_assign (data_ref, VEC_PERM_EXPR, vect1, + vect2, perm3_mask_low); vect_finish_stmt_generation (stmt, perm_stmt, gsi); - (*result_chain)[2*j] = high; + vect1 = data_ref; + vect2 = dr_chain[2]; /* Create interleaving stmt: - low = VEC_PERM_EXPR <vect1, vect2, {nelt/2, nelt*3/2, nelt/2+1, - nelt*3/2+1, ...}> */ - low = make_temp_ssa_name (vectype, NULL, "vect_inter_low"); - perm_stmt - = gimple_build_assign_with_ops (VEC_PERM_EXPR, low, - vect1, vect2, perm_mask_low); + low = VEC_PERM_EXPR <vect1, vect2, + {0, 1, nelt + j, 3, 4, nelt + j + 1, + 6, 7, nelt + j + 2, ...}> */ + data_ref = make_temp_ssa_name (vectype, NULL, "vect_shuffle3_high"); + perm_stmt = gimple_build_assign (data_ref, VEC_PERM_EXPR, vect1, + vect2, perm3_mask_high); vect_finish_stmt_generation (stmt, perm_stmt, gsi); - (*result_chain)[2*j+1] = low; + (*result_chain)[j] = data_ref; + } + } + else + { + /* If length is not equal to 3 then only power of 2 is supported. */ + gcc_assert (exact_log2 (length) != -1); + + for (i = 0, n = nelt / 2; i < n; i++) + { + sel[i * 2] = i; + sel[i * 2 + 1] = i + nelt; } - memcpy (dr_chain.address (), result_chain->address (), - length * sizeof (tree)); + perm_mask_high = vect_gen_perm_mask_checked (vectype, sel); + + for (i = 0; i < nelt; i++) + sel[i] += nelt / 2; + perm_mask_low = vect_gen_perm_mask_checked (vectype, sel); + + for (i = 0, n = log_length; i < n; i++) + { + for (j = 0; j < length/2; j++) + { + vect1 = dr_chain[j]; + vect2 = dr_chain[j+length/2]; + + /* Create interleaving stmt: + high = VEC_PERM_EXPR <vect1, vect2, {0, nelt, 1, nelt+1, + ...}> */ + high = make_temp_ssa_name (vectype, NULL, "vect_inter_high"); + perm_stmt = gimple_build_assign (high, VEC_PERM_EXPR, vect1, + vect2, perm_mask_high); + vect_finish_stmt_generation (stmt, perm_stmt, gsi); + (*result_chain)[2*j] = high; + + /* Create interleaving stmt: + low = VEC_PERM_EXPR <vect1, vect2, + {nelt/2, nelt*3/2, nelt/2+1, nelt*3/2+1, + ...}> */ + low = make_temp_ssa_name (vectype, NULL, "vect_inter_low"); + perm_stmt = gimple_build_assign (low, VEC_PERM_EXPR, vect1, + vect2, perm_mask_low); + vect_finish_stmt_generation (stmt, perm_stmt, gsi); + (*result_chain)[2*j+1] = low; + } + memcpy (dr_chain.address (), result_chain->address (), + length * sizeof (tree)); + } } } @@ -4600,11 +4791,10 @@ vect_setup_realignment (gimple stmt, gimple_stmt_iterator *gsi, gimple inc; tree ptr; tree data_ref; - gimple new_stmt; basic_block new_bb; tree msq_init = NULL_TREE; tree new_temp; - gimple phi_stmt; + gphi *phi_stmt; tree msq = NULL_TREE; gimple_seq stmts = NULL; bool inv_p; @@ -4695,15 +4885,16 @@ vect_setup_realignment (gimple stmt, gimple_stmt_iterator *gsi, if (alignment_support_scheme == dr_explicit_realign_optimized) { /* Create msq_init = *(floor(p1)) in the loop preheader */ + gassign *new_stmt; gcc_assert (!compute_in_loop); vec_dest = vect_create_destination_var (scalar_dest, vectype); ptr = vect_create_data_ref_ptr (stmt, vectype, loop_for_initial_load, NULL_TREE, &init_addr, NULL, &inc, true, &inv_p); - new_temp = copy_ssa_name (ptr, NULL); - new_stmt = gimple_build_assign_with_ops - (BIT_AND_EXPR, new_temp, ptr, + new_temp = copy_ssa_name (ptr); + new_stmt = gimple_build_assign + (new_temp, BIT_AND_EXPR, ptr, build_int_cst (TREE_TYPE (ptr), -(HOST_WIDE_INT)TYPE_ALIGN_UNIT (vectype))); new_bb = gsi_insert_on_edge_immediate (pe, new_stmt); @@ -4731,6 +4922,7 @@ vect_setup_realignment (gimple stmt, gimple_stmt_iterator *gsi, if (targetm.vectorize.builtin_mask_for_load) { + gcall *new_stmt; tree builtin_decl; /* Compute INIT_ADDR - the initial addressed accessed by this memref. */ @@ -4788,7 +4980,7 @@ vect_setup_realignment (gimple stmt, gimple_stmt_iterator *gsi, pe = loop_preheader_edge (containing_loop); vec_dest = vect_create_destination_var (scalar_dest, vectype); - msq = make_ssa_name (vec_dest, NULL); + msq = make_ssa_name (vec_dest); phi_stmt = create_phi_node (msq, containing_loop->header); add_phi_arg (phi_stmt, msq_init, pe, UNKNOWN_LOCATION); @@ -4804,38 +4996,78 @@ vect_setup_realignment (gimple stmt, gimple_stmt_iterator *gsi, bool vect_grouped_load_supported (tree vectype, unsigned HOST_WIDE_INT count) { - enum machine_mode mode = TYPE_MODE (vectype); + machine_mode mode = TYPE_MODE (vectype); - /* vect_permute_load_chain requires the group size to be a power of two. */ - if (exact_log2 (count) == -1) + /* vect_permute_load_chain requires the group size to be equal to 3 or + be a power of two. */ + if (count != 3 && exact_log2 (count) == -1) { if (dump_enabled_p ()) dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location, - "the size of the group of accesses" - " is not a power of 2\n"); + "the size of the group of accesses" + " is not a power of 2 or not equal to 3\n"); return false; } /* Check that the permutation is supported. */ if (VECTOR_MODE_P (mode)) { - unsigned int i, nelt = GET_MODE_NUNITS (mode); + unsigned int i, j, nelt = GET_MODE_NUNITS (mode); unsigned char *sel = XALLOCAVEC (unsigned char, nelt); - for (i = 0; i < nelt; i++) - sel[i] = i * 2; - if (can_vec_perm_p (mode, false, sel)) + if (count == 3) + { + unsigned int k; + for (k = 0; k < 3; k++) + { + for (i = 0; i < nelt; i++) + if (3 * i + k < 2 * nelt) + sel[i] = 3 * i + k; + else + sel[i] = 0; + if (!can_vec_perm_p (mode, false, sel)) + { + if (dump_enabled_p ()) + dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location, + "shuffle of 3 loads is not supported by" + " target\n"); + return false; + } + for (i = 0, j = 0; i < nelt; i++) + if (3 * i + k < 2 * nelt) + sel[i] = i; + else + sel[i] = nelt + ((nelt + k) % 3) + 3 * (j++); + if (!can_vec_perm_p (mode, false, sel)) + { + if (dump_enabled_p ()) + dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location, + "shuffle of 3 loads is not supported by" + " target\n"); + return false; + } + } + return true; + } + else { + /* If length is not equal to 3 then only power of 2 is supported. */ + gcc_assert (exact_log2 (count) != -1); for (i = 0; i < nelt; i++) - sel[i] = i * 2 + 1; + sel[i] = i * 2; if (can_vec_perm_p (mode, false, sel)) - return true; - } + { + for (i = 0; i < nelt; i++) + sel[i] = i * 2 + 1; + if (can_vec_perm_p (mode, false, sel)) + return true; + } + } } if (dump_enabled_p ()) dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location, - "extract even/odd not supported by target\n"); + "extract even/odd not supported by target\n"); return false; } @@ -4853,8 +5085,9 @@ vect_load_lanes_supported (tree vectype, unsigned HOST_WIDE_INT count) /* Function vect_permute_load_chain. Given a chain of interleaved loads in DR_CHAIN of LENGTH that must be - a power of 2, generate extract_even/odd stmts to reorder the input data - correctly. Return the final references for loads in RESULT_CHAIN. + a power of 2 or equal to 3, generate extract_even/odd stmts to reorder + the input data correctly. Return the final references for loads in + RESULT_CHAIN. E.g., LENGTH is 4 and the scalar type is short, i.e., VF is 8. The input is 4 vectors each containing 8 elements. We assign a number to each @@ -4935,6 +5168,7 @@ vect_permute_load_chain (vec<tree> dr_chain, { tree data_ref, first_vect, second_vect; tree perm_mask_even, perm_mask_odd; + tree perm3_mask_low, perm3_mask_high; gimple perm_stmt; tree vectype = STMT_VINFO_VECTYPE (vinfo_for_stmt (stmt)); unsigned int i, j, log_length = exact_log2 (length); @@ -4945,45 +5179,426 @@ vect_permute_load_chain (vec<tree> dr_chain, memcpy (result_chain->address (), dr_chain.address (), length * sizeof (tree)); - for (i = 0; i < nelt; ++i) - sel[i] = i * 2; - perm_mask_even = vect_gen_perm_mask (vectype, sel); - gcc_assert (perm_mask_even != NULL); + if (length == 3) + { + unsigned int k; - for (i = 0; i < nelt; ++i) - sel[i] = i * 2 + 1; - perm_mask_odd = vect_gen_perm_mask (vectype, sel); - gcc_assert (perm_mask_odd != NULL); + for (k = 0; k < 3; k++) + { + for (i = 0; i < nelt; i++) + if (3 * i + k < 2 * nelt) + sel[i] = 3 * i + k; + else + sel[i] = 0; + perm3_mask_low = vect_gen_perm_mask_checked (vectype, sel); + + for (i = 0, j = 0; i < nelt; i++) + if (3 * i + k < 2 * nelt) + sel[i] = i; + else + sel[i] = nelt + ((nelt + k) % 3) + 3 * (j++); + + perm3_mask_high = vect_gen_perm_mask_checked (vectype, sel); + + first_vect = dr_chain[0]; + second_vect = dr_chain[1]; + + /* Create interleaving stmt (low part of): + low = VEC_PERM_EXPR <first_vect, second_vect2, {k, 3 + k, 6 + k, + ...}> */ + data_ref = make_temp_ssa_name (vectype, NULL, "vect_shuffle3_low"); + perm_stmt = gimple_build_assign (data_ref, VEC_PERM_EXPR, first_vect, + second_vect, perm3_mask_low); + vect_finish_stmt_generation (stmt, perm_stmt, gsi); - for (i = 0; i < log_length; i++) + /* Create interleaving stmt (high part of): + high = VEC_PERM_EXPR <first_vect, second_vect2, {k, 3 + k, 6 + k, + ...}> */ + first_vect = data_ref; + second_vect = dr_chain[2]; + data_ref = make_temp_ssa_name (vectype, NULL, "vect_shuffle3_high"); + perm_stmt = gimple_build_assign (data_ref, VEC_PERM_EXPR, first_vect, + second_vect, perm3_mask_high); + vect_finish_stmt_generation (stmt, perm_stmt, gsi); + (*result_chain)[k] = data_ref; + } + } + else { - for (j = 0; j < length; j += 2) + /* If length is not equal to 3 then only power of 2 is supported. */ + gcc_assert (exact_log2 (length) != -1); + + for (i = 0; i < nelt; ++i) + sel[i] = i * 2; + perm_mask_even = vect_gen_perm_mask_checked (vectype, sel); + + for (i = 0; i < nelt; ++i) + sel[i] = i * 2 + 1; + perm_mask_odd = vect_gen_perm_mask_checked (vectype, sel); + + for (i = 0; i < log_length; i++) { - first_vect = dr_chain[j]; - second_vect = dr_chain[j+1]; - - /* data_ref = permute_even (first_data_ref, second_data_ref); */ - data_ref = make_temp_ssa_name (vectype, NULL, "vect_perm_even"); - perm_stmt = gimple_build_assign_with_ops (VEC_PERM_EXPR, data_ref, - first_vect, second_vect, - perm_mask_even); + for (j = 0; j < length; j += 2) + { + first_vect = dr_chain[j]; + second_vect = dr_chain[j+1]; + + /* data_ref = permute_even (first_data_ref, second_data_ref); */ + data_ref = make_temp_ssa_name (vectype, NULL, "vect_perm_even"); + perm_stmt = gimple_build_assign (data_ref, VEC_PERM_EXPR, + first_vect, second_vect, + perm_mask_even); + vect_finish_stmt_generation (stmt, perm_stmt, gsi); + (*result_chain)[j/2] = data_ref; + + /* data_ref = permute_odd (first_data_ref, second_data_ref); */ + data_ref = make_temp_ssa_name (vectype, NULL, "vect_perm_odd"); + perm_stmt = gimple_build_assign (data_ref, VEC_PERM_EXPR, + first_vect, second_vect, + perm_mask_odd); + vect_finish_stmt_generation (stmt, perm_stmt, gsi); + (*result_chain)[j/2+length/2] = data_ref; + } + memcpy (dr_chain.address (), result_chain->address (), + length * sizeof (tree)); + } + } +} + +/* Function vect_shift_permute_load_chain. + + Given a chain of loads in DR_CHAIN of LENGTH 2 or 3, generate + sequence of stmts to reorder the input data accordingly. + Return the final references for loads in RESULT_CHAIN. + Return true if successed, false otherwise. + + E.g., LENGTH is 3 and the scalar type is short, i.e., VF is 8. + The input is 3 vectors each containing 8 elements. We assign a + number to each element, the input sequence is: + + 1st vec: 0 1 2 3 4 5 6 7 + 2nd vec: 8 9 10 11 12 13 14 15 + 3rd vec: 16 17 18 19 20 21 22 23 + + The output sequence should be: + + 1st vec: 0 3 6 9 12 15 18 21 + 2nd vec: 1 4 7 10 13 16 19 22 + 3rd vec: 2 5 8 11 14 17 20 23 + + We use 3 shuffle instructions and 3 * 3 - 1 shifts to create such output. + + First we shuffle all 3 vectors to get correct elements order: + + 1st vec: ( 0 3 6) ( 1 4 7) ( 2 5) + 2nd vec: ( 8 11 14) ( 9 12 15) (10 13) + 3rd vec: (16 19 22) (17 20 23) (18 21) + + Next we unite and shift vector 3 times: + + 1st step: + shift right by 6 the concatenation of: + "1st vec" and "2nd vec" + ( 0 3 6) ( 1 4 7) |( 2 5) _ ( 8 11 14) ( 9 12 15)| (10 13) + "2nd vec" and "3rd vec" + ( 8 11 14) ( 9 12 15) |(10 13) _ (16 19 22) (17 20 23)| (18 21) + "3rd vec" and "1st vec" + (16 19 22) (17 20 23) |(18 21) _ ( 0 3 6) ( 1 4 7)| ( 2 5) + | New vectors | + + So that now new vectors are: + + 1st vec: ( 2 5) ( 8 11 14) ( 9 12 15) + 2nd vec: (10 13) (16 19 22) (17 20 23) + 3rd vec: (18 21) ( 0 3 6) ( 1 4 7) + + 2nd step: + shift right by 5 the concatenation of: + "1st vec" and "3rd vec" + ( 2 5) ( 8 11 14) |( 9 12 15) _ (18 21) ( 0 3 6)| ( 1 4 7) + "2nd vec" and "1st vec" + (10 13) (16 19 22) |(17 20 23) _ ( 2 5) ( 8 11 14)| ( 9 12 15) + "3rd vec" and "2nd vec" + (18 21) ( 0 3 6) |( 1 4 7) _ (10 13) (16 19 22)| (17 20 23) + | New vectors | + + So that now new vectors are: + + 1st vec: ( 9 12 15) (18 21) ( 0 3 6) + 2nd vec: (17 20 23) ( 2 5) ( 8 11 14) + 3rd vec: ( 1 4 7) (10 13) (16 19 22) READY + + 3rd step: + shift right by 5 the concatenation of: + "1st vec" and "1st vec" + ( 9 12 15) (18 21) |( 0 3 6) _ ( 9 12 15) (18 21)| ( 0 3 6) + shift right by 3 the concatenation of: + "2nd vec" and "2nd vec" + (17 20 23) |( 2 5) ( 8 11 14) _ (17 20 23)| ( 2 5) ( 8 11 14) + | New vectors | + + So that now all vectors are READY: + 1st vec: ( 0 3 6) ( 9 12 15) (18 21) + 2nd vec: ( 2 5) ( 8 11 14) (17 20 23) + 3rd vec: ( 1 4 7) (10 13) (16 19 22) + + This algorithm is faster than one in vect_permute_load_chain if: + 1. "shift of a concatination" is faster than general permutation. + This is usually so. + 2. The TARGET machine can't execute vector instructions in parallel. + This is because each step of the algorithm depends on previous. + The algorithm in vect_permute_load_chain is much more parallel. + + The algorithm is applicable only for LOAD CHAIN LENGTH less than VF. +*/ + +static bool +vect_shift_permute_load_chain (vec<tree> dr_chain, + unsigned int length, + gimple stmt, + gimple_stmt_iterator *gsi, + vec<tree> *result_chain) +{ + tree vect[3], vect_shift[3], data_ref, first_vect, second_vect; + tree perm2_mask1, perm2_mask2, perm3_mask; + tree select_mask, shift1_mask, shift2_mask, shift3_mask, shift4_mask; + gimple perm_stmt; + + tree vectype = STMT_VINFO_VECTYPE (vinfo_for_stmt (stmt)); + unsigned int i; + unsigned nelt = TYPE_VECTOR_SUBPARTS (vectype); + unsigned char *sel = XALLOCAVEC (unsigned char, nelt); + stmt_vec_info stmt_info = vinfo_for_stmt (stmt); + loop_vec_info loop_vinfo = STMT_VINFO_LOOP_VINFO (stmt_info); + + result_chain->quick_grow (length); + memcpy (result_chain->address (), dr_chain.address (), + length * sizeof (tree)); + + if (exact_log2 (length) != -1 && LOOP_VINFO_VECT_FACTOR (loop_vinfo) > 4) + { + unsigned int j, log_length = exact_log2 (length); + for (i = 0; i < nelt / 2; ++i) + sel[i] = i * 2; + for (i = 0; i < nelt / 2; ++i) + sel[nelt / 2 + i] = i * 2 + 1; + if (!can_vec_perm_p (TYPE_MODE (vectype), false, sel)) + { + if (dump_enabled_p ()) + dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location, + "shuffle of 2 fields structure is not \ + supported by target\n"); + return false; + } + perm2_mask1 = vect_gen_perm_mask_checked (vectype, sel); + + for (i = 0; i < nelt / 2; ++i) + sel[i] = i * 2 + 1; + for (i = 0; i < nelt / 2; ++i) + sel[nelt / 2 + i] = i * 2; + if (!can_vec_perm_p (TYPE_MODE (vectype), false, sel)) + { + if (dump_enabled_p ()) + dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location, + "shuffle of 2 fields structure is not \ + supported by target\n"); + return false; + } + perm2_mask2 = vect_gen_perm_mask_checked (vectype, sel); + + /* Generating permutation constant to shift all elements. + For vector length 8 it is {4 5 6 7 8 9 10 11}. */ + for (i = 0; i < nelt; i++) + sel[i] = nelt / 2 + i; + if (!can_vec_perm_p (TYPE_MODE (vectype), false, sel)) + { + if (dump_enabled_p ()) + dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location, + "shift permutation is not supported by target\n"); + return false; + } + shift1_mask = vect_gen_perm_mask_checked (vectype, sel); + + /* Generating permutation constant to select vector from 2. + For vector length 8 it is {0 1 2 3 12 13 14 15}. */ + for (i = 0; i < nelt / 2; i++) + sel[i] = i; + for (i = nelt / 2; i < nelt; i++) + sel[i] = nelt + i; + if (!can_vec_perm_p (TYPE_MODE (vectype), false, sel)) + { + if (dump_enabled_p ()) + dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location, + "select is not supported by target\n"); + return false; + } + select_mask = vect_gen_perm_mask_checked (vectype, sel); + + for (i = 0; i < log_length; i++) + { + for (j = 0; j < length; j += 2) + { + first_vect = dr_chain[j]; + second_vect = dr_chain[j + 1]; + + data_ref = make_temp_ssa_name (vectype, NULL, "vect_shuffle2"); + perm_stmt = gimple_build_assign (data_ref, VEC_PERM_EXPR, + first_vect, first_vect, + perm2_mask1); + vect_finish_stmt_generation (stmt, perm_stmt, gsi); + vect[0] = data_ref; + + data_ref = make_temp_ssa_name (vectype, NULL, "vect_shuffle2"); + perm_stmt = gimple_build_assign (data_ref, VEC_PERM_EXPR, + second_vect, second_vect, + perm2_mask2); + vect_finish_stmt_generation (stmt, perm_stmt, gsi); + vect[1] = data_ref; + + data_ref = make_temp_ssa_name (vectype, NULL, "vect_shift"); + perm_stmt = gimple_build_assign (data_ref, VEC_PERM_EXPR, + vect[0], vect[1], shift1_mask); + vect_finish_stmt_generation (stmt, perm_stmt, gsi); + (*result_chain)[j/2 + length/2] = data_ref; + + data_ref = make_temp_ssa_name (vectype, NULL, "vect_select"); + perm_stmt = gimple_build_assign (data_ref, VEC_PERM_EXPR, + vect[0], vect[1], select_mask); + vect_finish_stmt_generation (stmt, perm_stmt, gsi); + (*result_chain)[j/2] = data_ref; + } + memcpy (dr_chain.address (), result_chain->address (), + length * sizeof (tree)); + } + return true; + } + if (length == 3 && LOOP_VINFO_VECT_FACTOR (loop_vinfo) > 2) + { + unsigned int k = 0, l = 0; + + /* Generating permutation constant to get all elements in rigth order. + For vector length 8 it is {0 3 6 1 4 7 2 5}. */ + for (i = 0; i < nelt; i++) + { + if (3 * k + (l % 3) >= nelt) + { + k = 0; + l += (3 - (nelt % 3)); + } + sel[i] = 3 * k + (l % 3); + k++; + } + if (!can_vec_perm_p (TYPE_MODE (vectype), false, sel)) + { + if (dump_enabled_p ()) + dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location, + "shuffle of 3 fields structure is not \ + supported by target\n"); + return false; + } + perm3_mask = vect_gen_perm_mask_checked (vectype, sel); + + /* Generating permutation constant to shift all elements. + For vector length 8 it is {6 7 8 9 10 11 12 13}. */ + for (i = 0; i < nelt; i++) + sel[i] = 2 * (nelt / 3) + (nelt % 3) + i; + if (!can_vec_perm_p (TYPE_MODE (vectype), false, sel)) + { + if (dump_enabled_p ()) + dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location, + "shift permutation is not supported by target\n"); + return false; + } + shift1_mask = vect_gen_perm_mask_checked (vectype, sel); + + /* Generating permutation constant to shift all elements. + For vector length 8 it is {5 6 7 8 9 10 11 12}. */ + for (i = 0; i < nelt; i++) + sel[i] = 2 * (nelt / 3) + 1 + i; + if (!can_vec_perm_p (TYPE_MODE (vectype), false, sel)) + { + if (dump_enabled_p ()) + dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location, + "shift permutation is not supported by target\n"); + return false; + } + shift2_mask = vect_gen_perm_mask_checked (vectype, sel); + + /* Generating permutation constant to shift all elements. + For vector length 8 it is {3 4 5 6 7 8 9 10}. */ + for (i = 0; i < nelt; i++) + sel[i] = (nelt / 3) + (nelt % 3) / 2 + i; + if (!can_vec_perm_p (TYPE_MODE (vectype), false, sel)) + { + if (dump_enabled_p ()) + dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location, + "shift permutation is not supported by target\n"); + return false; + } + shift3_mask = vect_gen_perm_mask_checked (vectype, sel); + + /* Generating permutation constant to shift all elements. + For vector length 8 it is {5 6 7 8 9 10 11 12}. */ + for (i = 0; i < nelt; i++) + sel[i] = 2 * (nelt / 3) + (nelt % 3) / 2 + i; + if (!can_vec_perm_p (TYPE_MODE (vectype), false, sel)) + { + if (dump_enabled_p ()) + dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location, + "shift permutation is not supported by target\n"); + return false; + } + shift4_mask = vect_gen_perm_mask_checked (vectype, sel); + + for (k = 0; k < 3; k++) + { + data_ref = make_temp_ssa_name (vectype, NULL, "vect_shuffle3"); + perm_stmt = gimple_build_assign (data_ref, VEC_PERM_EXPR, + dr_chain[k], dr_chain[k], + perm3_mask); vect_finish_stmt_generation (stmt, perm_stmt, gsi); - (*result_chain)[j/2] = data_ref; + vect[k] = data_ref; + } + + for (k = 0; k < 3; k++) + { + data_ref = make_temp_ssa_name (vectype, NULL, "vect_shift1"); + perm_stmt = gimple_build_assign (data_ref, VEC_PERM_EXPR, + vect[k % 3], vect[(k + 1) % 3], + shift1_mask); + vect_finish_stmt_generation (stmt, perm_stmt, gsi); + vect_shift[k] = data_ref; + } - /* data_ref = permute_odd (first_data_ref, second_data_ref); */ - data_ref = make_temp_ssa_name (vectype, NULL, "vect_perm_odd"); - perm_stmt = gimple_build_assign_with_ops (VEC_PERM_EXPR, data_ref, - first_vect, second_vect, - perm_mask_odd); + for (k = 0; k < 3; k++) + { + data_ref = make_temp_ssa_name (vectype, NULL, "vect_shift2"); + perm_stmt = gimple_build_assign (data_ref, VEC_PERM_EXPR, + vect_shift[(4 - k) % 3], + vect_shift[(3 - k) % 3], + shift2_mask); vect_finish_stmt_generation (stmt, perm_stmt, gsi); - (*result_chain)[j/2+length/2] = data_ref; + vect[k] = data_ref; } - memcpy (dr_chain.address (), result_chain->address (), - length * sizeof (tree)); + + (*result_chain)[3 - (nelt % 3)] = vect[2]; + + data_ref = make_temp_ssa_name (vectype, NULL, "vect_shift3"); + perm_stmt = gimple_build_assign (data_ref, VEC_PERM_EXPR, vect[0], + vect[0], shift3_mask); + vect_finish_stmt_generation (stmt, perm_stmt, gsi); + (*result_chain)[nelt % 3] = data_ref; + + data_ref = make_temp_ssa_name (vectype, NULL, "vect_shift4"); + perm_stmt = gimple_build_assign (data_ref, VEC_PERM_EXPR, vect[1], + vect[1], shift4_mask); + vect_finish_stmt_generation (stmt, perm_stmt, gsi); + (*result_chain)[0] = data_ref; + return true; } + return false; } - /* Function vect_transform_grouped_load. Given a chain of input interleaved data-refs (in DR_CHAIN), build statements @@ -4995,13 +5610,23 @@ void vect_transform_grouped_load (gimple stmt, vec<tree> dr_chain, int size, gimple_stmt_iterator *gsi) { + machine_mode mode; vec<tree> result_chain = vNULL; /* DR_CHAIN contains input data-refs that are a part of the interleaving. RESULT_CHAIN is the output of vect_permute_load_chain, it contains permuted vectors, that are ready for vector computation. */ result_chain.create (size); - vect_permute_load_chain (dr_chain, size, stmt, gsi, &result_chain); + + /* If reassociation width for vector type is 2 or greater target machine can + execute 2 or more vector instructions in parallel. Otherwise try to + get chain for loads group using vect_shift_permute_load_chain. */ + mode = TYPE_MODE (STMT_VINFO_VECTYPE (vinfo_for_stmt (stmt))); + if (targetm.sched.reassociation_width (VEC_PERM_EXPR, mode) > 1 + || exact_log2 (size) != -1 + || !vect_shift_permute_load_chain (dr_chain, size, stmt, + gsi, &result_chain)) + vect_permute_load_chain (dr_chain, size, stmt, gsi, &result_chain); vect_record_grouped_load_vectors (stmt, result_chain); result_chain.release (); } @@ -5091,20 +5716,33 @@ vect_can_force_dr_alignment_p (const_tree decl, unsigned int alignment) if (TREE_CODE (decl) != VAR_DECL) return false; - /* We cannot change alignment of common or external symbols as another - translation unit may contain a definition with lower alignment. - The rules of common symbol linking mean that the definition - will override the common symbol. The same is true for constant - pool entries which may be shared and are not properly merged - by LTO. */ - if (DECL_EXTERNAL (decl) - || DECL_COMMON (decl) - || DECL_IN_CONSTANT_POOL (decl)) + /* With -fno-toplevel-reorder we may have already output the constant. */ + if (TREE_ASM_WRITTEN (decl)) return false; - if (TREE_ASM_WRITTEN (decl)) + /* Constant pool entries may be shared and not properly merged by LTO. */ + if (DECL_IN_CONSTANT_POOL (decl)) return false; + if (TREE_PUBLIC (decl) || DECL_EXTERNAL (decl)) + { + symtab_node *snode; + + /* We cannot change alignment of symbols that may bind to symbols + in other translation unit that may contain a definition with lower + alignment. */ + if (!decl_binds_to_current_def_p (decl)) + return false; + + /* When compiling partition, be sure the symbol is not output by other + partition. */ + snode = symtab_node::get (decl); + if (flag_ltrans + && (snode->in_other_partition + || snode->get_partitioning_class () == SYMBOL_DUPLICATE)) + return false; + } + /* Do not override the alignment as specified by the ABI when the used attribute is set. */ if (DECL_PRESERVE_P (decl)) @@ -5113,10 +5751,23 @@ vect_can_force_dr_alignment_p (const_tree decl, unsigned int alignment) /* Do not override explicit alignment set by the user when an explicit section name is also used. This is a common idiom used by many software projects. */ - if (DECL_SECTION_NAME (decl) != NULL_TREE - && !DECL_HAS_IMPLICIT_SECTION_NAME_P (decl)) + if (TREE_STATIC (decl) + && DECL_SECTION_NAME (decl) != NULL + && !symtab_node::get (decl)->implicit_section) return false; + /* If symbol is an alias, we need to check that target is OK. */ + if (TREE_STATIC (decl)) + { + tree target = symtab_node::get (decl)->ultimate_alias_target ()->decl; + if (target != decl) + { + if (DECL_PRESERVE_P (target)) + return false; + decl = target; + } + } + if (TREE_STATIC (decl)) return (alignment <= MAX_OFILE_ALIGNMENT); else @@ -5137,7 +5788,7 @@ vect_supportable_dr_alignment (struct data_reference *dr, gimple stmt = DR_STMT (dr); stmt_vec_info stmt_info = vinfo_for_stmt (stmt); tree vectype = STMT_VINFO_VECTYPE (stmt_info); - enum machine_mode mode = TYPE_MODE (vectype); + machine_mode mode = TYPE_MODE (vectype); loop_vec_info loop_vinfo = STMT_VINFO_LOOP_VINFO (stmt_info); struct loop *vect_loop = NULL; bool nested_in_vect_loop = false; |