diff options
author | kyukhin <kyukhin@138bc75d-0d04-0410-961f-82ee72b054a4> | 2014-06-18 07:46:18 +0000 |
---|---|---|
committer | kyukhin <kyukhin@138bc75d-0d04-0410-961f-82ee72b054a4> | 2014-06-18 07:46:18 +0000 |
commit | 926f7a02cf54c290980c2eb8b77974337154bb6d (patch) | |
tree | 28cbde5f2df4453f929eea8f65b8e6be0303d012 /gcc | |
parent | 691447aba6b3f8f3ea8402a35e7c53e2dbb9a6bd (diff) | |
download | gcc-926f7a02cf54c290980c2eb8b77974337154bb6d.tar.gz |
gcc/
* config/i386/i386.c (ix86_reassociation_width): Add alternative for
vector case.
* config/i386/i386.h (TARGET_VECTOR_PARALLEL_EXECUTION): New.
* config/i386/x86-tune.def (X86_TUNE_VECTOR_PARALLEL_EXECUTION): New.
* tree-vect-data-refs.c (vect_shift_permute_load_chain): New.
Introduces alternative way of loads group permutaions.
(vect_transform_grouped_load): Try alternative way of permutations.
gcc/testsuite/
PR tree-optimization/52252
* gcc.target/i386/pr52252-atom.c: Test on loads group of size 3.
* gcc.target/i386/pr52252-core.c: Ditto.
PR tree-optimization/61403
* gcc.target/i386/pr61403.c: Test on loads and stores group of size 3.
git-svn-id: svn+ssh://gcc.gnu.org/svn/gcc/trunk@211769 138bc75d-0d04-0410-961f-82ee72b054a4
Diffstat (limited to 'gcc')
-rw-r--r-- | gcc/ChangeLog | 10 | ||||
-rw-r--r-- | gcc/config/i386/i386.c | 10 | ||||
-rw-r--r-- | gcc/config/i386/i386.h | 2 | ||||
-rw-r--r-- | gcc/config/i386/x86-tune.def | 5 | ||||
-rw-r--r-- | gcc/testsuite/ChangeLog | 9 | ||||
-rw-r--r-- | gcc/testsuite/gcc.target/i386/pr52252-atom.c | 29 | ||||
-rw-r--r-- | gcc/testsuite/gcc.target/i386/pr52252-core.c | 29 | ||||
-rw-r--r-- | gcc/testsuite/gcc.target/i386/pr61403.c | 27 | ||||
-rw-r--r-- | gcc/tree-vect-data-refs.c | 351 |
9 files changed, 471 insertions, 1 deletions
diff --git a/gcc/ChangeLog b/gcc/ChangeLog index 0fc67169c8e..93955b3465b 100644 --- a/gcc/ChangeLog +++ b/gcc/ChangeLog @@ -1,3 +1,13 @@ +2014-06-18 Evgeny Stupachenko <evstupac@gmail.com> + + * config/i386/i386.c (ix86_reassociation_width): Add alternative for + vector case. + * config/i386/i386.h (TARGET_VECTOR_PARALLEL_EXECUTION): New. + * config/i386/x86-tune.def (X86_TUNE_VECTOR_PARALLEL_EXECUTION): New. + * tree-vect-data-refs.c (vect_shift_permute_load_chain): New. + Introduces alternative way of loads group permutaions. + (vect_transform_grouped_load): Try alternative way of permutations. + 2014-06-18 Jakub Jelinek <jakub@redhat.com> * gimplify.c (omp_notice_variable): If n is non-NULL diff --git a/gcc/config/i386/i386.c b/gcc/config/i386/i386.c index 77d54e5bcb7..8046c67c555 100644 --- a/gcc/config/i386/i386.c +++ b/gcc/config/i386/i386.c @@ -46429,6 +46429,16 @@ ix86_reassociation_width (unsigned int opc ATTRIBUTE_UNUSED, { int res = 1; + /* Vector part. */ + if (VECTOR_MODE_P (mode)) + { + if (TARGET_VECTOR_PARALLEL_EXECUTION) + return 2; + else + return 1; + } + + /* Scalar part. */ if (INTEGRAL_MODE_P (mode) && TARGET_REASSOC_INT_TO_PARALLEL) res = 2; else if (FLOAT_MODE_P (mode) && TARGET_REASSOC_FP_TO_PARALLEL) diff --git a/gcc/config/i386/i386.h b/gcc/config/i386/i386.h index a4009d3a83b..9e3ef9424c3 100644 --- a/gcc/config/i386/i386.h +++ b/gcc/config/i386/i386.h @@ -433,6 +433,8 @@ extern unsigned char ix86_tune_features[X86_TUNE_LAST]; ix86_tune_features[X86_TUNE_USE_VECTOR_CONVERTS] #define TARGET_SLOW_PSHUFB \ ix86_tune_features[X86_TUNE_SLOW_PSHUFB] +#define TARGET_VECTOR_PARALLEL_EXECUTION \ + ix86_tune_features[X86_TUNE_VECTOR_PARALLEL_EXECUTION] #define TARGET_FUSE_CMP_AND_BRANCH_32 \ ix86_tune_features[X86_TUNE_FUSE_CMP_AND_BRANCH_32] #define TARGET_FUSE_CMP_AND_BRANCH_64 \ diff --git a/gcc/config/i386/x86-tune.def b/gcc/config/i386/x86-tune.def index 9b0ff360ab3..cb44dc3120c 100644 --- a/gcc/config/i386/x86-tune.def +++ b/gcc/config/i386/x86-tune.def @@ -390,6 +390,11 @@ DEF_TUNE (X86_TUNE_USE_VECTOR_CONVERTS, "use_vector_converts", m_AMDFAM10) DEF_TUNE (X86_TUNE_SLOW_PSHUFB, "slow_pshufb", m_BONNELL | m_SILVERMONT | m_INTEL) +/* X86_TUNE_VECTOR_PARALLEL_EXECUTION: Indicates tunings with ability to + execute 2 or more vector instructions in parallel. */ +DEF_TUNE (X86_TUNE_VECTOR_PARALLEL_EXECUTION, "vec_parallel", + m_NEHALEM | m_SANDYBRIDGE | m_HASWELL) + /*****************************************************************************/ /* AVX instruction selection tuning (some of SSE flags affects AVX, too) */ /*****************************************************************************/ diff --git a/gcc/testsuite/ChangeLog b/gcc/testsuite/ChangeLog index 9b76f2394ab..cb2984d8e94 100644 --- a/gcc/testsuite/ChangeLog +++ b/gcc/testsuite/ChangeLog @@ -1,3 +1,12 @@ +2014-06-18 Evgeny Stupachenko <evstupac@gmail.com> + + PR tree-optimization/52252 + * gcc.target/i386/pr52252-atom.c: Test on loads group of size 3. + * gcc.target/i386/pr52252-core.c: Ditto. + + PR tree-optimization/61403 + * gcc.target/i386/pr61403.c: Test on loads and stores group of size 3. + 2014-06-18 Jakub Jelinek <jakub@redhat.com> * gfortran.dg/gomp/declare-simd-1.f90: New test. diff --git a/gcc/testsuite/gcc.target/i386/pr52252-atom.c b/gcc/testsuite/gcc.target/i386/pr52252-atom.c new file mode 100644 index 00000000000..715b4594382 --- /dev/null +++ b/gcc/testsuite/gcc.target/i386/pr52252-atom.c @@ -0,0 +1,29 @@ +/* { dg-do compile } */ +/* { dg-require-effective-target ssse3 } */ +/* { dg-options "-O2 -ftree-vectorize -mssse3 -mtune=slm" } */ +#define byte unsigned char + +void +matrix_mul (byte *in, byte *out, int size) +{ + int i; + for (i = 0; i < size; i++) + { + byte in0 = in[0]; + byte in1 = in[1]; + byte in2 = in[2]; + byte out0, out1, out2, out3; + out0 = in0 + in1; + out1 = in0 + in2; + out2 = in1 + in2; + out3 = in0 + in1 + in2; + out[0] = out0; + out[1] = out1; + out[2] = out2; + out[3] = out3; + in += 3; + out += 4; + } +} + +/* { dg-final { scan-assembler "palignr" } } */ diff --git a/gcc/testsuite/gcc.target/i386/pr52252-core.c b/gcc/testsuite/gcc.target/i386/pr52252-core.c new file mode 100644 index 00000000000..ac857a5fe7e --- /dev/null +++ b/gcc/testsuite/gcc.target/i386/pr52252-core.c @@ -0,0 +1,29 @@ +/* { dg-do compile } */ +/* { dg-require-effective-target ssse3 } */ +/* { dg-options "-O2 -ftree-vectorize -mssse3 -mtune=corei7" } */ +#define byte unsigned char + +void +matrix_mul (byte *in, byte *out, int size) +{ + int i; + for (i = 0; i < size; i++) + { + byte in0 = in[0]; + byte in1 = in[1]; + byte in2 = in[2]; + byte out0, out1, out2, out3; + out0 = in0 + in1; + out1 = in0 + in2; + out2 = in1 + in2; + out3 = in0 + in1 + in2; + out[0] = out0; + out[1] = out1; + out[2] = out2; + out[3] = out3; + in += 3; + out += 4; + } +} + +/* { dg-final { scan-assembler "pshufb" } } */ diff --git a/gcc/testsuite/gcc.target/i386/pr61403.c b/gcc/testsuite/gcc.target/i386/pr61403.c new file mode 100644 index 00000000000..84cc5c5c80a --- /dev/null +++ b/gcc/testsuite/gcc.target/i386/pr61403.c @@ -0,0 +1,27 @@ +/* { dg-do compile } */ +/* { dg-require-effective-target sse4 } */ +/* { dg-options "-O2 -ffast-math -ftree-vectorize -msse4.2 -mtune=corei7" } */ + +#include <math.h> + +struct XYZ +{ + float x; + float y; + float z; +}; + +void +norm (struct XYZ *in, struct XYZ *out, int size) +{ + int i; + for (i = 0; i < size; ++i) + { + float n = sqrt (in[i].x * in[i].x + in[i].y * in[i].y + in[i].z * in[i].z); + out[i].x = in[i].x / n; + out[i].y = in[i].y / n; + out[i].z = in[i].z / n; + } +} + +/* { dg-final { scan-assembler "blend" } } */ diff --git a/gcc/tree-vect-data-refs.c b/gcc/tree-vect-data-refs.c index b2b629c03c6..d5cd9eca2fe 100644 --- a/gcc/tree-vect-data-refs.c +++ b/gcc/tree-vect-data-refs.c @@ -5209,6 +5209,346 @@ vect_permute_load_chain (vec<tree> dr_chain, } } +/* Function vect_shift_permute_load_chain. + + Given a chain of loads in DR_CHAIN of LENGTH 2 or 3, generate + sequence of stmts to reorder the input data accordingly. + Return the final references for loads in RESULT_CHAIN. + Return true if successed, false otherwise. + + E.g., LENGTH is 3 and the scalar type is short, i.e., VF is 8. + The input is 3 vectors each containing 8 elements. We assign a + number to each element, the input sequence is: + + 1st vec: 0 1 2 3 4 5 6 7 + 2nd vec: 8 9 10 11 12 13 14 15 + 3rd vec: 16 17 18 19 20 21 22 23 + + The output sequence should be: + + 1st vec: 0 3 6 9 12 15 18 21 + 2nd vec: 1 4 7 10 13 16 19 22 + 3rd vec: 2 5 8 11 14 17 20 23 + + We use 3 shuffle instructions and 3 * 3 - 1 shifts to create such output. + + First we shuffle all 3 vectors to get correct elements order: + + 1st vec: ( 0 3 6) ( 1 4 7) ( 2 5) + 2nd vec: ( 8 11 14) ( 9 12 15) (10 13) + 3rd vec: (16 19 22) (17 20 23) (18 21) + + Next we unite and shift vector 3 times: + + 1st step: + shift right by 6 the concatenation of: + "1st vec" and "2nd vec" + ( 0 3 6) ( 1 4 7) |( 2 5) _ ( 8 11 14) ( 9 12 15)| (10 13) + "2nd vec" and "3rd vec" + ( 8 11 14) ( 9 12 15) |(10 13) _ (16 19 22) (17 20 23)| (18 21) + "3rd vec" and "1st vec" + (16 19 22) (17 20 23) |(18 21) _ ( 0 3 6) ( 1 4 7)| ( 2 5) + | New vectors | + + So that now new vectors are: + + 1st vec: ( 2 5) ( 8 11 14) ( 9 12 15) + 2nd vec: (10 13) (16 19 22) (17 20 23) + 3rd vec: (18 21) ( 0 3 6) ( 1 4 7) + + 2nd step: + shift right by 5 the concatenation of: + "1st vec" and "3rd vec" + ( 2 5) ( 8 11 14) |( 9 12 15) _ (18 21) ( 0 3 6)| ( 1 4 7) + "2nd vec" and "1st vec" + (10 13) (16 19 22) |(17 20 23) _ ( 2 5) ( 8 11 14)| ( 9 12 15) + "3rd vec" and "2nd vec" + (18 21) ( 0 3 6) |( 1 4 7) _ (10 13) (16 19 22)| (17 20 23) + | New vectors | + + So that now new vectors are: + + 1st vec: ( 9 12 15) (18 21) ( 0 3 6) + 2nd vec: (17 20 23) ( 2 5) ( 8 11 14) + 3rd vec: ( 1 4 7) (10 13) (16 19 22) READY + + 3rd step: + shift right by 5 the concatenation of: + "1st vec" and "1st vec" + ( 9 12 15) (18 21) |( 0 3 6) _ ( 9 12 15) (18 21)| ( 0 3 6) + shift right by 3 the concatenation of: + "2nd vec" and "2nd vec" + (17 20 23) |( 2 5) ( 8 11 14) _ (17 20 23)| ( 2 5) ( 8 11 14) + | New vectors | + + So that now all vectors are READY: + 1st vec: ( 0 3 6) ( 9 12 15) (18 21) + 2nd vec: ( 2 5) ( 8 11 14) (17 20 23) + 3rd vec: ( 1 4 7) (10 13) (16 19 22) + + This algorithm is faster than one in vect_permute_load_chain if: + 1. "shift of a concatination" is faster than general permutation. + This is usually so. + 2. The TARGET machine can't execute vector instructions in parallel. + This is because each step of the algorithm depends on previous. + The algorithm in vect_permute_load_chain is much more parallel. + + The algorithm is applicable only for LOAD CHAIN LENGTH less than VF. +*/ + +static bool +vect_shift_permute_load_chain (vec<tree> dr_chain, + unsigned int length, + gimple stmt, + gimple_stmt_iterator *gsi, + vec<tree> *result_chain) +{ + tree vect[3], vect_shift[3], data_ref, first_vect, second_vect; + tree perm2_mask1, perm2_mask2, perm3_mask; + tree select_mask, shift1_mask, shift2_mask, shift3_mask, shift4_mask; + gimple perm_stmt; + + tree vectype = STMT_VINFO_VECTYPE (vinfo_for_stmt (stmt)); + unsigned int i; + unsigned nelt = TYPE_VECTOR_SUBPARTS (vectype); + unsigned char *sel = XALLOCAVEC (unsigned char, nelt); + stmt_vec_info stmt_info = vinfo_for_stmt (stmt); + loop_vec_info loop_vinfo = STMT_VINFO_LOOP_VINFO (stmt_info); + + result_chain->quick_grow (length); + memcpy (result_chain->address (), dr_chain.address (), + length * sizeof (tree)); + + if (length == 2 && LOOP_VINFO_VECT_FACTOR (loop_vinfo) > 4) + { + for (i = 0; i < nelt / 2; ++i) + sel[i] = i * 2; + for (i = 0; i < nelt / 2; ++i) + sel[nelt / 2 + i] = i * 2 + 1; + if (!can_vec_perm_p (TYPE_MODE (vectype), false, sel)) + { + if (dump_enabled_p ()) + dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location, + "shuffle of 2 fields structure is not \ + supported by target\n"); + return false; + } + perm2_mask1 = vect_gen_perm_mask (vectype, sel); + gcc_assert (perm2_mask1 != NULL); + + for (i = 0; i < nelt / 2; ++i) + sel[i] = i * 2 + 1; + for (i = 0; i < nelt / 2; ++i) + sel[nelt / 2 + i] = i * 2; + if (!can_vec_perm_p (TYPE_MODE (vectype), false, sel)) + { + if (dump_enabled_p ()) + dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location, + "shuffle of 2 fields structure is not \ + supported by target\n"); + return false; + } + perm2_mask2 = vect_gen_perm_mask (vectype, sel); + gcc_assert (perm2_mask2 != NULL); + + /* Generating permutation constant to shift all elements. + For vector length 8 it is {4 5 6 7 8 9 10 11}. */ + for (i = 0; i < nelt; i++) + sel[i] = nelt / 2 + i; + if (!can_vec_perm_p (TYPE_MODE (vectype), false, sel)) + { + if (dump_enabled_p ()) + dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location, + "shift permutation is not supported by target\n"); + return false; + } + shift1_mask = vect_gen_perm_mask (vectype, sel); + gcc_assert (shift1_mask != NULL); + + /* Generating permutation constant to select vector from 2. + For vector length 8 it is {0 1 2 3 12 13 14 15}. */ + for (i = 0; i < nelt / 2; i++) + sel[i] = i; + for (i = nelt / 2; i < nelt; i++) + sel[i] = nelt + i; + if (!can_vec_perm_p (TYPE_MODE (vectype), false, sel)) + { + if (dump_enabled_p ()) + dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location, + "select is not supported by target\n"); + return false; + } + select_mask = vect_gen_perm_mask (vectype, sel); + gcc_assert (select_mask != NULL); + + first_vect = dr_chain[0]; + second_vect = dr_chain[1]; + + data_ref = make_temp_ssa_name (vectype, NULL, "vect_shuffle2"); + perm_stmt = gimple_build_assign_with_ops (VEC_PERM_EXPR, data_ref, + first_vect, first_vect, + perm2_mask1); + vect_finish_stmt_generation (stmt, perm_stmt, gsi); + vect[0] = data_ref; + + data_ref = make_temp_ssa_name (vectype, NULL, "vect_shuffle2"); + perm_stmt = gimple_build_assign_with_ops (VEC_PERM_EXPR, data_ref, + second_vect, second_vect, + perm2_mask2); + vect_finish_stmt_generation (stmt, perm_stmt, gsi); + vect[1] = data_ref; + + data_ref = make_temp_ssa_name (vectype, NULL, "vect_shift"); + perm_stmt = gimple_build_assign_with_ops (VEC_PERM_EXPR, data_ref, + vect[0], vect[1], + shift1_mask); + vect_finish_stmt_generation (stmt, perm_stmt, gsi); + (*result_chain)[1] = data_ref; + + data_ref = make_temp_ssa_name (vectype, NULL, "vect_select"); + perm_stmt = gimple_build_assign_with_ops (VEC_PERM_EXPR, data_ref, + vect[0], vect[1], + select_mask); + vect_finish_stmt_generation (stmt, perm_stmt, gsi); + (*result_chain)[0] = data_ref; + + return true; + } + if (length == 3 && LOOP_VINFO_VECT_FACTOR (loop_vinfo) > 2) + { + unsigned int k = 0, l = 0; + + /* Generating permutation constant to get all elements in rigth order. + For vector length 8 it is {0 3 6 1 4 7 2 5}. */ + for (i = 0; i < nelt; i++) + { + if (3 * k + (l % 3) >= nelt) + { + k = 0; + l += (3 - (nelt % 3)); + } + sel[i] = 3 * k + (l % 3); + k++; + } + if (!can_vec_perm_p (TYPE_MODE (vectype), false, sel)) + { + if (dump_enabled_p ()) + dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location, + "shuffle of 3 fields structure is not \ + supported by target\n"); + return false; + } + perm3_mask = vect_gen_perm_mask (vectype, sel); + gcc_assert (perm3_mask != NULL); + + /* Generating permutation constant to shift all elements. + For vector length 8 it is {6 7 8 9 10 11 12 13}. */ + for (i = 0; i < nelt; i++) + sel[i] = 2 * (nelt / 3) + (nelt % 3) + i; + if (!can_vec_perm_p (TYPE_MODE (vectype), false, sel)) + { + if (dump_enabled_p ()) + dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location, + "shift permutation is not supported by target\n"); + return false; + } + shift1_mask = vect_gen_perm_mask (vectype, sel); + gcc_assert (shift1_mask != NULL); + + /* Generating permutation constant to shift all elements. + For vector length 8 it is {5 6 7 8 9 10 11 12}. */ + for (i = 0; i < nelt; i++) + sel[i] = 2 * (nelt / 3) + 1 + i; + if (!can_vec_perm_p (TYPE_MODE (vectype), false, sel)) + { + if (dump_enabled_p ()) + dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location, + "shift permutation is not supported by target\n"); + return false; + } + shift2_mask = vect_gen_perm_mask (vectype, sel); + gcc_assert (shift2_mask != NULL); + + /* Generating permutation constant to shift all elements. + For vector length 8 it is {3 4 5 6 7 8 9 10}. */ + for (i = 0; i < nelt; i++) + sel[i] = (nelt / 3) + (nelt % 3) / 2 + i; + if (!can_vec_perm_p (TYPE_MODE (vectype), false, sel)) + { + if (dump_enabled_p ()) + dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location, + "shift permutation is not supported by target\n"); + return false; + } + shift3_mask = vect_gen_perm_mask (vectype, sel); + gcc_assert (shift3_mask != NULL); + + /* Generating permutation constant to shift all elements. + For vector length 8 it is {5 6 7 8 9 10 11 12}. */ + for (i = 0; i < nelt; i++) + sel[i] = 2 * (nelt / 3) + (nelt % 3) / 2 + i; + if (!can_vec_perm_p (TYPE_MODE (vectype), false, sel)) + { + if (dump_enabled_p ()) + dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location, + "shift permutation is not supported by target\n"); + return false; + } + shift4_mask = vect_gen_perm_mask (vectype, sel); + gcc_assert (shift4_mask != NULL); + + for (k = 0; k < 3; k++) + { + data_ref = make_temp_ssa_name (vectype, NULL, "vect_suffle3"); + perm_stmt = gimple_build_assign_with_ops (VEC_PERM_EXPR, data_ref, + dr_chain[k], dr_chain[k], + perm3_mask); + vect_finish_stmt_generation (stmt, perm_stmt, gsi); + vect[k] = data_ref; + } + + for (k = 0; k < 3; k++) + { + data_ref = make_temp_ssa_name (vectype, NULL, "vect_shift1"); + perm_stmt = gimple_build_assign_with_ops (VEC_PERM_EXPR, data_ref, + vect[k % 3], + vect[(k + 1) % 3], + shift1_mask); + vect_finish_stmt_generation (stmt, perm_stmt, gsi); + vect_shift[k] = data_ref; + } + + for (k = 0; k < 3; k++) + { + data_ref = make_temp_ssa_name (vectype, NULL, "vect_shift2"); + perm_stmt = gimple_build_assign_with_ops (VEC_PERM_EXPR, data_ref, + vect_shift[(4 - k) % 3], + vect_shift[(3 - k) % 3], + shift2_mask); + vect_finish_stmt_generation (stmt, perm_stmt, gsi); + vect[k] = data_ref; + } + + (*result_chain)[3 - (nelt % 3)] = vect[2]; + + data_ref = make_temp_ssa_name (vectype, NULL, "vect_shift3"); + perm_stmt = gimple_build_assign_with_ops (VEC_PERM_EXPR, data_ref, + vect[0], vect[0], + shift3_mask); + vect_finish_stmt_generation (stmt, perm_stmt, gsi); + (*result_chain)[nelt % 3] = data_ref; + + data_ref = make_temp_ssa_name (vectype, NULL, "vect_shift4"); + perm_stmt = gimple_build_assign_with_ops (VEC_PERM_EXPR, data_ref, + vect[1], vect[1], + shift4_mask); + vect_finish_stmt_generation (stmt, perm_stmt, gsi); + (*result_chain)[0] = data_ref; + return true; + } + return false; +} + /* Function vect_transform_grouped_load. Given a chain of input interleaved data-refs (in DR_CHAIN), build statements @@ -5220,13 +5560,22 @@ void vect_transform_grouped_load (gimple stmt, vec<tree> dr_chain, int size, gimple_stmt_iterator *gsi) { + enum machine_mode mode; vec<tree> result_chain = vNULL; /* DR_CHAIN contains input data-refs that are a part of the interleaving. RESULT_CHAIN is the output of vect_permute_load_chain, it contains permuted vectors, that are ready for vector computation. */ result_chain.create (size); - vect_permute_load_chain (dr_chain, size, stmt, gsi, &result_chain); + + /* If reassociation width for vector type is 2 or greater target machine can + execute 2 or more vector instructions in parallel. Otherwise try to + get chain for loads group using vect_shift_permute_load_chain. */ + mode = TYPE_MODE (STMT_VINFO_VECTYPE (vinfo_for_stmt (stmt))); + if (targetm.sched.reassociation_width (VEC_PERM_EXPR, mode) > 1 + || !vect_shift_permute_load_chain (dr_chain, size, stmt, + gsi, &result_chain)) + vect_permute_load_chain (dr_chain, size, stmt, gsi, &result_chain); vect_record_grouped_load_vectors (stmt, result_chain); result_chain.release (); } |