Add optabs for common types of permutation

...so that we can use them for variable-length vectors. For now constant-length vectors continue to use VEC_PERM_EXPR and the vec_perm_const optab even for cases that the new optabs could handle. The vector optabs are inconsistent about whether there should be an underscore before the mode part of the name, but the other lo/hi optabs have one. Doing this means that we're able to optimise some SLP tests using non-SLP (for now) on targets with variable-length vectors, so the patch needs to add a few XFAILs. Most of these go away with later patches. 2017-11-09 Richard Sandiford <richard.sandiford@linaro.org> Alan Hayward <alan.hayward@arm.com> David Sherwood <david.sherwood@arm.com> gcc/ * doc/md.texi (vec_reverse, vec_interleave_lo, vec_interleave_hi) (vec_extract_even, vec_extract_odd): Document new optabs. * internal-fn.def (VEC_INTERLEAVE_LO, VEC_INTERLEAVE_HI) (VEC_EXTRACT_EVEN, VEC_EXTRACT_ODD, VEC_REVERSE): New internal functions. * optabs.def (vec_interleave_lo_optab, vec_interleave_hi_optab) (vec_extract_even_optab, vec_extract_odd_optab, vec_reverse_optab): New optabs. * tree-vect-data-refs.c: Include internal-fn.h. (vect_grouped_store_supported): Try using IFN_VEC_INTERLEAVE_{LO,HI}. (vect_permute_store_chain): Use them here too. (vect_grouped_load_supported): Try using IFN_VEC_EXTRACT_{EVEN,ODD}. (vect_permute_load_chain): Use them here too. * tree-vect-stmts.c (can_reverse_vector_p): New function. (get_negative_load_store_type): Use it. (reverse_vector): New function. (vectorizable_store, vectorizable_load): Use it. * config/aarch64/iterators.md (perm_optab): New iterator. * config/aarch64/aarch64-sve.md (<perm_optab>_<mode>): New expander. (vec_reverse_<mode>): Likewise. gcc/testsuite/ * gcc.dg/vect/no-vfa-vect-depend-2.c: Remove XFAIL. * gcc.dg/vect/no-vfa-vect-depend-3.c: Likewise. * gcc.dg/vect/pr33953.c: XFAIL for vect_variable_length. * gcc.dg/vect/pr68445.c: Likewise. * gcc.dg/vect/slp-12a.c: Likewise. * gcc.dg/vect/slp-13-big-array.c: Likewise. * gcc.dg/vect/slp-13.c: Likewise. * gcc.dg/vect/slp-14.c: Likewise. * gcc.dg/vect/slp-15.c: Likewise. * gcc.dg/vect/slp-42.c: Likewise. * gcc.dg/vect/slp-multitypes-2.c: Likewise. * gcc.dg/vect/slp-multitypes-4.c: Likewise. * gcc.dg/vect/slp-multitypes-5.c: Likewise. * gcc.dg/vect/slp-reduc-4.c: Likewise. * gcc.dg/vect/slp-reduc-7.c: Likewise. * gcc.target/aarch64/sve_vec_perm_2.c: New test. * gcc.target/aarch64/sve_vec_perm_2_run.c: Likewise. * gcc.target/aarch64/sve_vec_perm_3.c: New test. * gcc.target/aarch64/sve_vec_perm_3_run.c: Likewise. * gcc.target/aarch64/sve_vec_perm_4.c: New test. * gcc.target/aarch64/sve_vec_perm_4_run.c: Likewise.
author: Richard Sandiford <richard.sandiford@linaro.org> 2017-10-13 12:29:59 +0100
committer: Richard Sandiford <richard.sandiford@linaro.org> 2017-11-19 14:48:28 +0000
commit: 11edfd37908fd019eab616afbc2909a090d3c484 (patch)
tree: a026f768742cdad4c088809bb3417e3685c710cd
parent: 51714d91a47c0324fe39251244e06a32e951f5ca (diff)
download: gcc-11edfd37908fd019eab616afbc2909a090d3c484.tar.gz
28 files changed, 482 insertions, 94 deletions
diff --git a/gcc/config/aarch64/aarch64-sve.md b/gcc/config/aarch64/aarch64-sve.md
index 86ac7418a6d..535f0c3b174 100644
--- a/gcc/config/aarch64/aarch64-sve.md
+++ b/gcc/config/aarch64/aarch64-sve.md
@@ -630,6 +630,19 @@
   }
 )
 
+(define_expand "<perm_optab>_<mode>"
+  [(set (match_operand:SVE_ALL 0 "register_operand")
+	(unspec:SVE_ALL [(match_operand:SVE_ALL 1 "register_operand")
+			 (match_operand:SVE_ALL 2 "register_operand")]
+			OPTAB_PERMUTE))]
+  "TARGET_SVE && !GET_MODE_NUNITS (<MODE>mode).is_constant ()")
+
+(define_expand "vec_reverse_<mode>"
+  [(set (match_operand:SVE_ALL 0 "register_operand")
+	(unspec:SVE_ALL [(match_operand:SVE_ALL 1 "register_operand")]
+			UNSPEC_REV))]
+  "TARGET_SVE && !GET_MODE_NUNITS (<MODE>mode).is_constant ()")
+
 (define_insn "*aarch64_sve_tbl<mode>"
   [(set (match_operand:SVE_ALL 0 "register_operand" "=w")
 	(unspec:SVE_ALL
diff --git a/gcc/config/aarch64/iterators.md b/gcc/config/aarch64/iterators.md
index 69fd38939fe..898a0b9f23f 100644
--- a/gcc/config/aarch64/iterators.md
+++ b/gcc/config/aarch64/iterators.md
@@ -1576,6 +1576,11 @@
 				    (UNSPEC_PACI1716 "8")
 				    (UNSPEC_AUTI1716 "12")])
 
+(define_int_attr perm_optab [(UNSPEC_ZIP1 "vec_interleave_lo")
+			     (UNSPEC_ZIP2 "vec_interleave_hi")
+			     (UNSPEC_UZP1 "vec_extract_even")
+			     (UNSPEC_UZP2 "vec_extract_odd")])
+
 (define_int_attr perm_insn [(UNSPEC_ZIP1 "zip") (UNSPEC_ZIP2 "zip")
 			    (UNSPEC_TRN1 "trn") (UNSPEC_TRN2 "trn")
 			    (UNSPEC_UZP1 "uzp") (UNSPEC_UZP2 "uzp")])
diff --git a/gcc/doc/md.texi b/gcc/doc/md.texi
index 7e448153eb7..5b700d36c15 100644
--- a/gcc/doc/md.texi
+++ b/gcc/doc/md.texi
@@ -5050,6 +5050,46 @@ There is no need for a target to supply both @samp{vec_perm@var{m}}
 and @samp{vec_perm_const@var{m}} if the former can trivially implement
 the operation with, say, the vector constant loaded into a register.
 
+@cindex @code{vec_reverse_@var{m}} instruction pattern
+@item @samp{vec_reverse_@var{m}}
+Reverse the order of the elements in vector input operand 1 and store
+the result in vector output operand 0.  Both operands have mode @var{m}.
+
+This pattern is provided mainly for targets with variable-length vectors.
+Targets with fixed-length vectors can instead handle any reverse-specific
+optimizations in @samp{vec_perm_const@var{m}}.
+
+@cindex @code{vec_interleave_lo_@var{m}} instruction pattern
+@item @samp{vec_interleave_lo_@var{m}}
+Take the lowest-indexed halves of vector input operands 1 and 2 and
+interleave the elements, so that element @var{x} of operand 1 is followed by
+element @var{x} of operand 2.  Store the result in vector output operand 0.
+All three operands have mode @var{m}.
+
+This pattern is provided mainly for targets with variable-length
+vectors.  Targets with fixed-length vectors can instead handle any
+interleave-specific optimizations in @samp{vec_perm_const@var{m}}.
+
+@cindex @code{vec_interleave_hi_@var{m}} instruction pattern
+@item @samp{vec_interleave_hi_@var{m}}
+Like @samp{vec_interleave_lo_@var{m}}, but operate on the highest-indexed
+halves instead of the lowest-indexed halves.
+
+@cindex @code{vec_extract_even_@var{m}} instruction pattern
+@item @samp{vec_extract_even_@var{m}}
+Concatenate vector input operands 1 and 2, extract the elements with
+even-numbered indices, and store the result in vector output operand 0.
+All three operands have mode @var{m}.
+
+This pattern is provided mainly for targets with variable-length vectors.
+Targets with fixed-length vectors can instead handle any
+extract-specific optimizations in @samp{vec_perm_const@var{m}}.
+
+@cindex @code{vec_extract_odd_@var{m}} instruction pattern
+@item @samp{vec_extract_odd_@var{m}}
+Like @samp{vec_extract_even_@var{m}}, but extract the elements with
+odd-numbered indices.
+
 @cindex @code{push@var{m}1} instruction pattern
 @item @samp{push@var{m}1}
 Output a push instruction.  Operand 0 is value to push.  Used only when
diff --git a/gcc/internal-fn.def b/gcc/internal-fn.def
index 5059adfe641..c1329c76f3d 100644
--- a/gcc/internal-fn.def
+++ b/gcc/internal-fn.def
@@ -102,6 +102,17 @@ DEF_INTERNAL_OPTAB_FN (STORE_LANES, ECF_CONST, vec_store_lanes, store_lanes)
 DEF_INTERNAL_OPTAB_FN (MASK_STORE_LANES, 0,
 		       vec_mask_store_lanes, mask_store_lanes)
 
+DEF_INTERNAL_OPTAB_FN (VEC_INTERLEAVE_LO, ECF_CONST | ECF_NOTHROW,
+		       vec_interleave_lo, binary)
+DEF_INTERNAL_OPTAB_FN (VEC_INTERLEAVE_HI, ECF_CONST | ECF_NOTHROW,
+		       vec_interleave_hi, binary)
+DEF_INTERNAL_OPTAB_FN (VEC_EXTRACT_EVEN, ECF_CONST | ECF_NOTHROW,
+		       vec_extract_even, binary)
+DEF_INTERNAL_OPTAB_FN (VEC_EXTRACT_ODD, ECF_CONST | ECF_NOTHROW,
+		       vec_extract_odd, binary)
+DEF_INTERNAL_OPTAB_FN (VEC_REVERSE, ECF_CONST | ECF_NOTHROW,
+		       vec_reverse, unary)
+
 DEF_INTERNAL_OPTAB_FN (RSQRT, ECF_CONST, rsqrt, unary)
 
 /* Unary math functions.  */
diff --git a/gcc/optabs.def b/gcc/optabs.def
index 17ce1d682e9..1f3fee43471 100644
--- a/gcc/optabs.def
+++ b/gcc/optabs.def
@@ -309,6 +309,11 @@ OPTAB_D (vec_perm_optab, "vec_perm$a")
 OPTAB_D (vec_realign_load_optab, "vec_realign_load_$a")
 OPTAB_D (vec_set_optab, "vec_set$a")
 OPTAB_D (vec_shr_optab, "vec_shr_$a")
+OPTAB_D (vec_interleave_lo_optab, "vec_interleave_lo_$a")
+OPTAB_D (vec_interleave_hi_optab, "vec_interleave_hi_$a")
+OPTAB_D (vec_extract_even_optab, "vec_extract_even_$a")
+OPTAB_D (vec_extract_odd_optab, "vec_extract_odd_$a")
+OPTAB_D (vec_reverse_optab, "vec_reverse_$a")
 OPTAB_D (vec_unpacks_float_hi_optab, "vec_unpacks_float_hi_$a")
 OPTAB_D (vec_unpacks_float_lo_optab, "vec_unpacks_float_lo_$a")
 OPTAB_D (vec_unpacks_hi_optab, "vec_unpacks_hi_$a")
diff --git a/gcc/testsuite/gcc.dg/vect/no-vfa-vect-depend-2.c b/gcc/testsuite/gcc.dg/vect/no-vfa-vect-depend-2.c
index acad8fc0332..1880d1edb32 100644
--- a/gcc/testsuite/gcc.dg/vect/no-vfa-vect-depend-2.c
+++ b/gcc/testsuite/gcc.dg/vect/no-vfa-vect-depend-2.c
@@ -51,7 +51,4 @@ int main (void)
 }
 
 /* { dg-final { scan-tree-dump-times "vectorized 1 loops" 1 "vect" {xfail { vect_no_align && { ! vect_hw_misalign } } } } } */
-/* Requires reverse for variable-length SVE, which is implemented for
-   by a later patch.  Until then we report it twice, once for SVE and
-   once for 128-bit Advanced SIMD.  */
-/* { dg-final { scan-tree-dump-times "dependence distance negative" 1 "vect" { xfail { aarch64_sve && vect_variable_length } } } } */
+/* { dg-final { scan-tree-dump-times "dependence distance negative" 1 "vect" } } */
diff --git a/gcc/testsuite/gcc.dg/vect/no-vfa-vect-depend-3.c b/gcc/testsuite/gcc.dg/vect/no-vfa-vect-depend-3.c
index 1ccfc1edacc..e5914d970e3 100644
--- a/gcc/testsuite/gcc.dg/vect/no-vfa-vect-depend-3.c
+++ b/gcc/testsuite/gcc.dg/vect/no-vfa-vect-depend-3.c
@@ -183,7 +183,4 @@ int main ()
 }
 
 /* { dg-final { scan-tree-dump-times "vectorized 1 loops" 4 "vect" {xfail { vect_no_align && { ! vect_hw_misalign } } } } } */
-/* f4 requires reverse for SVE, which is implemented by a later patch.
-   Until then we report it twice, once for SVE and once for 128-bit
-   Advanced SIMD.  */
-/* { dg-final { scan-tree-dump-times "dependence distance negative" 4 "vect" { xfail { aarch64_sve && vect_variable_length } } } } */
+/* { dg-final { scan-tree-dump-times "dependence distance negative" 4 "vect" } } */
diff --git a/gcc/testsuite/gcc.dg/vect/pr33953.c b/gcc/testsuite/gcc.dg/vect/pr33953.c
index 4dd54cd57f3..deb66828d56 100644
--- a/gcc/testsuite/gcc.dg/vect/pr33953.c
+++ b/gcc/testsuite/gcc.dg/vect/pr33953.c
@@ -29,6 +29,6 @@ void blockmove_NtoN_blend_noremap32 (const UINT32 *srcdata, int srcwidth,
 }
 
 /* { dg-final { scan-tree-dump-times "vectorized 1 loops" 1 "vect" { xfail { vect_no_align && { ! vect_hw_misalign } } } } } */
-/* { dg-final { scan-tree-dump-times "vectorizing stmts using SLP" 1 "vect" { xfail { vect_no_align && { ! vect_hw_misalign } } } } } */
+/* { dg-final { scan-tree-dump-times "vectorizing stmts using SLP" 1 "vect" { xfail { { vect_no_align && { ! vect_hw_misalign } } || vect_variable_length } } } } */
 
 
diff --git a/gcc/testsuite/gcc.dg/vect/pr68445.c b/gcc/testsuite/gcc.dg/vect/pr68445.c
index e2b0c01c2af..15bffdc7e05 100644
--- a/gcc/testsuite/gcc.dg/vect/pr68445.c
+++ b/gcc/testsuite/gcc.dg/vect/pr68445.c
@@ -16,4 +16,4 @@ void IMB_double_fast_x (int *destf, int *dest, int y, int *p1f)
     }
 }
 
-/* { dg-final { scan-tree-dump "vectorizing stmts using SLP" "vect" } } */
+/* { dg-final { scan-tree-dump "vectorizing stmts using SLP" "vect" { xfail vect_variable_length } } } */
diff --git a/gcc/testsuite/gcc.dg/vect/slp-12a.c b/gcc/testsuite/gcc.dg/vect/slp-12a.c
index 08a8f55bab0..522ab64cf09 100644
--- a/gcc/testsuite/gcc.dg/vect/slp-12a.c
+++ b/gcc/testsuite/gcc.dg/vect/slp-12a.c
@@ -75,5 +75,5 @@ int main (void)
 
 /* { dg-final { scan-tree-dump-times "vectorized 1 loops" 1 "vect" { target { vect_strided8 && vect_int_mult } } } } */
 /* { dg-final { scan-tree-dump-times "vectorized 0 loops" 1 "vect" { target { ! { vect_strided8 && vect_int_mult } } } } } */
-/* { dg-final { scan-tree-dump-times "vectorizing stmts using SLP" 1 "vect" { target { vect_strided8 && vect_int_mult } } } } */
+/* { dg-final { scan-tree-dump-times "vectorizing stmts using SLP" 1 "vect" { target { vect_strided8 && vect_int_mult } xfail vect_variable_length } } } */
 /* { dg-final { scan-tree-dump-times "vectorizing stmts using SLP" 0 "vect" { target { ! { vect_strided8 && vect_int_mult } } } } } */
diff --git a/gcc/testsuite/gcc.dg/vect/slp-13-big-array.c b/gcc/testsuite/gcc.dg/vect/slp-13-big-array.c
index 59781386254..a16656ace00 100644
--- a/gcc/testsuite/gcc.dg/vect/slp-13-big-array.c
+++ b/gcc/testsuite/gcc.dg/vect/slp-13-big-array.c
@@ -134,4 +134,4 @@ int main (void)
 /* { dg-final { scan-tree-dump-times "vectorized 2 loops" 1 "vect" { target { { vect_interleave && vect_extract_even_odd } && { ! vect_pack_trunc } } } } } */
 /* { dg-final { scan-tree-dump-times "vectorizing stmts using SLP" 2 "vect" { target { ! vect_pack_trunc } } } } */
 /* { dg-final { scan-tree-dump-times "vectorized 3 loops" 1 "vect" { target { { vect_interleave && vect_extract_even_odd } && vect_pack_trunc } } } } */
-/* { dg-final { scan-tree-dump-times "vectorizing stmts using SLP" 3 "vect" { target vect_pack_trunc } } } */
+/* { dg-final { scan-tree-dump-times "vectorizing stmts using SLP" 3 "vect" { target vect_pack_trunc xfail vect_variable_length } } } */
diff --git a/gcc/testsuite/gcc.dg/vect/slp-13.c b/gcc/testsuite/gcc.dg/vect/slp-13.c
index e7482667e2a..8769d62cfd4 100644
--- a/gcc/testsuite/gcc.dg/vect/slp-13.c
+++ b/gcc/testsuite/gcc.dg/vect/slp-13.c
@@ -128,4 +128,4 @@ int main (void)
 /* { dg-final { scan-tree-dump-times "vectorized 2 loops" 1 "vect" { target { { vect_interleave && vect_extract_even_odd } && { ! vect_pack_trunc } } } } } */
 /* { dg-final { scan-tree-dump-times "vectorizing stmts using SLP" 2 "vect" { target { ! vect_pack_trunc } } } } */
 /* { dg-final { scan-tree-dump-times "vectorized 3 loops" 1 "vect" { target { { vect_interleave && vect_extract_even_odd } && vect_pack_trunc } } } } */
-/* { dg-final { scan-tree-dump-times "vectorizing stmts using SLP" 3 "vect" { target vect_pack_trunc } } } */
+/* { dg-final { scan-tree-dump-times "vectorizing stmts using SLP" 3 "vect" { target vect_pack_trunc xfail vect_variable_length } } } */
diff --git a/gcc/testsuite/gcc.dg/vect/slp-14.c b/gcc/testsuite/gcc.dg/vect/slp-14.c
index 6af70815dd4..a5916047cef 100644
--- a/gcc/testsuite/gcc.dg/vect/slp-14.c
+++ b/gcc/testsuite/gcc.dg/vect/slp-14.c
@@ -111,5 +111,5 @@ int main (void)
 }
 
 /* { dg-final { scan-tree-dump-times "vectorized 1 loops" 1 "vect"  { target vect_int_mult } } }  */
-/* { dg-final { scan-tree-dump-times "vectorizing stmts using SLP" 2 "vect" { target vect_int_mult } } } */
+/* { dg-final { scan-tree-dump-times "vectorizing stmts using SLP" 2 "vect" { target vect_int_mult xfail vect_variable_length } } } */
   
diff --git a/gcc/testsuite/gcc.dg/vect/slp-15.c b/gcc/testsuite/gcc.dg/vect/slp-15.c
index dbced88c98d..ff30fb4220f 100644
--- a/gcc/testsuite/gcc.dg/vect/slp-15.c
+++ b/gcc/testsuite/gcc.dg/vect/slp-15.c
@@ -112,6 +112,6 @@ int main (void)
 
 /* { dg-final { scan-tree-dump-times "vectorized 1 loops" 1 "vect"  {target vect_int_mult } } } */
 /* { dg-final { scan-tree-dump-times "vectorized 0 loops" 1 "vect"  {target  { ! { vect_int_mult } } } } } */
-/* { dg-final { scan-tree-dump-times "vectorizing stmts using SLP" 2 "vect" {target vect_int_mult } } } */
+/* { dg-final { scan-tree-dump-times "vectorizing stmts using SLP" 2 "vect" { target vect_int_mult xfail vect_variable_length } } } */
 /* { dg-final { scan-tree-dump-times "vectorizing stmts using SLP" 0 "vect" {target { ! { vect_int_mult } } } } } */
   
diff --git a/gcc/testsuite/gcc.dg/vect/slp-42.c b/gcc/testsuite/gcc.dg/vect/slp-42.c
index ea5fe167cdb..6b78246c2df 100644
--- a/gcc/testsuite/gcc.dg/vect/slp-42.c
+++ b/gcc/testsuite/gcc.dg/vect/slp-42.c
@@ -15,5 +15,5 @@ void foo (int n)
     }
 }
 
-/* { dg-final { scan-tree-dump "vectorizing stmts using SLP" "vect" } } */
+/* { dg-final { scan-tree-dump "vectorizing stmts using SLP" "vect" { xfail vect_variable_length } } } */
 /* { dg-final { scan-tree-dump "vectorized 1 loops" "vect" } } */
diff --git a/gcc/testsuite/gcc.dg/vect/slp-multitypes-2.c b/gcc/testsuite/gcc.dg/vect/slp-multitypes-2.c
index 28a645c7947..0eca73af699 100644
--- a/gcc/testsuite/gcc.dg/vect/slp-multitypes-2.c
+++ b/gcc/testsuite/gcc.dg/vect/slp-multitypes-2.c
@@ -77,5 +77,5 @@ int main (void)
 }
 
 /* { dg-final { scan-tree-dump-times "vectorized 1 loops" 1 "vect"  } } */
-/* { dg-final { scan-tree-dump-times "vectorizing stmts using SLP" 2 "vect"  } } */
+/* { dg-final { scan-tree-dump-times "vectorizing stmts using SLP" 2 "vect" { xfail vect_variable_length } } } */
   
diff --git a/gcc/testsuite/gcc.dg/vect/slp-multitypes-4.c b/gcc/testsuite/gcc.dg/vect/slp-multitypes-4.c
index faf17d6f0cd..05a51d2f55c 100644
--- a/gcc/testsuite/gcc.dg/vect/slp-multitypes-4.c
+++ b/gcc/testsuite/gcc.dg/vect/slp-multitypes-4.c
@@ -52,5 +52,5 @@ int main (void)
 }
 
 /* { dg-final { scan-tree-dump-times "vectorized 1 loops" 1 "vect"  { target vect_unpack } } } */
-/* { dg-final { scan-tree-dump-times "vectorizing stmts using SLP" 1 "vect"  { target vect_unpack } } } */
+/* { dg-final { scan-tree-dump-times "vectorizing stmts using SLP" 1 "vect" { target vect_unpack xfail vect_variable_length } } } */
   
diff --git a/gcc/testsuite/gcc.dg/vect/slp-multitypes-5.c b/gcc/testsuite/gcc.dg/vect/slp-multitypes-5.c
index fb4f720aa49..1153e7b194d 100644
--- a/gcc/testsuite/gcc.dg/vect/slp-multitypes-5.c
+++ b/gcc/testsuite/gcc.dg/vect/slp-multitypes-5.c
@@ -52,5 +52,5 @@ int main (void)
 }
 
 /* { dg-final { scan-tree-dump-times "vectorized 1 loops" 1 "vect" { target vect_pack_trunc } } } */
-/* { dg-final { scan-tree-dump-times "vectorizing stmts using SLP" 1 "vect" { target vect_pack_trunc } } } */
+/* { dg-final { scan-tree-dump-times "vectorizing stmts using SLP" 1 "vect" { target vect_pack_trunc xfail vect_variable_length } } } */
   
diff --git a/gcc/testsuite/gcc.dg/vect/slp-reduc-4.c b/gcc/testsuite/gcc.dg/vect/slp-reduc-4.c
index c41d322445a..d58e5b0fd22 100644
--- a/gcc/testsuite/gcc.dg/vect/slp-reduc-4.c
+++ b/gcc/testsuite/gcc.dg/vect/slp-reduc-4.c
@@ -57,5 +57,5 @@ int main (void)
 }
 
 /* { dg-final { scan-tree-dump-times "vectorized 1 loops" 1 "vect" { xfail vect_no_int_min_max } } } */
-/* { dg-final { scan-tree-dump-times "vectorizing stmts using SLP" 1 "vect" { xfail vect_no_int_min_max } } } */
+/* { dg-final { scan-tree-dump-times "vectorizing stmts using SLP" 1 "vect" { xfail { vect_no_int_min_max || vect_variable_length } } } } */
 
diff --git a/gcc/testsuite/gcc.dg/vect/slp-reduc-7.c b/gcc/testsuite/gcc.dg/vect/slp-reduc-7.c
index d7cc6cae8b4..43d1cee9fbe 100644
--- a/gcc/testsuite/gcc.dg/vect/slp-reduc-7.c
+++ b/gcc/testsuite/gcc.dg/vect/slp-reduc-7.c
@@ -55,5 +55,5 @@ int main (void)
 }
 
 /* { dg-final { scan-tree-dump-times "vectorized 1 loops" 1 "vect" { xfail vect_no_int_add } } } */
-/* { dg-final { scan-tree-dump-times "vectorizing stmts using SLP" 1 "vect" { xfail vect_no_int_add } } } */
+/* { dg-final { scan-tree-dump-times "vectorizing stmts using SLP" 1 "vect" { xfail { vect_no_int_add || vect_variable_length } } } } */
 
diff --git a/gcc/testsuite/gcc.target/aarch64/sve_vec_perm_2.c b/gcc/testsuite/gcc.target/aarch64/sve_vec_perm_2.c
new file mode 100644
index 00000000000..4c3df975bab
--- /dev/null
+++ b/gcc/testsuite/gcc.target/aarch64/sve_vec_perm_2.c
@@ -0,0 +1,31 @@
+/* { dg-do compile } */
+/* { dg-options "-O2 -ftree-vectorize -march=armv8-a+sve" } */
+
+#include <stdint.h>
+
+#define VEC_PERM(TYPE)						\
+TYPE __attribute__ ((noinline, noclone))			\
+vec_reverse_##TYPE (TYPE *restrict a, TYPE *restrict b, int n)	\
+{								\
+  for (int i = 0; i < n; ++i)					\
+    a[i] = b[n - i - 1];					\
+}
+
+#define TEST_ALL(T)				\
+  T (int8_t)					\
+  T (uint8_t)					\
+  T (int16_t)					\
+  T (uint16_t)					\
+  T (int32_t)					\
+  T (uint32_t)					\
+  T (int64_t)					\
+  T (uint64_t)					\
+  T (float)					\
+  T (double)
+
+TEST_ALL (VEC_PERM)
+
+/* { dg-final { scan-assembler-times {\trev\tz[0-9]+\.b, z[0-9]+\.b\n} 2 } } */
+/* { dg-final { scan-assembler-times {\trev\tz[0-9]+\.h, z[0-9]+\.h\n} 2 } } */
+/* { dg-final { scan-assembler-times {\trev\tz[0-9]+\.s, z[0-9]+\.s\n} 3 } } */
+/* { dg-final { scan-assembler-times {\trev\tz[0-9]+\.d, z[0-9]+\.d\n} 3 } } */
diff --git a/gcc/testsuite/gcc.target/aarch64/sve_vec_perm_2_run.c b/gcc/testsuite/gcc.target/aarch64/sve_vec_perm_2_run.c
new file mode 100644
index 00000000000..9a9300509ab
--- /dev/null
+++ b/gcc/testsuite/gcc.target/aarch64/sve_vec_perm_2_run.c
@@ -0,0 +1,29 @@
+/* { dg-do run { target aarch64_sve_hw } } */
+/* { dg-options "-O2 -ftree-vectorize -march=armv8-a+sve" } */
+
+#include "sve_vec_perm_2.c"
+
+#define N 153
+
+#define HARNESS(TYPE)						\
+  {								\
+    TYPE a[N], b[N];						\
+    for (unsigned int i = 0; i < N; ++i)			\
+      {								\
+	b[i] = i * 2 + i % 5;					\
+	asm volatile ("" ::: "memory");				\
+      }								\
+    vec_reverse_##TYPE (a, b, N);				\
+    for (unsigned int i = 0; i < N; ++i)			\
+      {								\
+	TYPE expected = (N - i - 1) * 2 + (N - i - 1) % 5;	\
+	if (a[i] != expected)					\
+	  __builtin_abort ();					\
+      }								\
+  }
+
+int __attribute__ ((optimize (1)))
+main (void)
+{
+  TEST_ALL (HARNESS)
+}
diff --git a/gcc/testsuite/gcc.target/aarch64/sve_vec_perm_3.c b/gcc/testsuite/gcc.target/aarch64/sve_vec_perm_3.c
new file mode 100644
index 00000000000..8b4901b1014
--- /dev/null
+++ b/gcc/testsuite/gcc.target/aarch64/sve_vec_perm_3.c
@@ -0,0 +1,46 @@
+/* { dg-do compile } */
+/* { dg-options "-O2 -ftree-vectorize -march=armv8-a+sve -msve-vector-bits=scalable" } */
+
+#include <stdint.h>
+
+#define VEC_PERM(TYPE)					\
+TYPE __attribute__ ((noinline, noclone))		\
+vec_zip_##TYPE (TYPE *restrict a, TYPE *restrict b,	\
+		TYPE *restrict c, long n)		\
+{							\
+  for (long i = 0; i < n; ++i)				\
+    {							\
+      a[i * 8] = c[i * 4];				\
+      a[i * 8 + 1] = b[i * 4];				\
+      a[i * 8 + 2] = c[i * 4 + 1];			\
+      a[i * 8 + 3] = b[i * 4 + 1];			\
+      a[i * 8 + 4] = c[i * 4 + 2];			\
+      a[i * 8 + 5] = b[i * 4 + 2];			\
+      a[i * 8 + 6] = c[i * 4 + 3];			\
+      a[i * 8 + 7] = b[i * 4 + 3];			\
+    }							\
+}
+
+#define TEST_ALL(T)				\
+  T (int8_t)					\
+  T (uint8_t)					\
+  T (int16_t)					\
+  T (uint16_t)					\
+  T (int32_t)					\
+  T (uint32_t)					\
+  T (int64_t)					\
+  T (uint64_t)					\
+  T (float)					\
+  T (double)
+
+TEST_ALL (VEC_PERM)
+
+/* { dg-final { scan-assembler-times {\tzip1\tz[0-9]+\.b, z[0-9]+\.b, z[0-9]+\.b\n} 24 } } */
+/* { dg-final { scan-assembler-times {\tzip2\tz[0-9]+\.b, z[0-9]+\.b, z[0-9]+\.b\n} 24 } } */
+/* { dg-final { scan-assembler-times {\tzip1\tz[0-9]+\.h, z[0-9]+\.h, z[0-9]+\.h\n} 24 } } */
+/* { dg-final { scan-assembler-times {\tzip2\tz[0-9]+\.h, z[0-9]+\.h, z[0-9]+\.h\n} 24 } } */
+/* Currently we can't use SLP for groups bigger than 128 bits.  */
+/* { dg-final { scan-assembler-times {\tzip1\tz[0-9]+\.s, z[0-9]+\.s, z[0-9]+\.s\n} 36 { xfail *-*-* } } } */
+/* { dg-final { scan-assembler-times {\tzip2\tz[0-9]+\.s, z[0-9]+\.s, z[0-9]+\.s\n} 36 { xfail *-*-* } } } */
+/* { dg-final { scan-assembler-times {\tzip1\tz[0-9]+\.d, z[0-9]+\.d, z[0-9]+\.d\n} 36 { xfail *-*-* } } } */
+/* { dg-final { scan-assembler-times {\tzip2\tz[0-9]+\.d, z[0-9]+\.d, z[0-9]+\.d\n} 36 { xfail *-*-* } } } */
diff --git a/gcc/testsuite/gcc.target/aarch64/sve_vec_perm_3_run.c b/gcc/testsuite/gcc.target/aarch64/sve_vec_perm_3_run.c
new file mode 100644
index 00000000000..c47b4050ae2
--- /dev/null
+++ b/gcc/testsuite/gcc.target/aarch64/sve_vec_perm_3_run.c
@@ -0,0 +1,31 @@
+/* { dg-do run { target aarch64_sve_hw } } */
+/* { dg-options "-O2 -ftree-vectorize -march=armv8-a+sve" } */
+
+#include "sve_vec_perm_3.c"
+
+#define N (43 * 8)
+
+#define HARNESS(TYPE)						\
+  {								\
+    TYPE a[N], b[N], c[N];					\
+    for (unsigned int i = 0; i < N; ++i)			\
+      {								\
+	b[i] = i * 2 + i % 5;					\
+	c[i] = i * 3;						\
+	asm volatile ("" ::: "memory");				\
+      }								\
+    vec_zip_##TYPE (a, b, c, N / 8);				\
+    for (unsigned int i = 0; i < N / 2; ++i)			\
+      {								\
+	TYPE expected1 = i * 3;					\
+	TYPE expected2 = i * 2 + i % 5;				\
+	if (a[i * 2] != expected1 || a[i * 2 + 1] != expected2)	\
+	  __builtin_abort ();					\
+      }								\
+  }
+
+int __attribute__ ((optimize (1)))
+main (void)
+{
+  TEST_ALL (HARNESS)
+}
diff --git a/gcc/testsuite/gcc.target/aarch64/sve_vec_perm_4.c b/gcc/testsuite/gcc.target/aarch64/sve_vec_perm_4.c
new file mode 100644
index 00000000000..c08ad23868c
--- /dev/null
+++ b/gcc/testsuite/gcc.target/aarch64/sve_vec_perm_4.c
@@ -0,0 +1,52 @@
+/* { dg-do compile } */
+/* { dg-options "-O2 -ftree-vectorize -march=armv8-a+sve -msve-vector-bits=scalable" } */
+
+#include <stdint.h>
+
+#define VEC_PERM(TYPE)					\
+TYPE __attribute__ ((noinline, noclone))		\
+vec_uzp_##TYPE (TYPE *restrict a, TYPE *restrict b,	\
+		 TYPE *restrict c, long n)		\
+{							\
+  for (long i = 0; i < n; ++i)				\
+    {							\
+      a[i * 4] = c[i * 8];				\
+      b[i * 4] = c[i * 8 + 1];				\
+      a[i * 4 + 1] = c[i * 8 + 2];			\
+      b[i * 4 + 1] = c[i * 8 + 3];			\
+      a[i * 4 + 2] = c[i * 8 + 4];			\
+      b[i * 4 + 2] = c[i * 8 + 5];			\
+      a[i * 4 + 3] = c[i * 8 + 6];			\
+      b[i * 4 + 3] = c[i * 8 + 7];			\
+    }							\
+}
+
+#define TEST_ALL(T)				\
+  T (int8_t)					\
+  T (uint8_t)					\
+  T (int16_t)					\
+  T (uint16_t)					\
+  T (int32_t)					\
+  T (uint32_t)					\
+  T (int64_t)					\
+  T (uint64_t)					\
+  T (float)					\
+  T (double)
+
+TEST_ALL (VEC_PERM)
+
+/* We could use a single uzp1 and uzp2 per function by implementing
+   SLP load permutation for variable width.  XFAIL until then.  */
+/* { dg-final { scan-assembler-times {\tuzp1\tz[0-9]+\.b, z[0-9]+\.b, z[0-9]+\.b\n} 2 { xfail *-*-* } } } */
+/* { dg-final { scan-assembler-times {\tuzp2\tz[0-9]+\.b, z[0-9]+\.b, z[0-9]+\.b\n} 2 { xfail *-*-* } } } */
+/* { dg-final { scan-assembler-times {\tuzp1\tz[0-9]+\.h, z[0-9]+\.h, z[0-9]+\.h\n} 2 { xfail *-*-* } } } */
+/* { dg-final { scan-assembler-times {\tuzp2\tz[0-9]+\.h, z[0-9]+\.h, z[0-9]+\.h\n} 2 { xfail *-*-* } } } */
+/* { dg-final { scan-assembler-times {\tuzp1\tz[0-9]+\.s, z[0-9]+\.s, z[0-9]+\.s\n} 3 { xfail *-*-* } } } */
+/* { dg-final { scan-assembler-times {\tuzp2\tz[0-9]+\.s, z[0-9]+\.s, z[0-9]+\.s\n} 3 { xfail *-*-* } } } */
+/* { dg-final { scan-assembler-times {\tuzp1\tz[0-9]+\.d, z[0-9]+\.d, z[0-9]+\.d\n} 3 { xfail *-*-* } } } */
+/* { dg-final { scan-assembler-times {\tuzp2\tz[0-9]+\.d, z[0-9]+\.d, z[0-9]+\.d\n} 3 { xfail *-*-* } } } */
+/* Delete these if the tests above start passing instead.  */
+/* { dg-final { scan-assembler-times {\tuzp1\tz[0-9]+\.b, z[0-9]+\.b, z[0-9]+\.b\n} 24 } } */
+/* { dg-final { scan-assembler-times {\tuzp2\tz[0-9]+\.b, z[0-9]+\.b, z[0-9]+\.b\n} 24 } } */
+/* { dg-final { scan-assembler-times {\tuzp1\tz[0-9]+\.h, z[0-9]+\.h, z[0-9]+\.h\n} 24 } } */
+/* { dg-final { scan-assembler-times {\tuzp2\tz[0-9]+\.h, z[0-9]+\.h, z[0-9]+\.h\n} 24 } } */
diff --git a/gcc/testsuite/gcc.target/aarch64/sve_vec_perm_4_run.c b/gcc/testsuite/gcc.target/aarch64/sve_vec_perm_4_run.c
new file mode 100644
index 00000000000..a096b6c5353
--- /dev/null
+++ b/gcc/testsuite/gcc.target/aarch64/sve_vec_perm_4_run.c
@@ -0,0 +1,29 @@
+/* { dg-do run { target aarch64_sve_hw } } */
+/* { dg-options "-O2 -ftree-vectorize -march=armv8-a+sve" } */
+
+#include "sve_vec_perm_4.c"
+
+#define N (43 * 8)
+
+#define HARNESS(TYPE)					\
+  {							\
+    TYPE a[N], b[N], c[N];				\
+    for (unsigned int i = 0; i < N; ++i)		\
+      {							\
+	c[i] = i * 2 + i % 5;				\
+	asm volatile ("" ::: "memory");			\
+      }							\
+    vec_uzp_##TYPE (a, b, c, N / 8);			\
+    for (unsigned int i = 0; i < N; ++i)		\
+      {							\
+	TYPE expected = i * 2 + i % 5;			\
+	if ((i & 1 ? b[i / 2] : a[i / 2]) != expected)	\
+	  __builtin_abort ();				\
+      }							\
+  }
+
+int __attribute__ ((optimize (1)))
+main (void)
+{
+  TEST_ALL (HARNESS)
+}
diff --git a/gcc/tree-vect-data-refs.c b/gcc/tree-vect-data-refs.c
index 5bede0661c9..23dbe6e3ae4 100644
--- a/gcc/tree-vect-data-refs.c
+++ b/gcc/tree-vect-data-refs.c
@@ -52,6 +52,7 @@ along with GCC; see the file COPYING3.  If not see
 #include "params.h"
 #include "tree-cfg.h"
 #include "tree-hash-traits.h"
+#include "internal-fn.h"
 
 /* Return true if load- or store-lanes optab OPTAB is implemented for
    COUNT vectors of type VECTYPE.  NAME is the name of OPTAB.  */
@@ -4636,7 +4637,16 @@ vect_grouped_store_supported (tree vectype, unsigned HOST_WIDE_INT count)
       return false;
     }
 
-  /* Check that the permutation is supported.  */
+  /* Powers of 2 use a tree of interleaving operations.  See whether the
+     target supports them directly.  */
+  if (count != 3
+      && direct_internal_fn_supported_p (IFN_VEC_INTERLEAVE_LO, vectype,
+					 OPTIMIZE_FOR_SPEED)
+      && direct_internal_fn_supported_p (IFN_VEC_INTERLEAVE_HI, vectype,
+					 OPTIMIZE_FOR_SPEED))
+    return true;
+
+  /* Otherwise check for support in the form of general permutations.  */
   unsigned int nelt;
   if (VECTOR_MODE_P (mode) && GET_MODE_NUNITS (mode).is_constant (&nelt))
     {
@@ -4881,50 +4891,78 @@ vect_permute_store_chain (vec<tree> dr_chain,
       /* If length is not equal to 3 then only power of 2 is supported.  */
       gcc_assert (pow2p_hwi (length));
 
-      /* vect_grouped_store_supported ensures that this is constant.  */
-      unsigned int nelt = TYPE_VECTOR_SUBPARTS (vectype).to_constant ();
-      auto_vec_perm_indices sel (nelt);
-      sel.quick_grow (nelt);
-      for (i = 0, n = nelt / 2; i < n; i++)
+      if (direct_internal_fn_supported_p (IFN_VEC_INTERLEAVE_LO, vectype,
+					  OPTIMIZE_FOR_SPEED)
+	  && direct_internal_fn_supported_p (IFN_VEC_INTERLEAVE_HI, vectype,
+					     OPTIMIZE_FOR_SPEED))
 	{
-	  sel[i * 2] = i;
-	  sel[i * 2 + 1] = i + nelt;
+	  /* We could support the case where only one of the optabs is
+	     implemented, but that seems unlikely.  */
+	  perm_mask_low = NULL_TREE;
+	  perm_mask_high = NULL_TREE;
 	}
-	perm_mask_high = vect_gen_perm_mask_checked (vectype, sel);
-
-	for (i = 0; i < nelt; i++)
-	  sel[i] += nelt / 2;
-	perm_mask_low = vect_gen_perm_mask_checked (vectype, sel);
+      else
+	{
+	  /* vect_grouped_store_supported ensures that this is constant.  */
+	  unsigned int nelt = TYPE_VECTOR_SUBPARTS (vectype).to_constant ();
+	  auto_vec_perm_indices sel (nelt);
+	  sel.quick_grow (nelt);
+	  for (i = 0, n = nelt / 2; i < n; i++)
+	    {
+	      sel[i * 2] = i;
+	      sel[i * 2 + 1] = i + nelt;
+	    }
+	  perm_mask_low = vect_gen_perm_mask_checked (vectype, sel);
 
-	for (i = 0, n = log_length; i < n; i++)
-	  {
-	    for (j = 0; j < length/2; j++)
-	      {
-		vect1 = dr_chain[j];
-		vect2 = dr_chain[j+length/2];
+	  for (i = 0; i < nelt; i++)
+	    sel[i] += nelt / 2;
+	  perm_mask_high = vect_gen_perm_mask_checked (vectype, sel);
+	}
 
-		/* Create interleaving stmt:
-		   high = VEC_PERM_EXPR <vect1, vect2, {0, nelt, 1, nelt+1,
-							...}>  */
-		high = make_temp_ssa_name (vectype, NULL, "vect_inter_high");
-		perm_stmt = gimple_build_assign (high, VEC_PERM_EXPR, vect1,
-						 vect2, perm_mask_high);
-		vect_finish_stmt_generation (stmt, perm_stmt, gsi);
-		(*result_chain)[2*j] = high;
-
-		/* Create interleaving stmt:
-		   low = VEC_PERM_EXPR <vect1, vect2,
-					{nelt/2, nelt*3/2, nelt/2+1, nelt*3/2+1,
-					 ...}>  */
-		low = make_temp_ssa_name (vectype, NULL, "vect_inter_low");
+      for (i = 0, n = log_length; i < n; i++)
+	{
+	  for (j = 0; j < length / 2; j++)
+	    {
+	      vect1 = dr_chain[j];
+	      vect2 = dr_chain[j + length / 2];
+
+	      /* Create interleaving stmt:
+		 high = VEC_PERM_EXPR <vect1, vect2,
+				       {0, nelt, 1, nelt + 1, ...}>  */
+	      low = make_temp_ssa_name (vectype, NULL, "vect_inter_low");
+	      if (perm_mask_low)
 		perm_stmt = gimple_build_assign (low, VEC_PERM_EXPR, vect1,
 						 vect2, perm_mask_low);
-		vect_finish_stmt_generation (stmt, perm_stmt, gsi);
-		(*result_chain)[2*j+1] = low;
-	      }
-	    memcpy (dr_chain.address (), result_chain->address (),
-		    length * sizeof (tree));
-	  }
+	      else
+		{
+		  perm_stmt = gimple_build_call_internal
+		    (IFN_VEC_INTERLEAVE_LO, 2, vect1, vect2);
+		  gimple_set_lhs (perm_stmt, low);
+		}
+	      vect_finish_stmt_generation (stmt, perm_stmt, gsi);
+	      (*result_chain)[2 * j] = low;
+
+	      /* Create interleaving stmt:
+		 high = VEC_PERM_EXPR <vect1, vect2,
+				      {nelt / 2, nelt * 3 / 2,
+				       nelt / 2 + 1, nelt * 3 / 2 + 1,
+				       ...}>  */
+	      high = make_temp_ssa_name (vectype, NULL, "vect_inter_high");
+	      if (perm_mask_high)
+		perm_stmt = gimple_build_assign (high, VEC_PERM_EXPR, vect1,
+						 vect2, perm_mask_high);
+	      else
+		{
+		  perm_stmt = gimple_build_call_internal
+		    (IFN_VEC_INTERLEAVE_HI, 2, vect1, vect2);
+		  gimple_set_lhs (perm_stmt, high);
+		}
+	      vect_finish_stmt_generation (stmt, perm_stmt, gsi);
+	      (*result_chain)[2 * j + 1] = high;
+	    }
+	  memcpy (dr_chain.address (), result_chain->address (),
+		  length * sizeof (tree));
+	}
     }
 }
 
@@ -5235,7 +5273,16 @@ vect_grouped_load_supported (tree vectype, bool single_element_p,
       return false;
     }
 
-  /* Check that the permutation is supported.  */
+  /* Powers of 2 use a tree of extract operations.  See whether the
+     target supports them directly.  */
+  if (count != 3
+      && direct_internal_fn_supported_p (IFN_VEC_EXTRACT_EVEN, vectype,
+					 OPTIMIZE_FOR_SPEED)
+      && direct_internal_fn_supported_p (IFN_VEC_EXTRACT_ODD, vectype,
+					 OPTIMIZE_FOR_SPEED))
+    return true;
+
+  /* Otherwise check for support in the form of general permutations.  */
   unsigned int nelt;
   if (VECTOR_MODE_P (mode) && GET_MODE_NUNITS (mode).is_constant (&nelt))
     {
@@ -5464,17 +5511,30 @@ vect_permute_load_chain (vec<tree> dr_chain,
       /* If length is not equal to 3 then only power of 2 is supported.  */
       gcc_assert (pow2p_hwi (length));
 
-      /* vect_grouped_load_supported ensures that this is constant.  */
-      unsigned nelt = TYPE_VECTOR_SUBPARTS (vectype).to_constant ();
-      auto_vec_perm_indices sel (nelt);
-      sel.quick_grow (nelt);
-      for (i = 0; i < nelt; ++i)
-	sel[i] = i * 2;
-      perm_mask_even = vect_gen_perm_mask_checked (vectype, sel);
+      if (direct_internal_fn_supported_p (IFN_VEC_EXTRACT_EVEN, vectype,
+					  OPTIMIZE_FOR_SPEED)
+	  && direct_internal_fn_supported_p (IFN_VEC_EXTRACT_ODD, vectype,
+					     OPTIMIZE_FOR_SPEED))
+	{
+	  /* We could support the case where only one of the optabs is
+	     implemented, but that seems unlikely.  */
+	  perm_mask_even = NULL_TREE;
+	  perm_mask_odd = NULL_TREE;
+	}
+      else
+	{
+	  /* vect_grouped_load_supported ensures that this is constant.  */
+	  unsigned nelt = TYPE_VECTOR_SUBPARTS (vectype).to_constant ();
+	  auto_vec_perm_indices sel (nelt);
+	  sel.quick_grow (nelt);
+	  for (i = 0; i < nelt; ++i)
+	    sel[i] = i * 2;
+	  perm_mask_even = vect_gen_perm_mask_checked (vectype, sel);
 
-      for (i = 0; i < nelt; ++i)
-	sel[i] = i * 2 + 1;
-      perm_mask_odd = vect_gen_perm_mask_checked (vectype, sel);
+	  for (i = 0; i < nelt; ++i)
+	    sel[i] = i * 2 + 1;
+	  perm_mask_odd = vect_gen_perm_mask_checked (vectype, sel);
+	}
 
       for (i = 0; i < log_length; i++)
 	{
@@ -5485,19 +5545,33 @@ vect_permute_load_chain (vec<tree> dr_chain,
 
 	      /* data_ref = permute_even (first_data_ref, second_data_ref);  */
 	      data_ref = make_temp_ssa_name (vectype, NULL, "vect_perm_even");
-	      perm_stmt = gimple_build_assign (data_ref, VEC_PERM_EXPR,
-					       first_vect, second_vect,
-					       perm_mask_even);
+	      if (perm_mask_even)
+		perm_stmt = gimple_build_assign (data_ref, VEC_PERM_EXPR,
+						 first_vect, second_vect,
+						 perm_mask_even);
+	      else
+		{
+		  perm_stmt = gimple_build_call_internal
+		    (IFN_VEC_EXTRACT_EVEN, 2, first_vect, second_vect);
+		  gimple_set_lhs (perm_stmt, data_ref);
+		}
 	      vect_finish_stmt_generation (stmt, perm_stmt, gsi);
-	      (*result_chain)[j/2] = data_ref;
+	      (*result_chain)[j / 2] = data_ref;
 
 	      /* data_ref = permute_odd (first_data_ref, second_data_ref);  */
 	      data_ref = make_temp_ssa_name (vectype, NULL, "vect_perm_odd");
-	      perm_stmt = gimple_build_assign (data_ref, VEC_PERM_EXPR,
-					       first_vect, second_vect,
-					       perm_mask_odd);
+	      if (perm_mask_odd)
+		perm_stmt = gimple_build_assign (data_ref, VEC_PERM_EXPR,
+						 first_vect, second_vect,
+						 perm_mask_odd);
+	      else
+		{
+		  perm_stmt = gimple_build_call_internal
+		    (IFN_VEC_EXTRACT_ODD, 2, first_vect, second_vect);
+		  gimple_set_lhs (perm_stmt, data_ref);
+		}
 	      vect_finish_stmt_generation (stmt, perm_stmt, gsi);
-	      (*result_chain)[j/2+length/2] = data_ref;
+	      (*result_chain)[j / 2 + length / 2] = data_ref;
 	    }
 	  memcpy (dr_chain.address (), result_chain->address (),
 		  length * sizeof (tree));
diff --git a/gcc/tree-vect-stmts.c b/gcc/tree-vect-stmts.c
index e3041b9dcf8..0be681c23b5 100644
--- a/gcc/tree-vect-stmts.c
+++ b/gcc/tree-vect-stmts.c
@@ -1725,6 +1725,46 @@ perm_mask_for_reverse (tree vectype)
   return vect_gen_perm_mask_checked (vectype, sel);
 }
 
+/* Return true if the target can reverse the elements in a vector of
+   type VECTOR_TYPE.  */
+
+static bool
+can_reverse_vector_p (tree vector_type)
+{
+  return (direct_internal_fn_supported_p (IFN_VEC_REVERSE, vector_type,
+					  OPTIMIZE_FOR_SPEED)
+	  || perm_mask_for_reverse (vector_type));
+}
+
+/* Generate a statement to reverse the elements in vector INPUT and
+   return the SSA name that holds the result.  GSI is a statement iterator
+   pointing to STMT, which is the scalar statement we're vectorizing.
+   VEC_DEST is the destination variable with which new SSA names
+   should be associated.  */
+
+static tree
+reverse_vector (tree vec_dest, tree input, gimple *stmt,
+		gimple_stmt_iterator *gsi)
+{
+  tree new_temp = make_ssa_name (vec_dest);
+  tree vector_type = TREE_TYPE (input);
+  gimple *perm_stmt;
+  if (direct_internal_fn_supported_p (IFN_VEC_REVERSE, vector_type,
+				      OPTIMIZE_FOR_SPEED))
+    {
+      perm_stmt = gimple_build_call_internal (IFN_VEC_REVERSE, 1, input);
+      gimple_set_lhs (perm_stmt, new_temp);
+    }
+  else
+    {
+      tree perm_mask = perm_mask_for_reverse (vector_type);
+      perm_stmt = gimple_build_assign (new_temp, VEC_PERM_EXPR,
+				       input, input, perm_mask);
+    }
+  vect_finish_stmt_generation (stmt, perm_stmt, gsi);
+  return new_temp;
+}
+
 /* STMT is either a masked or unconditional store.  Return the value
    being stored.  */
 
@@ -1948,7 +1988,7 @@ get_negative_load_store_type (gimple *stmt, tree vectype,
       return VMAT_CONTIGUOUS_DOWN;
     }
 
-  if (!perm_mask_for_reverse (vectype))
+  if (!can_reverse_vector_p (vectype))
     {
       if (dump_enabled_p ())
 	dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
@@ -6467,20 +6507,10 @@ vectorizable_store (gimple *stmt, gimple_stmt_iterator *gsi, gimple **vec_stmt,
 
 	      if (memory_access_type == VMAT_CONTIGUOUS_REVERSE)
 		{
-		  tree perm_mask = perm_mask_for_reverse (vectype);
 		  tree perm_dest 
 		    = vect_create_destination_var (vect_get_store_rhs (stmt),
 						   vectype);
-		  tree new_temp = make_ssa_name (perm_dest);
-
-		  /* Generate the permute statement.  */
-		  gimple *perm_stmt 
-		    = gimple_build_assign (new_temp, VEC_PERM_EXPR, vec_oprnd,
-					   vec_oprnd, perm_mask);
-		  vect_finish_stmt_generation (stmt, perm_stmt, gsi);
-
-		  perm_stmt = SSA_NAME_DEF_STMT (new_temp);
-		  vec_oprnd = new_temp;
+		  vec_oprnd = reverse_vector (perm_dest, vec_oprnd, stmt, gsi);
 		}
 
 	      /* Arguments are ready.  Create the new vector stmt.  */
@@ -7692,9 +7722,7 @@ vectorizable_load (gimple *stmt, gimple_stmt_iterator *gsi, gimple **vec_stmt,
 
 	      if (memory_access_type == VMAT_CONTIGUOUS_REVERSE)
 		{
-		  tree perm_mask = perm_mask_for_reverse (vectype);
-		  new_temp = permute_vec_elements (new_temp, new_temp,
-						   perm_mask, stmt, gsi);
+		  new_temp = reverse_vector (vec_dest, new_temp, stmt, gsi);
 		  new_stmt = SSA_NAME_DEF_STMT (new_temp);
 		}
author	Richard Sandiford <richard.sandiford@linaro.org>	2017-10-13 12:29:59 +0100
committer	Richard Sandiford <richard.sandiford@linaro.org>	2017-11-19 14:48:28 +0000
commit	11edfd37908fd019eab616afbc2909a090d3c484 (patch)
tree	a026f768742cdad4c088809bb3417e3685c710cd
parent	51714d91a47c0324fe39251244e06a32e951f5ca (diff)
download	gcc-11edfd37908fd019eab616afbc2909a090d3c484.tar.gz