Add support for SVE gather loads

This patch adds support for SVE gather loads. It uses the basically the same analysis code as the AVX gather support, but after that there are two major differences: - It uses new internal functions rather than target built-ins. The interface is: IFN_GATHER_LOAD (base, offsets scale) IFN_MASK_GATHER_LOAD (base, offsets scale, mask) which should be reasonably generic. One of the advantages of using internal functions is that other passes can understand what the functions do, but a more immediate advantage is that we can query the underlying target pattern to see which scales it supports. - It uses pattern recognition to convert the offset to the right width, if it was originally narrower than that. This avoids having to do a widening operation as part of the gather expansion itself. 2017-11-17 Richard Sandiford <richard.sandiford@linaro.org> Alan Hayward <alan.hayward@arm.com> David Sherwood <david.sherwood@arm.com> gcc/ * doc/md.texi (gather_load@var{m}): Document. (mask_gather_load@var{m}): Likewise. * genopinit.c (main): Add supports_vec_gather_load and supports_vec_gather_load_cached to target_optabs. * optabs-tree.c (init_tree_optimization_optabs): Use ggc_cleared_alloc to allocate target_optabs. * optabs.def (gather_load_optab, mask_gather_laod_optab): New optabs. * internal-fn.def (GATHER_LOAD, MASK_GATHER_LOAD): New internal functions. * internal-fn.h (internal_load_fn_p): Declare. (internal_gather_scatter_fn_p): Likewise. (internal_fn_mask_index): Likewise. (internal_gather_scatter_fn_supported_p): Likewise. * internal-fn.c (gather_load_direct): New macro. (expand_gather_load_optab_fn): New function. (direct_gather_load_optab_supported_p): New macro. (direct_internal_fn_optab): New function. (internal_load_fn_p): Likewise. (internal_gather_scatter_fn_p): Likewise. (internal_fn_mask_index): Likewise. (internal_gather_scatter_fn_supported_p): Likewise. * optabs-query.c (supports_at_least_one_mode_p): New function. (supports_vec_gather_load_p): Likewise. * optabs-query.h (supports_vec_gather_load_p): Declare. * tree-vectorizer.h (gather_scatter_info): Add ifn, element_type and memory_type field. (NUM_PATTERNS): Bump to 15. * tree-vect-data-refs.c (vect_gather_scatter_fn_p): New function. (vect_describe_gather_scatter_call): Likewise. (vect_check_gather_scatter): Try using internal functions for gather loads. Recognize existing calls to a gather load function. (vect_analyze_data_refs): Consider using gather loads if supports_vec_gather_load_p. * tree-vect-patterns.c (vect_get_load_store_mask): New function. (vect_get_gather_scatter_offset_type): Likewise. (vect_convert_mask_for_vectype): Likewise. (vect_add_conversion_to_patterm): Likewise. (vect_try_gather_scatter_pattern): Likewise. (vect_recog_gather_scatter_pattern): New pattern recognizer. (vect_vect_recog_func_ptrs): Add it. * tree-vect-stmts.c (exist_non_indexing_operands_for_use_p): Use internal_fn_mask_index and internal_gather_scatter_fn_p. (check_load_store_masking): Take the gather_scatter_info as an argument and handle gather loads. (vect_get_gather_scatter_ops): New function. (vectorizable_call): Check internal_load_fn_p. (vectorizable_load): Likewise. Handle gather load internal functions. (vectorizable_store): Update call to check_load_store_masking. * config/aarch64/aarch64.md (UNSPEC_LD1_GATHER): New unspec. * config/aarch64/iterators.md (SVE_S, SVE_D): New mode iterators. * config/aarch64/predicates.md (aarch64_gather_scale_operand_w) (aarch64_gather_scale_operand_d): New predicates. * config/aarch64/aarch64-sve.md (gather_load<mode>): New expander. (mask_gather_load<mode>): New insns. gcc/testsuite/ * gcc.target/aarch64/sve_gather_load_1.c: New test. * gcc.target/aarch64/sve_gather_load_2.c: Likewise. * gcc.target/aarch64/sve_gather_load_3.c: Likewise. * gcc.target/aarch64/sve_gather_load_4.c: Likewise. * gcc.target/aarch64/sve_gather_load_5.c: Likewise. * gcc.target/aarch64/sve_gather_load_6.c: Likewise. * gcc.target/aarch64/sve_gather_load_7.c: Likewise. * gcc.target/aarch64/sve_mask_gather_load_1.c: Likewise. * gcc.target/aarch64/sve_mask_gather_load_2.c: Likewise. * gcc.target/aarch64/sve_mask_gather_load_3.c: Likewise. * gcc.target/aarch64/sve_mask_gather_load_4.c: Likewise. * gcc.target/aarch64/sve_mask_gather_load_5.c: Likewise. * gcc.target/aarch64/sve_mask_gather_load_6.c: Likewise. * gcc.target/aarch64/sve_mask_gather_load_7.c: Likewise.
author: Richard Sandiford <richard.sandiford@linaro.org> 2017-11-17 16:57:22 +0000
committer: Richard Sandiford <richard.sandiford@linaro.org> 2017-11-20 11:42:56 +0000
commit: 9c4d5a5738e1a17ae328b81bbd147b6ea9675505 (patch)
tree: 5a8313dca464bf8ead111a2b608d20c3c807a613
parent: c1802e041d9b69f83ff072a323840bb0deaa5a60 (diff)
download: gcc-9c4d5a5738e1a17ae328b81bbd147b6ea9675505.tar.gz
31 files changed, 1183 insertions, 43 deletions
diff --git a/gcc/config/aarch64/aarch64-sve.md b/gcc/config/aarch64/aarch64-sve.md
index 502d8afe94c..681cca90ea0 100644
--- a/gcc/config/aarch64/aarch64-sve.md
+++ b/gcc/config/aarch64/aarch64-sve.md
@@ -189,6 +189,63 @@
   "st1<Vesize>\t%1.<Vetype>, %2, %0"
 )
 
+;; Unpredicated gather loads.
+(define_expand "gather_load<mode>"
+  [(set (match_operand:SVE_SD 0 "register_operand")
+	(unspec:SVE_SD
+	  [(match_dup 5)
+	   (match_operand:DI 1 "aarch64_reg_or_zero")
+	   (match_operand:<V_INT_EQUIV> 2 "register_operand")
+	   (match_operand:DI 3 "const_int_operand")
+	   (match_operand:DI 4 "aarch64_gather_scale_operand_<Vesize>")
+	   (mem:BLK (scratch))]
+	  UNSPEC_LD1_GATHER))]
+  "TARGET_SVE"
+  {
+    operands[5] = force_reg (<VPRED>mode, CONSTM1_RTX (<VPRED>mode));
+  }
+)
+
+;; Predicated gather loads for 32-bit elements.  Operand 3 is true for
+;; unsigned extension and false for signed extension.
+(define_insn "mask_gather_load<mode>"
+  [(set (match_operand:SVE_S 0 "register_operand" "=w, w, w, w, w")
+	(unspec:SVE_S
+	  [(match_operand:<VPRED> 5 "register_operand" "Upl, Upl, Upl, Upl, Upl")
+	   (match_operand:DI 1 "aarch64_reg_or_zero" "Z, rk, rk, rk, rk")
+	   (match_operand:<V_INT_EQUIV> 2 "register_operand" "w, w, w, w, w")
+	   (match_operand:DI 3 "const_int_operand" "i, Z, Ui1, Z, Ui1")
+	   (match_operand:DI 4 "aarch64_gather_scale_operand_w" "Ui1, Ui1, Ui1, i, i")
+	   (mem:BLK (scratch))]
+	  UNSPEC_LD1_GATHER))]
+  "TARGET_SVE"
+  "@
+   ld1w\t%0.s, %5/z, [%2.s]
+   ld1w\t%0.s, %5/z, [%1, %2.s, sxtw]
+   ld1w\t%0.s, %5/z, [%1, %2.s, uxtw]
+   ld1w\t%0.s, %5/z, [%1, %2.s, sxtw %p4]
+   ld1w\t%0.s, %5/z, [%1, %2.s, uxtw %p4]"
+)
+
+;; Predicated gather loads for 64-bit elements.  The value of operand 3
+;; doesn't matter in this case.
+(define_insn "mask_gather_load<mode>"
+  [(set (match_operand:SVE_D 0 "register_operand" "=w, w, w")
+	(unspec:SVE_D
+	  [(match_operand:<VPRED> 5 "register_operand" "Upl, Upl, Upl")
+	   (match_operand:DI 1 "aarch64_reg_or_zero" "Z, rk, rk")
+	   (match_operand:<V_INT_EQUIV> 2 "register_operand" "w, w, w")
+	   (match_operand:DI 3 "const_int_operand")
+	   (match_operand:DI 4 "aarch64_gather_scale_operand_d" "Ui1, Ui1, i")
+	   (mem:BLK (scratch))]
+	  UNSPEC_LD1_GATHER))]
+  "TARGET_SVE"
+  "@
+   ld1d\t%0.d, %5/z, [%2.d]
+   ld1d\t%0.d, %5/z, [%1, %2.d]
+   ld1d\t%0.d, %5/z, [%1, %2.d, lsl %p4]"
+)
+
 ;; SVE structure moves.
 (define_expand "mov<mode>"
   [(set (match_operand:SVE_STRUCT 0 "nonimmediate_operand")
diff --git a/gcc/config/aarch64/aarch64.md b/gcc/config/aarch64/aarch64.md
index bbfb4858dbe..dcebaccdc0b 100644
--- a/gcc/config/aarch64/aarch64.md
+++ b/gcc/config/aarch64/aarch64.md
@@ -151,6 +151,7 @@
     UNSPEC_XPACLRI
     UNSPEC_LD1_SVE
     UNSPEC_ST1_SVE
+    UNSPEC_LD1_GATHER
     UNSPEC_MERGE_PTRUE
     UNSPEC_PTEST_PTRUE
     UNSPEC_UNPACKSHI
diff --git a/gcc/config/aarch64/iterators.md b/gcc/config/aarch64/iterators.md
index f57c88da5aa..a94b068097d 100644
--- a/gcc/config/aarch64/iterators.md
+++ b/gcc/config/aarch64/iterators.md
@@ -276,6 +276,12 @@
 ;; All SVE vector modes that have 32-bit or 64-bit elements.
 (define_mode_iterator SVE_SD [VNx4SI VNx2DI VNx4SF VNx2DF])
 
+;; All SVE vector modes that have 32-bit elements.
+(define_mode_iterator SVE_S [VNx4SI VNx4SF])
+
+;; All SVE vector modes that have 64-bit elements.
+(define_mode_iterator SVE_D [VNx2DI VNx2DF])
+
 ;; All SVE integer vector modes that have 32-bit or 64-bit elements.
 (define_mode_iterator SVE_SDI [VNx4SI VNx2DI])
 
diff --git a/gcc/config/aarch64/predicates.md b/gcc/config/aarch64/predicates.md
index 5a5e22fc5ac..b9bbf680960 100644
--- a/gcc/config/aarch64/predicates.md
+++ b/gcc/config/aarch64/predicates.md
@@ -596,3 +596,11 @@
 (define_predicate "aarch64_sve_vec_perm_operand"
   (ior (match_operand 0 "register_operand")
        (match_operand 0 "aarch64_constant_vector_operand")))
+
+(define_predicate "aarch64_gather_scale_operand_w"
+  (and (match_code "const_int")
+       (match_test "INTVAL (op) == 1 || INTVAL (op) == 4")))
+
+(define_predicate "aarch64_gather_scale_operand_d"
+  (and (match_code "const_int")
+       (match_test "INTVAL (op) == 1 || INTVAL (op) == 8")))
diff --git a/gcc/doc/md.texi b/gcc/doc/md.texi
index 75a1110192f..be31f223ddd 100644
--- a/gcc/doc/md.texi
+++ b/gcc/doc/md.texi
@@ -4905,6 +4905,35 @@ for (j = 0; j < GET_MODE_NUNITS (@var{n}); j++)
 
 This pattern is not allowed to @code{FAIL}.
 
+@cindex @code{gather_load@var{m}} instruction pattern
+@item @samp{gather_load@var{m}}
+Load several separate memory locations into a vector of mode @var{m}.
+Operand 1 is a scalar base address and operand 2 is a vector of
+offsets from that base.  Operand 0 is a destination vector with the
+same number of elements as the offset.  For each element index @var{i}:
+
+@itemize @bullet
+@item
+extend the offset element @var{i} to address width, using zero
+extension if operand 3 is 1 and sign extension if operand 3 is zero;
+@item
+multiply the extended offset by operand 4;
+@item
+add the result to the base; and
+@item
+load the value at that address into element @var{i} of operand 0.
+@end itemize
+
+The value of operand 3 does not matter if the offsets are already
+address width.
+
+@cindex @code{mask_gather_load@var{m}} instruction pattern
+@item @samp{mask_gather_load@var{m}}
+Like @samp{gather_load@var{m}}, but takes an extra mask operand as
+operand 5.  Bit @var{i} of the mask is set if element @var{i}
+of the result should be loaded from memory and clear if element @var{i}
+of the result should be set to zero.
+
 @cindex @code{vec_set@var{m}} instruction pattern
 @item @samp{vec_set@var{m}}
 Set given field in the vector value.  Operand 0 is the vector to modify,
diff --git a/gcc/genopinit.c b/gcc/genopinit.c
index 1ad9054770a..0aac7b6a77e 100644
--- a/gcc/genopinit.c
+++ b/gcc/genopinit.c
@@ -234,6 +234,11 @@ main (int argc, const char **argv)
 	   "struct target_optabs {\n"
 	   "  /* Patterns that are used by optabs that are enabled for this target.  */\n"
 	   "  bool pat_enable[NUM_OPTAB_PATTERNS];\n"
+	   "\n"
+	   "  /* Cache if the target supports vec_gather_load for at least one vector\n"
+	   "     mode.  */\n"
+	   "  bool supports_vec_gather_load;\n"
+	   "  bool supports_vec_gather_load_cached;\n"
 	   "};\n"
 	   "extern void init_all_optabs (struct target_optabs *);\n"
 	   "\n"
diff --git a/gcc/internal-fn.c b/gcc/internal-fn.c
index f8974c5c8ca..96095d696a6 100644
--- a/gcc/internal-fn.c
+++ b/gcc/internal-fn.c
@@ -83,6 +83,7 @@ init_internal_fns ()
 #define mask_load_direct { -1, 2, false }
 #define load_lanes_direct { -1, -1, false }
 #define mask_load_lanes_direct { -1, -1, false }
+#define gather_load_direct { -1, 1, false }
 #define mask_store_direct { 3, 2, false }
 #define store_lanes_direct { 0, 0, false }
 #define mask_store_lanes_direct { 0, 0, false }
@@ -2676,6 +2677,38 @@ expand_LAUNDER (internal_fn, gcall *call)
   expand_assignment (lhs, gimple_call_arg (call, 0), false);
 }
 
+/* Expand {MASK_,}GATHER_LOAD call CALL using optab OPTAB.  */
+
+static void
+expand_gather_load_optab_fn (internal_fn, gcall *stmt, direct_optab optab)
+{
+  tree lhs = gimple_call_lhs (stmt);
+  tree base = gimple_call_arg (stmt, 0);
+  tree offset = gimple_call_arg (stmt, 1);
+  tree scale = gimple_call_arg (stmt, 2);
+
+  rtx lhs_rtx = expand_expr (lhs, NULL_RTX, VOIDmode, EXPAND_WRITE);
+  rtx base_rtx = expand_normal (base);
+  rtx offset_rtx = expand_normal (offset);
+  HOST_WIDE_INT scale_int = tree_to_shwi (scale);
+
+  int i = 0;
+  struct expand_operand ops[6];
+  create_output_operand (&ops[i++], lhs_rtx, TYPE_MODE (TREE_TYPE (lhs)));
+  create_address_operand (&ops[i++], base_rtx);
+  create_input_operand (&ops[i++], offset_rtx, TYPE_MODE (TREE_TYPE (offset)));
+  create_integer_operand (&ops[i++], TYPE_UNSIGNED (TREE_TYPE (offset)));
+  create_integer_operand (&ops[i++], scale_int);
+  if (optab == mask_gather_load_optab)
+    {
+      tree mask = gimple_call_arg (stmt, 3);
+      rtx mask_rtx = expand_normal (mask);
+      create_input_operand (&ops[i++], mask_rtx, TYPE_MODE (TREE_TYPE (mask)));
+    }
+  insn_code icode = direct_optab_handler (optab, TYPE_MODE (TREE_TYPE (lhs)));
+  expand_insn (icode, i, ops);
+}
+
 /* Expand DIVMOD() using:
  a) optab handler for udivmod/sdivmod if it is available.
  b) If optab_handler doesn't exist, generate call to
@@ -2915,12 +2948,32 @@ multi_vector_optab_supported_p (convert_optab optab, tree_pair types,
 #define direct_mask_load_optab_supported_p direct_optab_supported_p
 #define direct_load_lanes_optab_supported_p multi_vector_optab_supported_p
 #define direct_mask_load_lanes_optab_supported_p multi_vector_optab_supported_p
+#define direct_gather_load_optab_supported_p direct_optab_supported_p
 #define direct_mask_store_optab_supported_p direct_optab_supported_p
 #define direct_store_lanes_optab_supported_p multi_vector_optab_supported_p
 #define direct_mask_store_lanes_optab_supported_p multi_vector_optab_supported_p
 #define direct_while_optab_supported_p convert_optab_supported_p
 #define direct_fold_extract_optab_supported_p direct_optab_supported_p
 
+/* Return the optab used by internal function FN.  */
+
+static optab
+direct_internal_fn_optab (internal_fn fn)
+{
+  switch (fn)
+    {
+#define DEF_INTERNAL_FN(CODE, FLAGS, FNSPEC) \
+    case IFN_##CODE: break;
+#define DEF_INTERNAL_OPTAB_FN(CODE, FLAGS, OPTAB, TYPE) \
+    case IFN_##CODE: return OPTAB##_optab;
+#include "internal-fn.def"
+
+    case IFN_LAST:
+      break;
+    }
+  gcc_unreachable ();
+}
+
 /* Return true if FN is supported for the types in TYPES when the
    optimization type is OPT_TYPE.  The types are those associated with
    the "type0" and "type1" fields of FN's direct_internal_fn_info
@@ -3022,6 +3075,87 @@ get_conditional_internal_fn (tree_code code, tree type)
     }
 }
 
+/* Return true if IFN is some form of load from memory.  */
+
+bool
+internal_load_fn_p (internal_fn fn)
+{
+  switch (fn)
+    {
+    case IFN_MASK_LOAD:
+    case IFN_LOAD_LANES:
+    case IFN_MASK_LOAD_LANES:
+    case IFN_GATHER_LOAD:
+    case IFN_MASK_GATHER_LOAD:
+      return true;
+
+    default:
+      return false;
+    }
+}
+
+/* Return true if IFN is some form of gather load or scatter store.  */
+
+bool
+internal_gather_scatter_fn_p (internal_fn fn)
+{
+  switch (fn)
+    {
+    case IFN_GATHER_LOAD:
+    case IFN_MASK_GATHER_LOAD:
+      return true;
+
+    default:
+      return false;
+    }
+}
+
+/* If FN takes a vector mask argument, return the index of that argument,
+   otherwise return -1.  */
+
+int
+internal_fn_mask_index (internal_fn fn)
+{
+  switch (fn)
+    {
+    case IFN_MASK_LOAD:
+    case IFN_MASK_LOAD_LANES:
+    case IFN_MASK_STORE:
+    case IFN_MASK_STORE_LANES:
+      return 2;
+
+    case IFN_MASK_GATHER_LOAD:
+      return 3;
+
+    default:
+      return -1;
+    }
+}
+
+/* Return true if the target supports gather load or scatter store function
+   IFN.  For loads, VECTOR_TYPE is the vector type of the load result,
+   while for stores it is the vector type of the stored data argument.
+   MEMORY_ELEMENT_TYPE is the type of the memory elements being loaded
+   or stored.  OFFSET_SIGN is the sign of the offset argument, which is
+   only relevant when the offset is narrower than an address.  SCALE is
+   the amount by which the offset should be multiplied *after* it has
+   been extended to address width.  */
+
+bool
+internal_gather_scatter_fn_supported_p (internal_fn ifn, tree vector_type,
+					tree memory_element_type,
+					signop offset_sign, int scale)
+{
+  if (!tree_int_cst_equal (TYPE_SIZE (TREE_TYPE (vector_type)),
+			   TYPE_SIZE (memory_element_type)))
+    return false;
+  optab optab = direct_internal_fn_optab (ifn);
+  insn_code icode = direct_optab_handler (optab, TYPE_MODE (vector_type));
+  return (icode != CODE_FOR_nothing
+	  && insn_operand_matches (icode, 3, GEN_INT (offset_sign == UNSIGNED))
+	  && insn_operand_matches (icode, 4, GEN_INT (scale)));
+}
+
 /* Expand STMT as though it were a call to internal function FN.  */
 
 void
diff --git a/gcc/internal-fn.def b/gcc/internal-fn.def
index 9c1b190f442..e9cdad39775 100644
--- a/gcc/internal-fn.def
+++ b/gcc/internal-fn.def
@@ -47,6 +47,7 @@ along with GCC; see the file COPYING3.  If not see
    - mask_load: currently just maskload
    - load_lanes: currently just vec_load_lanes
    - mask_load_lanes: currently just vec_mask_load_lanes
+   - gather_load: used for {mask_,}gather_load
 
    - mask_store: currently just maskstore
    - store_lanes: currently just vec_store_lanes
@@ -110,6 +111,10 @@ DEF_INTERNAL_OPTAB_FN (LOAD_LANES, ECF_CONST, vec_load_lanes, load_lanes)
 DEF_INTERNAL_OPTAB_FN (MASK_LOAD_LANES, ECF_PURE,
 		       vec_mask_load_lanes, mask_load_lanes)
 
+DEF_INTERNAL_OPTAB_FN (GATHER_LOAD, ECF_PURE, gather_load, gather_load)
+DEF_INTERNAL_OPTAB_FN (MASK_GATHER_LOAD, ECF_PURE,
+		       mask_gather_load, gather_load)
+
 DEF_INTERNAL_OPTAB_FN (MASK_STORE, 0, maskstore, mask_store)
 DEF_INTERNAL_OPTAB_FN (STORE_LANES, ECF_CONST, vec_store_lanes, store_lanes)
 DEF_INTERNAL_OPTAB_FN (MASK_STORE_LANES, 0,
diff --git a/gcc/internal-fn.h b/gcc/internal-fn.h
index 15691a6eb1a..4b323a9d247 100644
--- a/gcc/internal-fn.h
+++ b/gcc/internal-fn.h
@@ -192,6 +192,12 @@ extern bool set_edom_supported_p (void);
 
 extern internal_fn get_conditional_internal_fn (tree_code, tree);
 
+extern bool internal_load_fn_p (internal_fn);
+extern bool internal_gather_scatter_fn_p (internal_fn);
+extern int internal_fn_mask_index (internal_fn);
+extern bool internal_gather_scatter_fn_supported_p (internal_fn, tree,
+						    tree, signop, int);
+
 extern void expand_internal_call (gcall *);
 extern void expand_internal_call (internal_fn, gcall *);
 extern void expand_PHI (internal_fn, gcall *);
diff --git a/gcc/optabs-query.c b/gcc/optabs-query.c
index dc955e1093b..041834148ad 100644
--- a/gcc/optabs-query.c
+++ b/gcc/optabs-query.c
@@ -621,3 +621,32 @@ lshift_cheap_p (bool speed_p)
 
   return cheap[speed_p];
 }
+
+/* Return true if optab OP supports at least one mode.  */
+
+static bool
+supports_at_least_one_mode_p (optab op)
+{
+  for (int i = 0; i < NUM_MACHINE_MODES; ++i)
+    if (direct_optab_handler (op, (machine_mode) i) != CODE_FOR_nothing)
+      return true;
+
+  return false;
+}
+
+/* Return true if vec_gather_load is available for at least one vector
+   mode.  */
+
+bool
+supports_vec_gather_load_p ()
+{
+  if (this_fn_optabs->supports_vec_gather_load_cached)
+    return this_fn_optabs->supports_vec_gather_load;
+
+  this_fn_optabs->supports_vec_gather_load_cached = true;
+
+  this_fn_optabs->supports_vec_gather_load
+    = supports_at_least_one_mode_p (gather_load_optab);
+
+  return this_fn_optabs->supports_vec_gather_load;
+}
diff --git a/gcc/optabs-query.h b/gcc/optabs-query.h
index 2701e2594e6..15ecc0febb0 100644
--- a/gcc/optabs-query.h
+++ b/gcc/optabs-query.h
@@ -187,6 +187,7 @@ bool can_compare_and_swap_p (machine_mode, bool);
 bool can_atomic_exchange_p (machine_mode, bool);
 bool can_atomic_load_p (machine_mode);
 bool lshift_cheap_p (bool);
+bool supports_vec_gather_load_p ();
 
 /* Version of find_widening_optab_handler_and_mode that operates on
    specific mode types.  */
diff --git a/gcc/optabs-tree.c b/gcc/optabs-tree.c
index 6d75d1109ce..beb7d26e002 100644
--- a/gcc/optabs-tree.c
+++ b/gcc/optabs-tree.c
@@ -380,7 +380,7 @@ init_tree_optimization_optabs (tree optnode)
   if (tmp_optabs)
     memset (tmp_optabs, 0, sizeof (struct target_optabs));
   else
-    tmp_optabs = ggc_alloc<target_optabs> ();
+    tmp_optabs = ggc_cleared_alloc<target_optabs> ();
 
   /* Generate a new set of optabs into tmp_optabs.  */
   init_all_optabs (tmp_optabs);
diff --git a/gcc/optabs.def b/gcc/optabs.def
index e4ba5d20d78..da7e9c9a9b6 100644
--- a/gcc/optabs.def
+++ b/gcc/optabs.def
@@ -390,6 +390,9 @@ OPTAB_D (atomic_xor_optab, "atomic_xor$I$a")
 OPTAB_D (get_thread_pointer_optab, "get_thread_pointer$I$a")
 OPTAB_D (set_thread_pointer_optab, "set_thread_pointer$I$a")
 
+OPTAB_D (gather_load_optab, "gather_load$a")
+OPTAB_D (mask_gather_load_optab, "mask_gather_load$a")
+
 OPTAB_DC (vec_duplicate_optab, "vec_duplicate$a", VEC_DUPLICATE)
 OPTAB_DC (vec_series_optab, "vec_series$a", VEC_SERIES)
 OPTAB_D (vec_shl_insert_optab, "vec_shl_insert_$a")
diff --git a/gcc/testsuite/gcc.target/aarch64/sve_gather_load_1.c b/gcc/testsuite/gcc.target/aarch64/sve_gather_load_1.c
new file mode 100644
index 00000000000..6ed5c06bd51
--- /dev/null
+++ b/gcc/testsuite/gcc.target/aarch64/sve_gather_load_1.c
@@ -0,0 +1,32 @@
+/* { dg-do assemble } */
+/* { dg-options "-O2 -ftree-vectorize -march=armv8-a+sve --save-temps" } */
+
+#include <stdint.h>
+
+#ifndef INDEX32
+#define INDEX32 int32_t
+#define INDEX64 int64_t
+#endif
+
+/* Invoked 18 times for each data size.  */
+#define TEST_LOOP(DATA_TYPE, BITS)					\
+  void __attribute__ ((noinline, noclone))				\
+  f_##DATA_TYPE (DATA_TYPE *restrict dest, DATA_TYPE *restrict src,	\
+		 INDEX##BITS *indices, int n)				\
+  {									\
+    for (int i = 9; i < n; ++i)						\
+      dest[i] += src[indices[i]];					\
+  }
+
+#define TEST_ALL(T)				\
+  T (int32_t, 32)				\
+  T (uint32_t, 32)				\
+  T (float, 32)					\
+  T (int64_t, 64)				\
+  T (uint64_t, 64)				\
+  T (double, 64)
+
+TEST_ALL (TEST_LOOP)
+
+/* { dg-final { scan-assembler-times {\tld1w\tz[0-9]+\.s, p[0-7]/z, \[x[0-9]+, z[0-9]+.s, sxtw 2\]\n} 3 } } */
+/* { dg-final { scan-assembler-times {\tld1d\tz[0-9]+\.d, p[0-7]/z, \[x[0-9]+, z[0-9]+.d, lsl 3\]\n} 3 } } */
diff --git a/gcc/testsuite/gcc.target/aarch64/sve_gather_load_2.c b/gcc/testsuite/gcc.target/aarch64/sve_gather_load_2.c
new file mode 100644
index 00000000000..4e348db3bf1
--- /dev/null
+++ b/gcc/testsuite/gcc.target/aarch64/sve_gather_load_2.c
@@ -0,0 +1,10 @@
+/* { dg-do assemble } */
+/* { dg-options "-O2 -ftree-vectorize -march=armv8-a+sve --save-temps" } */
+
+#define INDEX32 uint32_t
+#define INDEX64 uint64_t
+
+#include "sve_gather_load_1.c"
+
+/* { dg-final { scan-assembler-times {\tld1w\tz[0-9]+\.s, p[0-7]/z, \[x[0-9]+, z[0-9]+.s, uxtw 2\]\n} 3 } } */
+/* { dg-final { scan-assembler-times {\tld1d\tz[0-9]+\.d, p[0-7]/z, \[x[0-9]+, z[0-9]+.d, lsl 3\]\n} 3 } } */
diff --git a/gcc/testsuite/gcc.target/aarch64/sve_gather_load_3.c b/gcc/testsuite/gcc.target/aarch64/sve_gather_load_3.c
new file mode 100644
index 00000000000..a113a0faeb9
--- /dev/null
+++ b/gcc/testsuite/gcc.target/aarch64/sve_gather_load_3.c
@@ -0,0 +1,32 @@
+/* { dg-do assemble } */
+/* { dg-options "-O2 -ftree-vectorize -march=armv8-a+sve --save-temps" } */
+
+#include <stdint.h>
+
+#ifndef INDEX32
+#define INDEX32 int32_t
+#define INDEX64 int64_t
+#endif
+
+/* Invoked 18 times for each data size.  */
+#define TEST_LOOP(DATA_TYPE, BITS)					\
+  void __attribute__ ((noinline, noclone))				\
+  f_##DATA_TYPE (DATA_TYPE *restrict dest, DATA_TYPE *restrict src,	\
+		 INDEX##BITS *indices, int n)				\
+  {									\
+    for (int i = 9; i < n; ++i)						\
+      dest[i] += *(DATA_TYPE *) ((char *) src + indices[i]);		\
+  }
+
+#define TEST_ALL(T)				\
+  T (int32_t, 32)				\
+  T (uint32_t, 32)				\
+  T (float, 32)					\
+  T (int64_t, 64)				\
+  T (uint64_t, 64)				\
+  T (double, 64)
+
+TEST_ALL (TEST_LOOP)
+
+/* { dg-final { scan-assembler-times {\tld1w\tz[0-9]+\.s, p[0-7]/z, \[x[0-9]+, z[0-9]+.s, sxtw\]\n} 3 } } */
+/* { dg-final { scan-assembler-times {\tld1d\tz[0-9]+\.d, p[0-7]/z, \[x[0-9]+, z[0-9]+.d\]\n} 3 } } */
diff --git a/gcc/testsuite/gcc.target/aarch64/sve_gather_load_4.c b/gcc/testsuite/gcc.target/aarch64/sve_gather_load_4.c
new file mode 100644
index 00000000000..5382e523689
--- /dev/null
+++ b/gcc/testsuite/gcc.target/aarch64/sve_gather_load_4.c
@@ -0,0 +1,10 @@
+/* { dg-do assemble } */
+/* { dg-options "-O2 -ftree-vectorize -march=armv8-a+sve --save-temps" } */
+
+#define INDEX32 uint32_t
+#define INDEX64 uint64_t
+
+#include "sve_gather_load_3.c"
+
+/* { dg-final { scan-assembler-times {\tld1w\tz[0-9]+\.s, p[0-7]/z, \[x[0-9]+, z[0-9]+.s, uxtw\]\n} 3 } } */
+/* { dg-final { scan-assembler-times {\tld1d\tz[0-9]+\.d, p[0-7]/z, \[x[0-9]+, z[0-9]+.d\]\n} 3 } } */
diff --git a/gcc/testsuite/gcc.target/aarch64/sve_gather_load_5.c b/gcc/testsuite/gcc.target/aarch64/sve_gather_load_5.c
new file mode 100644
index 00000000000..8e4f689243b
--- /dev/null
+++ b/gcc/testsuite/gcc.target/aarch64/sve_gather_load_5.c
@@ -0,0 +1,23 @@
+/* { dg-do assemble } */
+/* { dg-options "-O2 -ftree-vectorize -march=armv8-a+sve --save-temps" } */
+
+#include <stdint.h>
+
+/* Invoked 18 times for each data size.  */
+#define TEST_LOOP(DATA_TYPE)						\
+  void __attribute__ ((noinline, noclone))				\
+  f_##DATA_TYPE (DATA_TYPE *restrict dest, DATA_TYPE *restrict *src,	\
+		 int n)							\
+  {									\
+    for (int i = 9; i < n; ++i)						\
+      dest[i] += *src[i];						\
+  }
+
+#define TEST_ALL(T)				\
+  T (int64_t)					\
+  T (uint64_t)					\
+  T (double)
+
+TEST_ALL (TEST_LOOP)
+
+/* { dg-final { scan-assembler-times {\tld1d\tz[0-9]+\.d, p[0-7]/z, \[z[0-9]+.d\]\n} 3 } } */
diff --git a/gcc/testsuite/gcc.target/aarch64/sve_gather_load_6.c b/gcc/testsuite/gcc.target/aarch64/sve_gather_load_6.c
new file mode 100644
index 00000000000..745e00f1e50
--- /dev/null
+++ b/gcc/testsuite/gcc.target/aarch64/sve_gather_load_6.c
@@ -0,0 +1,36 @@
+/* { dg-do assemble } */
+/* { dg-options "-O2 -ftree-vectorize -fwrapv -march=armv8-a+sve --save-temps" } */
+
+#include <stdint.h>
+
+#ifndef INDEX32
+#define INDEX16 int16_t
+#define INDEX32 int32_t
+#endif
+
+/* Invoked 18 times for each data size.  */
+#define TEST_LOOP(DATA_TYPE, BITS)					\
+  void __attribute__ ((noinline, noclone))				\
+  f_##DATA_TYPE (DATA_TYPE *restrict dest, DATA_TYPE *restrict src,	\
+		 INDEX##BITS *indices, INDEX##BITS mask, int n)		\
+  {									\
+    for (int i = 9; i < n; ++i)						\
+      dest[i] = src[(INDEX##BITS) (indices[i] | mask)];			\
+  }
+
+#define TEST_ALL(T)				\
+  T (int32_t, 16)				\
+  T (uint32_t, 16)				\
+  T (float, 16)					\
+  T (int64_t, 32)				\
+  T (uint64_t, 32)				\
+  T (double, 32)
+
+TEST_ALL (TEST_LOOP)
+
+/* { dg-final { scan-assembler-times {\tsunpkhi\tz[0-9]+\.s, z[0-9]+\.h\n} 3 } } */
+/* { dg-final { scan-assembler-times {\tsunpklo\tz[0-9]+\.s, z[0-9]+\.h\n} 3 } } */
+/* { dg-final { scan-assembler-times {\tsunpkhi\tz[0-9]+\.d, z[0-9]+\.s\n} 3 } } */
+/* { dg-final { scan-assembler-times {\tsunpklo\tz[0-9]+\.d, z[0-9]+\.s\n} 3 } } */
+/* { dg-final { scan-assembler-times {\tld1w\tz[0-9]+\.s, p[0-7]/z, \[x[0-9]+, z[0-9]+.s, sxtw 2\]\n} 6 } } */
+/* { dg-final { scan-assembler-times {\tld1d\tz[0-9]+\.d, p[0-7]/z, \[x[0-9]+, z[0-9]+.d, lsl 3\]\n} 6 } } */
diff --git a/gcc/testsuite/gcc.target/aarch64/sve_gather_load_7.c b/gcc/testsuite/gcc.target/aarch64/sve_gather_load_7.c
new file mode 100644
index 00000000000..8f2dfb75149
--- /dev/null
+++ b/gcc/testsuite/gcc.target/aarch64/sve_gather_load_7.c
@@ -0,0 +1,15 @@
+/* { dg-do assemble } */
+/* { dg-options "-O2 -ftree-vectorize -march=armv8-a+sve --save-temps" } */
+
+#define INDEX16 uint16_t
+#define INDEX32 uint32_t
+
+#include "sve_gather_load_6.c"
+
+/* { dg-final { scan-assembler-times {\tuunpkhi\tz[0-9]+\.s, z[0-9]+\.h\n} 3 } } */
+/* { dg-final { scan-assembler-times {\tuunpklo\tz[0-9]+\.s, z[0-9]+\.h\n} 3 } } */
+/* { dg-final { scan-assembler-times {\tuunpkhi\tz[0-9]+\.d, z[0-9]+\.s\n} 3 } } */
+/* { dg-final { scan-assembler-times {\tuunpklo\tz[0-9]+\.d, z[0-9]+\.s\n} 3 } } */
+/* Either extension type is OK here.  */
+/* { dg-final { scan-assembler-times {\tld1w\tz[0-9]+\.s, p[0-7]/z, \[x[0-9]+, z[0-9]+.s, [us]xtw 2\]\n} 6 } } */
+/* { dg-final { scan-assembler-times {\tld1d\tz[0-9]+\.d, p[0-7]/z, \[x[0-9]+, z[0-9]+.d, lsl 3\]\n} 6 } } */
diff --git a/gcc/testsuite/gcc.target/aarch64/sve_mask_gather_load_1.c b/gcc/testsuite/gcc.target/aarch64/sve_mask_gather_load_1.c
new file mode 100644
index 00000000000..469e3c670d3
--- /dev/null
+++ b/gcc/testsuite/gcc.target/aarch64/sve_mask_gather_load_1.c
@@ -0,0 +1,52 @@
+/* { dg-do assemble } */
+/* { dg-options "-O2 -ftree-vectorize -ffast-math -march=armv8-a+sve --save-temps" } */
+
+#include <stdint.h>
+
+#ifndef INDEX32
+#define INDEX32 int32_t
+#define INDEX64 int64_t
+#endif
+
+#define TEST_LOOP(DATA_TYPE, CMP_TYPE, BITS)				\
+  void									\
+  f_##DATA_TYPE##_##CMP_TYPE						\
+    (DATA_TYPE *restrict dest, DATA_TYPE *restrict src,			\
+     CMP_TYPE *cmp1, CMP_TYPE *cmp2, INDEX##BITS *indices, int n)	\
+  {									\
+    for (int i = 0; i < n; ++i)						\
+      if (cmp1[i] == cmp2[i])						\
+	dest[i] += src[indices[i]];					\
+  }
+
+#define TEST32(T, DATA_TYPE)		\
+  T (DATA_TYPE, int32_t, 32)		\
+  T (DATA_TYPE, uint32_t, 32)		\
+  T (DATA_TYPE, float, 32)
+
+#define TEST64(T, DATA_TYPE)		\
+  T (DATA_TYPE, int64_t, 64)		\
+  T (DATA_TYPE, uint64_t, 64)		\
+  T (DATA_TYPE, double, 64)
+
+#define TEST_ALL(T)			\
+  TEST32 (T, int32_t)			\
+  TEST32 (T, uint32_t)			\
+  TEST32 (T, float)			\
+  TEST64 (T, int64_t)			\
+  TEST64 (T, uint64_t)			\
+  TEST64 (T, double)
+
+TEST_ALL (TEST_LOOP)
+
+/* { dg-final { scan-assembler-times {\tld1w\tz[0-9]+\.s, p[0-7]/z, \[x[0-9]+, x[0-9]+, lsl 2\]\n} 36 } } */
+/* { dg-final { scan-assembler-times {\tcmpeq\tp[0-7]\.s, p[0-7]/z, z[0-9]+\.s, z[0-9]+\.s\n} 6 } } */
+/* { dg-final { scan-assembler-times {\tfcmeq\tp[0-7]\.s, p[0-7]/z, z[0-9]+\.s, z[0-9]+\.s\n} 3 } } */
+/* { dg-final { scan-assembler-times {\tld1w\tz[0-9]+\.s, p[0-7]/z, \[x[0-9]+, z[0-9]+\.s, sxtw 2\]\n} 9 } } */
+/* { dg-final { scan-assembler-times {\tst1w\tz[0-9]+\.s, p[0-7], \[x[0-9]+, x[0-9]+, lsl 2\]\n} 9 } } */
+
+/* { dg-final { scan-assembler-times {\tld1d\tz[0-9]+\.d, p[0-7]/z, \[x[0-9]+, x[0-9]+, lsl 3\]\n} 36 } } */
+/* { dg-final { scan-assembler-times {\tcmpeq\tp[0-7]\.d, p[0-7]/z, z[0-9]+\.d, z[0-9]+\.d\n} 6 } } */
+/* { dg-final { scan-assembler-times {\tfcmeq\tp[0-7]\.d, p[0-7]/z, z[0-9]+\.d, z[0-9]+\.d\n} 3 } } */
+/* { dg-final { scan-assembler-times {\tld1d\tz[0-9]+\.d, p[0-7]/z, \[x[0-9]+, z[0-9]+\.d, lsl 3\]\n} 9 } } */
+/* { dg-final { scan-assembler-times {\tst1d\tz[0-9]+\.d, p[0-7], \[x[0-9]+, x[0-9]+, lsl 3\]\n} 9 } } */
diff --git a/gcc/testsuite/gcc.target/aarch64/sve_mask_gather_load_2.c b/gcc/testsuite/gcc.target/aarch64/sve_mask_gather_load_2.c
new file mode 100644
index 00000000000..8dd48462b51
--- /dev/null
+++ b/gcc/testsuite/gcc.target/aarch64/sve_mask_gather_load_2.c
@@ -0,0 +1,19 @@
+/* { dg-do assemble } */
+/* { dg-options "-O2 -ftree-vectorize -ffast-math -march=armv8-a+sve --save-temps" } */
+
+#define INDEX32 uint32_t
+#define INDEX64 uint64_t
+
+#include "sve_mask_gather_load_1.c"
+
+/* { dg-final { scan-assembler-times {\tld1w\tz[0-9]+\.s, p[0-7]/z, \[x[0-9]+, x[0-9]+, lsl 2\]\n} 36 } } */
+/* { dg-final { scan-assembler-times {\tcmpeq\tp[0-7]\.s, p[0-7]/z, z[0-9]+\.s, z[0-9]+\.s\n} 6 } } */
+/* { dg-final { scan-assembler-times {\tfcmeq\tp[0-7]\.s, p[0-7]/z, z[0-9]+\.s, z[0-9]+\.s\n} 3 } } */
+/* { dg-final { scan-assembler-times {\tld1w\tz[0-9]+\.s, p[0-7]/z, \[x[0-9]+, z[0-9]+\.s, uxtw 2\]\n} 9 } } */
+/* { dg-final { scan-assembler-times {\tst1w\tz[0-9]+\.s, p[0-7], \[x[0-9]+, x[0-9]+, lsl 2\]\n} 9 } } */
+
+/* { dg-final { scan-assembler-times {\tld1d\tz[0-9]+\.d, p[0-7]/z, \[x[0-9]+, x[0-9]+, lsl 3\]\n} 36 } } */
+/* { dg-final { scan-assembler-times {\tcmpeq\tp[0-7]\.d, p[0-7]/z, z[0-9]+\.d, z[0-9]+\.d\n} 6 } } */
+/* { dg-final { scan-assembler-times {\tfcmeq\tp[0-7]\.d, p[0-7]/z, z[0-9]+\.d, z[0-9]+\.d\n} 3 } } */
+/* { dg-final { scan-assembler-times {\tld1d\tz[0-9]+\.d, p[0-7]/z, \[x[0-9]+, z[0-9]+\.d, lsl 3\]\n} 9 } } */
+/* { dg-final { scan-assembler-times {\tst1d\tz[0-9]+\.d, p[0-7], \[x[0-9]+, x[0-9]+, lsl 3\]\n} 9 } } */
diff --git a/gcc/testsuite/gcc.target/aarch64/sve_mask_gather_load_3.c b/gcc/testsuite/gcc.target/aarch64/sve_mask_gather_load_3.c
new file mode 100644
index 00000000000..b370f532f2c
--- /dev/null
+++ b/gcc/testsuite/gcc.target/aarch64/sve_mask_gather_load_3.c
@@ -0,0 +1,52 @@
+/* { dg-do assemble } */
+/* { dg-options "-O2 -ftree-vectorize -march=armv8-a+sve -ffast-math --save-temps" } */
+
+#include <stdint.h>
+
+#ifndef INDEX32
+#define INDEX32 int32_t
+#define INDEX64 int64_t
+#endif
+
+#define TEST_LOOP(DATA_TYPE, CMP_TYPE, BITS)				\
+  void									\
+  f_##DATA_TYPE##_##CMP_TYPE						\
+    (DATA_TYPE *restrict dest, DATA_TYPE *restrict src,			\
+     CMP_TYPE *cmp1, CMP_TYPE *cmp2, INDEX##BITS *indices, int n)	\
+  {									\
+    for (int i = 0; i < n; ++i)						\
+      if (cmp1[i] == cmp2[i])						\
+	dest[i] += *(DATA_TYPE *) ((char *) src + indices[i]);		\
+  }
+
+#define TEST32(T, DATA_TYPE)		\
+  T (DATA_TYPE, int32_t, 32)		\
+  T (DATA_TYPE, uint32_t, 32)		\
+  T (DATA_TYPE, float, 32)
+
+#define TEST64(T, DATA_TYPE)		\
+  T (DATA_TYPE, int64_t, 64)		\
+  T (DATA_TYPE, uint64_t, 64)		\
+  T (DATA_TYPE, double, 64)
+
+#define TEST_ALL(T)			\
+  TEST32 (T, int32_t)			\
+  TEST32 (T, uint32_t)			\
+  TEST32 (T, float)			\
+  TEST64 (T, int64_t)			\
+  TEST64 (T, uint64_t)			\
+  TEST64 (T, double)
+
+TEST_ALL (TEST_LOOP)
+
+/* { dg-final { scan-assembler-times {\tld1w\tz[0-9]+\.s, p[0-7]/z, \[x[0-9]+, x[0-9]+, lsl 2\]\n} 36 } } */
+/* { dg-final { scan-assembler-times {\tcmpeq\tp[0-7]\.s, p[0-7]/z, z[0-9]+\.s, z[0-9]+\.s\n} 6 } } */
+/* { dg-final { scan-assembler-times {\tfcmeq\tp[0-7]\.s, p[0-7]/z, z[0-9]+\.s, z[0-9]+\.s\n} 3 } } */
+/* { dg-final { scan-assembler-times {\tld1w\tz[0-9]+\.s, p[0-7]/z, \[x[0-9]+, z[0-9]+\.s, sxtw\]\n} 9 } } */
+/* { dg-final { scan-assembler-times {\tst1w\tz[0-9]+\.s, p[0-7], \[x[0-9]+, x[0-9]+, lsl 2\]\n} 9 } } */
+
+/* { dg-final { scan-assembler-times {\tld1d\tz[0-9]+\.d, p[0-7]/z, \[x[0-9]+, x[0-9]+, lsl 3\]\n} 36 } } */
+/* { dg-final { scan-assembler-times {\tcmpeq\tp[0-7]\.d, p[0-7]/z, z[0-9]+\.d, z[0-9]+\.d\n} 6 } } */
+/* { dg-final { scan-assembler-times {\tfcmeq\tp[0-7]\.d, p[0-7]/z, z[0-9]+\.d, z[0-9]+\.d\n} 3 } } */
+/* { dg-final { scan-assembler-times {\tld1d\tz[0-9]+\.d, p[0-7]/z, \[x[0-9]+, z[0-9]+\.d\]\n} 9 } } */
+/* { dg-final { scan-assembler-times {\tst1d\tz[0-9]+\.d, p[0-7], \[x[0-9]+, x[0-9]+, lsl 3\]\n} 9 } } */
diff --git a/gcc/testsuite/gcc.target/aarch64/sve_mask_gather_load_4.c b/gcc/testsuite/gcc.target/aarch64/sve_mask_gather_load_4.c
new file mode 100644
index 00000000000..0464e9343a3
--- /dev/null
+++ b/gcc/testsuite/gcc.target/aarch64/sve_mask_gather_load_4.c
@@ -0,0 +1,19 @@
+/* { dg-do assemble } */
+/* { dg-options "-O2 -ftree-vectorize -march=armv8-a+sve -ffast-math --save-temps" } */
+
+#define INDEX32 uint32_t
+#define INDEX64 uint64_t
+
+#include "sve_mask_gather_load_3.c"
+
+/* { dg-final { scan-assembler-times {\tld1w\tz[0-9]+\.s, p[0-7]/z, \[x[0-9]+, x[0-9]+, lsl 2\]\n} 36 } } */
+/* { dg-final { scan-assembler-times {\tcmpeq\tp[0-7]\.s, p[0-7]/z, z[0-9]+\.s, z[0-9]+\.s\n} 6 } } */
+/* { dg-final { scan-assembler-times {\tfcmeq\tp[0-7]\.s, p[0-7]/z, z[0-9]+\.s, z[0-9]+\.s\n} 3 } } */
+/* { dg-final { scan-assembler-times {\tld1w\tz[0-9]+\.s, p[0-7]/z, \[x[0-9]+, z[0-9]+\.s, uxtw\]\n} 9 } } */
+/* { dg-final { scan-assembler-times {\tst1w\tz[0-9]+\.s, p[0-7], \[x[0-9]+, x[0-9]+, lsl 2\]\n} 9 } } */
+
+/* { dg-final { scan-assembler-times {\tld1d\tz[0-9]+\.d, p[0-7]/z, \[x[0-9]+, x[0-9]+, lsl 3\]\n} 36 } } */
+/* { dg-final { scan-assembler-times {\tcmpeq\tp[0-7]\.d, p[0-7]/z, z[0-9]+\.d, z[0-9]+\.d\n} 6 } } */
+/* { dg-final { scan-assembler-times {\tfcmeq\tp[0-7]\.d, p[0-7]/z, z[0-9]+\.d, z[0-9]+\.d\n} 3 } } */
+/* { dg-final { scan-assembler-times {\tld1d\tz[0-9]+\.d, p[0-7]/z, \[x[0-9]+, z[0-9]+\.d\]\n} 9 } } */
+/* { dg-final { scan-assembler-times {\tst1d\tz[0-9]+\.d, p[0-7], \[x[0-9]+, x[0-9]+, lsl 3\]\n} 9 } } */
diff --git a/gcc/testsuite/gcc.target/aarch64/sve_mask_gather_load_5.c b/gcc/testsuite/gcc.target/aarch64/sve_mask_gather_load_5.c
new file mode 100644
index 00000000000..831d594654a
--- /dev/null
+++ b/gcc/testsuite/gcc.target/aarch64/sve_mask_gather_load_5.c
@@ -0,0 +1,38 @@
+/* { dg-do assemble } */
+/* { dg-options "-O2 -ftree-vectorize -march=armv8-a+sve -ffast-math --save-temps" } */
+
+#include <stdint.h>
+
+#ifndef INDEX32
+#define INDEX32 int32_t
+#define INDEX64 int64_t
+#endif
+
+#define TEST_LOOP(DATA_TYPE, CMP_TYPE)					\
+  void									\
+  f_##DATA_TYPE##_##CMP_TYPE						\
+    (DATA_TYPE *restrict dest, DATA_TYPE *restrict *restrict src,	\
+     CMP_TYPE *cmp1, CMP_TYPE *cmp2, int n)				\
+  {									\
+    for (int i = 0; i < n; ++i)						\
+      if (cmp1[i] == cmp2[i])						\
+	dest[i] += *src[i];						\
+  }
+
+#define TEST_TYPE(T, DATA_TYPE)		\
+  T (DATA_TYPE, int64_t)		\
+  T (DATA_TYPE, uint64_t)		\
+  T (DATA_TYPE, double)
+
+#define TEST_ALL(T)			\
+  TEST_TYPE (T, int64_t)		\
+  TEST_TYPE (T, uint64_t)		\
+  TEST_TYPE (T, double)
+
+TEST_ALL (TEST_LOOP)
+
+/* { dg-final { scan-assembler-times {\tld1d\tz[0-9]+\.d, p[0-7]/z, \[x[0-9]+, x[0-9]+, lsl 3\]\n} 36 } } */
+/* { dg-final { scan-assembler-times {\tcmpeq\tp[0-7]\.d, p[0-7]/z, z[0-9]+\.d, z[0-9]+\.d\n} 6 } } */
+/* { dg-final { scan-assembler-times {\tfcmeq\tp[0-7]\.d, p[0-7]/z, z[0-9]+\.d, z[0-9]+\.d\n} 3 } } */
+/* { dg-final { scan-assembler-times {\tld1d\tz[0-9]+\.d, p[0-7]/z, \[z[0-9]+\.d\]\n} 9 } } */
+/* { dg-final { scan-assembler-times {\tst1d\tz[0-9]+\.d, p[0-7], \[x[0-9]+, x[0-9]+, lsl 3\]\n} 9 } } */
diff --git a/gcc/testsuite/gcc.target/aarch64/sve_mask_gather_load_6.c b/gcc/testsuite/gcc.target/aarch64/sve_mask_gather_load_6.c
new file mode 100644
index 00000000000..64eb0c46278
--- /dev/null
+++ b/gcc/testsuite/gcc.target/aarch64/sve_mask_gather_load_6.c
@@ -0,0 +1,38 @@
+/* { dg-do assemble } */
+/* { dg-options "-O2 -ftree-vectorize -ffast-math -march=armv8-a+sve --save-temps" } */
+
+#include <stdint.h>
+
+#define TEST_LOOP(DATA_TYPE, CMP_TYPE, INDEX_TYPE)			\
+  void									\
+  f_##DATA_TYPE##_##CMP_TYPE##_##INDEX_TYPE				\
+    (DATA_TYPE *restrict dest, DATA_TYPE *restrict src,			\
+     CMP_TYPE *cmp1, CMP_TYPE *cmp2, INDEX_TYPE *indices, int n)	\
+  {									\
+    for (int i = 0; i < n; ++i)						\
+      if (cmp1[i] == cmp2[i])						\
+	dest[i] += src[indices[i]];					\
+  }
+
+#define TEST32(T, DATA_TYPE)			\
+  T (DATA_TYPE, int64_t, int32_t)		\
+  T (DATA_TYPE, uint64_t, int32_t)		\
+  T (DATA_TYPE, double, int32_t)		\
+  T (DATA_TYPE, int64_t, uint32_t)		\
+  T (DATA_TYPE, uint64_t, uint32_t)		\
+  T (DATA_TYPE, double, uint32_t)
+
+#define TEST_ALL(T)			\
+  TEST32 (T, int32_t)			\
+  TEST32 (T, uint32_t)			\
+  TEST32 (T, float)
+
+TEST_ALL (TEST_LOOP)
+
+/* { dg-final { scan-assembler-times {\tld1d\tz[0-9]+\.d, p[0-7]/z, \[x[0-9]+, x[0-9]+, lsl 3\]\n} 72 } } */
+/* { dg-final { scan-assembler-times {\tcmpeq\tp[0-7]\.d, p[0-7]/z, z[0-9]+\.d, z[0-9]+\.d\n} 24 } } */
+/* { dg-final { scan-assembler-times {\tfcmeq\tp[0-7]\.d, p[0-7]/z, z[0-9]+\.d, z[0-9]+\.d\n} 12 } } */
+/* { dg-final { scan-assembler-times {\tld1w\tz[0-9]+\.s, p[0-7]/z, \[x[0-9]+, x[0-9]+, lsl 2\]\n} 36 } } */
+/* { dg-final { scan-assembler-times {\tld1w\tz[0-9]+\.s, p[0-7]/z, \[x[0-9]+, z[0-9]+\.s, sxtw 2\]\n} 9 } } */
+/* { dg-final { scan-assembler-times {\tld1w\tz[0-9]+\.s, p[0-7]/z, \[x[0-9]+, z[0-9]+\.s, uxtw 2\]\n} 9 } } */
+/* { dg-final { scan-assembler-times {\tst1w\tz[0-9]+\.s, p[0-7], \[x[0-9]+, x[0-9]+, lsl 2\]\n} 18 } } */
diff --git a/gcc/testsuite/gcc.target/aarch64/sve_mask_gather_load_7.c b/gcc/testsuite/gcc.target/aarch64/sve_mask_gather_load_7.c
new file mode 100644
index 00000000000..4a8b38e13af
--- /dev/null
+++ b/gcc/testsuite/gcc.target/aarch64/sve_mask_gather_load_7.c
@@ -0,0 +1,53 @@
+/* { dg-do assemble } */
+/* { dg-options "-O2 -ftree-vectorize -ffast-math -march=armv8-a+sve --save-temps" } */
+
+#include <stdint.h>
+
+#define TEST_LOOP(DATA_TYPE, CMP_TYPE, INDEX_TYPE)			\
+  void									\
+  f_##DATA_TYPE##_##CMP_TYPE##_##INDEX_TYPE				\
+    (DATA_TYPE *restrict dest, DATA_TYPE *restrict src,			\
+     CMP_TYPE *cmp1, CMP_TYPE *cmp2, INDEX_TYPE *indices, int n)	\
+  {									\
+    for (int i = 0; i < n; ++i)						\
+      if (cmp1[i] == cmp2[i])						\
+	dest[i] += src[indices[i]];					\
+  }
+
+#define TEST32(T, DATA_TYPE)			\
+  T (DATA_TYPE, int16_t, int32_t)		\
+  T (DATA_TYPE, uint16_t, int32_t)		\
+  T (DATA_TYPE, _Float16, int32_t)		\
+  T (DATA_TYPE, int16_t, uint32_t)		\
+  T (DATA_TYPE, uint16_t, uint32_t)		\
+  T (DATA_TYPE, _Float16, uint32_t)
+
+#define TEST64(T, DATA_TYPE)			\
+  T (DATA_TYPE, int32_t, int64_t)		\
+  T (DATA_TYPE, uint32_t, int64_t)		\
+  T (DATA_TYPE, float, int64_t)			\
+  T (DATA_TYPE, int32_t, uint64_t)		\
+  T (DATA_TYPE, uint32_t, uint64_t)		\
+  T (DATA_TYPE, float, uint64_t)
+
+#define TEST_ALL(T)			\
+  TEST32 (T, int32_t)			\
+  TEST32 (T, uint32_t)			\
+  TEST32 (T, float)			\
+  TEST64 (T, int64_t)			\
+  TEST64 (T, uint64_t)			\
+  TEST64 (T, double)
+
+TEST_ALL (TEST_LOOP)
+
+/* { dg-final { scan-assembler-times {\tld1h\tz[0-9]+\.h, p[0-7]/z, \[x[0-9]+, x[0-9]+, lsl 1\]\n} 36 } } */
+/* { dg-final { scan-assembler-times {\tcmpeq\tp[0-7]\.h, p[0-7]/z, z[0-9]+\.h, z[0-9]+\.h\n} 12 } } */
+/* { dg-final { scan-assembler-times {\tfcmeq\tp[0-7]\.h, p[0-7]/z, z[0-9]+\.h, z[0-9]+\.h\n} 6 } } */
+/* { dg-final { scan-assembler-times {\tld1w\tz[0-9]+\.s, p[0-7]/z, \[x[0-9]+, z[0-9]+\.s, sxtw 2\]\n} 18 } } */
+/* { dg-final { scan-assembler-times {\tld1w\tz[0-9]+\.s, p[0-7]/z, \[x[0-9]+, z[0-9]+\.s, uxtw 2\]\n} 18 } } */
+
+/* Also used for the TEST32 indices.  */
+/* { dg-final { scan-assembler-times {\tld1w\tz[0-9]+\.s, p[0-7]/z, \[x[0-9]+, x[0-9]+, lsl 2\]\n} 72 } } */
+/* { dg-final { scan-assembler-times {\tcmpeq\tp[0-7]\.s, p[0-7]/z, z[0-9]+\.s, z[0-9]+\.s\n} 12 } } */
+/* { dg-final { scan-assembler-times {\tfcmeq\tp[0-7]\.s, p[0-7]/z, z[0-9]+\.s, z[0-9]+\.s\n} 6 } } */
+/* { dg-final { scan-assembler-times {\tld1d\tz[0-9]+\.d, p[0-7]/z, \[x[0-9]+, z[0-9]+\.d, lsl 3\]\n} 36 } } */
diff --git a/gcc/tree-vect-data-refs.c b/gcc/tree-vect-data-refs.c
index 20f9cf17cb5..61413e9daae 100644
--- a/gcc/tree-vect-data-refs.c
+++ b/gcc/tree-vect-data-refs.c
@@ -3296,6 +3296,74 @@ vect_prune_runtime_alias_test_list (loop_vec_info loop_vinfo)
   return true;
 }
 
+/* Check whether we can use an internal function for a gather load
+   or scatter store.  READ_P is true for loads and false for stores.
+   MASKED_P is true if the load or store is conditional.  MEMORY_TYPE is
+   the type of the memory elements being loaded or stored.  OFFSET_BITS
+   is the number of bits in each scalar offset and OFFSET_SIGN is the
+   sign of the offset.  SCALE is the amount by which the offset should
+   be multiplied *after* it has been converted to address width.
+
+   Return true if the function is supported, storing the function
+   id in *IFN_OUT and the type of a vector element in *ELEMENT_TYPE_OUT.  */
+
+static bool
+vect_gather_scatter_fn_p (bool read_p, bool masked_p, tree vectype,
+			  tree memory_type, unsigned int offset_bits,
+			  signop offset_sign, int scale,
+			  internal_fn *ifn_out, tree *element_type_out)
+{
+  unsigned int memory_bits = tree_to_uhwi (TYPE_SIZE (memory_type));
+  unsigned int element_bits = tree_to_uhwi (TYPE_SIZE (TREE_TYPE (vectype)));
+  if (offset_bits > element_bits)
+    /* Internal functions require the offset to be the same width as
+       the vector elements.  We can extend narrower offsets, but it isn't
+       safe to truncate wider offsets.  */
+    return false;
+
+  if (element_bits != memory_bits)
+    /* For now the vector elements must be the same width as the
+       memory elements.  */
+    return false;
+
+  /* Work out which function we need.  */
+  internal_fn ifn;
+  if (read_p)
+    ifn = masked_p ? IFN_MASK_GATHER_LOAD : IFN_GATHER_LOAD;
+  else
+    return false;
+
+  /* Test whether the target supports this combination.  */
+  if (!internal_gather_scatter_fn_supported_p (ifn, vectype, memory_type,
+					       offset_sign, scale))
+    return false;
+
+  *ifn_out = ifn;
+  *element_type_out = TREE_TYPE (vectype);
+  return true;
+}
+
+/* CALL is a call to an internal gather load or scatter store function.
+   Describe the operation in INFO.  */
+
+static void
+vect_describe_gather_scatter_call (gcall *call, gather_scatter_info *info)
+{
+  stmt_vec_info stmt_info = vinfo_for_stmt (call);
+  tree vectype = STMT_VINFO_VECTYPE (stmt_info);
+  data_reference *dr = STMT_VINFO_DATA_REF (stmt_info);
+
+  info->ifn = gimple_call_internal_fn (call);
+  info->decl = NULL_TREE;
+  info->base = gimple_call_arg (call, 0);
+  info->offset = gimple_call_arg (call, 1);
+  info->offset_dt = vect_unknown_def_type;
+  info->offset_vectype = NULL_TREE;
+  info->scale = TREE_INT_CST_LOW (gimple_call_arg (call, 2));
+  info->element_type = TREE_TYPE (vectype);
+  info->memory_type = TREE_TYPE (DR_REF (dr));
+}
+
 /* Return true if a non-affine read or write in STMT is suitable for a
    gather load or scatter store.  Describe the operation in *INFO if so.  */
 
@@ -3309,17 +3377,38 @@ vect_check_gather_scatter (gimple *stmt, loop_vec_info loop_vinfo,
   stmt_vec_info stmt_info = vinfo_for_stmt (stmt);
   struct data_reference *dr = STMT_VINFO_DATA_REF (stmt_info);
   tree offtype = NULL_TREE;
-  tree decl, base, off;
+  tree decl = NULL_TREE, base, off;
+  tree vectype = STMT_VINFO_VECTYPE (stmt_info);
+  tree memory_type = TREE_TYPE (DR_REF (dr));
   machine_mode pmode;
   int punsignedp, reversep, pvolatilep = 0;
+  internal_fn ifn;
+  tree element_type;
+  bool masked_p = false;
+
+  /* See whether this is already a call to a gather/scatter internal function.
+     If not, see whether it's a masked load or store.  */
+  gcall *call = dyn_cast <gcall *> (stmt);
+  if (call && gimple_call_internal_p (call))
+    {
+      ifn = gimple_call_internal_fn (stmt);
+      if (internal_gather_scatter_fn_p (ifn))
+	{
+	  vect_describe_gather_scatter_call (call, info);
+	  return true;
+	}
+      masked_p = (ifn == IFN_MASK_LOAD || ifn == IFN_MASK_STORE);
+    }
+
+  /* True if we should aim to use internal functions rather than
+     built-in functions.  */
+  bool use_ifn_p = (DR_IS_READ (dr)
+		    && supports_vec_gather_load_p ());
 
   base = DR_REF (dr);
   /* For masked loads/stores, DR_REF (dr) is an artificial MEM_REF,
      see if we can use the def stmt of the address.  */
-  if (is_gimple_call (stmt)
-      && gimple_call_internal_p (stmt)
-      && (gimple_call_internal_fn (stmt) == IFN_MASK_LOAD
-	  || gimple_call_internal_fn (stmt) == IFN_MASK_STORE)
+  if (masked_p
       && TREE_CODE (base) == MEM_REF
       && TREE_CODE (TREE_OPERAND (base, 0)) == SSA_NAME
       && integer_zerop (TREE_OPERAND (base, 1))
@@ -3450,7 +3539,17 @@ vect_check_gather_scatter (gimple *stmt, loop_vec_info loop_vinfo,
 	case MULT_EXPR:
 	  if (scale == 1 && tree_fits_shwi_p (op1))
 	    {
-	      scale = tree_to_shwi (op1);
+	      int new_scale = tree_to_shwi (op1);
+	      /* Only treat this as a scaling operation if the target
+		 supports it.  */
+	      if (use_ifn_p
+		  && !vect_gather_scatter_fn_p (DR_IS_READ (dr), masked_p,
+						vectype, memory_type, 1,
+						TYPE_SIGN (TREE_TYPE (op0)),
+						new_scale, &ifn,
+						&element_type))
+		break;
+	      scale = new_scale;
 	      off = op0;
 	      continue;
 	    }
@@ -3468,6 +3567,15 @@ vect_check_gather_scatter (gimple *stmt, loop_vec_info loop_vinfo,
 	      off = op0;
 	      continue;
 	    }
+
+	  /* The internal functions need the offset to be the same width
+	     as the elements of VECTYPE.  Don't include operations that
+	     cast the offset from that width to a different width.  */
+	  if (use_ifn_p
+	      && (int_size_in_bytes (TREE_TYPE (vectype))
+		  == int_size_in_bytes (TREE_TYPE (off))))
+	    break;
+
 	  if (TYPE_PRECISION (TREE_TYPE (op0))
 	      < TYPE_PRECISION (TREE_TYPE (off)))
 	    {
@@ -3492,22 +3600,37 @@ vect_check_gather_scatter (gimple *stmt, loop_vec_info loop_vinfo,
   if (offtype == NULL_TREE)
     offtype = TREE_TYPE (off);
 
-  if (DR_IS_READ (dr))
-    decl = targetm.vectorize.builtin_gather (STMT_VINFO_VECTYPE (stmt_info),
-					     offtype, scale);
+  if (use_ifn_p)
+    {
+      if (!vect_gather_scatter_fn_p (DR_IS_READ (dr), masked_p, vectype,
+				     memory_type, TYPE_PRECISION (offtype),
+				     TYPE_SIGN (offtype), scale, &ifn,
+				     &element_type))
+	return false;
+    }
   else
-    decl = targetm.vectorize.builtin_scatter (STMT_VINFO_VECTYPE (stmt_info),
-					      offtype, scale);
+    {
+      if (DR_IS_READ (dr))
+	decl = targetm.vectorize.builtin_gather (vectype, offtype, scale);
+      else
+	decl = targetm.vectorize.builtin_scatter (vectype, offtype, scale);
 
-  if (decl == NULL_TREE)
-    return false;
+      if (!decl)
+	return false;
+
+      ifn = IFN_LAST;
+      element_type = TREE_TYPE (vectype);
+    }
 
+  info->ifn = ifn;
   info->decl = decl;
   info->base = base;
   info->offset = off;
   info->offset_dt = vect_unknown_def_type;
   info->offset_vectype = NULL_TREE;
   info->scale = scale;
+  info->element_type = element_type;
+  info->memory_type = memory_type;
   return true;
 }
 
@@ -3588,7 +3711,8 @@ again:
 	  bool maybe_gather
 	    = DR_IS_READ (dr)
 	      && !TREE_THIS_VOLATILE (DR_REF (dr))
-	      && targetm.vectorize.builtin_gather != NULL;
+	      && (targetm.vectorize.builtin_gather != NULL
+		  || supports_vec_gather_load_p ());
 	  bool maybe_scatter
 	    = DR_IS_WRITE (dr)
 	      && !TREE_THIS_VOLATILE (DR_REF (dr))
diff --git a/gcc/tree-vect-patterns.c b/gcc/tree-vect-patterns.c
index 5b82c462dff..7a2bb13f3a9 100644
--- a/gcc/tree-vect-patterns.c
+++ b/gcc/tree-vect-patterns.c
@@ -69,6 +69,7 @@ static gimple *vect_recog_mixed_size_cond_pattern (vec<gimple *> *,
 						  tree *, tree *);
 static gimple *vect_recog_bool_pattern (vec<gimple *> *, tree *, tree *);
 static gimple *vect_recog_mask_conversion_pattern (vec<gimple *> *, tree *, tree *);
+static gimple *vect_recog_gather_scatter_pattern (vec<gimple *> *, tree *, tree *);
 
 struct vect_recog_func
 {
@@ -93,6 +94,10 @@ static vect_recog_func vect_vect_recog_func_ptrs[NUM_PATTERNS] = {
       {	vect_recog_mult_pattern, "mult" },
       {	vect_recog_mixed_size_cond_pattern, "mixed_size_cond" },
       {	vect_recog_bool_pattern, "bool" },
+      /* This must come before mask conversion, and includes the parts
+	 of mask conversion that are needed for gather and scatter
+	 internal functions.  */
+      { vect_recog_gather_scatter_pattern, "gather_scatter" },
       {	vect_recog_mask_conversion_pattern, "mask_conversion" }
 };
 
@@ -4090,6 +4095,203 @@ vect_recog_mask_conversion_pattern (vec<gimple *> *stmts, tree *type_in,
   return pattern_stmt;
 }
 
+/* STMT is a load or store.  If the load or store is conditional, return
+   the boolean condition under which it occurs, otherwise return null.  */
+
+static tree
+vect_get_load_store_mask (gimple *stmt)
+{
+  if (gassign *def_assign = dyn_cast <gassign *> (stmt))
+    {
+      gcc_assert (gimple_assign_single_p (def_assign));
+      return NULL_TREE;
+    }
+
+  if (gcall *def_call = dyn_cast <gcall *> (stmt))
+    {
+      internal_fn ifn = gimple_call_internal_fn (def_call);
+      int mask_index = internal_fn_mask_index (ifn);
+      return gimple_call_arg (def_call, mask_index);
+    }
+
+  gcc_unreachable ();
+}
+
+/* Return the scalar offset type that an internal gather/scatter function
+   should use.  GS_INFO describes the gather/scatter operation.  */
+
+static tree
+vect_get_gather_scatter_offset_type (gather_scatter_info *gs_info)
+{
+  tree offset_type = TREE_TYPE (gs_info->offset);
+  unsigned int element_bits = tree_to_uhwi (TYPE_SIZE (gs_info->element_type));
+
+  /* Enforced by vect_check_gather_scatter.  */
+  unsigned int offset_bits = TYPE_PRECISION (offset_type);
+  gcc_assert (element_bits >= offset_bits);
+
+  /* If the offset is narrower than the elements, extend it according
+     to its sign.   */
+  if (element_bits > offset_bits)
+    return build_nonstandard_integer_type (element_bits,
+					   TYPE_UNSIGNED (offset_type));
+
+  return offset_type;
+}
+
+/* Return MASK if MASK is suitable for masking an operation on vectors
+   of type VECTYPE, otherwise convert it into such a form and return
+   the result.  Associate any conversion statements with STMT_INFO's
+   pattern.  */
+
+static tree
+vect_convert_mask_for_vectype (tree mask, tree vectype,
+			       stmt_vec_info stmt_info, vec_info *vinfo)
+{
+  tree mask_type = search_type_for_mask (mask, vinfo);
+  if (mask_type)
+    {
+      tree mask_vectype = get_mask_type_for_scalar_type (mask_type);
+      if (mask_vectype
+	  && may_ne (TYPE_VECTOR_SUBPARTS (vectype),
+		     TYPE_VECTOR_SUBPARTS (mask_vectype)))
+	mask = build_mask_conversion (mask, vectype, stmt_info, vinfo);
+    }
+  return mask;
+}
+
+/* Return the equivalent of:
+
+     fold_convert (TYPE, VALUE)
+
+   with the expectation that the operation will be vectorized.
+   If new statements are needed, add them as pattern statements
+   to STMT_INFO.  */
+
+static tree
+vect_add_conversion_to_patterm (tree type, tree value,
+				stmt_vec_info stmt_info,
+				vec_info *vinfo)
+{
+  if (useless_type_conversion_p (type, TREE_TYPE (value)))
+    return value;
+
+  tree new_value = vect_recog_temp_ssa_var (type, NULL);
+  gassign *conversion = gimple_build_assign (new_value, CONVERT_EXPR, value);
+  stmt_vec_info new_stmt_info = new_stmt_vec_info (conversion, vinfo);
+  set_vinfo_for_stmt (conversion, new_stmt_info);
+  STMT_VINFO_VECTYPE (new_stmt_info) = get_vectype_for_scalar_type (type);
+  append_pattern_def_seq (stmt_info, conversion);
+  return new_value;
+}
+
+/* Try to convert STMT into a call to a gather load or scatter store
+   internal function.  Return the final statement on success and set
+   *TYPE_IN and *TYPE_OUT to the vector type being loaded or stored.
+
+   This function only handles gathers and scatters that were recognized
+   as such from the outset (indicated by STMT_VINFO_GATHER_SCATTER_P).  */
+
+static gimple *
+vect_try_gather_scatter_pattern (gimple *stmt, stmt_vec_info last_stmt_info,
+				 tree *type_in, tree *type_out)
+{
+  /* Currently we only support this for loop vectorization.  */
+  stmt_vec_info stmt_info = vinfo_for_stmt (stmt);
+  loop_vec_info loop_vinfo = dyn_cast <loop_vec_info> (stmt_info->vinfo);
+  if (!loop_vinfo)
+    return NULL;
+
+  /* Make sure that we're looking at a gather load or scatter store.  */
+  data_reference *dr = STMT_VINFO_DATA_REF (stmt_info);
+  if (!dr || !STMT_VINFO_GATHER_SCATTER_P (stmt_info))
+    return NULL;
+
+  /* Reject stores for now.  */
+  if (!DR_IS_READ (dr))
+    return NULL;
+
+  /* Get the boolean that controls whether the load or store happens.
+     This is null if the operation is unconditional.  */
+  tree mask = vect_get_load_store_mask (stmt);
+
+  /* Make sure that the target supports an appropriate internal
+     function for the gather/scatter operation.  */
+  gather_scatter_info gs_info;
+  if (!vect_check_gather_scatter (stmt, loop_vinfo, &gs_info)
+      || gs_info.decl)
+    return NULL;
+
+  /* Convert the mask to the right form.  */
+  tree gs_vectype = get_vectype_for_scalar_type (gs_info.element_type);
+  if (mask)
+    mask = vect_convert_mask_for_vectype (mask, gs_vectype, last_stmt_info,
+					  loop_vinfo);
+
+  /* Get the invariant base and non-invariant offset, converting the
+     latter to the same width as the vector elements.  */
+  tree base = gs_info.base;
+  tree offset_type = vect_get_gather_scatter_offset_type (&gs_info);
+  tree offset = vect_add_conversion_to_patterm (offset_type, gs_info.offset,
+						last_stmt_info, loop_vinfo);
+
+  /* Build the new pattern statement.  */
+  tree scale = size_int (gs_info.scale);
+  gcall *pattern_stmt;
+  if (DR_IS_READ (dr))
+    {
+      if (mask != NULL)
+	pattern_stmt = gimple_build_call_internal (gs_info.ifn, 4, base,
+						   offset, scale, mask);
+      else
+	pattern_stmt = gimple_build_call_internal (gs_info.ifn, 3, base,
+						   offset, scale);
+      tree load_lhs = vect_recog_temp_ssa_var (gs_info.element_type, NULL);
+      gimple_call_set_lhs (pattern_stmt, load_lhs);
+    }
+  else
+    /* Not yet supported.  */
+    gcc_unreachable ();
+  gimple_call_set_nothrow (pattern_stmt, true);
+
+  /* Copy across relevant vectorization info and associate DR with the
+     new pattern statement instead of the original statement.  */
+  stmt_vec_info pattern_stmt_info = new_stmt_vec_info (pattern_stmt,
+						       loop_vinfo);
+  set_vinfo_for_stmt (pattern_stmt, pattern_stmt_info);
+  STMT_VINFO_DATA_REF (pattern_stmt_info) = dr;
+  STMT_VINFO_DR_WRT_VEC_LOOP (pattern_stmt_info)
+    = STMT_VINFO_DR_WRT_VEC_LOOP (stmt_info);
+  STMT_VINFO_GATHER_SCATTER_P (pattern_stmt_info)
+    = STMT_VINFO_GATHER_SCATTER_P (stmt_info);
+  DR_STMT (dr) = pattern_stmt;
+
+  tree vectype = STMT_VINFO_VECTYPE (stmt_info);
+  *type_out = vectype;
+  *type_in = vectype;
+
+  if (dump_enabled_p ())
+    dump_printf_loc (MSG_NOTE, vect_location,
+		     "gather/scatter pattern detected:\n");
+
+  return pattern_stmt;
+}
+
+/* Pattern wrapper around vect_try_gather_scatter_pattern.  */
+
+static gimple *
+vect_recog_gather_scatter_pattern (vec<gimple *> *stmts, tree *type_in,
+				   tree *type_out)
+{
+  gimple *last_stmt = stmts->pop ();
+  stmt_vec_info last_stmt_info = vinfo_for_stmt (last_stmt);
+  gimple *pattern_stmt = vect_try_gather_scatter_pattern (last_stmt,
+							  last_stmt_info,
+							  type_in, type_out);
+  if (pattern_stmt)
+    stmts->safe_push (last_stmt);
+  return pattern_stmt;
+}
 
 /* Mark statements that are involved in a pattern.  */
 
diff --git a/gcc/tree-vect-stmts.c b/gcc/tree-vect-stmts.c
index 88ac271093a..80f8a8dc93b 100644
--- a/gcc/tree-vect-stmts.c
+++ b/gcc/tree-vect-stmts.c
@@ -389,21 +389,19 @@ exist_non_indexing_operands_for_use_p (tree use, gimple *stmt)
     {
       if (is_gimple_call (stmt)
 	  && gimple_call_internal_p (stmt))
-	switch (gimple_call_internal_fn (stmt))
-	  {
-	  case IFN_MASK_STORE:
-	    operand = gimple_call_arg (stmt, 3);
-	    if (operand == use)
-	      return true;
-	    /* FALLTHRU */
-	  case IFN_MASK_LOAD:
-	    operand = gimple_call_arg (stmt, 2);
-	    if (operand == use)
-	      return true;
-	    break;
-	  default:
-	    break;
-	  }
+	{
+	  internal_fn ifn = gimple_call_internal_fn (stmt);
+	  int mask_index = internal_fn_mask_index (ifn);
+	  if (mask_index >= 0
+	      && use == gimple_call_arg (stmt, mask_index))
+	    return true;
+	  if (internal_gather_scatter_fn_p (ifn)
+	      && use == gimple_call_arg (stmt, 1))
+	    return true;
+	  if (ifn == IFN_MASK_STORE
+	      && use == gimple_call_arg (stmt, 3))
+	    return true;
+	}
       return false;
     }
 
@@ -1725,6 +1723,8 @@ static tree permute_vec_elements (tree, tree, tree, gimple *,
    is the type of the vector being loaded or stored.  MEMORY_ACCESS_TYPE
    says how the load or store is going to be implemented and GROUP_SIZE
    is the number of load or store statements in the containing group.
+   If the access is a gather load or scatter store, GS_INFO describes
+   its arguments.
 
    Clear LOOP_VINFO_CAN_FULLY_MASK_P if a fully-masked loop is not
    supported, otherwise record the required mask types.  */
@@ -1732,7 +1732,8 @@ static tree permute_vec_elements (tree, tree, tree, gimple *,
 static void
 check_load_store_masking (loop_vec_info loop_vinfo, tree vectype,
 			  vec_load_store_type vls_type, int group_size,
-			  vect_memory_access_type memory_access_type)
+			  vect_memory_access_type memory_access_type,
+			  gather_scatter_info *gs_info)
 {
   /* Invariant loads need no special support.  */
   if (memory_access_type == VMAT_INVARIANT)
@@ -1760,6 +1761,29 @@ check_load_store_masking (loop_vec_info loop_vinfo, tree vectype,
       return;
     }
 
+  if (memory_access_type == VMAT_GATHER_SCATTER)
+    {
+      gcc_assert (is_load);
+      tree offset_type = TREE_TYPE (gs_info->offset);
+      if (!internal_gather_scatter_fn_supported_p (IFN_MASK_GATHER_LOAD,
+						   vectype,
+						   gs_info->memory_type,
+						   TYPE_SIGN (offset_type),
+						   gs_info->scale))
+	{
+	  if (dump_enabled_p ())
+	    dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
+			     "can't use a fully-masked loop because the"
+			     " target doesn't have an appropriate masked"
+			     " gather load instruction.\n");
+	  LOOP_VINFO_CAN_FULLY_MASK_P (loop_vinfo) = false;
+	  return;
+	}
+      unsigned int ncopies = vect_get_num_copies (loop_vinfo, vectype);
+      vect_record_loop_mask (loop_vinfo, masks, ncopies, vectype);
+      return;
+    }
+
   if (memory_access_type != VMAT_CONTIGUOUS
       && memory_access_type != VMAT_CONTIGUOUS_PERMUTE)
     {
@@ -2591,6 +2615,31 @@ vect_build_gather_load_calls (gimple *stmt, gimple_stmt_iterator *gsi,
     }
 }
 
+/* Prepare the base and offset in GS_INFO for vectorization.
+   Set *DATAREF_PTR to the loop-invariant base address and *VEC_OFFSET
+   to the vectorized offset argument for the first copy of STMT.  STMT
+   is the statement described by GS_INFO and LOOP is the containing loop.  */
+
+static void
+vect_get_gather_scatter_ops (struct loop *loop, gimple *stmt,
+			     gather_scatter_info *gs_info,
+			     tree *dataref_ptr, tree *vec_offset)
+{
+  gimple_seq stmts = NULL;
+  *dataref_ptr = force_gimple_operand (gs_info->base, &stmts, true, NULL_TREE);
+  if (stmts != NULL)
+    {
+      basic_block new_bb;
+      edge pe = loop_preheader_edge (loop);
+      new_bb = gsi_insert_seq_on_edge_immediate (pe, stmts);
+      gcc_assert (!new_bb);
+    }
+  tree offset_type = TREE_TYPE (gs_info->offset);
+  tree offset_vectype = get_vectype_for_scalar_type (offset_type);
+  *vec_offset = vect_get_vec_def_for_operand (gs_info->offset, stmt,
+					      offset_vectype);
+}
+
 /* Check and perform vectorization of BUILT_IN_BSWAP{16,32,64}.  */
 
 static bool
@@ -2780,7 +2829,7 @@ vectorizable_call (gimple *gs, gimple_stmt_iterator *gsi, gimple **vec_stmt,
     return false;
 
   if (gimple_call_internal_p (stmt)
-      && (gimple_call_internal_fn (stmt) == IFN_MASK_LOAD
+      && (internal_load_fn_p (gimple_call_internal_fn (stmt))
 	  || gimple_call_internal_fn (stmt) == IFN_MASK_STORE))
     /* Handled by vectorizable_load and vectorizable_store.  */
     return false;
@@ -5965,7 +6014,7 @@ vectorizable_store (gimple *stmt, gimple_stmt_iterator *gsi, gimple **vec_stmt,
       if (loop_vinfo
 	  && LOOP_VINFO_CAN_FULLY_MASK_P (loop_vinfo))
 	check_load_store_masking (loop_vinfo, vectype, vls_type, group_size,
-				  memory_access_type);
+				  memory_access_type, &gs_info);
 
       STMT_VINFO_TYPE (stmt_info) = store_vec_info_type;
       /* The SLP costs are calculated during SLP analysis.  */
@@ -6937,7 +6986,11 @@ vectorizable_load (gimple *stmt, gimple_stmt_iterator *gsi, gimple **vec_stmt,
   else
     {
       gcall *call = dyn_cast <gcall *> (stmt);
-      if (!call || !gimple_call_internal_p (call, IFN_MASK_LOAD))
+      if (!call || !gimple_call_internal_p (call))
+	return false;
+
+      internal_fn ifn = gimple_call_internal_fn (call);
+      if (!internal_load_fn_p (ifn))
 	return false;
 
       scalar_dest = gimple_call_lhs (call);
@@ -6952,9 +7005,13 @@ vectorizable_load (gimple *stmt, gimple_stmt_iterator *gsi, gimple **vec_stmt,
 	  return false;
 	}
 
-      mask = gimple_call_arg (call, 2);
-      if (!vect_check_load_store_mask (stmt, mask, &mask_vectype))
-	return false;
+      int mask_index = internal_fn_mask_index (ifn);
+      if (mask_index >= 0)
+	{
+	  mask = gimple_call_arg (call, mask_index);
+	  if (!vect_check_load_store_mask (stmt, mask, &mask_vectype))
+	    return false;
+	}
     }
 
   if (!STMT_VINFO_DATA_REF (stmt_info))
@@ -7078,7 +7135,7 @@ vectorizable_load (gimple *stmt, gimple_stmt_iterator *gsi, gimple **vec_stmt,
 					     TYPE_MODE (mask_vectype), true))
 	    return false;
 	}
-      else if (memory_access_type == VMAT_GATHER_SCATTER)
+      else if (memory_access_type == VMAT_GATHER_SCATTER && gs_info.decl)
 	{
 	  tree arglist = TYPE_ARG_TYPES (TREE_TYPE (gs_info.decl));
 	  tree masktype
@@ -7092,7 +7149,8 @@ vectorizable_load (gimple *stmt, gimple_stmt_iterator *gsi, gimple **vec_stmt,
 	      return false;
 	    }
 	}
-      else if (memory_access_type != VMAT_LOAD_STORE_LANES)
+      else if (memory_access_type != VMAT_LOAD_STORE_LANES
+	       && memory_access_type != VMAT_GATHER_SCATTER)
 	{
 	  if (dump_enabled_p ())
 	    dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
@@ -7109,7 +7167,7 @@ vectorizable_load (gimple *stmt, gimple_stmt_iterator *gsi, gimple **vec_stmt,
       if (loop_vinfo
 	  && LOOP_VINFO_CAN_FULLY_MASK_P (loop_vinfo))
 	check_load_store_masking (loop_vinfo, vectype, VLS_LOAD, group_size,
-				  memory_access_type);
+				  memory_access_type, &gs_info);
 
       STMT_VINFO_TYPE (stmt_info) = load_vec_info_type;
       /* The SLP costs are calculated during SLP analysis.  */
@@ -7131,7 +7189,7 @@ vectorizable_load (gimple *stmt, gimple_stmt_iterator *gsi, gimple **vec_stmt,
 
   ensure_base_align (dr);
 
-  if (memory_access_type == VMAT_GATHER_SCATTER)
+  if (memory_access_type == VMAT_GATHER_SCATTER && gs_info.decl)
     {
       vect_build_gather_load_calls (stmt, gsi, vec_stmt, &gs_info, mask);
       return true;
@@ -7576,6 +7634,7 @@ vectorizable_load (gimple *stmt, gimple_stmt_iterator *gsi, gimple **vec_stmt,
     aggr_type = vectype;
 
   tree vec_mask = NULL_TREE;
+  tree vec_offset = NULL_TREE;
   prev_stmt_info = NULL;
   poly_uint64 group_elt = 0;
   vec_loop_masks *masks = &LOOP_VINFO_MASKS (loop_vinfo);
@@ -7618,6 +7677,12 @@ vectorizable_load (gimple *stmt, gimple_stmt_iterator *gsi, gimple **vec_stmt,
 	      dataref_ptr = bump_vector_ptr (dataref_ptr, ptr_incr, gsi,
 					     stmt, diff);
 	    }
+	  else if (STMT_VINFO_GATHER_SCATTER_P (stmt_info))
+	    {
+	      vect_get_gather_scatter_ops (loop, stmt, &gs_info,
+					   &dataref_ptr, &vec_offset);
+	      inv_p = false;
+	    }
 	  else
 	    dataref_ptr
 	      = vect_create_data_ref_ptr (first_stmt, aggr_type, at_loop,
@@ -7633,6 +7698,13 @@ vectorizable_load (gimple *stmt, gimple_stmt_iterator *gsi, gimple **vec_stmt,
 	  if (dataref_offset)
 	    dataref_offset = int_const_binop (PLUS_EXPR, dataref_offset,
 					      TYPE_SIZE_UNIT (aggr_type));
+	  else if (STMT_VINFO_GATHER_SCATTER_P (stmt_info))
+	    {
+	      gimple *def_stmt;
+	      vect_def_type dt;
+	      vect_is_simple_use (vec_offset, loop_vinfo, &def_stmt, &dt);
+	      vec_offset = vect_get_vec_def_for_stmt_copy (dt, vec_offset);
+	    }
 	  else
 	    dataref_ptr = bump_vector_ptr (dataref_ptr, ptr_incr, gsi, stmt,
 					   TYPE_SIZE_UNIT (aggr_type));
@@ -7721,6 +7793,24 @@ vectorizable_load (gimple *stmt, gimple_stmt_iterator *gsi, gimple **vec_stmt,
 		  {
 		    unsigned int align, misalign;
 
+		    if (memory_access_type == VMAT_GATHER_SCATTER)
+		      {
+			tree scale = size_int (gs_info.scale);
+			gcall *call;
+			if (masked_loop_p)
+			  call = gimple_build_call_internal
+			    (IFN_MASK_GATHER_LOAD, 4, dataref_ptr,
+			     vec_offset, scale, final_mask);
+			else
+			  call = gimple_build_call_internal
+			    (IFN_GATHER_LOAD, 3, dataref_ptr,
+			     vec_offset, scale);
+			gimple_call_set_nothrow (call, true);
+			new_stmt = call;
+			data_ref = NULL_TREE;
+			break;
+		      }
+
 		    align = DR_TARGET_ALIGNMENT (dr);
 		    if (alignment_support_scheme == dr_aligned)
 		      {
diff --git a/gcc/tree-vectorizer.h b/gcc/tree-vectorizer.h
index e03751ce4d1..125e178136b 100644
--- a/gcc/tree-vectorizer.h
+++ b/gcc/tree-vectorizer.h
@@ -844,7 +844,12 @@ typedef struct _stmt_vec_info {
 
 /* Information about a gather/scatter call.  */
 struct gather_scatter_info {
-  /* The FUNCTION_DECL for the built-in gather/scatter function.  */
+  /* The internal function to use for the gather/scatter operation,
+     or IFN_LAST if a built-in function should be used instead.  */
+  internal_fn ifn;
+
+  /* The FUNCTION_DECL for the built-in gather/scatter function,
+     or null if an internal function should be used instead.  */
   tree decl;
 
   /* The loop-invariant base value.  */
@@ -862,6 +867,12 @@ struct gather_scatter_info {
 
   /* The type of the vectorized offset.  */
   tree offset_vectype;
+
+  /* The type of the scalar elements after loading or before storing.  */
+  tree element_type;
+
+  /* The type of the scalar elements being loaded or stored.  */
+  tree memory_type;
 };
 
 /* Access Functions.  */
@@ -1529,7 +1540,7 @@ extern void duplicate_and_interleave (gimple_seq *, tree, vec<tree>,
    Additional pattern recognition functions can (and will) be added
    in the future.  */
 typedef gimple *(* vect_recog_func_ptr) (vec<gimple *> *, tree *, tree *);
-#define NUM_PATTERNS 14
+#define NUM_PATTERNS 15
 void vect_pattern_recog (vec_info *);
 
 /* In tree-vectorizer.c.  */
author	Richard Sandiford <richard.sandiford@linaro.org>	2017-11-17 16:57:22 +0000
committer	Richard Sandiford <richard.sandiford@linaro.org>	2017-11-20 11:42:56 +0000
commit	9c4d5a5738e1a17ae328b81bbd147b6ea9675505 (patch)
tree	5a8313dca464bf8ead111a2b608d20c3c807a613
parent	c1802e041d9b69f83ff072a323840bb0deaa5a60 (diff)
download	gcc-9c4d5a5738e1a17ae328b81bbd147b6ea9675505.tar.gz